diff --git "a/sft/revise_Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" "b/sft/revise_Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/revise_Full_smoe_plus_plus/checkpoint-16632/trainer_state.json" @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.07718985, + "auxiliary_loss_mlp": 1.70153117, + "balance_loss_clip": 3.41505909, + "balance_loss_mlp": 2.47734356, + "epoch": 6.012325266796934e-05, + "flos": 24466939372800.0, + "grad_norm": 80.750913664829, + "language_loss": 3.30601239, + "learning_rate": 0.0, + "loss": 3.35351992, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 43.03125, + "router_z_loss_mlp": 1679.0, + "step": 1, + "time_per_iteration": 19.43519115447998 + }, + { + "auxiliary_loss_clip": 0.05053996, + "auxiliary_loss_mlp": 0.99255323, + "balance_loss_clip": 2.26067638, + "balance_loss_mlp": 1.55024052, + "epoch": 0.00012024650533593868, + "flos": 20234506619520.0, + "grad_norm": 65.25129112022476, + "language_loss": 2.05704641, + "learning_rate": 4.4628432569317594e-07, + "loss": 3.10013962, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 27.890625, + "router_z_loss_mlp": 977.0, + "step": 2, + "time_per_iteration": 2.70206618309021 + }, + { + "auxiliary_loss_clip": 0.05078094, + "auxiliary_loss_mlp": 1.09766936, + "balance_loss_clip": 2.2767148, + "balance_loss_mlp": 1.5638144, + "epoch": 0.000180369758003908, + "flos": 22320171780480.0, + "grad_norm": 72.49628618074873, + "language_loss": 1.8270725, + "learning_rate": 7.073439208833112e-07, + "loss": 2.97552299, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 28.0, + "router_z_loss_mlp": 1082.5, + "step": 3, + "time_per_iteration": 2.5937623977661133 + }, + { + "auxiliary_loss_clip": 0.05120585, + "auxiliary_loss_mlp": 0.9843846, + "balance_loss_clip": 2.2918458, + "balance_loss_mlp": 1.56345916, + "epoch": 0.00024049301067187735, + "flos": 22423683179520.0, + "grad_norm": 71.5263710431209, + "language_loss": 1.99349856, + "learning_rate": 8.925686513863519e-07, + "loss": 3.02908897, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 969.0, + "step": 4, + "time_per_iteration": 2.8225133419036865 + }, + { + "auxiliary_loss_clip": 0.05109305, + "auxiliary_loss_mlp": 1.06129885, + "balance_loss_clip": 2.28268504, + "balance_loss_mlp": 1.58887625, + "epoch": 0.0003006162633398467, + "flos": 21406766878080.0, + "grad_norm": 74.13974949679341, + "language_loss": 2.2624855, + "learning_rate": 1.0362401141348472e-06, + "loss": 3.37487745, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 28.28125, + "router_z_loss_mlp": 1046.0, + "step": 5, + "time_per_iteration": 2.9290919303894043 + }, + { + "auxiliary_loss_clip": 0.05097248, + "auxiliary_loss_mlp": 1.13614821, + "balance_loss_clip": 2.2707057, + "balance_loss_mlp": 1.55427611, + "epoch": 0.000360739516007816, + "flos": 21662228874240.0, + "grad_norm": 65.4975568880652, + "language_loss": 1.89590144, + "learning_rate": 1.153628246576487e-06, + "loss": 3.08302212, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 28.25, + "router_z_loss_mlp": 1122.0, + "step": 6, + "time_per_iteration": 3.007115364074707 + }, + { + "auxiliary_loss_clip": 0.05072753, + "auxiliary_loss_mlp": 1.1702292, + "balance_loss_clip": 2.24336386, + "balance_loss_mlp": 1.59323204, + "epoch": 0.0004208627686757854, + "flos": 27170511102720.0, + "grad_norm": 66.3124465874403, + "language_loss": 1.7740798, + "learning_rate": 1.2528784983718962e-06, + "loss": 2.9950366, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 28.265625, + "router_z_loss_mlp": 1154.0, + "step": 7, + "time_per_iteration": 3.123600482940674 + }, + { + "auxiliary_loss_clip": 0.04936389, + "auxiliary_loss_mlp": 0.77675629, + "balance_loss_clip": 2.25196505, + "balance_loss_mlp": 1.4061029, + "epoch": 0.0004809860213437547, + "flos": 31330937427840.0, + "grad_norm": 56.04965158379521, + "language_loss": 1.75922132, + "learning_rate": 1.338852977079528e-06, + "loss": 2.58534145, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 26.84375, + "router_z_loss_mlp": 763.0, + "step": 8, + "time_per_iteration": 2.997500419616699 + }, + { + "auxiliary_loss_clip": 0.0494687, + "auxiliary_loss_mlp": 0.89586616, + "balance_loss_clip": 2.2349596, + "balance_loss_mlp": 1.45184994, + "epoch": 0.000541109274011724, + "flos": 32173027246080.0, + "grad_norm": 56.731733031942944, + "language_loss": 1.84469974, + "learning_rate": 1.4146878417666224e-06, + "loss": 2.79003477, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 27.09375, + "router_z_loss_mlp": 882.0, + "step": 9, + "time_per_iteration": 3.1255698204040527 + }, + { + "auxiliary_loss_clip": 0.04919174, + "auxiliary_loss_mlp": 0.80704236, + "balance_loss_clip": 2.23462343, + "balance_loss_mlp": 1.40736294, + "epoch": 0.0006012325266796934, + "flos": 18926176688640.0, + "grad_norm": 47.71260207760991, + "language_loss": 1.74780297, + "learning_rate": 1.4825244398280232e-06, + "loss": 2.60403705, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 26.859375, + "router_z_loss_mlp": 793.0, + "step": 10, + "time_per_iteration": 2.9445340633392334 + }, + { + "auxiliary_loss_clip": 0.04901214, + "auxiliary_loss_mlp": 0.73953408, + "balance_loss_clip": 2.26035595, + "balance_loss_mlp": 1.39481604, + "epoch": 0.0006613557793476627, + "flos": 20784006443520.0, + "grad_norm": 48.1537727282832, + "language_loss": 1.75507092, + "learning_rate": 1.5438901072051983e-06, + "loss": 2.54361701, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 26.390625, + "router_z_loss_mlp": 726.0, + "step": 11, + "time_per_iteration": 2.9828543663024902 + }, + { + "auxiliary_loss_clip": 0.0488665, + "auxiliary_loss_mlp": 0.71851075, + "balance_loss_clip": 2.25469971, + "balance_loss_mlp": 1.39208663, + "epoch": 0.000721479032015632, + "flos": 16590433662720.0, + "grad_norm": 43.00170113714157, + "language_loss": 1.68411064, + "learning_rate": 1.5999125722696629e-06, + "loss": 2.45148778, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 26.296875, + "router_z_loss_mlp": 704.0, + "step": 12, + "time_per_iteration": 2.938222646713257 + }, + { + "auxiliary_loss_clip": 0.04851279, + "auxiliary_loss_mlp": 0.79162264, + "balance_loss_clip": 2.27056551, + "balance_loss_mlp": 1.33023047, + "epoch": 0.0007816022846836014, + "flos": 23815996738560.0, + "grad_norm": 44.59606807630656, + "language_loss": 1.61133099, + "learning_rate": 1.6514482443788434e-06, + "loss": 2.45146656, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 25.78125, + "router_z_loss_mlp": 778.5, + "step": 13, + "time_per_iteration": 2.975118637084961 + }, + { + "auxiliary_loss_clip": 0.0482919, + "auxiliary_loss_mlp": 0.4812108, + "balance_loss_clip": 2.38134813, + "balance_loss_mlp": 1.34373498, + "epoch": 0.0008417255373515708, + "flos": 19181638684800.0, + "grad_norm": 24.752099696183123, + "language_loss": 1.42605567, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.95555842, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 24.46875, + "router_z_loss_mlp": 467.5, + "step": 14, + "time_per_iteration": 3.005387783050537 + }, + { + "auxiliary_loss_clip": 0.04843317, + "auxiliary_loss_mlp": 0.42258424, + "balance_loss_clip": 2.42058206, + "balance_loss_mlp": 1.34045625, + "epoch": 0.00090184879001954, + "flos": 26406658823040.0, + "grad_norm": 21.522688589637365, + "language_loss": 1.34224033, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.81325781, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 24.1875, + "router_z_loss_mlp": 409.5, + "step": 15, + "time_per_iteration": 4.444719314575195 + }, + { + "auxiliary_loss_clip": 0.0485227, + "auxiliary_loss_mlp": 0.40017533, + "balance_loss_clip": 2.44129586, + "balance_loss_mlp": 1.37007236, + "epoch": 0.0009619720426875094, + "flos": 24689830423680.0, + "grad_norm": 20.54737274620766, + "language_loss": 1.28753221, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.73623025, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 24.125, + "router_z_loss_mlp": 386.75, + "step": 16, + "time_per_iteration": 5.87269401550293 + }, + { + "auxiliary_loss_clip": 0.04792338, + "auxiliary_loss_mlp": 0.3982912, + "balance_loss_clip": 2.41458082, + "balance_loss_mlp": 1.32814205, + "epoch": 0.0010220952953554788, + "flos": 18633722428800.0, + "grad_norm": 20.218220196956235, + "language_loss": 1.39150214, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.83771682, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 23.734375, + "router_z_loss_mlp": 385.0, + "step": 17, + "time_per_iteration": 2.999260902404785 + }, + { + "auxiliary_loss_clip": 0.04784064, + "auxiliary_loss_mlp": 0.32224673, + "balance_loss_clip": 2.46232653, + "balance_loss_mlp": 1.29205656, + "epoch": 0.001082218548023448, + "flos": 26154182983680.0, + "grad_norm": 16.732686601673127, + "language_loss": 1.23413396, + "learning_rate": 1.860972167459798e-06, + "loss": 1.60422134, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 23.203125, + "router_z_loss_mlp": 309.5, + "step": 18, + "time_per_iteration": 3.023937463760376 + }, + { + "auxiliary_loss_clip": 0.04811134, + "auxiliary_loss_mlp": 0.31757289, + "balance_loss_clip": 2.48741102, + "balance_loss_mlp": 1.31295526, + "epoch": 0.0011423418006914173, + "flos": 19619256821760.0, + "grad_norm": 16.316076017634586, + "language_loss": 1.22099996, + "learning_rate": 1.89578346593066e-06, + "loss": 1.58668423, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 23.203125, + "router_z_loss_mlp": 304.5, + "step": 19, + "time_per_iteration": 3.008336067199707 + }, + { + "auxiliary_loss_clip": 0.04806993, + "auxiliary_loss_mlp": 0.33392182, + "balance_loss_clip": 2.49775839, + "balance_loss_mlp": 1.29989696, + "epoch": 0.0012024650533593868, + "flos": 17904509683200.0, + "grad_norm": 17.142970550289178, + "language_loss": 1.31938076, + "learning_rate": 1.928808765521199e-06, + "loss": 1.70137239, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 23.078125, + "router_z_loss_mlp": 321.125, + "step": 20, + "time_per_iteration": 2.952523946762085 + }, + { + "auxiliary_loss_clip": 0.04744621, + "auxiliary_loss_mlp": 0.27582109, + "balance_loss_clip": 2.52278638, + "balance_loss_mlp": 1.22713017, + "epoch": 0.001262588306027356, + "flos": 21262055448960.0, + "grad_norm": 14.549432931011598, + "language_loss": 1.28722882, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.61049604, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 22.21875, + "router_z_loss_mlp": 263.5, + "step": 21, + "time_per_iteration": 3.0091187953948975 + }, + { + "auxiliary_loss_clip": 0.04678907, + "auxiliary_loss_mlp": 0.13504824, + "balance_loss_clip": 2.73326254, + "balance_loss_mlp": 1.35882723, + "epoch": 0.0013227115586953253, + "flos": 26115199948800.0, + "grad_norm": 7.4316860915247265, + "language_loss": 1.28335118, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.4651885, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 19.4375, + "router_z_loss_mlp": 121.4375, + "step": 22, + "time_per_iteration": 3.0606420040130615 + }, + { + "auxiliary_loss_clip": 0.04738327, + "auxiliary_loss_mlp": 0.10603125, + "balance_loss_clip": 2.8633604, + "balance_loss_mlp": 1.39291942, + "epoch": 0.0013828348113632948, + "flos": 23961929777280.0, + "grad_norm": 5.122555311389441, + "language_loss": 1.08021665, + "learning_rate": 2.018794797290208e-06, + "loss": 1.23363113, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 18.734375, + "router_z_loss_mlp": 92.0625, + "step": 23, + "time_per_iteration": 2.9850363731384277 + }, + { + "auxiliary_loss_clip": 0.04788831, + "auxiliary_loss_mlp": 0.09025525, + "balance_loss_clip": 2.97458267, + "balance_loss_mlp": 1.44801044, + "epoch": 0.001442958064031264, + "flos": 15968035186560.0, + "grad_norm": 4.53283244496276, + "language_loss": 1.19218946, + "learning_rate": 2.046196897962839e-06, + "loss": 1.33033299, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 18.171875, + "router_z_loss_mlp": 75.75, + "step": 24, + "time_per_iteration": 2.898371696472168 + }, + { + "auxiliary_loss_clip": 0.04741364, + "auxiliary_loss_mlp": 0.09615618, + "balance_loss_clip": 2.94651365, + "balance_loss_mlp": 1.44911408, + "epoch": 0.0015030813166992333, + "flos": 18116011716480.0, + "grad_norm": 5.890458377108494, + "language_loss": 1.21581137, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.3593812, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 17.96875, + "router_z_loss_mlp": 81.625, + "step": 25, + "time_per_iteration": 2.886779546737671 + }, + { + "auxiliary_loss_clip": 0.0476236, + "auxiliary_loss_mlp": 0.09070724, + "balance_loss_clip": 2.99907613, + "balance_loss_mlp": 1.4748987, + "epoch": 0.0015632045693672028, + "flos": 22244151237120.0, + "grad_norm": 4.4364931290458, + "language_loss": 1.18391669, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.32224751, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 17.609375, + "router_z_loss_mlp": 75.96875, + "step": 26, + "time_per_iteration": 3.0007307529449463 + }, + { + "auxiliary_loss_clip": 0.04789963, + "auxiliary_loss_mlp": 0.08426295, + "balance_loss_clip": 3.04724741, + "balance_loss_mlp": 1.47133934, + "epoch": 0.001623327822035172, + "flos": 24003582255360.0, + "grad_norm": 3.7605434031046165, + "language_loss": 1.05572927, + "learning_rate": 2.122031762649933e-06, + "loss": 1.18789196, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 17.4375, + "router_z_loss_mlp": 69.5625, + "step": 27, + "time_per_iteration": 2.9833555221557617 + }, + { + "auxiliary_loss_clip": 0.0477493, + "auxiliary_loss_mlp": 0.06973341, + "balance_loss_clip": 3.07912683, + "balance_loss_mlp": 1.48628032, + "epoch": 0.0016834510747031415, + "flos": 19685821201920.0, + "grad_norm": 3.2178699917375533, + "language_loss": 1.18026257, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.29774523, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 16.953125, + "router_z_loss_mlp": 54.84375, + "step": 28, + "time_per_iteration": 2.917419672012329 + }, + { + "auxiliary_loss_clip": 0.04737539, + "auxiliary_loss_mlp": 0.07382212, + "balance_loss_clip": 3.0716362, + "balance_loss_mlp": 1.49842286, + "epoch": 0.0017435743273711108, + "flos": 20933785290240.0, + "grad_norm": 3.11924325803866, + "language_loss": 1.15037429, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.27157176, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 16.671875, + "router_z_loss_mlp": 58.84375, + "step": 29, + "time_per_iteration": 2.983382225036621 + }, + { + "auxiliary_loss_clip": 0.04715473, + "auxiliary_loss_mlp": 0.07599083, + "balance_loss_clip": 3.05350828, + "balance_loss_mlp": 1.55354989, + "epoch": 0.00180369758003908, + "flos": 19536404313600.0, + "grad_norm": 3.824928631446582, + "language_loss": 1.37850773, + "learning_rate": 2.189868360711334e-06, + "loss": 1.50165331, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 16.65625, + "router_z_loss_mlp": 60.46875, + "step": 30, + "time_per_iteration": 2.9731624126434326 + }, + { + "auxiliary_loss_clip": 0.04724666, + "auxiliary_loss_mlp": 0.06635766, + "balance_loss_clip": 3.10167885, + "balance_loss_mlp": 1.65071881, + "epoch": 0.0018638208327070496, + "flos": 27464413196160.0, + "grad_norm": 3.153053452271225, + "language_loss": 1.18014097, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.29374528, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 16.234375, + "router_z_loss_mlp": 49.84375, + "step": 31, + "time_per_iteration": 2.966228723526001 + }, + { + "auxiliary_loss_clip": 0.04728903, + "auxiliary_loss_mlp": 0.0608696, + "balance_loss_clip": 3.12140417, + "balance_loss_mlp": 1.71684301, + "epoch": 0.0019239440853750188, + "flos": 13597335912960.0, + "grad_norm": 2.6499782485116, + "language_loss": 1.06642807, + "learning_rate": 2.2314216284658796e-06, + "loss": 1.17458677, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 16.078125, + "router_z_loss_mlp": 43.71875, + "step": 32, + "time_per_iteration": 2.940821647644043 + }, + { + "auxiliary_loss_clip": 0.04738251, + "auxiliary_loss_mlp": 0.06605868, + "balance_loss_clip": 3.12230968, + "balance_loss_mlp": 1.73221064, + "epoch": 0.001984067338042988, + "flos": 11261004704640.0, + "grad_norm": 3.2304432423690685, + "language_loss": 1.12521827, + "learning_rate": 2.2512340280885094e-06, + "loss": 1.23865938, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 16.15625, + "router_z_loss_mlp": 48.765625, + "step": 33, + "time_per_iteration": 2.955939292907715 + }, + { + "auxiliary_loss_clip": 0.04739591, + "auxiliary_loss_mlp": 0.05051812, + "balance_loss_clip": 3.15371943, + "balance_loss_mlp": 1.92681217, + "epoch": 0.0020441905907109576, + "flos": 22397368688640.0, + "grad_norm": 2.193599177835473, + "language_loss": 0.99594772, + "learning_rate": 2.270454923596497e-06, + "loss": 1.09386158, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 15.859375, + "router_z_loss_mlp": 31.265625, + "step": 34, + "time_per_iteration": 3.0313732624053955 + }, + { + "auxiliary_loss_clip": 0.04648167, + "auxiliary_loss_mlp": 0.04997702, + "balance_loss_clip": 3.08843613, + "balance_loss_mlp": 2.02986765, + "epoch": 0.0021043138433789266, + "flos": 49794611339520.0, + "grad_norm": 2.2977998165312448, + "language_loss": 0.8940649, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.99052364, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 15.6015625, + "router_z_loss_mlp": 29.671875, + "step": 35, + "time_per_iteration": 3.249321937561035 + }, + { + "auxiliary_loss_clip": 0.04644082, + "auxiliary_loss_mlp": 0.04726864, + "balance_loss_clip": 3.1093595, + "balance_loss_mlp": 2.17559481, + "epoch": 0.002164437096046896, + "flos": 20567572727040.0, + "grad_norm": 2.2566672745340646, + "language_loss": 1.00816774, + "learning_rate": 2.307256493152974e-06, + "loss": 1.10187721, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 15.34375, + "router_z_loss_mlp": 25.515625, + "step": 36, + "time_per_iteration": 3.08524489402771 + }, + { + "auxiliary_loss_clip": 0.04630248, + "auxiliary_loss_mlp": 0.04990356, + "balance_loss_clip": 3.12265873, + "balance_loss_mlp": 2.22088623, + "epoch": 0.0022245603487148656, + "flos": 26553632492160.0, + "grad_norm": 2.5399102877001334, + "language_loss": 1.0566318, + "learning_rate": 2.3248973825097614e-06, + "loss": 1.15283775, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 15.078125, + "router_z_loss_mlp": 27.65625, + "step": 37, + "time_per_iteration": 2.983510971069336 + }, + { + "auxiliary_loss_clip": 0.04611097, + "auxiliary_loss_mlp": 0.03961299, + "balance_loss_clip": 3.16870356, + "balance_loss_mlp": 2.31106138, + "epoch": 0.0022846836013828346, + "flos": 20347519426560.0, + "grad_norm": 2.0424402484536337, + "language_loss": 1.11575007, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.20147407, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 14.4453125, + "router_z_loss_mlp": 16.5, + "step": 38, + "time_per_iteration": 2.9655370712280273 + }, + { + "auxiliary_loss_clip": 0.04506429, + "auxiliary_loss_mlp": 0.03775921, + "balance_loss_clip": 3.14303923, + "balance_loss_mlp": 2.16230488, + "epoch": 0.002344806854050804, + "flos": 26258101585920.0, + "grad_norm": 2.027333862960444, + "language_loss": 0.95167744, + "learning_rate": 2.358792165262154e-06, + "loss": 1.03450096, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 13.6328125, + "router_z_loss_mlp": 16.140625, + "step": 39, + "time_per_iteration": 3.008152723312378 + }, + { + "auxiliary_loss_clip": 0.0438172, + "auxiliary_loss_mlp": 0.03697147, + "balance_loss_clip": 3.07590675, + "balance_loss_mlp": 1.97595596, + "epoch": 0.0024049301067187736, + "flos": 11808920960640.0, + "grad_norm": 2.8344510332509296, + "language_loss": 1.06198907, + "learning_rate": 2.3750930912143747e-06, + "loss": 1.14277792, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 13.078125, + "router_z_loss_mlp": 17.2109375, + "step": 40, + "time_per_iteration": 2.933713436126709 + }, + { + "auxiliary_loss_clip": 0.04263376, + "auxiliary_loss_mlp": 0.03392493, + "balance_loss_clip": 3.03415275, + "balance_loss_mlp": 1.76819491, + "epoch": 0.0024650533593867426, + "flos": 20641285785600.0, + "grad_norm": 2.038095027841407, + "language_loss": 1.05127823, + "learning_rate": 2.3909914837471044e-06, + "loss": 1.12783694, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 12.3046875, + "router_z_loss_mlp": 16.234375, + "step": 41, + "time_per_iteration": 3.0024852752685547 + }, + { + "auxiliary_loss_clip": 0.04179972, + "auxiliary_loss_mlp": 0.03160781, + "balance_loss_clip": 3.00309896, + "balance_loss_mlp": 1.64405775, + "epoch": 0.002525176612054712, + "flos": 18415388430720.0, + "grad_norm": 2.064139333797363, + "language_loss": 1.06096089, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.13436842, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 11.7734375, + "router_z_loss_mlp": 15.15625, + "step": 42, + "time_per_iteration": 2.8881754875183105 + }, + { + "auxiliary_loss_clip": 0.04125922, + "auxiliary_loss_mlp": 0.02867808, + "balance_loss_clip": 2.97397232, + "balance_loss_mlp": 1.53953028, + "epoch": 0.0025852998647226816, + "flos": 28195752447360.0, + "grad_norm": 2.0903387315667756, + "language_loss": 1.1070857, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.17702317, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 11.515625, + "router_z_loss_mlp": 13.2890625, + "step": 43, + "time_per_iteration": 2.9615306854248047 + }, + { + "auxiliary_loss_clip": 0.04024056, + "auxiliary_loss_mlp": 0.03335081, + "balance_loss_clip": 2.8991127, + "balance_loss_mlp": 1.48190141, + "epoch": 0.0026454231173906506, + "flos": 14291275697280.0, + "grad_norm": 2.4122634317931295, + "language_loss": 1.08213997, + "learning_rate": 2.4364587585915504e-06, + "loss": 1.15573144, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 11.2421875, + "router_z_loss_mlp": 18.53125, + "step": 44, + "time_per_iteration": 3.0093536376953125 + }, + { + "auxiliary_loss_clip": 0.03949196, + "auxiliary_loss_mlp": 0.02542776, + "balance_loss_clip": 2.86159801, + "balance_loss_mlp": 1.42735875, + "epoch": 0.00270554637005862, + "flos": 22429429269120.0, + "grad_norm": 1.8787174249155538, + "language_loss": 1.09263754, + "learning_rate": 2.450927955901469e-06, + "loss": 1.15755725, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 10.875, + "router_z_loss_mlp": 11.1640625, + "step": 45, + "time_per_iteration": 2.9262454509735107 + }, + { + "auxiliary_loss_clip": 0.03814146, + "auxiliary_loss_mlp": 0.02616596, + "balance_loss_clip": 2.76267052, + "balance_loss_mlp": 1.39818192, + "epoch": 0.0027656696227265896, + "flos": 23995800149760.0, + "grad_norm": 1.6735211082321715, + "language_loss": 1.10298002, + "learning_rate": 2.465079122983384e-06, + "loss": 1.16728747, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 10.515625, + "router_z_loss_mlp": 12.1796875, + "step": 46, + "time_per_iteration": 3.021876096725464 + }, + { + "auxiliary_loss_clip": 0.03728038, + "auxiliary_loss_mlp": 0.0228702, + "balance_loss_clip": 2.70781136, + "balance_loss_mlp": 1.39628839, + "epoch": 0.0028257928753945586, + "flos": 37683481224960.0, + "grad_norm": 2.1245409214462296, + "language_loss": 0.99205673, + "learning_rate": 2.4789259401737868e-06, + "loss": 1.05220735, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 10.1953125, + "router_z_loss_mlp": 8.90234375, + "step": 47, + "time_per_iteration": 3.1279637813568115 + }, + { + "auxiliary_loss_clip": 0.03609342, + "auxiliary_loss_mlp": 0.02256737, + "balance_loss_clip": 2.62699032, + "balance_loss_mlp": 1.43886566, + "epoch": 0.002885916128062528, + "flos": 22464476006400.0, + "grad_norm": 1.7315270840674621, + "language_loss": 0.95409727, + "learning_rate": 2.492481223656015e-06, + "loss": 1.01275802, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 9.8359375, + "router_z_loss_mlp": 8.18359375, + "step": 48, + "time_per_iteration": 2.9573256969451904 + }, + { + "auxiliary_loss_clip": 0.03492365, + "auxiliary_loss_mlp": 0.0239595, + "balance_loss_clip": 2.54325104, + "balance_loss_mlp": 1.47241127, + "epoch": 0.0029460393807304976, + "flos": 27023265964800.0, + "grad_norm": 1.8450388987066868, + "language_loss": 0.98168725, + "learning_rate": 2.5057569967437924e-06, + "loss": 1.04057026, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 9.484375, + "router_z_loss_mlp": 9.23828125, + "step": 49, + "time_per_iteration": 3.0204553604125977 + }, + { + "auxiliary_loss_clip": 0.03365751, + "auxiliary_loss_mlp": 0.02232909, + "balance_loss_clip": 2.44827795, + "balance_loss_mlp": 1.51917887, + "epoch": 0.0030061626333984666, + "flos": 15860361265920.0, + "grad_norm": 1.9479088444717867, + "language_loss": 0.99412388, + "learning_rate": 2.51876455396287e-06, + "loss": 1.05011046, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 9.1875, + "router_z_loss_mlp": 7.140625, + "step": 50, + "time_per_iteration": 5.775744438171387 + }, + { + "auxiliary_loss_clip": 0.03264582, + "auxiliary_loss_mlp": 0.02216198, + "balance_loss_clip": 2.39411044, + "balance_loss_mlp": 1.5341301, + "epoch": 0.003066285886066436, + "flos": 31838287080960.0, + "grad_norm": 1.9753530157624453, + "language_loss": 1.0056417, + "learning_rate": 2.5315145187866316e-06, + "loss": 1.0604496, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 8.703125, + "router_z_loss_mlp": 6.82421875, + "step": 51, + "time_per_iteration": 4.440434455871582 + }, + { + "auxiliary_loss_clip": 0.03135578, + "auxiliary_loss_mlp": 0.02258513, + "balance_loss_clip": 2.30240512, + "balance_loss_mlp": 1.54440165, + "epoch": 0.0031264091387344056, + "flos": 41442422025600.0, + "grad_norm": 1.8739398191585512, + "language_loss": 1.02895951, + "learning_rate": 2.5440168957651953e-06, + "loss": 1.08290029, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 8.3359375, + "router_z_loss_mlp": 7.140625, + "step": 52, + "time_per_iteration": 3.1230568885803223 + }, + { + "auxiliary_loss_clip": 0.03017838, + "auxiliary_loss_mlp": 0.02039637, + "balance_loss_clip": 2.23043537, + "balance_loss_mlp": 1.52331805, + "epoch": 0.0031865323914023747, + "flos": 23451458232960.0, + "grad_norm": 1.8070749898571337, + "language_loss": 1.01873517, + "learning_rate": 2.5562811176888872e-06, + "loss": 1.06930995, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 7.88671875, + "router_z_loss_mlp": 5.16210938, + "step": 53, + "time_per_iteration": 2.9906809329986572 + }, + { + "auxiliary_loss_clip": 0.0290796, + "auxiliary_loss_mlp": 0.02149796, + "balance_loss_clip": 2.17462349, + "balance_loss_mlp": 1.49252415, + "epoch": 0.003246655644070344, + "flos": 14437434960000.0, + "grad_norm": 1.9792945125495223, + "language_loss": 0.93568718, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.98626477, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 7.33203125, + "router_z_loss_mlp": 6.5625, + "step": 54, + "time_per_iteration": 2.9702131748199463 + }, + { + "auxiliary_loss_clip": 0.02827891, + "auxiliary_loss_mlp": 0.02136015, + "balance_loss_clip": 2.13814402, + "balance_loss_mlp": 1.44975078, + "epoch": 0.0033067788967383136, + "flos": 35931787067520.0, + "grad_norm": 2.0477595097481287, + "language_loss": 0.92444384, + "learning_rate": 2.580130221340046e-06, + "loss": 0.97408283, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 6.90234375, + "router_z_loss_mlp": 6.86328125, + "step": 55, + "time_per_iteration": 3.1214632987976074 + }, + { + "auxiliary_loss_clip": 0.02756652, + "auxiliary_loss_mlp": 0.02227438, + "balance_loss_clip": 2.10026264, + "balance_loss_mlp": 1.41948545, + "epoch": 0.003366902149406283, + "flos": 22967074955520.0, + "grad_norm": 2.3362825082226046, + "language_loss": 1.02704704, + "learning_rate": 2.5917314754514246e-06, + "loss": 1.07688785, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 6.55859375, + "router_z_loss_mlp": 8.0703125, + "step": 56, + "time_per_iteration": 3.0924503803253174 + }, + { + "auxiliary_loss_clip": 0.02700366, + "auxiliary_loss_mlp": 0.02074311, + "balance_loss_clip": 2.07978201, + "balance_loss_mlp": 1.38308835, + "epoch": 0.003427025402074252, + "flos": 26595239725440.0, + "grad_norm": 4.372956429906548, + "language_loss": 1.04043138, + "learning_rate": 2.6031273868139713e-06, + "loss": 1.08817816, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 6.20703125, + "router_z_loss_mlp": 6.9140625, + "step": 57, + "time_per_iteration": 3.024188756942749 + }, + { + "auxiliary_loss_clip": 0.02633189, + "auxiliary_loss_mlp": 0.02124644, + "balance_loss_clip": 2.02887058, + "balance_loss_mlp": 1.40175891, + "epoch": 0.0034871486547422216, + "flos": 23961703553280.0, + "grad_norm": 2.0204378954054003, + "language_loss": 1.08251572, + "learning_rate": 2.614325098333948e-06, + "loss": 1.13009405, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 6.046875, + "router_z_loss_mlp": 7.23046875, + "step": 58, + "time_per_iteration": 3.0998780727386475 + }, + { + "auxiliary_loss_clip": 0.02581605, + "auxiliary_loss_mlp": 0.02027596, + "balance_loss_clip": 1.99360061, + "balance_loss_mlp": 1.37871587, + "epoch": 0.003547271907410191, + "flos": 21224836961280.0, + "grad_norm": 1.8913346586629278, + "language_loss": 0.98887563, + "learning_rate": 2.625331386578098e-06, + "loss": 1.03496754, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 5.8828125, + "router_z_loss_mlp": 6.4921875, + "step": 59, + "time_per_iteration": 2.9935126304626465 + }, + { + "auxiliary_loss_clip": 0.02565694, + "auxiliary_loss_mlp": 0.02080627, + "balance_loss_clip": 1.97923756, + "balance_loss_mlp": 1.42659712, + "epoch": 0.00360739516007816, + "flos": 16512648572160.0, + "grad_norm": 1.8217685591083403, + "language_loss": 1.03651714, + "learning_rate": 2.63615268640451e-06, + "loss": 1.08298028, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 5.875, + "router_z_loss_mlp": 6.54101562, + "step": 60, + "time_per_iteration": 2.921691656112671 + }, + { + "auxiliary_loss_clip": 0.02525818, + "auxiliary_loss_mlp": 0.01972963, + "balance_loss_clip": 1.94882774, + "balance_loss_mlp": 1.43432772, + "epoch": 0.0036675184127461296, + "flos": 19474500147840.0, + "grad_norm": 2.012211350550266, + "language_loss": 1.00783324, + "learning_rate": 2.6467951135575943e-06, + "loss": 1.05282104, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 5.7734375, + "router_z_loss_mlp": 5.390625, + "step": 61, + "time_per_iteration": 2.9752259254455566 + }, + { + "auxiliary_loss_clip": 0.02525007, + "auxiliary_loss_mlp": 0.01933186, + "balance_loss_clip": 1.95319808, + "balance_loss_mlp": 1.42296982, + "epoch": 0.003727641665414099, + "flos": 20966524542720.0, + "grad_norm": 1.7233977874020083, + "language_loss": 0.97835529, + "learning_rate": 2.657264485425803e-06, + "loss": 1.0229373, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 5.71875, + "router_z_loss_mlp": 5.09960938, + "step": 62, + "time_per_iteration": 2.9686691761016846 + }, + { + "auxiliary_loss_clip": 0.02493904, + "auxiliary_loss_mlp": 0.01856282, + "balance_loss_clip": 1.92791915, + "balance_loss_mlp": 1.39317799, + "epoch": 0.003787764918082068, + "flos": 18415297941120.0, + "grad_norm": 1.686873854699331, + "language_loss": 1.02980542, + "learning_rate": 2.6675663401385186e-06, + "loss": 1.07330728, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 5.65625, + "router_z_loss_mlp": 4.6328125, + "step": 63, + "time_per_iteration": 2.951929807662964 + }, + { + "auxiliary_loss_clip": 0.02488836, + "auxiliary_loss_mlp": 0.01824781, + "balance_loss_clip": 1.93110788, + "balance_loss_mlp": 1.38437414, + "epoch": 0.0038478881707500376, + "flos": 12467044846080.0, + "grad_norm": 1.9303686318249158, + "language_loss": 1.11379838, + "learning_rate": 2.677705954159056e-06, + "loss": 1.15693462, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 5.5859375, + "router_z_loss_mlp": 4.40234375, + "step": 64, + "time_per_iteration": 2.9584901332855225 + }, + { + "auxiliary_loss_clip": 0.0245959, + "auxiliary_loss_mlp": 0.01820107, + "balance_loss_clip": 1.91570032, + "balance_loss_mlp": 1.36882806, + "epoch": 0.003908011423418007, + "flos": 13561293790080.0, + "grad_norm": 1.9293013249698734, + "language_loss": 1.00365043, + "learning_rate": 2.6876883585136904e-06, + "loss": 1.0464474, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 5.4453125, + "router_z_loss_mlp": 4.51171875, + "step": 65, + "time_per_iteration": 2.9369256496429443 + }, + { + "auxiliary_loss_clip": 0.02428748, + "auxiliary_loss_mlp": 0.01830616, + "balance_loss_clip": 1.90669298, + "balance_loss_mlp": 1.3783834, + "epoch": 0.003968134676085976, + "flos": 18342806492160.0, + "grad_norm": 1.6295432689463107, + "language_loss": 0.98608351, + "learning_rate": 2.697518353781685e-06, + "loss": 1.02867711, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 5.21484375, + "router_z_loss_mlp": 4.5234375, + "step": 66, + "time_per_iteration": 3.0221219062805176 + }, + { + "auxiliary_loss_clip": 0.02392277, + "auxiliary_loss_mlp": 0.01814904, + "balance_loss_clip": 1.89489579, + "balance_loss_mlp": 1.3765955, + "epoch": 0.004028257928753946, + "flos": 20494900298880.0, + "grad_norm": 1.9643108818682868, + "language_loss": 1.10504806, + "learning_rate": 2.7072005239581103e-06, + "loss": 1.14712, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 4.96875, + "router_z_loss_mlp": 4.38476562, + "step": 67, + "time_per_iteration": 2.9501683712005615 + }, + { + "auxiliary_loss_clip": 0.02350519, + "auxiliary_loss_mlp": 0.01734376, + "balance_loss_clip": 1.87984216, + "balance_loss_mlp": 1.37255144, + "epoch": 0.004088381181421915, + "flos": 18853187546880.0, + "grad_norm": 1.7730617643888935, + "language_loss": 1.01648676, + "learning_rate": 2.7167392492896727e-06, + "loss": 1.05733562, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 4.69921875, + "router_z_loss_mlp": 3.62109375, + "step": 68, + "time_per_iteration": 2.964909553527832 + }, + { + "auxiliary_loss_clip": 0.02310557, + "auxiliary_loss_mlp": 0.01778697, + "balance_loss_clip": 1.86005521, + "balance_loss_mlp": 1.36899877, + "epoch": 0.004148504434089885, + "flos": 19437372149760.0, + "grad_norm": 1.7152465305511237, + "language_loss": 1.06029165, + "learning_rate": 2.7261387181735195e-06, + "loss": 1.10118413, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 4.51171875, + "router_z_loss_mlp": 4.09570312, + "step": 69, + "time_per_iteration": 2.9616217613220215 + }, + { + "auxiliary_loss_clip": 0.02292586, + "auxiliary_loss_mlp": 0.01737615, + "balance_loss_clip": 1.86308813, + "balance_loss_mlp": 1.36358428, + "epoch": 0.004208627686757853, + "flos": 20820093811200.0, + "grad_norm": 2.0847122496335153, + "language_loss": 1.0963186, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.13662052, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 4.2890625, + "router_z_loss_mlp": 3.74023438, + "step": 70, + "time_per_iteration": 2.942866086959839 + }, + { + "auxiliary_loss_clip": 0.02260396, + "auxiliary_loss_mlp": 0.01844666, + "balance_loss_clip": 1.84152269, + "balance_loss_mlp": 1.35981774, + "epoch": 0.004268750939425823, + "flos": 19107880381440.0, + "grad_norm": 2.080099157641855, + "language_loss": 1.17807257, + "learning_rate": 2.7445357464116983e-06, + "loss": 1.21912324, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 4.1875, + "router_z_loss_mlp": 4.84570312, + "step": 71, + "time_per_iteration": 2.967806816101074 + }, + { + "auxiliary_loss_clip": 0.02468186, + "auxiliary_loss_mlp": 0.01745151, + "balance_loss_clip": 2.12512517, + "balance_loss_mlp": 1.37741411, + "epoch": 0.004328874192093792, + "flos": 52465203002880.0, + "grad_norm": 2.627424663235757, + "language_loss": 0.67090684, + "learning_rate": 2.75354081884615e-06, + "loss": 0.71304029, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 3.671875, + "step": 72, + "time_per_iteration": 3.3788864612579346 + }, + { + "auxiliary_loss_clip": 0.02465628, + "auxiliary_loss_mlp": 0.01579652, + "balance_loss_clip": 2.12709475, + "balance_loss_mlp": 1.34771872, + "epoch": 0.004388997444761762, + "flos": 66508436851200.0, + "grad_norm": 2.4574586075941327, + "language_loss": 0.64710939, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.68756223, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 2.3125, + "step": 73, + "time_per_iteration": 3.451493978500366 + }, + { + "auxiliary_loss_clip": 0.02220821, + "auxiliary_loss_mlp": 0.0171673, + "balance_loss_clip": 1.83534443, + "balance_loss_mlp": 1.36444235, + "epoch": 0.004449120697429731, + "flos": 18962264056320.0, + "grad_norm": 1.8122737928190449, + "language_loss": 0.98491162, + "learning_rate": 2.771181708202938e-06, + "loss": 1.02428722, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 3.859375, + "router_z_loss_mlp": 3.5234375, + "step": 74, + "time_per_iteration": 3.0129518508911133 + }, + { + "auxiliary_loss_clip": 0.0220345, + "auxiliary_loss_mlp": 0.01728201, + "balance_loss_clip": 1.81980813, + "balance_loss_mlp": 1.35340667, + "epoch": 0.004509243950097701, + "flos": 21115308003840.0, + "grad_norm": 1.7986986515436891, + "language_loss": 1.07602251, + "learning_rate": 2.779824149153005e-06, + "loss": 1.11533904, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 3.8359375, + "router_z_loss_mlp": 3.75, + "step": 75, + "time_per_iteration": 2.9826927185058594 + }, + { + "auxiliary_loss_clip": 0.02203747, + "auxiliary_loss_mlp": 0.0172515, + "balance_loss_clip": 1.82403791, + "balance_loss_mlp": 1.36046553, + "epoch": 0.004569367202765669, + "flos": 20707035759360.0, + "grad_norm": 1.824084662435007, + "language_loss": 0.9909516, + "learning_rate": 2.788352117317012e-06, + "loss": 1.03024054, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 3.79882812, + "router_z_loss_mlp": 3.6484375, + "step": 76, + "time_per_iteration": 3.0636885166168213 + }, + { + "auxiliary_loss_clip": 0.02194324, + "auxiliary_loss_mlp": 0.01687185, + "balance_loss_clip": 1.81513953, + "balance_loss_mlp": 1.35587883, + "epoch": 0.004629490455433639, + "flos": 28670362848000.0, + "grad_norm": 1.7042981619568738, + "language_loss": 1.02161932, + "learning_rate": 2.796768605577095e-06, + "loss": 1.06043446, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 3.7890625, + "router_z_loss_mlp": 3.31054688, + "step": 77, + "time_per_iteration": 3.0720362663269043 + }, + { + "auxiliary_loss_clip": 0.02171185, + "auxiliary_loss_mlp": 0.01636194, + "balance_loss_clip": 1.80305076, + "balance_loss_mlp": 1.36592269, + "epoch": 0.004689613708101608, + "flos": 11079753459840.0, + "grad_norm": 1.989328546204836, + "language_loss": 1.04702544, + "learning_rate": 2.80507649095533e-06, + "loss": 1.08509922, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 3.68164062, + "router_z_loss_mlp": 2.703125, + "step": 78, + "time_per_iteration": 2.9188883304595947 + }, + { + "auxiliary_loss_clip": 0.02167085, + "auxiliary_loss_mlp": 0.01657007, + "balance_loss_clip": 1.79622591, + "balance_loss_mlp": 1.36308455, + "epoch": 0.004749736960769578, + "flos": 21809202543360.0, + "grad_norm": 2.1524239162120753, + "language_loss": 0.96298873, + "learning_rate": 2.813278540517843e-06, + "loss": 1.00122976, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 3.70898438, + "router_z_loss_mlp": 2.94140625, + "step": 79, + "time_per_iteration": 2.968212127685547 + }, + { + "auxiliary_loss_clip": 0.02152993, + "auxiliary_loss_mlp": 0.01706955, + "balance_loss_clip": 1.78584027, + "balance_loss_mlp": 1.38385034, + "epoch": 0.004809860213437547, + "flos": 19802227368960.0, + "grad_norm": 1.6417094800798346, + "language_loss": 0.99399751, + "learning_rate": 2.8213774169075505e-06, + "loss": 1.03259695, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 3.671875, + "router_z_loss_mlp": 3.22851562, + "step": 80, + "time_per_iteration": 2.999403238296509 + }, + { + "auxiliary_loss_clip": 0.02145505, + "auxiliary_loss_mlp": 0.01643657, + "balance_loss_clip": 1.78738248, + "balance_loss_mlp": 1.3602246, + "epoch": 0.004869983466105517, + "flos": 26584516707840.0, + "grad_norm": 1.8651087935124366, + "language_loss": 1.05145454, + "learning_rate": 2.829375683533245e-06, + "loss": 1.08934617, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 3.58203125, + "router_z_loss_mlp": 2.83398438, + "step": 81, + "time_per_iteration": 2.990549087524414 + }, + { + "auxiliary_loss_clip": 0.02134164, + "auxiliary_loss_mlp": 0.01654487, + "balance_loss_clip": 1.77317882, + "balance_loss_mlp": 1.35713112, + "epoch": 0.004930106718773485, + "flos": 12831085658880.0, + "grad_norm": 2.3593275805769856, + "language_loss": 1.13555384, + "learning_rate": 2.8372758094402803e-06, + "loss": 1.17344034, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 2.97265625, + "step": 82, + "time_per_iteration": 2.936556816101074 + }, + { + "auxiliary_loss_clip": 0.02119173, + "auxiliary_loss_mlp": 0.0163481, + "balance_loss_clip": 1.76616168, + "balance_loss_mlp": 1.35500205, + "epoch": 0.004990229971441455, + "flos": 25785753425280.0, + "grad_norm": 1.790325806226351, + "language_loss": 0.94502568, + "learning_rate": 2.84508017388607e-06, + "loss": 0.98256558, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 3.52929688, + "router_z_loss_mlp": 2.79882812, + "step": 83, + "time_per_iteration": 3.0076515674591064 + }, + { + "auxiliary_loss_clip": 0.02100869, + "auxiliary_loss_mlp": 0.01634419, + "balance_loss_clip": 1.75793338, + "balance_loss_mlp": 1.3485074, + "epoch": 0.005050353224109424, + "flos": 17466212874240.0, + "grad_norm": 2.018160282273101, + "language_loss": 1.04113054, + "learning_rate": 2.852791070641559e-06, + "loss": 1.07848334, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 2.859375, + "step": 84, + "time_per_iteration": 2.968029737472534 + }, + { + "auxiliary_loss_clip": 0.02327821, + "auxiliary_loss_mlp": 0.01425662, + "balance_loss_clip": 2.03096032, + "balance_loss_mlp": 1.33906794, + "epoch": 0.005110476476777394, + "flos": 69835170908160.0, + "grad_norm": 1.6250768740902481, + "language_loss": 0.63235581, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.66989064, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.8671875, + "step": 85, + "time_per_iteration": 6.263977289199829 + }, + { + "auxiliary_loss_clip": 0.02069861, + "auxiliary_loss_mlp": 0.01643009, + "balance_loss_clip": 1.73488259, + "balance_loss_mlp": 1.35576189, + "epoch": 0.005170599729445363, + "flos": 24800761969920.0, + "grad_norm": 1.4888853033301752, + "language_loss": 0.96089995, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.99802858, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 2.87109375, + "step": 86, + "time_per_iteration": 4.483587741851807 + }, + { + "auxiliary_loss_clip": 0.02060358, + "auxiliary_loss_mlp": 0.01618375, + "balance_loss_clip": 1.72654772, + "balance_loss_mlp": 1.35106003, + "epoch": 0.005230722982113333, + "flos": 23268397196160.0, + "grad_norm": 2.031680688672302, + "language_loss": 0.95613682, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.99292409, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 3.33789062, + "router_z_loss_mlp": 2.67285156, + "step": 87, + "time_per_iteration": 2.964277982711792 + }, + { + "auxiliary_loss_clip": 0.02037031, + "auxiliary_loss_mlp": 0.01560917, + "balance_loss_clip": 1.71697092, + "balance_loss_mlp": 1.35339785, + "epoch": 0.005290846234781301, + "flos": 16736773904640.0, + "grad_norm": 1.585897659189128, + "language_loss": 1.03720117, + "learning_rate": 2.8827430842847267e-06, + "loss": 1.07318068, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 2.07714844, + "step": 88, + "time_per_iteration": 2.954820394515991 + }, + { + "auxiliary_loss_clip": 0.02026895, + "auxiliary_loss_mlp": 0.01580087, + "balance_loss_clip": 1.70528781, + "balance_loss_mlp": 1.34843946, + "epoch": 0.005350969487449271, + "flos": 20895707151360.0, + "grad_norm": 1.675591249683518, + "language_loss": 0.95536608, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.99143589, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 3.21875, + "router_z_loss_mlp": 2.31640625, + "step": 89, + "time_per_iteration": 2.976139783859253 + }, + { + "auxiliary_loss_clip": 0.02013525, + "auxiliary_loss_mlp": 0.01567141, + "balance_loss_clip": 1.69726038, + "balance_loss_mlp": 1.35256433, + "epoch": 0.00541109274011724, + "flos": 26220204426240.0, + "grad_norm": 1.6140515868788872, + "language_loss": 1.0082438, + "learning_rate": 2.8972122815946455e-06, + "loss": 1.04405046, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 2.14746094, + "step": 90, + "time_per_iteration": 2.970111846923828 + }, + { + "auxiliary_loss_clip": 0.01994247, + "auxiliary_loss_mlp": 0.01596177, + "balance_loss_clip": 1.68450856, + "balance_loss_mlp": 1.35499239, + "epoch": 0.00547121599278521, + "flos": 21188432880000.0, + "grad_norm": 1.8403228620390235, + "language_loss": 0.94911039, + "learning_rate": 2.90432674275074e-06, + "loss": 0.98501468, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 2.41308594, + "step": 91, + "time_per_iteration": 2.971445322036743 + }, + { + "auxiliary_loss_clip": 0.01995211, + "auxiliary_loss_mlp": 0.01589467, + "balance_loss_clip": 1.68318522, + "balance_loss_mlp": 1.35467279, + "epoch": 0.005531339245453179, + "flos": 19728197596800.0, + "grad_norm": 2.068406962320896, + "language_loss": 1.00197899, + "learning_rate": 2.91136344867656e-06, + "loss": 1.03782582, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 3.125, + "router_z_loss_mlp": 2.34667969, + "step": 92, + "time_per_iteration": 2.9482290744781494 + }, + { + "auxiliary_loss_clip": 0.0198818, + "auxiliary_loss_mlp": 0.01574655, + "balance_loss_clip": 1.67300308, + "balance_loss_mlp": 1.35340309, + "epoch": 0.005591462498121149, + "flos": 17644161248640.0, + "grad_norm": 2.2334664655249714, + "language_loss": 1.11671638, + "learning_rate": 2.918324080615938e-06, + "loss": 1.15234458, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 2.21289062, + "step": 93, + "time_per_iteration": 2.943408489227295 + }, + { + "auxiliary_loss_clip": 0.01991725, + "auxiliary_loss_mlp": 0.01604058, + "balance_loss_clip": 1.67189646, + "balance_loss_mlp": 1.35667479, + "epoch": 0.005651585750789117, + "flos": 20020832835840.0, + "grad_norm": 2.253313963518644, + "language_loss": 1.00802302, + "learning_rate": 2.925210265866963e-06, + "loss": 1.04398084, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 2.47558594, + "step": 94, + "time_per_iteration": 3.011888027191162 + }, + { + "auxiliary_loss_clip": 0.02080452, + "auxiliary_loss_mlp": 0.01412516, + "balance_loss_clip": 1.81388235, + "balance_loss_mlp": 1.33240759, + "epoch": 0.005711709003457087, + "flos": 59841268842240.0, + "grad_norm": 1.4637687301060223, + "language_loss": 0.68633616, + "learning_rate": 2.932023580065507e-06, + "loss": 0.72126579, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.80078125, + "step": 95, + "time_per_iteration": 3.239597797393799 + }, + { + "auxiliary_loss_clip": 0.01981705, + "auxiliary_loss_mlp": 0.01586513, + "balance_loss_clip": 1.66928828, + "balance_loss_mlp": 1.34933424, + "epoch": 0.005771832256125056, + "flos": 15567952250880.0, + "grad_norm": 1.764136207853058, + "language_loss": 1.02087808, + "learning_rate": 2.9387655493491906e-06, + "loss": 1.0565604, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 2.37207031, + "step": 96, + "time_per_iteration": 2.994492769241333 + }, + { + "auxiliary_loss_clip": 0.01974666, + "auxiliary_loss_mlp": 0.01579647, + "balance_loss_clip": 1.66532207, + "balance_loss_mlp": 1.35477054, + "epoch": 0.005831955508793026, + "flos": 22538551023360.0, + "grad_norm": 2.0376589297783205, + "language_loss": 1.05534458, + "learning_rate": 2.9454376524092147e-06, + "loss": 1.09088778, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 2.25, + "step": 97, + "time_per_iteration": 2.9825820922851562 + }, + { + "auxiliary_loss_clip": 0.0196442, + "auxiliary_loss_mlp": 0.01601222, + "balance_loss_clip": 1.65420675, + "balance_loss_mlp": 1.35002446, + "epoch": 0.005892078761460995, + "flos": 22058782715520.0, + "grad_norm": 1.8372653349549066, + "language_loss": 0.84753704, + "learning_rate": 2.952041322436969e-06, + "loss": 0.88319343, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 2.51660156, + "step": 98, + "time_per_iteration": 3.0707077980041504 + }, + { + "auxiliary_loss_clip": 0.01981568, + "auxiliary_loss_mlp": 0.01431458, + "balance_loss_clip": 1.73114038, + "balance_loss_mlp": 1.35287571, + "epoch": 0.005952202014128965, + "flos": 68571298632960.0, + "grad_norm": 1.0903465547067244, + "language_loss": 0.65863246, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.69276273, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.78515625, + "step": 99, + "time_per_iteration": 3.4346020221710205 + }, + { + "auxiliary_loss_clip": 0.01945887, + "auxiliary_loss_mlp": 0.01549802, + "balance_loss_clip": 1.64866436, + "balance_loss_mlp": 1.34991217, + "epoch": 0.006012325266796933, + "flos": 22969789643520.0, + "grad_norm": 1.7755210256902771, + "language_loss": 1.03168666, + "learning_rate": 2.9650488796560464e-06, + "loss": 1.0666436, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 2.97265625, + "router_z_loss_mlp": 1.99902344, + "step": 100, + "time_per_iteration": 2.9528040885925293 + }, + { + "auxiliary_loss_clip": 0.01944294, + "auxiliary_loss_mlp": 0.01559737, + "balance_loss_clip": 1.64519811, + "balance_loss_mlp": 1.3622309, + "epoch": 0.006072448519464903, + "flos": 17356910140800.0, + "grad_norm": 1.851347115892186, + "language_loss": 1.01091123, + "learning_rate": 2.971455421902446e-06, + "loss": 1.0459516, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 1.97460938, + "step": 101, + "time_per_iteration": 2.959949016571045 + }, + { + "auxiliary_loss_clip": 0.01937535, + "auxiliary_loss_mlp": 0.01559691, + "balance_loss_clip": 1.64149296, + "balance_loss_mlp": 1.35837042, + "epoch": 0.006132571772132872, + "flos": 24691866439680.0, + "grad_norm": 1.7239595815909345, + "language_loss": 1.03446138, + "learning_rate": 2.9777988444798075e-06, + "loss": 1.06943369, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 2.01367188, + "step": 102, + "time_per_iteration": 3.0102484226226807 + }, + { + "auxiliary_loss_clip": 0.01935015, + "auxiliary_loss_mlp": 0.01614495, + "balance_loss_clip": 1.63839757, + "balance_loss_mlp": 1.36320138, + "epoch": 0.006192695024800842, + "flos": 21474371888640.0, + "grad_norm": 1.9250394651509122, + "language_loss": 0.97853917, + "learning_rate": 2.9840803790210285e-06, + "loss": 1.01403427, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 2.96679688, + "router_z_loss_mlp": 2.515625, + "step": 103, + "time_per_iteration": 2.9484057426452637 + }, + { + "auxiliary_loss_clip": 0.01927277, + "auxiliary_loss_mlp": 0.01590141, + "balance_loss_clip": 1.63277698, + "balance_loss_mlp": 1.36345267, + "epoch": 0.006252818277468811, + "flos": 17429265855360.0, + "grad_norm": 1.664814914521033, + "language_loss": 1.00576234, + "learning_rate": 2.990301221458371e-06, + "loss": 1.04093659, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 2.26757812, + "step": 104, + "time_per_iteration": 2.9664459228515625 + }, + { + "auxiliary_loss_clip": 0.01933122, + "auxiliary_loss_mlp": 0.01557284, + "balance_loss_clip": 1.64216352, + "balance_loss_mlp": 1.36664486, + "epoch": 0.006312941530136781, + "flos": 19109056746240.0, + "grad_norm": 1.8866654983844848, + "language_loss": 1.0761739, + "learning_rate": 2.9964625333900544e-06, + "loss": 1.11107802, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 1.90527344, + "step": 105, + "time_per_iteration": 2.984001636505127 + }, + { + "auxiliary_loss_clip": 0.0192205, + "auxiliary_loss_mlp": 0.01555855, + "balance_loss_clip": 1.63220656, + "balance_loss_mlp": 1.3480494, + "epoch": 0.006373064782804749, + "flos": 24071413489920.0, + "grad_norm": 1.9538587272618906, + "language_loss": 1.04559231, + "learning_rate": 3.002565443382063e-06, + "loss": 1.08037138, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 2.08007812, + "step": 106, + "time_per_iteration": 3.048912763595581 + }, + { + "auxiliary_loss_clip": 0.01920889, + "auxiliary_loss_mlp": 0.01538576, + "balance_loss_clip": 1.62742102, + "balance_loss_mlp": 1.34860444, + "epoch": 0.006433188035472719, + "flos": 18341494392960.0, + "grad_norm": 1.862783685773675, + "language_loss": 0.99073768, + "learning_rate": 3.008611048208843e-06, + "loss": 1.02533245, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 1.90039062, + "step": 107, + "time_per_iteration": 3.03747820854187 + }, + { + "auxiliary_loss_clip": 0.01922726, + "auxiliary_loss_mlp": 0.01489736, + "balance_loss_clip": 1.69902253, + "balance_loss_mlp": 1.42870128, + "epoch": 0.006493311288140688, + "flos": 62594785520640.0, + "grad_norm": 1.0550469301656948, + "language_loss": 0.65021467, + "learning_rate": 3.014600414036285e-06, + "loss": 0.68433928, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.609375, + "step": 108, + "time_per_iteration": 3.3504483699798584 + }, + { + "auxiliary_loss_clip": 0.0191401, + "auxiliary_loss_mlp": 0.0154659, + "balance_loss_clip": 1.62987506, + "balance_loss_mlp": 1.36272204, + "epoch": 0.006553434540808658, + "flos": 19509546885120.0, + "grad_norm": 1.792568200840074, + "language_loss": 1.08283758, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.11744368, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 1.83886719, + "step": 109, + "time_per_iteration": 2.962038040161133 + }, + { + "auxiliary_loss_clip": 0.01898747, + "auxiliary_loss_mlp": 0.01584883, + "balance_loss_clip": 1.61868119, + "balance_loss_mlp": 1.38804448, + "epoch": 0.006613557793476627, + "flos": 21114855555840.0, + "grad_norm": 1.4914829380756742, + "language_loss": 0.94560754, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.98044389, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 1.96679688, + "step": 110, + "time_per_iteration": 2.996743679046631 + }, + { + "auxiliary_loss_clip": 0.01886148, + "auxiliary_loss_mlp": 0.01583833, + "balance_loss_clip": 1.60763562, + "balance_loss_mlp": 1.39538682, + "epoch": 0.006673681046144597, + "flos": 26041758359040.0, + "grad_norm": 1.8055190977799156, + "language_loss": 0.89272934, + "learning_rate": 3.032241303393073e-06, + "loss": 0.9274292, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 1.88476562, + "step": 111, + "time_per_iteration": 3.0377683639526367 + }, + { + "auxiliary_loss_clip": 0.01886626, + "auxiliary_loss_mlp": 0.01576551, + "balance_loss_clip": 1.60440516, + "balance_loss_mlp": 1.37570679, + "epoch": 0.006733804298812566, + "flos": 23157872853120.0, + "grad_norm": 1.64758868341786, + "language_loss": 1.01985645, + "learning_rate": 3.0380158011446e-06, + "loss": 1.05448818, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 2.00976562, + "step": 112, + "time_per_iteration": 3.0487022399902344 + }, + { + "auxiliary_loss_clip": 0.01875265, + "auxiliary_loss_mlp": 0.01545316, + "balance_loss_clip": 1.59934902, + "balance_loss_mlp": 1.3559165, + "epoch": 0.006793927551480535, + "flos": 11771657228160.0, + "grad_norm": 2.0481936995810166, + "language_loss": 0.94080055, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.97500634, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 1.89257812, + "step": 113, + "time_per_iteration": 2.9369571208953857 + }, + { + "auxiliary_loss_clip": 0.01871203, + "auxiliary_loss_mlp": 0.01505836, + "balance_loss_clip": 1.59276152, + "balance_loss_mlp": 1.36998463, + "epoch": 0.006854050804148504, + "flos": 19181321971200.0, + "grad_norm": 1.653140138910595, + "language_loss": 1.03894496, + "learning_rate": 3.0494117125071475e-06, + "loss": 1.0727154, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 1.35888672, + "step": 114, + "time_per_iteration": 2.9705049991607666 + }, + { + "auxiliary_loss_clip": 0.01866799, + "auxiliary_loss_mlp": 0.01551521, + "balance_loss_clip": 1.58869743, + "balance_loss_mlp": 1.37094283, + "epoch": 0.006914174056816474, + "flos": 21992127845760.0, + "grad_norm": 1.7331904134594145, + "language_loss": 1.05513215, + "learning_rate": 3.055034911425055e-06, + "loss": 1.08931541, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 1.80712891, + "step": 115, + "time_per_iteration": 3.013333320617676 + }, + { + "auxiliary_loss_clip": 0.01878037, + "auxiliary_loss_mlp": 0.01671563, + "balance_loss_clip": 1.59157312, + "balance_loss_mlp": 1.38674784, + "epoch": 0.006974297309484443, + "flos": 16297662689280.0, + "grad_norm": 1.8038799674825148, + "language_loss": 0.96423781, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.99973392, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 2.85107422, + "step": 116, + "time_per_iteration": 2.9376189708709717 + }, + { + "auxiliary_loss_clip": 0.01846166, + "auxiliary_loss_mlp": 0.01523902, + "balance_loss_clip": 1.57392287, + "balance_loss_mlp": 1.37779856, + "epoch": 0.007034420562152413, + "flos": 26115109459200.0, + "grad_norm": 1.8016206569490554, + "language_loss": 1.02880001, + "learning_rate": 3.0661360861454656e-06, + "loss": 1.06250072, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 1.46191406, + "step": 117, + "time_per_iteration": 2.9882030487060547 + }, + { + "auxiliary_loss_clip": 0.01839617, + "auxiliary_loss_mlp": 0.01510947, + "balance_loss_clip": 1.56601071, + "balance_loss_mlp": 1.36379528, + "epoch": 0.007094543814820382, + "flos": 14211318856320.0, + "grad_norm": 1.8741808881633313, + "language_loss": 0.98083568, + "learning_rate": 3.071615712271274e-06, + "loss": 1.01434135, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 1.47167969, + "step": 118, + "time_per_iteration": 2.9267263412475586 + }, + { + "auxiliary_loss_clip": 0.01833788, + "auxiliary_loss_mlp": 0.01527576, + "balance_loss_clip": 1.56065798, + "balance_loss_mlp": 1.37022018, + "epoch": 0.007154667067488351, + "flos": 14984265340800.0, + "grad_norm": 1.8385648159151324, + "language_loss": 1.08662391, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.12023759, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 1.57421875, + "step": 119, + "time_per_iteration": 2.9875338077545166 + }, + { + "auxiliary_loss_clip": 0.01828494, + "auxiliary_loss_mlp": 0.01523588, + "balance_loss_clip": 1.55392456, + "balance_loss_mlp": 1.36537349, + "epoch": 0.00721479032015632, + "flos": 20202853242240.0, + "grad_norm": 2.02862627281142, + "language_loss": 1.10576797, + "learning_rate": 3.082437012097686e-06, + "loss": 1.1392889, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 1.58203125, + "step": 120, + "time_per_iteration": 7.1235082149505615 + }, + { + "auxiliary_loss_clip": 0.01809403, + "auxiliary_loss_mlp": 0.01510399, + "balance_loss_clip": 1.54465103, + "balance_loss_mlp": 1.37345147, + "epoch": 0.00727491357282429, + "flos": 23157194181120.0, + "grad_norm": 1.6537702279639956, + "language_loss": 0.98594522, + "learning_rate": 3.0877802144103967e-06, + "loss": 1.01914322, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 1.37011719, + "step": 121, + "time_per_iteration": 4.4040961265563965 + }, + { + "auxiliary_loss_clip": 0.01813054, + "auxiliary_loss_mlp": 0.01535398, + "balance_loss_clip": 1.54480803, + "balance_loss_mlp": 1.36926746, + "epoch": 0.007335036825492259, + "flos": 15529647888000.0, + "grad_norm": 2.1449802853017257, + "language_loss": 1.050385, + "learning_rate": 3.09307943925077e-06, + "loss": 1.08386946, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 1.66210938, + "step": 122, + "time_per_iteration": 2.9283711910247803 + }, + { + "auxiliary_loss_clip": 0.0180488, + "auxiliary_loss_mlp": 0.01494523, + "balance_loss_clip": 1.54170537, + "balance_loss_mlp": 1.35519147, + "epoch": 0.007395160078160229, + "flos": 24254022078720.0, + "grad_norm": 1.881524266622204, + "language_loss": 1.04281998, + "learning_rate": 3.0983354046304154e-06, + "loss": 1.07581413, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 1.39257812, + "step": 123, + "time_per_iteration": 3.007431983947754 + }, + { + "auxiliary_loss_clip": 0.01806349, + "auxiliary_loss_mlp": 0.0150086, + "balance_loss_clip": 1.53740692, + "balance_loss_mlp": 1.35742736, + "epoch": 0.007455283330828198, + "flos": 31772491862400.0, + "grad_norm": 1.7140578427898214, + "language_loss": 0.84739226, + "learning_rate": 3.103548811118979e-06, + "loss": 0.88046438, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 1.43603516, + "step": 124, + "time_per_iteration": 3.075922966003418 + }, + { + "auxiliary_loss_clip": 0.01793709, + "auxiliary_loss_mlp": 0.01520004, + "balance_loss_clip": 1.52771068, + "balance_loss_mlp": 1.3671298, + "epoch": 0.007515406583496167, + "flos": 26626485899520.0, + "grad_norm": 1.8289813203271517, + "language_loss": 1.00631356, + "learning_rate": 3.108720342404542e-06, + "loss": 1.03945065, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 1.52929688, + "step": 125, + "time_per_iteration": 3.031848430633545 + }, + { + "auxiliary_loss_clip": 0.01795018, + "auxiliary_loss_mlp": 0.01515022, + "balance_loss_clip": 1.52735257, + "balance_loss_mlp": 1.37402105, + "epoch": 0.007575529836164136, + "flos": 18232915576320.0, + "grad_norm": 3.125298966529811, + "language_loss": 1.01942134, + "learning_rate": 3.1138506658316945e-06, + "loss": 1.05252171, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 1.40917969, + "step": 126, + "time_per_iteration": 3.087946891784668 + }, + { + "auxiliary_loss_clip": 0.01793919, + "auxiliary_loss_mlp": 0.01507171, + "balance_loss_clip": 1.52451944, + "balance_loss_mlp": 1.38552988, + "epoch": 0.007635653088832106, + "flos": 21590506586880.0, + "grad_norm": 2.0968660593484723, + "language_loss": 0.8475852, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.8805961, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 1.21582031, + "step": 127, + "time_per_iteration": 3.032813310623169 + }, + { + "auxiliary_loss_clip": 0.01798734, + "auxiliary_loss_mlp": 0.01537827, + "balance_loss_clip": 1.53317142, + "balance_loss_mlp": 1.37689507, + "epoch": 0.007695776341500075, + "flos": 25386168182400.0, + "grad_norm": 1.7634310780686029, + "language_loss": 0.96107388, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.99443942, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 1.60986328, + "step": 128, + "time_per_iteration": 2.994908332824707 + }, + { + "auxiliary_loss_clip": 0.01775609, + "auxiliary_loss_mlp": 0.01485588, + "balance_loss_clip": 1.51239061, + "balance_loss_mlp": 1.34921277, + "epoch": 0.007755899594168045, + "flos": 22353499215360.0, + "grad_norm": 1.5447440556940202, + "language_loss": 0.91816604, + "learning_rate": 3.129000827968184e-06, + "loss": 0.95077795, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 1.36376953, + "step": 129, + "time_per_iteration": 3.036813497543335 + }, + { + "auxiliary_loss_clip": 0.01771995, + "auxiliary_loss_mlp": 0.0147112, + "balance_loss_clip": 1.51097655, + "balance_loss_mlp": 1.34804797, + "epoch": 0.007816022846836013, + "flos": 22648532428800.0, + "grad_norm": 1.740957399337759, + "language_loss": 1.06647468, + "learning_rate": 3.133972684206866e-06, + "loss": 1.0989058, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 1.22949219, + "step": 130, + "time_per_iteration": 3.003504514694214 + }, + { + "auxiliary_loss_clip": 0.01766666, + "auxiliary_loss_mlp": 0.01488458, + "balance_loss_clip": 1.50428295, + "balance_loss_mlp": 1.35494328, + "epoch": 0.007876146099503984, + "flos": 18190991629440.0, + "grad_norm": 1.633591787982827, + "language_loss": 0.91260767, + "learning_rate": 3.138906441556014e-06, + "loss": 0.94515896, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 1.3359375, + "step": 131, + "time_per_iteration": 2.9374780654907227 + }, + { + "auxiliary_loss_clip": 0.01769541, + "auxiliary_loss_mlp": 0.01500769, + "balance_loss_clip": 1.51053822, + "balance_loss_mlp": 1.36224818, + "epoch": 0.007936269352171952, + "flos": 27129265827840.0, + "grad_norm": 1.7986380249471208, + "language_loss": 0.91651535, + "learning_rate": 3.143802679474861e-06, + "loss": 0.94921839, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 1.38623047, + "step": 132, + "time_per_iteration": 3.0551247596740723 + }, + { + "auxiliary_loss_clip": 0.01759184, + "auxiliary_loss_mlp": 0.01509459, + "balance_loss_clip": 1.49819088, + "balance_loss_mlp": 1.3616395, + "epoch": 0.007996392604839923, + "flos": 19035750890880.0, + "grad_norm": 1.7781138983719795, + "language_loss": 1.04958653, + "learning_rate": 3.1486619643025565e-06, + "loss": 1.08227301, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 1.47851562, + "step": 133, + "time_per_iteration": 3.035219430923462 + }, + { + "auxiliary_loss_clip": 0.01743115, + "auxiliary_loss_mlp": 0.01522793, + "balance_loss_clip": 1.48822713, + "balance_loss_mlp": 1.35036814, + "epoch": 0.008056515857507891, + "flos": 25495199447040.0, + "grad_norm": 1.4242186650854611, + "language_loss": 0.80555087, + "learning_rate": 3.153484849651286e-06, + "loss": 0.83820999, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 1.72363281, + "step": 134, + "time_per_iteration": 2.9813473224639893 + }, + { + "auxiliary_loss_clip": 0.01744416, + "auxiliary_loss_mlp": 0.01483782, + "balance_loss_clip": 1.48811674, + "balance_loss_mlp": 1.35069668, + "epoch": 0.00811663911017586, + "flos": 20567075034240.0, + "grad_norm": 2.360031493174988, + "language_loss": 1.03100455, + "learning_rate": 3.1582718767847806e-06, + "loss": 1.06328654, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 1.33105469, + "step": 135, + "time_per_iteration": 2.951490879058838 + }, + { + "auxiliary_loss_clip": 0.01739459, + "auxiliary_loss_mlp": 0.01474254, + "balance_loss_clip": 1.48635364, + "balance_loss_mlp": 1.34856021, + "epoch": 0.00817676236284383, + "flos": 18807101078400.0, + "grad_norm": 2.0791676106209125, + "language_loss": 1.03863001, + "learning_rate": 3.1630235749828485e-06, + "loss": 1.07076716, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 1.25585938, + "step": 136, + "time_per_iteration": 2.939040184020996 + }, + { + "auxiliary_loss_clip": 0.01731514, + "auxiliary_loss_mlp": 0.01481807, + "balance_loss_clip": 1.47920465, + "balance_loss_mlp": 1.34752953, + "epoch": 0.008236885615511799, + "flos": 23882877832320.0, + "grad_norm": 1.8035601242269115, + "language_loss": 0.96884584, + "learning_rate": 3.1677404618925676e-06, + "loss": 1.00097907, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 1.34228516, + "step": 137, + "time_per_iteration": 3.0082385540008545 + }, + { + "auxiliary_loss_clip": 0.01728285, + "auxiliary_loss_mlp": 0.01482164, + "balance_loss_clip": 1.4768219, + "balance_loss_mlp": 1.34960294, + "epoch": 0.00829700886817977, + "flos": 24654105014400.0, + "grad_norm": 1.4912579024644579, + "language_loss": 0.97569871, + "learning_rate": 3.1724230438666953e-06, + "loss": 1.0078032, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 1.32519531, + "step": 138, + "time_per_iteration": 2.988132953643799 + }, + { + "auxiliary_loss_clip": 0.01720391, + "auxiliary_loss_mlp": 0.01497493, + "balance_loss_clip": 1.46845675, + "balance_loss_mlp": 1.34943545, + "epoch": 0.008357132120847738, + "flos": 25272159989760.0, + "grad_norm": 1.7767025349051841, + "language_loss": 0.98709744, + "learning_rate": 3.177071816289865e-06, + "loss": 1.01927638, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 1.48046875, + "step": 139, + "time_per_iteration": 3.0043931007385254 + }, + { + "auxiliary_loss_clip": 0.01720858, + "auxiliary_loss_mlp": 0.01470096, + "balance_loss_clip": 1.46969604, + "balance_loss_mlp": 1.34850264, + "epoch": 0.008417255373515706, + "flos": 27356422561920.0, + "grad_norm": 1.8838243025125034, + "language_loss": 1.01136756, + "learning_rate": 3.181687263893095e-06, + "loss": 1.04327714, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 1.21630859, + "step": 140, + "time_per_iteration": 3.014277458190918 + }, + { + "auxiliary_loss_clip": 0.01716456, + "auxiliary_loss_mlp": 0.01467969, + "balance_loss_clip": 1.4673841, + "balance_loss_mlp": 1.34856856, + "epoch": 0.008477378626183677, + "flos": 17647916567040.0, + "grad_norm": 2.0274133416777254, + "language_loss": 0.9790231, + "learning_rate": 3.186269861057098e-06, + "loss": 1.01086736, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 1.19433594, + "step": 141, + "time_per_iteration": 2.9487948417663574 + }, + { + "auxiliary_loss_clip": 0.01712216, + "auxiliary_loss_mlp": 0.01469021, + "balance_loss_clip": 1.46430659, + "balance_loss_mlp": 1.34814322, + "epoch": 0.008537501878851645, + "flos": 13889292480000.0, + "grad_norm": 2.2138094256594463, + "language_loss": 0.98319864, + "learning_rate": 3.1908200721048745e-06, + "loss": 1.01501107, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 1.20898438, + "step": 142, + "time_per_iteration": 3.000056028366089 + }, + { + "auxiliary_loss_clip": 0.01760019, + "auxiliary_loss_mlp": 0.01493213, + "balance_loss_clip": 1.54942608, + "balance_loss_mlp": 1.42950761, + "epoch": 0.008597625131519616, + "flos": 71283298567680.0, + "grad_norm": 1.1725613867174547, + "language_loss": 0.6717701, + "learning_rate": 3.195338351584042e-06, + "loss": 0.70430243, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.63671875, + "step": 143, + "time_per_iteration": 3.505128860473633 + }, + { + "auxiliary_loss_clip": 0.01713448, + "auxiliary_loss_mlp": 0.01477787, + "balance_loss_clip": 1.46557307, + "balance_loss_mlp": 1.35872102, + "epoch": 0.008657748384187584, + "flos": 17611738709760.0, + "grad_norm": 1.7054812241713622, + "language_loss": 0.9672817, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.99919403, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 1.19140625, + "step": 144, + "time_per_iteration": 3.0075223445892334 + }, + { + "auxiliary_loss_clip": 0.01715618, + "auxiliary_loss_mlp": 0.01546279, + "balance_loss_clip": 1.46553135, + "balance_loss_mlp": 1.39264166, + "epoch": 0.008717871636855555, + "flos": 19723808851200.0, + "grad_norm": 1.5138212695528415, + "language_loss": 0.99771917, + "learning_rate": 3.204280886775619e-06, + "loss": 1.03033805, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 1.53613281, + "step": 145, + "time_per_iteration": 3.086247205734253 + }, + { + "auxiliary_loss_clip": 0.01730241, + "auxiliary_loss_mlp": 0.01551944, + "balance_loss_clip": 1.47565234, + "balance_loss_mlp": 1.41995525, + "epoch": 0.008777994889523523, + "flos": 24728134786560.0, + "grad_norm": 1.520543892784519, + "language_loss": 0.97562361, + "learning_rate": 3.208706005112005e-06, + "loss": 1.0084455, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 1.31933594, + "step": 146, + "time_per_iteration": 2.9851768016815186 + }, + { + "auxiliary_loss_clip": 0.01717858, + "auxiliary_loss_mlp": 0.0147494, + "balance_loss_clip": 1.51347804, + "balance_loss_mlp": 1.4371742, + "epoch": 0.008838118142191492, + "flos": 70161405050880.0, + "grad_norm": 0.9327860305439852, + "language_loss": 0.60406101, + "learning_rate": 3.213100917627104e-06, + "loss": 0.63598901, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.37695312, + "step": 147, + "time_per_iteration": 3.480670928955078 + }, + { + "auxiliary_loss_clip": 0.01712017, + "auxiliary_loss_mlp": 0.01493891, + "balance_loss_clip": 1.46028781, + "balance_loss_mlp": 1.37749517, + "epoch": 0.008898241394859462, + "flos": 20053662577920.0, + "grad_norm": 1.7733948706889147, + "language_loss": 0.91982633, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.9518854, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 1.16455078, + "step": 148, + "time_per_iteration": 2.989441394805908 + }, + { + "auxiliary_loss_clip": 0.01709266, + "auxiliary_loss_mlp": 0.01483898, + "balance_loss_clip": 1.46102834, + "balance_loss_mlp": 1.3573451, + "epoch": 0.008958364647527431, + "flos": 10750759384320.0, + "grad_norm": 1.846657737799584, + "language_loss": 0.98415065, + "learning_rate": 3.2218017552198588e-06, + "loss": 1.01608229, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 1.265625, + "step": 149, + "time_per_iteration": 2.9582393169403076 + }, + { + "auxiliary_loss_clip": 0.01707349, + "auxiliary_loss_mlp": 0.01485741, + "balance_loss_clip": 1.4617424, + "balance_loss_mlp": 1.37788033, + "epoch": 0.009018487900195401, + "flos": 29138096039040.0, + "grad_norm": 3.5482986287921734, + "language_loss": 1.04718316, + "learning_rate": 3.226108474846181e-06, + "loss": 1.07911408, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 1.07861328, + "step": 150, + "time_per_iteration": 3.035717725753784 + }, + { + "auxiliary_loss_clip": 0.01698054, + "auxiliary_loss_mlp": 0.01502167, + "balance_loss_clip": 1.45451343, + "balance_loss_mlp": 1.40126812, + "epoch": 0.00907861115286337, + "flos": 32976631722240.0, + "grad_norm": 1.7070340963942663, + "language_loss": 0.85386127, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.88586354, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 1.00927734, + "step": 151, + "time_per_iteration": 3.0584869384765625 + }, + { + "auxiliary_loss_clip": 0.01714564, + "auxiliary_loss_mlp": 0.01537387, + "balance_loss_clip": 1.46676886, + "balance_loss_mlp": 1.42208743, + "epoch": 0.009138734405531338, + "flos": 21772481748480.0, + "grad_norm": 1.8963026642612548, + "language_loss": 1.01383948, + "learning_rate": 3.234636443010188e-06, + "loss": 1.04635906, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 1.15283203, + "step": 152, + "time_per_iteration": 3.015085458755493 + }, + { + "auxiliary_loss_clip": 0.01697658, + "auxiliary_loss_mlp": 0.01505996, + "balance_loss_clip": 1.453601, + "balance_loss_mlp": 1.4037149, + "epoch": 0.009198857658199309, + "flos": 20850299354880.0, + "grad_norm": 2.5095207462199944, + "language_loss": 1.0263381, + "learning_rate": 3.238858439669943e-06, + "loss": 1.05837464, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 1.02294922, + "step": 153, + "time_per_iteration": 2.9863834381103516 + }, + { + "auxiliary_loss_clip": 0.01686311, + "auxiliary_loss_mlp": 0.01482566, + "balance_loss_clip": 1.44646311, + "balance_loss_mlp": 1.36826825, + "epoch": 0.009258980910867277, + "flos": 24838251926400.0, + "grad_norm": 1.58784688596587, + "language_loss": 0.9567551, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.98844391, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 1.14306641, + "step": 154, + "time_per_iteration": 2.988161325454712 + }, + { + "auxiliary_loss_clip": 0.01683941, + "auxiliary_loss_mlp": 0.01451792, + "balance_loss_clip": 1.44340765, + "balance_loss_mlp": 1.34278738, + "epoch": 0.009319104163535248, + "flos": 28779665581440.0, + "grad_norm": 1.7612335960786463, + "language_loss": 0.96971238, + "learning_rate": 3.2472202738674737e-06, + "loss": 1.00106966, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 1.09033203, + "step": 155, + "time_per_iteration": 5.857283592224121 + }, + { + "auxiliary_loss_clip": 0.01700492, + "auxiliary_loss_mlp": 0.01467731, + "balance_loss_clip": 1.45578206, + "balance_loss_mlp": 1.3586781, + "epoch": 0.009379227416203216, + "flos": 16590840865920.0, + "grad_norm": 1.9042846459747682, + "language_loss": 0.99431241, + "learning_rate": 3.2513608166485063e-06, + "loss": 1.02599466, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 1.09033203, + "step": 156, + "time_per_iteration": 5.756657600402832 + }, + { + "auxiliary_loss_clip": 0.01692196, + "auxiliary_loss_mlp": 0.015152, + "balance_loss_clip": 1.44911659, + "balance_loss_mlp": 1.37849081, + "epoch": 0.009439350668871187, + "flos": 18338327256960.0, + "grad_norm": 2.102312069099806, + "language_loss": 1.1224153, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.15448916, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 1.36767578, + "step": 157, + "time_per_iteration": 2.929694652557373 + }, + { + "auxiliary_loss_clip": 0.01687177, + "auxiliary_loss_mlp": 0.01494498, + "balance_loss_clip": 1.44721782, + "balance_loss_mlp": 1.38263166, + "epoch": 0.009499473921539155, + "flos": 24359840962560.0, + "grad_norm": 1.7898147420392636, + "language_loss": 0.99449217, + "learning_rate": 3.2595628662110186e-06, + "loss": 1.02630901, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 1.11865234, + "step": 158, + "time_per_iteration": 3.1394481658935547 + }, + { + "auxiliary_loss_clip": 0.01677624, + "auxiliary_loss_mlp": 0.01512768, + "balance_loss_clip": 1.43753374, + "balance_loss_mlp": 1.38488019, + "epoch": 0.009559597174207124, + "flos": 16408051297920.0, + "grad_norm": 2.0099245126604957, + "language_loss": 0.99955839, + "learning_rate": 3.2636250385721982e-06, + "loss": 1.03146219, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 1.27783203, + "step": 159, + "time_per_iteration": 3.36445689201355 + }, + { + "auxiliary_loss_clip": 0.01676278, + "auxiliary_loss_mlp": 0.01492735, + "balance_loss_clip": 1.43387544, + "balance_loss_mlp": 1.3758142, + "epoch": 0.009619720426875094, + "flos": 22867183140480.0, + "grad_norm": 1.4472853637188445, + "language_loss": 0.93441355, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.96610367, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 1.16943359, + "step": 160, + "time_per_iteration": 3.147494316101074 + }, + { + "auxiliary_loss_clip": 0.01671263, + "auxiliary_loss_mlp": 0.01479541, + "balance_loss_clip": 1.43019044, + "balance_loss_mlp": 1.36600614, + "epoch": 0.009679843679543063, + "flos": 19144239217920.0, + "grad_norm": 1.9611011875678692, + "language_loss": 1.0494926, + "learning_rate": 3.2716732956621042e-06, + "loss": 1.08100057, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 1.13623047, + "step": 161, + "time_per_iteration": 3.083754539489746 + }, + { + "auxiliary_loss_clip": 0.01667278, + "auxiliary_loss_mlp": 0.01456613, + "balance_loss_clip": 1.42786336, + "balance_loss_mlp": 1.35352039, + "epoch": 0.009739966932211033, + "flos": 20312653668480.0, + "grad_norm": 1.5627629857261016, + "language_loss": 1.02950001, + "learning_rate": 3.2756600092264203e-06, + "loss": 1.0607388, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 1.03076172, + "step": 162, + "time_per_iteration": 3.0321624279022217 + }, + { + "auxiliary_loss_clip": 0.01646538, + "auxiliary_loss_mlp": 0.0147447, + "balance_loss_clip": 1.45340502, + "balance_loss_mlp": 1.41229022, + "epoch": 0.009800090184879002, + "flos": 67063664782080.0, + "grad_norm": 1.2254356704368674, + "language_loss": 0.72704124, + "learning_rate": 3.279622189013474e-06, + "loss": 0.75825131, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.62109375, + "step": 163, + "time_per_iteration": 3.41477370262146 + }, + { + "auxiliary_loss_clip": 0.01654993, + "auxiliary_loss_mlp": 0.01472794, + "balance_loss_clip": 1.41852486, + "balance_loss_mlp": 1.36169064, + "epoch": 0.00986021343754697, + "flos": 17173396656000.0, + "grad_norm": 1.8904270217147379, + "language_loss": 0.97955823, + "learning_rate": 3.283560135133457e-06, + "loss": 1.01083612, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 1.11083984, + "step": 164, + "time_per_iteration": 2.9637222290039062 + }, + { + "auxiliary_loss_clip": 0.01657604, + "auxiliary_loss_mlp": 0.01501523, + "balance_loss_clip": 1.41857445, + "balance_loss_mlp": 1.38236129, + "epoch": 0.00992033669021494, + "flos": 17758531399680.0, + "grad_norm": 1.8437212215427194, + "language_loss": 1.00707221, + "learning_rate": 3.2874741422233565e-06, + "loss": 1.03866363, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 1.19238281, + "step": 165, + "time_per_iteration": 3.0621936321258545 + }, + { + "auxiliary_loss_clip": 0.01648986, + "auxiliary_loss_mlp": 0.01488579, + "balance_loss_clip": 1.41218567, + "balance_loss_mlp": 1.39478469, + "epoch": 0.00998045994288291, + "flos": 25306980503040.0, + "grad_norm": 1.5728534755454364, + "language_loss": 0.90479118, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.93616682, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.93847656, + "step": 166, + "time_per_iteration": 2.993326425552368 + }, + { + "auxiliary_loss_clip": 0.01646425, + "auxiliary_loss_mlp": 0.01485213, + "balance_loss_clip": 1.41008055, + "balance_loss_mlp": 1.38297963, + "epoch": 0.01004058319555088, + "flos": 32309685100800.0, + "grad_norm": 1.8408636647865255, + "language_loss": 1.01632929, + "learning_rate": 3.2952314912845914e-06, + "loss": 1.04764569, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 1.02197266, + "step": 167, + "time_per_iteration": 3.109844207763672 + }, + { + "auxiliary_loss_clip": 0.0164675, + "auxiliary_loss_mlp": 0.01458415, + "balance_loss_clip": 1.41309142, + "balance_loss_mlp": 1.37029552, + "epoch": 0.010100706448218848, + "flos": 11325306844800.0, + "grad_norm": 2.0017223838491023, + "language_loss": 1.06033587, + "learning_rate": 3.299075396334735e-06, + "loss": 1.09138751, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.88085938, + "step": 168, + "time_per_iteration": 3.099179267883301 + }, + { + "auxiliary_loss_clip": 0.01648768, + "auxiliary_loss_mlp": 0.01443044, + "balance_loss_clip": 1.41379213, + "balance_loss_mlp": 1.35058498, + "epoch": 0.010160829700886819, + "flos": 29732415477120.0, + "grad_norm": 1.3823934475823065, + "language_loss": 0.94224906, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.97316718, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.92431641, + "step": 169, + "time_per_iteration": 3.0935239791870117 + }, + { + "auxiliary_loss_clip": 0.01650068, + "auxiliary_loss_mlp": 0.01440419, + "balance_loss_clip": 1.41612494, + "balance_loss_mlp": 1.35382497, + "epoch": 0.010220952953554787, + "flos": 20422001646720.0, + "grad_norm": 1.4773365170470347, + "language_loss": 0.94947594, + "learning_rate": 3.306695037731344e-06, + "loss": 0.98038083, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.86523438, + "step": 170, + "time_per_iteration": 3.008519172668457 + }, + { + "auxiliary_loss_clip": 0.0167029, + "auxiliary_loss_mlp": 0.01460026, + "balance_loss_clip": 1.4290185, + "balance_loss_mlp": 1.36637545, + "epoch": 0.010281076206222756, + "flos": 31297609992960.0, + "grad_norm": 1.6130596964371056, + "language_loss": 1.00798678, + "learning_rate": 3.3104713076972827e-06, + "loss": 1.03928983, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.93701172, + "step": 171, + "time_per_iteration": 3.111489772796631 + }, + { + "auxiliary_loss_clip": 0.01650145, + "auxiliary_loss_mlp": 0.01454002, + "balance_loss_clip": 1.41761374, + "balance_loss_mlp": 1.36197269, + "epoch": 0.010341199458890726, + "flos": 21992625538560.0, + "grad_norm": 1.6574259950964503, + "language_loss": 0.98412812, + "learning_rate": 3.314225558471224e-06, + "loss": 1.0151695, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.92041016, + "step": 172, + "time_per_iteration": 3.0119850635528564 + }, + { + "auxiliary_loss_clip": 0.0164008, + "auxiliary_loss_mlp": 0.01411296, + "balance_loss_clip": 1.41633415, + "balance_loss_mlp": 1.3497839, + "epoch": 0.010401322711558695, + "flos": 30822818613120.0, + "grad_norm": 1.4251684275897927, + "language_loss": 0.89418882, + "learning_rate": 3.317958045350308e-06, + "loss": 0.92470258, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.61425781, + "step": 173, + "time_per_iteration": 3.2355563640594482 + }, + { + "auxiliary_loss_clip": 0.01645054, + "auxiliary_loss_mlp": 0.01452845, + "balance_loss_clip": 1.41507065, + "balance_loss_mlp": 1.35022962, + "epoch": 0.010461445964226665, + "flos": 24725239119360.0, + "grad_norm": 1.5776203616061641, + "language_loss": 0.92519748, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.95617652, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 1.02587891, + "step": 174, + "time_per_iteration": 3.2617673873901367 + }, + { + "auxiliary_loss_clip": 0.01643694, + "auxiliary_loss_mlp": 0.01442153, + "balance_loss_clip": 1.40976715, + "balance_loss_mlp": 1.34359097, + "epoch": 0.010521569216894634, + "flos": 27722137432320.0, + "grad_norm": 1.6991857998138788, + "language_loss": 0.82230949, + "learning_rate": 3.325358726641591e-06, + "loss": 0.85316801, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.98535156, + "step": 175, + "time_per_iteration": 3.1029117107391357 + }, + { + "auxiliary_loss_clip": 0.01628773, + "auxiliary_loss_mlp": 0.01463245, + "balance_loss_clip": 1.40014338, + "balance_loss_mlp": 1.34775496, + "epoch": 0.010581692469562603, + "flos": 12465913726080.0, + "grad_norm": 3.1192036637042633, + "language_loss": 1.0955677, + "learning_rate": 3.329027409977902e-06, + "loss": 1.12648797, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 1.15576172, + "step": 176, + "time_per_iteration": 3.086076498031616 + }, + { + "auxiliary_loss_clip": 0.01620001, + "auxiliary_loss_mlp": 0.01462779, + "balance_loss_clip": 1.39549148, + "balance_loss_mlp": 1.35844743, + "epoch": 0.010641815722230573, + "flos": 19437100680960.0, + "grad_norm": 1.8462771906213882, + "language_loss": 0.89843351, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.92926133, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 1.04394531, + "step": 177, + "time_per_iteration": 3.0327537059783936 + }, + { + "auxiliary_loss_clip": 0.01618232, + "auxiliary_loss_mlp": 0.0143988, + "balance_loss_clip": 1.39239311, + "balance_loss_mlp": 1.35686231, + "epoch": 0.010701938974898541, + "flos": 18341675372160.0, + "grad_norm": 2.103736868146801, + "language_loss": 0.94473076, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.97531188, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.83056641, + "step": 178, + "time_per_iteration": 2.995852470397949 + }, + { + "auxiliary_loss_clip": 0.01610333, + "auxiliary_loss_mlp": 0.01445442, + "balance_loss_clip": 1.38644528, + "balance_loss_mlp": 1.36266279, + "epoch": 0.010762062227566512, + "flos": 19212206186880.0, + "grad_norm": 1.8843429627550965, + "language_loss": 0.95800889, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.98856664, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.82861328, + "step": 179, + "time_per_iteration": 3.0423085689544678 + }, + { + "auxiliary_loss_clip": 0.01607489, + "auxiliary_loss_mlp": 0.01454743, + "balance_loss_clip": 1.38320935, + "balance_loss_mlp": 1.3504113, + "epoch": 0.01082218548023448, + "flos": 31436484842880.0, + "grad_norm": 1.812687952719425, + "language_loss": 0.97297537, + "learning_rate": 3.3434966072878213e-06, + "loss": 1.00359762, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 1.04443359, + "step": 180, + "time_per_iteration": 3.0637238025665283 + }, + { + "auxiliary_loss_clip": 0.01603919, + "auxiliary_loss_mlp": 0.01428238, + "balance_loss_clip": 1.37961936, + "balance_loss_mlp": 1.34560251, + "epoch": 0.01088230873290245, + "flos": 25057400330880.0, + "grad_norm": 1.7641694181066094, + "language_loss": 0.88037115, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.91069275, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.82568359, + "step": 181, + "time_per_iteration": 3.085604429244995 + }, + { + "auxiliary_loss_clip": 0.01605751, + "auxiliary_loss_mlp": 0.01464032, + "balance_loss_clip": 1.37691689, + "balance_loss_mlp": 1.34372616, + "epoch": 0.01094243198557042, + "flos": 22903722956160.0, + "grad_norm": 2.1307819986005034, + "language_loss": 0.99614644, + "learning_rate": 3.3506110684439156e-06, + "loss": 1.02684426, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 1.20410156, + "step": 182, + "time_per_iteration": 2.995382308959961 + }, + { + "auxiliary_loss_clip": 0.01600262, + "auxiliary_loss_mlp": 0.01448144, + "balance_loss_clip": 1.37345481, + "balance_loss_mlp": 1.35206163, + "epoch": 0.011002555238238388, + "flos": 17173849104000.0, + "grad_norm": 1.8525369895595036, + "language_loss": 1.01507986, + "learning_rate": 3.3541390344409054e-06, + "loss": 1.04556394, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.9609375, + "step": 183, + "time_per_iteration": 2.994041919708252 + }, + { + "auxiliary_loss_clip": 0.01596406, + "auxiliary_loss_mlp": 0.01471138, + "balance_loss_clip": 1.37230945, + "balance_loss_mlp": 1.36027384, + "epoch": 0.011062678490906358, + "flos": 22320443249280.0, + "grad_norm": 1.7988334841039793, + "language_loss": 0.98383915, + "learning_rate": 3.357647774369736e-06, + "loss": 1.01451468, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 1.10839844, + "step": 184, + "time_per_iteration": 3.105968475341797 + }, + { + "auxiliary_loss_clip": 0.01589508, + "auxiliary_loss_mlp": 0.0145589, + "balance_loss_clip": 1.37041354, + "balance_loss_mlp": 1.35399008, + "epoch": 0.011122801743574327, + "flos": 24398823997440.0, + "grad_norm": 1.606355745395951, + "language_loss": 0.94632804, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.97678202, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 1.01904297, + "step": 185, + "time_per_iteration": 2.9795873165130615 + }, + { + "auxiliary_loss_clip": 0.01590443, + "auxiliary_loss_mlp": 0.01423736, + "balance_loss_clip": 1.36884439, + "balance_loss_mlp": 1.34000301, + "epoch": 0.011182924996242297, + "flos": 18159338252160.0, + "grad_norm": 1.8106894182956066, + "language_loss": 0.86341554, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.89355731, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.83789062, + "step": 186, + "time_per_iteration": 2.9593544006347656 + }, + { + "auxiliary_loss_clip": 0.01596973, + "auxiliary_loss_mlp": 0.01433203, + "balance_loss_clip": 1.37253225, + "balance_loss_mlp": 1.34403408, + "epoch": 0.011243048248910266, + "flos": 15495189333120.0, + "grad_norm": 2.1098877541394394, + "language_loss": 1.16447306, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.19477475, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.89111328, + "step": 187, + "time_per_iteration": 2.933722734451294 + }, + { + "auxiliary_loss_clip": 0.01586543, + "auxiliary_loss_mlp": 0.01437228, + "balance_loss_clip": 1.36790895, + "balance_loss_mlp": 1.34367251, + "epoch": 0.011303171501578235, + "flos": 40931724257280.0, + "grad_norm": 1.3705166734980192, + "language_loss": 0.82476997, + "learning_rate": 3.371494591560139e-06, + "loss": 0.85500765, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.93603516, + "step": 188, + "time_per_iteration": 3.143360137939453 + }, + { + "auxiliary_loss_clip": 0.01586828, + "auxiliary_loss_mlp": 0.01475275, + "balance_loss_clip": 1.40591264, + "balance_loss_mlp": 1.41767311, + "epoch": 0.011363294754246205, + "flos": 66331465879680.0, + "grad_norm": 0.867892790654316, + "language_loss": 0.56381094, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.594432, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.57421875, + "step": 189, + "time_per_iteration": 4.888100624084473 + }, + { + "auxiliary_loss_clip": 0.01582753, + "auxiliary_loss_mlp": 0.01423865, + "balance_loss_clip": 1.36430097, + "balance_loss_mlp": 1.34556842, + "epoch": 0.011423418006914174, + "flos": 24911150578560.0, + "grad_norm": 1.792213522382581, + "language_loss": 1.06865633, + "learning_rate": 3.3783079057586833e-06, + "loss": 1.09872258, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.78320312, + "step": 190, + "time_per_iteration": 5.986846923828125 + }, + { + "auxiliary_loss_clip": 0.01588894, + "auxiliary_loss_mlp": 0.0144313, + "balance_loss_clip": 1.36869991, + "balance_loss_mlp": 1.36383212, + "epoch": 0.011483541259582144, + "flos": 19801322472960.0, + "grad_norm": 1.9985818129320894, + "language_loss": 0.98708963, + "learning_rate": 3.3816877150079665e-06, + "loss": 1.0174098, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.79345703, + "step": 191, + "time_per_iteration": 3.0297658443450928 + }, + { + "auxiliary_loss_clip": 0.01586426, + "auxiliary_loss_mlp": 0.01482982, + "balance_loss_clip": 1.36770689, + "balance_loss_mlp": 1.3854686, + "epoch": 0.011543664512250112, + "flos": 26188053356160.0, + "grad_norm": 1.7106597154035308, + "language_loss": 1.00054991, + "learning_rate": 3.385049875042367e-06, + "loss": 1.03124392, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.97460938, + "step": 192, + "time_per_iteration": 4.4699742794036865 + }, + { + "auxiliary_loss_clip": 0.0159415, + "auxiliary_loss_mlp": 0.0145886, + "balance_loss_clip": 1.37441766, + "balance_loss_mlp": 1.38027692, + "epoch": 0.011603787764918083, + "flos": 23779502167680.0, + "grad_norm": 1.8966645310430514, + "language_loss": 0.98729956, + "learning_rate": 3.3883945692315938e-06, + "loss": 1.01782966, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.78564453, + "step": 193, + "time_per_iteration": 3.0018441677093506 + }, + { + "auxiliary_loss_clip": 0.01588637, + "auxiliary_loss_mlp": 0.01457368, + "balance_loss_clip": 1.3726126, + "balance_loss_mlp": 1.38269556, + "epoch": 0.011663911017586051, + "flos": 25964878164480.0, + "grad_norm": 1.7196389491649033, + "language_loss": 1.03056097, + "learning_rate": 3.3917219781023906e-06, + "loss": 1.06102109, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.74707031, + "step": 194, + "time_per_iteration": 3.0725462436676025 + }, + { + "auxiliary_loss_clip": 0.01600153, + "auxiliary_loss_mlp": 0.01436216, + "balance_loss_clip": 1.37777197, + "balance_loss_mlp": 1.35343742, + "epoch": 0.01172403427025402, + "flos": 17904238214400.0, + "grad_norm": 1.8620773395163162, + "language_loss": 1.06341648, + "learning_rate": 3.3950322793970014e-06, + "loss": 1.09378028, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.82714844, + "step": 195, + "time_per_iteration": 2.960056781768799 + }, + { + "auxiliary_loss_clip": 0.01598722, + "auxiliary_loss_mlp": 0.01445266, + "balance_loss_clip": 1.37920976, + "balance_loss_mlp": 1.34513021, + "epoch": 0.01178415752292199, + "flos": 17903469052800.0, + "grad_norm": 2.0132020838664912, + "language_loss": 0.99307179, + "learning_rate": 3.3983256481301445e-06, + "loss": 1.02351165, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 1.00244141, + "step": 196, + "time_per_iteration": 2.994088649749756 + }, + { + "auxiliary_loss_clip": 0.01595048, + "auxiliary_loss_mlp": 0.01447349, + "balance_loss_clip": 1.37689674, + "balance_loss_mlp": 1.34354186, + "epoch": 0.011844280775589959, + "flos": 22903903935360.0, + "grad_norm": 1.7019718340320535, + "language_loss": 1.06016111, + "learning_rate": 3.4016022566445335e-06, + "loss": 1.09058499, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 1.03710938, + "step": 197, + "time_per_iteration": 3.001882791519165 + }, + { + "auxiliary_loss_clip": 0.01588873, + "auxiliary_loss_mlp": 0.01428747, + "balance_loss_clip": 1.37480688, + "balance_loss_mlp": 1.35250032, + "epoch": 0.01190440402825793, + "flos": 26991838811520.0, + "grad_norm": 1.7998534599724627, + "language_loss": 0.90103662, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.93121278, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.76269531, + "step": 198, + "time_per_iteration": 3.0731801986694336 + }, + { + "auxiliary_loss_clip": 0.01594941, + "auxiliary_loss_mlp": 0.01455012, + "balance_loss_clip": 1.3803103, + "balance_loss_mlp": 1.36860907, + "epoch": 0.011964527280925898, + "flos": 20531213890560.0, + "grad_norm": 1.5120346432146654, + "language_loss": 0.94269764, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.97319716, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.86425781, + "step": 199, + "time_per_iteration": 3.017489194869995 + }, + { + "auxiliary_loss_clip": 0.01589983, + "auxiliary_loss_mlp": 0.0149015, + "balance_loss_clip": 1.375211, + "balance_loss_mlp": 1.38085949, + "epoch": 0.012024650533593867, + "flos": 27757546128000.0, + "grad_norm": 1.5399294366883403, + "language_loss": 0.92364168, + "learning_rate": 3.411333205349222e-06, + "loss": 0.95444298, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 1.09179688, + "step": 200, + "time_per_iteration": 3.12546706199646 + }, + { + "auxiliary_loss_clip": 0.01579263, + "auxiliary_loss_mlp": 0.01453223, + "balance_loss_clip": 1.36549819, + "balance_loss_mlp": 1.36553311, + "epoch": 0.012084773786261837, + "flos": 10459164775680.0, + "grad_norm": 1.6793761058769672, + "language_loss": 1.0211432, + "learning_rate": 3.4145444448414217e-06, + "loss": 1.05146813, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.87695312, + "step": 201, + "time_per_iteration": 3.0311992168426514 + }, + { + "auxiliary_loss_clip": 0.01580439, + "auxiliary_loss_mlp": 0.01461107, + "balance_loss_clip": 1.3682791, + "balance_loss_mlp": 1.36335516, + "epoch": 0.012144897038929806, + "flos": 23114410583040.0, + "grad_norm": 1.5656129433252899, + "language_loss": 0.93370402, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.96411949, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.97851562, + "step": 202, + "time_per_iteration": 3.074474334716797 + }, + { + "auxiliary_loss_clip": 0.01568137, + "auxiliary_loss_mlp": 0.01426323, + "balance_loss_clip": 1.35298061, + "balance_loss_mlp": 1.34588087, + "epoch": 0.012205020291597776, + "flos": 21043178513280.0, + "grad_norm": 1.6077091105372745, + "language_loss": 0.99539471, + "learning_rate": 3.4209192710126685e-06, + "loss": 1.02533925, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.80517578, + "step": 203, + "time_per_iteration": 2.967900037765503 + }, + { + "auxiliary_loss_clip": 0.01578951, + "auxiliary_loss_mlp": 0.01460392, + "balance_loss_clip": 1.40472007, + "balance_loss_mlp": 1.37341678, + "epoch": 0.012265143544265745, + "flos": 68475144153600.0, + "grad_norm": 1.0529902784300085, + "language_loss": 0.61530805, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.64570153, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.87109375, + "step": 204, + "time_per_iteration": 3.2920563220977783 + }, + { + "auxiliary_loss_clip": 0.0156772, + "auxiliary_loss_mlp": 0.01448896, + "balance_loss_clip": 1.35566282, + "balance_loss_mlp": 1.35147834, + "epoch": 0.012325266796933715, + "flos": 17028051799680.0, + "grad_norm": 1.901813848608763, + "language_loss": 1.03444135, + "learning_rate": 3.4272315978819516e-06, + "loss": 1.06460762, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.97412109, + "step": 205, + "time_per_iteration": 2.985851526260376 + }, + { + "auxiliary_loss_clip": 0.01570419, + "auxiliary_loss_mlp": 0.01464171, + "balance_loss_clip": 1.35819101, + "balance_loss_mlp": 1.3507787, + "epoch": 0.012385390049601683, + "flos": 20198781210240.0, + "grad_norm": 1.8404623470408281, + "language_loss": 1.01589632, + "learning_rate": 3.4303647047142043e-06, + "loss": 1.04624224, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 1.13427734, + "step": 206, + "time_per_iteration": 2.9894423484802246 + }, + { + "auxiliary_loss_clip": 0.01570843, + "auxiliary_loss_mlp": 0.01455271, + "balance_loss_clip": 1.35698056, + "balance_loss_mlp": 1.35885477, + "epoch": 0.012445513302269652, + "flos": 16261756300800.0, + "grad_norm": 1.7020108087685581, + "language_loss": 1.08177805, + "learning_rate": 3.43348263905683e-06, + "loss": 1.11203933, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.96435547, + "step": 207, + "time_per_iteration": 2.9436988830566406 + }, + { + "auxiliary_loss_clip": 0.0156513, + "auxiliary_loss_mlp": 0.01454605, + "balance_loss_clip": 1.35251713, + "balance_loss_mlp": 1.34679234, + "epoch": 0.012505636554937622, + "flos": 23779999860480.0, + "grad_norm": 1.5428073141987484, + "language_loss": 0.85805249, + "learning_rate": 3.436585547151547e-06, + "loss": 0.88824981, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 1.07910156, + "step": 208, + "time_per_iteration": 3.064143657684326 + }, + { + "auxiliary_loss_clip": 0.01555433, + "auxiliary_loss_mlp": 0.01426469, + "balance_loss_clip": 1.34742963, + "balance_loss_mlp": 1.34531164, + "epoch": 0.012565759807605591, + "flos": 30603760698240.0, + "grad_norm": 1.851827969060184, + "language_loss": 1.10349154, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.13331056, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.81201172, + "step": 209, + "time_per_iteration": 2.996107816696167 + }, + { + "auxiliary_loss_clip": 0.01555773, + "auxiliary_loss_mlp": 0.01444747, + "balance_loss_clip": 1.34767008, + "balance_loss_mlp": 1.34737718, + "epoch": 0.012625883060273561, + "flos": 40127984046720.0, + "grad_norm": 2.183608990792774, + "language_loss": 0.97831374, + "learning_rate": 3.4427468590832302e-06, + "loss": 1.00831902, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.97363281, + "step": 210, + "time_per_iteration": 3.157759428024292 + }, + { + "auxiliary_loss_clip": 0.01549514, + "auxiliary_loss_mlp": 0.01449906, + "balance_loss_clip": 1.3399713, + "balance_loss_mlp": 1.34628963, + "epoch": 0.01268600631294153, + "flos": 27100553362560.0, + "grad_norm": 1.8298789375655278, + "language_loss": 1.04656506, + "learning_rate": 3.445805545042314e-06, + "loss": 1.07655931, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 1.03515625, + "step": 211, + "time_per_iteration": 2.976926565170288 + }, + { + "auxiliary_loss_clip": 0.01545301, + "auxiliary_loss_mlp": 0.01429967, + "balance_loss_clip": 1.33997726, + "balance_loss_mlp": 1.35996699, + "epoch": 0.012746129565609499, + "flos": 16991557228800.0, + "grad_norm": 1.9047585529259456, + "language_loss": 1.09408629, + "learning_rate": 3.448849769075239e-06, + "loss": 1.1238389, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.69970703, + "step": 212, + "time_per_iteration": 2.9629440307617188 + }, + { + "auxiliary_loss_clip": 0.01542534, + "auxiliary_loss_mlp": 0.01446048, + "balance_loss_clip": 1.33732605, + "balance_loss_mlp": 1.36150515, + "epoch": 0.012806252818277469, + "flos": 46549218729600.0, + "grad_norm": 1.5140581898646686, + "language_loss": 0.86952293, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.89940876, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.84472656, + "step": 213, + "time_per_iteration": 3.3281195163726807 + }, + { + "auxiliary_loss_clip": 0.01543328, + "auxiliary_loss_mlp": 0.01443704, + "balance_loss_clip": 1.33556056, + "balance_loss_mlp": 1.34929061, + "epoch": 0.012866376070945438, + "flos": 14395782481920.0, + "grad_norm": 2.0071758723376036, + "language_loss": 1.00026679, + "learning_rate": 3.4548953739020187e-06, + "loss": 1.03013706, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.94482422, + "step": 214, + "time_per_iteration": 3.098564386367798 + }, + { + "auxiliary_loss_clip": 0.01540683, + "auxiliary_loss_mlp": 0.01431085, + "balance_loss_clip": 1.33509338, + "balance_loss_mlp": 1.34492087, + "epoch": 0.012926499323613408, + "flos": 26152011233280.0, + "grad_norm": 1.7264916970998099, + "language_loss": 0.90226293, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.93198061, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.86181641, + "step": 215, + "time_per_iteration": 3.114870309829712 + }, + { + "auxiliary_loss_clip": 0.0154069, + "auxiliary_loss_mlp": 0.01419949, + "balance_loss_clip": 1.3332057, + "balance_loss_mlp": 1.34408414, + "epoch": 0.012986622576281377, + "flos": 30129286032000.0, + "grad_norm": 1.8948763808052436, + "language_loss": 1.05602026, + "learning_rate": 3.460884739729461e-06, + "loss": 1.0856266, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.75878906, + "step": 216, + "time_per_iteration": 3.1361331939697266 + }, + { + "auxiliary_loss_clip": 0.01547965, + "auxiliary_loss_mlp": 0.01434006, + "balance_loss_clip": 1.33968568, + "balance_loss_mlp": 1.34560049, + "epoch": 0.013046745828949347, + "flos": 13961919663360.0, + "grad_norm": 2.1322378241289504, + "language_loss": 1.11902142, + "learning_rate": 3.463858658104523e-06, + "loss": 1.14884114, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.88525391, + "step": 217, + "time_per_iteration": 3.0592234134674072 + }, + { + "auxiliary_loss_clip": 0.01547375, + "auxiliary_loss_mlp": 0.01429443, + "balance_loss_clip": 1.34041226, + "balance_loss_mlp": 1.35262465, + "epoch": 0.013106869081617315, + "flos": 17356774406400.0, + "grad_norm": 1.6561432610172337, + "language_loss": 1.03728271, + "learning_rate": 3.4668189032433696e-06, + "loss": 1.06705093, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.76855469, + "step": 218, + "time_per_iteration": 3.036843776702881 + }, + { + "auxiliary_loss_clip": 0.01547172, + "auxiliary_loss_mlp": 0.01439344, + "balance_loss_clip": 1.34101033, + "balance_loss_mlp": 1.35761392, + "epoch": 0.013166992334285284, + "flos": 25895915809920.0, + "grad_norm": 1.6917249011666626, + "language_loss": 0.97110111, + "learning_rate": 3.46976560030214e-06, + "loss": 1.00096631, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.81640625, + "step": 219, + "time_per_iteration": 3.0055127143859863 + }, + { + "auxiliary_loss_clip": 0.01548611, + "auxiliary_loss_mlp": 0.0142526, + "balance_loss_clip": 1.3441056, + "balance_loss_mlp": 1.35444951, + "epoch": 0.013227115586953254, + "flos": 31188488238720.0, + "grad_norm": 1.514770208155254, + "language_loss": 0.9790594, + "learning_rate": 3.4726988727263976e-06, + "loss": 1.00879812, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.70703125, + "step": 220, + "time_per_iteration": 3.127387285232544 + }, + { + "auxiliary_loss_clip": 0.01550258, + "auxiliary_loss_mlp": 0.01412156, + "balance_loss_clip": 1.34800196, + "balance_loss_mlp": 1.3456372, + "epoch": 0.013287238839621223, + "flos": 20418382062720.0, + "grad_norm": 1.624010904639624, + "language_loss": 0.97542638, + "learning_rate": 3.475618842282164e-06, + "loss": 1.00505042, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.66552734, + "step": 221, + "time_per_iteration": 3.010651111602783 + }, + { + "auxiliary_loss_clip": 0.01545648, + "auxiliary_loss_mlp": 0.01412944, + "balance_loss_clip": 1.34445381, + "balance_loss_mlp": 1.33970201, + "epoch": 0.013347362092289193, + "flos": 14145252168960.0, + "grad_norm": 1.7979664062556922, + "language_loss": 1.06572032, + "learning_rate": 3.4785256290862486e-06, + "loss": 1.0953064, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.73291016, + "step": 222, + "time_per_iteration": 2.9702508449554443 + }, + { + "auxiliary_loss_clip": 0.01544633, + "auxiliary_loss_mlp": 0.0143031, + "balance_loss_clip": 1.34095025, + "balance_loss_mlp": 1.35449314, + "epoch": 0.013407485344957162, + "flos": 21807437996160.0, + "grad_norm": 1.8929668477572879, + "language_loss": 1.0565865, + "learning_rate": 3.481419351635897e-06, + "loss": 1.08633602, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.75830078, + "step": 223, + "time_per_iteration": 3.011080265045166 + }, + { + "auxiliary_loss_clip": 0.01538105, + "auxiliary_loss_mlp": 0.01440734, + "balance_loss_clip": 1.33741093, + "balance_loss_mlp": 1.36997163, + "epoch": 0.013467608597625132, + "flos": 18630057600000.0, + "grad_norm": 2.0340043540875357, + "language_loss": 1.01779819, + "learning_rate": 3.484300126837776e-06, + "loss": 1.04758656, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.70703125, + "step": 224, + "time_per_iteration": 4.353667736053467 + }, + { + "auxiliary_loss_clip": 0.01537377, + "auxiliary_loss_mlp": 0.01461071, + "balance_loss_clip": 1.33349156, + "balance_loss_mlp": 1.38396645, + "epoch": 0.013527731850293101, + "flos": 18561683427840.0, + "grad_norm": 1.6696514359732253, + "language_loss": 1.0311991, + "learning_rate": 3.487168070036317e-06, + "loss": 1.06118357, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.77197266, + "step": 225, + "time_per_iteration": 6.091062307357788 + }, + { + "auxiliary_loss_clip": 0.01545256, + "auxiliary_loss_mlp": 0.01462484, + "balance_loss_clip": 1.33913267, + "balance_loss_mlp": 1.3785603, + "epoch": 0.01358785510296107, + "flos": 19173042172800.0, + "grad_norm": 1.6628444424216993, + "language_loss": 1.0870378, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.11711514, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.83935547, + "step": 226, + "time_per_iteration": 4.461630582809448 + }, + { + "auxiliary_loss_clip": 0.01542054, + "auxiliary_loss_mlp": 0.01419012, + "balance_loss_clip": 1.33768368, + "balance_loss_mlp": 1.35344672, + "epoch": 0.01364797835562904, + "flos": 23340028993920.0, + "grad_norm": 1.9115911600531583, + "language_loss": 1.03947318, + "learning_rate": 3.4928659141555727e-06, + "loss": 1.06908381, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.65625, + "step": 227, + "time_per_iteration": 3.015738010406494 + }, + { + "auxiliary_loss_clip": 0.01533437, + "auxiliary_loss_mlp": 0.01392338, + "balance_loss_clip": 1.36927652, + "balance_loss_mlp": 1.34503591, + "epoch": 0.013708101608297009, + "flos": 71029148670720.0, + "grad_norm": 0.9499424665603502, + "language_loss": 0.57789373, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.60715151, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.47265625, + "step": 228, + "time_per_iteration": 3.4632973670959473 + }, + { + "auxiliary_loss_clip": 0.01538427, + "auxiliary_loss_mlp": 0.01401956, + "balance_loss_clip": 1.33955169, + "balance_loss_mlp": 1.3410641, + "epoch": 0.013768224860964979, + "flos": 16333569077760.0, + "grad_norm": 2.0534891648260056, + "language_loss": 1.01966214, + "learning_rate": 3.4985137765422354e-06, + "loss": 1.04906595, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.60839844, + "step": 229, + "time_per_iteration": 3.0026321411132812 + }, + { + "auxiliary_loss_clip": 0.01546391, + "auxiliary_loss_mlp": 0.01402634, + "balance_loss_clip": 1.34813881, + "balance_loss_mlp": 1.34107423, + "epoch": 0.013828348113632948, + "flos": 20202536528640.0, + "grad_norm": 2.039958872605183, + "language_loss": 0.98587745, + "learning_rate": 3.501319237118231e-06, + "loss": 1.01536775, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.61572266, + "step": 230, + "time_per_iteration": 2.9712865352630615 + }, + { + "auxiliary_loss_clip": 0.01537479, + "auxiliary_loss_mlp": 0.01412425, + "balance_loss_clip": 1.33900666, + "balance_loss_mlp": 1.34106636, + "epoch": 0.013888471366300916, + "flos": 20750905232640.0, + "grad_norm": 1.5512726377315005, + "language_loss": 1.00041687, + "learning_rate": 3.5041125264604056e-06, + "loss": 1.02991593, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.71337891, + "step": 231, + "time_per_iteration": 3.0177319049835205 + }, + { + "auxiliary_loss_clip": 0.01543093, + "auxiliary_loss_mlp": 0.01407073, + "balance_loss_clip": 1.34452498, + "balance_loss_mlp": 1.34355879, + "epoch": 0.013948594618968886, + "flos": 22100706662400.0, + "grad_norm": 1.6557998835614465, + "language_loss": 0.986265, + "learning_rate": 3.5068937497203002e-06, + "loss": 1.01576674, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.63476562, + "step": 232, + "time_per_iteration": 3.034668445587158 + }, + { + "auxiliary_loss_clip": 0.01538857, + "auxiliary_loss_mlp": 0.01426975, + "balance_loss_clip": 1.34251785, + "balance_loss_mlp": 1.34004736, + "epoch": 0.014008717871636855, + "flos": 19072607420160.0, + "grad_norm": 2.360152341163756, + "language_loss": 0.89624757, + "learning_rate": 3.509663010692652e-06, + "loss": 0.92590582, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.86865234, + "step": 233, + "time_per_iteration": 3.021674394607544 + }, + { + "auxiliary_loss_clip": 0.01544517, + "auxiliary_loss_mlp": 0.01423078, + "balance_loss_clip": 1.34542036, + "balance_loss_mlp": 1.34616458, + "epoch": 0.014068841124304825, + "flos": 14537055306240.0, + "grad_norm": 1.823594101367171, + "language_loss": 0.99260366, + "learning_rate": 3.512420411838642e-06, + "loss": 1.02227962, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.76953125, + "step": 234, + "time_per_iteration": 2.9990434646606445 + }, + { + "auxiliary_loss_clip": 0.01541841, + "auxiliary_loss_mlp": 0.0141043, + "balance_loss_clip": 1.34432316, + "balance_loss_mlp": 1.34381568, + "epoch": 0.014128964376972794, + "flos": 18086484844800.0, + "grad_norm": 2.106158128252329, + "language_loss": 1.06949103, + "learning_rate": 3.515166054308634e-06, + "loss": 1.09901381, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.66625977, + "step": 235, + "time_per_iteration": 2.953233480453491 + }, + { + "auxiliary_loss_clip": 0.01540034, + "auxiliary_loss_mlp": 0.01425278, + "balance_loss_clip": 1.33978438, + "balance_loss_mlp": 1.34030616, + "epoch": 0.014189087629640764, + "flos": 25344334725120.0, + "grad_norm": 1.973853259890551, + "language_loss": 0.95837873, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.98803186, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.85009766, + "step": 236, + "time_per_iteration": 3.108091354370117 + }, + { + "auxiliary_loss_clip": 0.01534465, + "auxiliary_loss_mlp": 0.01411695, + "balance_loss_clip": 1.33595824, + "balance_loss_mlp": 1.3408848, + "epoch": 0.014249210882308733, + "flos": 36154012118400.0, + "grad_norm": 1.6472033711304293, + "language_loss": 0.94469661, + "learning_rate": 3.520622461401154e-06, + "loss": 0.97415823, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.70703125, + "step": 237, + "time_per_iteration": 3.147148370742798 + }, + { + "auxiliary_loss_clip": 0.01531647, + "auxiliary_loss_mlp": 0.01441091, + "balance_loss_clip": 1.33193135, + "balance_loss_mlp": 1.34105003, + "epoch": 0.014309334134976702, + "flos": 12940750350720.0, + "grad_norm": 1.6857618712516538, + "language_loss": 0.92148811, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.95121545, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 1.00048828, + "step": 238, + "time_per_iteration": 3.0487539768218994 + }, + { + "auxiliary_loss_clip": 0.01526684, + "auxiliary_loss_mlp": 0.01433568, + "balance_loss_clip": 1.33139622, + "balance_loss_mlp": 1.35121799, + "epoch": 0.014369457387644672, + "flos": 20787354558720.0, + "grad_norm": 1.4702974274791385, + "language_loss": 0.97015762, + "learning_rate": 3.526033015791284e-06, + "loss": 0.99976009, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.82324219, + "step": 239, + "time_per_iteration": 3.03141188621521 + }, + { + "auxiliary_loss_clip": 0.01517616, + "auxiliary_loss_mlp": 0.0139697, + "balance_loss_clip": 1.32820475, + "balance_loss_mlp": 1.34435058, + "epoch": 0.01442958064031264, + "flos": 25859556973440.0, + "grad_norm": 1.8281698220235636, + "language_loss": 1.02019739, + "learning_rate": 3.528721337790862e-06, + "loss": 1.04934335, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.52636719, + "step": 240, + "time_per_iteration": 3.03023624420166 + }, + { + "auxiliary_loss_clip": 0.01537564, + "auxiliary_loss_mlp": 0.01433203, + "balance_loss_clip": 1.33819675, + "balance_loss_mlp": 1.34317613, + "epoch": 0.014489703892980611, + "flos": 28231251632640.0, + "grad_norm": 1.5706241175056228, + "language_loss": 0.95799619, + "learning_rate": 3.531398481704111e-06, + "loss": 0.98770386, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.90014648, + "step": 241, + "time_per_iteration": 3.0657451152801514 + }, + { + "auxiliary_loss_clip": 0.01537643, + "auxiliary_loss_mlp": 0.01391238, + "balance_loss_clip": 1.34420371, + "balance_loss_mlp": 1.34145594, + "epoch": 0.01454982714564858, + "flos": 22500834842880.0, + "grad_norm": 1.57702522257575, + "language_loss": 0.97568578, + "learning_rate": 3.534064540103573e-06, + "loss": 1.0049746, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.49755859, + "step": 242, + "time_per_iteration": 3.0432698726654053 + }, + { + "auxiliary_loss_clip": 0.01535947, + "auxiliary_loss_mlp": 0.01395094, + "balance_loss_clip": 1.34300327, + "balance_loss_mlp": 1.34161711, + "epoch": 0.014609950398316548, + "flos": 21663269504640.0, + "grad_norm": 1.7671406578339897, + "language_loss": 0.96698225, + "learning_rate": 3.536719604416555e-06, + "loss": 0.99629271, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.53515625, + "step": 243, + "time_per_iteration": 2.973078966140747 + }, + { + "auxiliary_loss_clip": 0.0153846, + "auxiliary_loss_mlp": 0.01402678, + "balance_loss_clip": 1.34361792, + "balance_loss_mlp": 1.34672117, + "epoch": 0.014670073650984519, + "flos": 21879567486720.0, + "grad_norm": 1.5513554859779264, + "language_loss": 0.94136047, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.97077185, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.55981445, + "step": 244, + "time_per_iteration": 3.0259532928466797 + }, + { + "auxiliary_loss_clip": 0.01549616, + "auxiliary_loss_mlp": 0.01429884, + "balance_loss_clip": 1.34524202, + "balance_loss_mlp": 1.35182548, + "epoch": 0.014730196903652487, + "flos": 23193688752000.0, + "grad_norm": 2.136069430411029, + "language_loss": 0.96741927, + "learning_rate": 3.54199711087864e-06, + "loss": 0.9972142, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.78027344, + "step": 245, + "time_per_iteration": 2.927734136581421 + }, + { + "auxiliary_loss_clip": 0.01521465, + "auxiliary_loss_mlp": 0.01422144, + "balance_loss_clip": 1.3281002, + "balance_loss_mlp": 1.35114336, + "epoch": 0.014790320156320457, + "flos": 23233214724480.0, + "grad_norm": 1.742875451921813, + "language_loss": 0.94243884, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.97187495, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.7097168, + "step": 246, + "time_per_iteration": 3.0417020320892334 + }, + { + "auxiliary_loss_clip": 0.01514138, + "auxiliary_loss_mlp": 0.01425263, + "balance_loss_clip": 1.32048953, + "balance_loss_mlp": 1.35464299, + "epoch": 0.014850443408988426, + "flos": 15824047674240.0, + "grad_norm": 1.516718986174222, + "language_loss": 1.00428391, + "learning_rate": 3.5472317103095034e-06, + "loss": 1.03367805, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.70654297, + "step": 247, + "time_per_iteration": 3.0131680965423584 + }, + { + "auxiliary_loss_clip": 0.01503631, + "auxiliary_loss_mlp": 0.01416549, + "balance_loss_clip": 1.3083998, + "balance_loss_mlp": 1.34864712, + "epoch": 0.014910566661656396, + "flos": 22791343576320.0, + "grad_norm": 1.7280001728929737, + "language_loss": 0.90848935, + "learning_rate": 3.549833136812155e-06, + "loss": 0.93769115, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.67871094, + "step": 248, + "time_per_iteration": 3.0582101345062256 + }, + { + "auxiliary_loss_clip": 0.01501963, + "auxiliary_loss_mlp": 0.01417489, + "balance_loss_clip": 1.30928552, + "balance_loss_mlp": 1.34977841, + "epoch": 0.014970689914324365, + "flos": 26874980196480.0, + "grad_norm": 1.620061910370753, + "language_loss": 0.93865281, + "learning_rate": 3.552424094769381e-06, + "loss": 0.96784729, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.67626953, + "step": 249, + "time_per_iteration": 3.1427114009857178 + }, + { + "auxiliary_loss_clip": 0.0149497, + "auxiliary_loss_mlp": 0.014032, + "balance_loss_clip": 1.30651999, + "balance_loss_mlp": 1.34166384, + "epoch": 0.015030813166992334, + "flos": 13992306186240.0, + "grad_norm": 1.9939782907251342, + "language_loss": 1.06728542, + "learning_rate": 3.5550046680977174e-06, + "loss": 1.09626722, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.61547852, + "step": 250, + "time_per_iteration": 2.955688714981079 + }, + { + "auxiliary_loss_clip": 0.01507225, + "auxiliary_loss_mlp": 0.01430628, + "balance_loss_clip": 1.31432784, + "balance_loss_mlp": 1.34761024, + "epoch": 0.015090936419660304, + "flos": 24728632479360.0, + "grad_norm": 1.75651532953761, + "language_loss": 1.09728301, + "learning_rate": 3.5575749397087034e-06, + "loss": 1.12666154, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.83105469, + "step": 251, + "time_per_iteration": 3.048722505569458 + }, + { + "auxiliary_loss_clip": 0.01499759, + "auxiliary_loss_mlp": 0.0141744, + "balance_loss_clip": 1.30865979, + "balance_loss_mlp": 1.34953868, + "epoch": 0.015151059672328273, + "flos": 25749711302400.0, + "grad_norm": 1.7049632383354647, + "language_loss": 0.97725958, + "learning_rate": 3.5601349915248707e-06, + "loss": 1.00643158, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.6796875, + "step": 252, + "time_per_iteration": 3.11375093460083 + }, + { + "auxiliary_loss_clip": 0.01496236, + "auxiliary_loss_mlp": 0.01421074, + "balance_loss_clip": 1.30496168, + "balance_loss_mlp": 1.3498348, + "epoch": 0.015211182924996243, + "flos": 21881105809920.0, + "grad_norm": 1.9149137336692048, + "language_loss": 1.12649786, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.15567088, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.71240234, + "step": 253, + "time_per_iteration": 2.958259344100952 + }, + { + "auxiliary_loss_clip": 0.01521347, + "auxiliary_loss_mlp": 0.01427505, + "balance_loss_clip": 1.36887252, + "balance_loss_mlp": 1.39221895, + "epoch": 0.015271306177664212, + "flos": 66926237765760.0, + "grad_norm": 0.9097627796219767, + "language_loss": 0.55826509, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.58775359, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.35351562, + "step": 254, + "time_per_iteration": 3.3988163471221924 + }, + { + "auxiliary_loss_clip": 0.01493778, + "auxiliary_loss_mlp": 0.01430392, + "balance_loss_clip": 1.30201626, + "balance_loss_mlp": 1.36096442, + "epoch": 0.01533142943033218, + "flos": 26845453324800.0, + "grad_norm": 1.67899170853955, + "language_loss": 1.0334152, + "learning_rate": 3.567754632921479e-06, + "loss": 1.06265688, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.69433594, + "step": 255, + "time_per_iteration": 3.1387276649475098 + }, + { + "auxiliary_loss_clip": 0.01492574, + "auxiliary_loss_mlp": 0.01451185, + "balance_loss_clip": 1.30023885, + "balance_loss_mlp": 1.38118553, + "epoch": 0.01539155268300015, + "flos": 20823532416000.0, + "grad_norm": 1.8500275296933248, + "language_loss": 0.97365046, + "learning_rate": 3.5702746055454075e-06, + "loss": 1.003088, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.70019531, + "step": 256, + "time_per_iteration": 3.1409034729003906 + }, + { + "auxiliary_loss_clip": 0.01499804, + "auxiliary_loss_mlp": 0.01443717, + "balance_loss_clip": 1.30421281, + "balance_loss_mlp": 1.36346483, + "epoch": 0.01545167593566812, + "flos": 15970568895360.0, + "grad_norm": 2.306947397733322, + "language_loss": 0.92229998, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.95173526, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.80273438, + "step": 257, + "time_per_iteration": 2.9621479511260986 + }, + { + "auxiliary_loss_clip": 0.01494134, + "auxiliary_loss_mlp": 0.01451152, + "balance_loss_clip": 1.2997936, + "balance_loss_mlp": 1.34953809, + "epoch": 0.01551179918833609, + "flos": 22612037857920.0, + "grad_norm": 1.8186499191267613, + "language_loss": 1.05892444, + "learning_rate": 3.5752851536613596e-06, + "loss": 1.08837736, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 1.01464844, + "step": 258, + "time_per_iteration": 3.1616013050079346 + }, + { + "auxiliary_loss_clip": 0.01493301, + "auxiliary_loss_mlp": 0.01412727, + "balance_loss_clip": 1.3009882, + "balance_loss_mlp": 1.34010458, + "epoch": 0.015571922441004058, + "flos": 22826028355200.0, + "grad_norm": 1.9649827033688687, + "language_loss": 1.02177918, + "learning_rate": 3.577775880881658e-06, + "loss": 1.05083942, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.72607422, + "step": 259, + "time_per_iteration": 4.476131916046143 + }, + { + "auxiliary_loss_clip": 0.01496346, + "auxiliary_loss_mlp": 0.01412601, + "balance_loss_clip": 1.30462313, + "balance_loss_mlp": 1.3415997, + "epoch": 0.015632045693672027, + "flos": 18955296357120.0, + "grad_norm": 1.6180244478368793, + "language_loss": 1.03253531, + "learning_rate": 3.5802570099000424e-06, + "loss": 1.06162488, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.71044922, + "step": 260, + "time_per_iteration": 4.6127049922943115 + }, + { + "auxiliary_loss_clip": 0.01504015, + "auxiliary_loss_mlp": 0.01423853, + "balance_loss_clip": 1.30994344, + "balance_loss_mlp": 1.34684408, + "epoch": 0.015692168946339995, + "flos": 29983895930880.0, + "grad_norm": 1.7491298145402792, + "language_loss": 1.02591562, + "learning_rate": 3.5827286144073947e-06, + "loss": 1.05519426, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.76904297, + "step": 261, + "time_per_iteration": 4.452184438705444 + }, + { + "auxiliary_loss_clip": 0.01516066, + "auxiliary_loss_mlp": 0.01442714, + "balance_loss_clip": 1.32149994, + "balance_loss_mlp": 1.35960126, + "epoch": 0.015752292199007967, + "flos": 19401827719680.0, + "grad_norm": 1.5811001112028633, + "language_loss": 0.77230787, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.80189562, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.83105469, + "step": 262, + "time_per_iteration": 3.0023179054260254 + }, + { + "auxiliary_loss_clip": 0.01512664, + "auxiliary_loss_mlp": 0.01441328, + "balance_loss_clip": 1.31741226, + "balance_loss_mlp": 1.35325599, + "epoch": 0.015812415451675936, + "flos": 20349555442560.0, + "grad_norm": 1.6833239508127824, + "language_loss": 0.8233102, + "learning_rate": 3.587643540438383e-06, + "loss": 0.85285014, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.88085938, + "step": 263, + "time_per_iteration": 2.983851432800293 + }, + { + "auxiliary_loss_clip": 0.01510672, + "auxiliary_loss_mlp": 0.01433856, + "balance_loss_clip": 1.31800413, + "balance_loss_mlp": 1.35145867, + "epoch": 0.015872538704343905, + "flos": 17533139212800.0, + "grad_norm": 2.086961113888894, + "language_loss": 1.03318727, + "learning_rate": 3.590087005168037e-06, + "loss": 1.06263256, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.82421875, + "step": 264, + "time_per_iteration": 2.9981093406677246 + }, + { + "auxiliary_loss_clip": 0.01508437, + "auxiliary_loss_mlp": 0.01396569, + "balance_loss_clip": 1.31915498, + "balance_loss_mlp": 1.34268665, + "epoch": 0.015932661957011873, + "flos": 15266901479040.0, + "grad_norm": 1.9130482137158251, + "language_loss": 1.14311886, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.17216897, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.53833008, + "step": 265, + "time_per_iteration": 2.989739179611206 + }, + { + "auxiliary_loss_clip": 0.01511034, + "auxiliary_loss_mlp": 0.01413019, + "balance_loss_clip": 1.31729448, + "balance_loss_mlp": 1.34027791, + "epoch": 0.015992785209679845, + "flos": 20312065486080.0, + "grad_norm": 1.7922365078398237, + "language_loss": 0.91443074, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.94367129, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.7277832, + "step": 266, + "time_per_iteration": 3.0518651008605957 + }, + { + "auxiliary_loss_clip": 0.01497402, + "auxiliary_loss_mlp": 0.01397294, + "balance_loss_clip": 1.30961037, + "balance_loss_mlp": 1.34286296, + "epoch": 0.016052908462347814, + "flos": 23371410902400.0, + "grad_norm": 1.899517010506227, + "language_loss": 0.97124958, + "learning_rate": 3.5973622484909068e-06, + "loss": 1.00019646, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.54394531, + "step": 267, + "time_per_iteration": 3.119874954223633 + }, + { + "auxiliary_loss_clip": 0.01500339, + "auxiliary_loss_mlp": 0.01401077, + "balance_loss_clip": 1.30966687, + "balance_loss_mlp": 1.34035194, + "epoch": 0.016113031715015783, + "flos": 21296106800640.0, + "grad_norm": 1.8967484384952342, + "language_loss": 0.99606454, + "learning_rate": 3.599769175344462e-06, + "loss": 1.02507877, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.60693359, + "step": 268, + "time_per_iteration": 3.0038843154907227 + }, + { + "auxiliary_loss_clip": 0.01499467, + "auxiliary_loss_mlp": 0.01415009, + "balance_loss_clip": 1.3107481, + "balance_loss_mlp": 1.34324479, + "epoch": 0.01617315496768375, + "flos": 18923507245440.0, + "grad_norm": 1.7709569959392844, + "language_loss": 0.99513739, + "learning_rate": 3.602167137831432e-06, + "loss": 1.0242821, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.71728516, + "step": 269, + "time_per_iteration": 3.0612123012542725 + }, + { + "auxiliary_loss_clip": 0.01497056, + "auxiliary_loss_mlp": 0.01410551, + "balance_loss_clip": 1.3065927, + "balance_loss_mlp": 1.34124255, + "epoch": 0.01623327822035172, + "flos": 16555567904640.0, + "grad_norm": 1.8287034574259209, + "language_loss": 1.07515216, + "learning_rate": 3.6045562024779565e-06, + "loss": 1.10422826, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.69311523, + "step": 270, + "time_per_iteration": 3.061020612716675 + }, + { + "auxiliary_loss_clip": 0.01488502, + "auxiliary_loss_mlp": 0.01410139, + "balance_loss_clip": 1.30125809, + "balance_loss_mlp": 1.34247553, + "epoch": 0.016293401473019692, + "flos": 23523904437120.0, + "grad_norm": 1.7254651216900898, + "language_loss": 0.98590124, + "learning_rate": 3.606936435072361e-06, + "loss": 1.01488769, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.67675781, + "step": 271, + "time_per_iteration": 3.0728659629821777 + }, + { + "auxiliary_loss_clip": 0.01495146, + "auxiliary_loss_mlp": 0.01404902, + "balance_loss_clip": 1.30626762, + "balance_loss_mlp": 1.34489191, + "epoch": 0.01635352472568766, + "flos": 29026169107200.0, + "grad_norm": 1.9776424046432988, + "language_loss": 0.96737254, + "learning_rate": 3.609307900676025e-06, + "loss": 0.99637306, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.59985352, + "step": 272, + "time_per_iteration": 3.103900909423828 + }, + { + "auxiliary_loss_clip": 0.01490369, + "auxiliary_loss_mlp": 0.01414603, + "balance_loss_clip": 1.30223227, + "balance_loss_mlp": 1.34786987, + "epoch": 0.01641364797835563, + "flos": 13378323242880.0, + "grad_norm": 1.8709334937543125, + "language_loss": 0.9319911, + "learning_rate": 3.611670663634051e-06, + "loss": 0.96104085, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.66772461, + "step": 273, + "time_per_iteration": 3.0304439067840576 + }, + { + "auxiliary_loss_clip": 0.01499912, + "auxiliary_loss_mlp": 0.01423625, + "balance_loss_clip": 1.30913794, + "balance_loss_mlp": 1.35326767, + "epoch": 0.016473771231023598, + "flos": 18887057919360.0, + "grad_norm": 1.7943621424665783, + "language_loss": 1.05686402, + "learning_rate": 3.614024787585744e-06, + "loss": 1.08609939, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.70410156, + "step": 274, + "time_per_iteration": 3.044799566268921 + }, + { + "auxiliary_loss_clip": 0.01496866, + "auxiliary_loss_mlp": 0.01425705, + "balance_loss_clip": 1.30865598, + "balance_loss_mlp": 1.3518424, + "epoch": 0.016533894483691566, + "flos": 22611902123520.0, + "grad_norm": 1.6615583293566911, + "language_loss": 0.99816883, + "learning_rate": 3.6163703354748927e-06, + "loss": 1.02739453, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.73925781, + "step": 275, + "time_per_iteration": 3.052940607070923 + }, + { + "auxiliary_loss_clip": 0.014945, + "auxiliary_loss_mlp": 0.01414059, + "balance_loss_clip": 1.30802441, + "balance_loss_mlp": 1.34932864, + "epoch": 0.01659401773635954, + "flos": 21517200731520.0, + "grad_norm": 1.475723944904423, + "language_loss": 0.90201354, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.93109918, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.64672852, + "step": 276, + "time_per_iteration": 3.0280468463897705 + }, + { + "auxiliary_loss_clip": 0.01505257, + "auxiliary_loss_mlp": 0.01410711, + "balance_loss_clip": 1.31645894, + "balance_loss_mlp": 1.34423971, + "epoch": 0.016654140989027507, + "flos": 32863075977600.0, + "grad_norm": 1.5797113314272933, + "language_loss": 0.8910737, + "learning_rate": 3.621035951423551e-06, + "loss": 0.92023337, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.66455078, + "step": 277, + "time_per_iteration": 3.2062783241271973 + }, + { + "auxiliary_loss_clip": 0.01498274, + "auxiliary_loss_mlp": 0.01396574, + "balance_loss_clip": 1.31197357, + "balance_loss_mlp": 1.34066498, + "epoch": 0.016714264241695476, + "flos": 12312153336960.0, + "grad_norm": 1.8414944991782476, + "language_loss": 0.9014163, + "learning_rate": 3.623356141983041e-06, + "loss": 0.93036473, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.55883789, + "step": 278, + "time_per_iteration": 3.0505058765411377 + }, + { + "auxiliary_loss_clip": 0.01503943, + "auxiliary_loss_mlp": 0.01408682, + "balance_loss_clip": 1.31156635, + "balance_loss_mlp": 1.34001732, + "epoch": 0.016774387494363444, + "flos": 27134695203840.0, + "grad_norm": 1.6753241897745959, + "language_loss": 1.02659738, + "learning_rate": 3.6256680014992486e-06, + "loss": 1.05572367, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.68701172, + "step": 279, + "time_per_iteration": 3.133037805557251 + }, + { + "auxiliary_loss_clip": 0.01497442, + "auxiliary_loss_mlp": 0.01421199, + "balance_loss_clip": 1.30784976, + "balance_loss_mlp": 1.34256911, + "epoch": 0.016834510747031413, + "flos": 20200591002240.0, + "grad_norm": 1.8016591020591932, + "language_loss": 1.07473278, + "learning_rate": 3.6279715895862713e-06, + "loss": 1.10391927, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.78613281, + "step": 280, + "time_per_iteration": 3.116851329803467 + }, + { + "auxiliary_loss_clip": 0.01487226, + "auxiliary_loss_mlp": 0.01413694, + "balance_loss_clip": 1.29906964, + "balance_loss_mlp": 1.34302628, + "epoch": 0.016894633999699385, + "flos": 27286374332160.0, + "grad_norm": 1.5058306002681243, + "language_loss": 0.84433413, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.87334335, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.70654297, + "step": 281, + "time_per_iteration": 3.167877435684204 + }, + { + "auxiliary_loss_clip": 0.01485974, + "auxiliary_loss_mlp": 0.01417188, + "balance_loss_clip": 1.29811001, + "balance_loss_mlp": 1.34556723, + "epoch": 0.016954757252367354, + "flos": 14911185709440.0, + "grad_norm": 2.1072468692071835, + "language_loss": 1.01152253, + "learning_rate": 3.632554186750274e-06, + "loss": 1.04055417, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.71582031, + "step": 282, + "time_per_iteration": 3.1417341232299805 + }, + { + "auxiliary_loss_clip": 0.0148228, + "auxiliary_loss_mlp": 0.01412605, + "balance_loss_clip": 1.29277134, + "balance_loss_mlp": 1.34155655, + "epoch": 0.017014880505035322, + "flos": 21368145801600.0, + "grad_norm": 1.6627071864352685, + "language_loss": 0.9007085, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.92965734, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.71044922, + "step": 283, + "time_per_iteration": 3.3196709156036377 + }, + { + "auxiliary_loss_clip": 0.01472782, + "auxiliary_loss_mlp": 0.01415784, + "balance_loss_clip": 1.28571892, + "balance_loss_mlp": 1.34268534, + "epoch": 0.01707500375770329, + "flos": 35346199875840.0, + "grad_norm": 1.784532941800421, + "language_loss": 0.94354904, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.97243464, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.73095703, + "step": 284, + "time_per_iteration": 3.245760917663574 + }, + { + "auxiliary_loss_clip": 0.01465664, + "auxiliary_loss_mlp": 0.01411844, + "balance_loss_clip": 1.28227806, + "balance_loss_mlp": 1.34632707, + "epoch": 0.01713512701037126, + "flos": 23591373713280.0, + "grad_norm": 1.9563699841610152, + "language_loss": 1.09732497, + "learning_rate": 3.639367500948819e-06, + "loss": 1.12609994, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.65527344, + "step": 285, + "time_per_iteration": 3.0912346839904785 + }, + { + "auxiliary_loss_clip": 0.01463259, + "auxiliary_loss_mlp": 0.01408419, + "balance_loss_clip": 1.28129053, + "balance_loss_mlp": 1.3433789, + "epoch": 0.01719525026303923, + "flos": 27645709685760.0, + "grad_norm": 1.6831460302368153, + "language_loss": 1.05088341, + "learning_rate": 3.6416226772772178e-06, + "loss": 1.07960033, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.64990234, + "step": 286, + "time_per_iteration": 3.253262996673584 + }, + { + "auxiliary_loss_clip": 0.01469827, + "auxiliary_loss_mlp": 0.01402503, + "balance_loss_clip": 1.28674436, + "balance_loss_mlp": 1.34287429, + "epoch": 0.0172553735157072, + "flos": 26991069649920.0, + "grad_norm": 1.489969933521248, + "language_loss": 0.9993127, + "learning_rate": 3.643869982119001e-06, + "loss": 1.028036, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.59643555, + "step": 287, + "time_per_iteration": 3.0771868228912354 + }, + { + "auxiliary_loss_clip": 0.01479763, + "auxiliary_loss_mlp": 0.01405626, + "balance_loss_clip": 1.29350686, + "balance_loss_mlp": 1.33984613, + "epoch": 0.01731549676837517, + "flos": 14062761619200.0, + "grad_norm": 2.082134750542835, + "language_loss": 1.17210579, + "learning_rate": 3.646109470232502e-06, + "loss": 1.20095968, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.6574707, + "step": 288, + "time_per_iteration": 3.045431613922119 + }, + { + "auxiliary_loss_clip": 0.01477235, + "auxiliary_loss_mlp": 0.01368571, + "balance_loss_clip": 1.32320094, + "balance_loss_mlp": 1.33900678, + "epoch": 0.017375620021043137, + "flos": 66546153031680.0, + "grad_norm": 0.9375127621114936, + "language_loss": 0.64246446, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.67092252, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.29492188, + "step": 289, + "time_per_iteration": 3.5431721210479736 + }, + { + "auxiliary_loss_clip": 0.01486309, + "auxiliary_loss_mlp": 0.01396211, + "balance_loss_clip": 1.30509067, + "balance_loss_mlp": 1.34256721, + "epoch": 0.01743574327371111, + "flos": 15231854741760.0, + "grad_norm": 2.154210799896163, + "language_loss": 1.00688875, + "learning_rate": 3.6505652124687957e-06, + "loss": 1.03571403, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.53686523, + "step": 290, + "time_per_iteration": 3.0662574768066406 + }, + { + "auxiliary_loss_clip": 0.01508007, + "auxiliary_loss_mlp": 0.01405689, + "balance_loss_clip": 1.32217836, + "balance_loss_mlp": 1.3489933, + "epoch": 0.017495866526379078, + "flos": 25384675104000.0, + "grad_norm": 1.5705570950264276, + "language_loss": 0.97315371, + "learning_rate": 3.6527815732925258e-06, + "loss": 1.00229073, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.56738281, + "step": 291, + "time_per_iteration": 3.085784435272217 + }, + { + "auxiliary_loss_clip": 0.01506241, + "auxiliary_loss_mlp": 0.01397565, + "balance_loss_clip": 1.32405698, + "balance_loss_mlp": 1.34291995, + "epoch": 0.017555989779047047, + "flos": 26370661944960.0, + "grad_norm": 1.5352469087520764, + "language_loss": 0.81681895, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.84585702, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.54663086, + "step": 292, + "time_per_iteration": 3.007880449295044 + }, + { + "auxiliary_loss_clip": 0.01498802, + "auxiliary_loss_mlp": 0.01395748, + "balance_loss_clip": 1.31978095, + "balance_loss_mlp": 1.3505919, + "epoch": 0.017616113031715015, + "flos": 22347074453760.0, + "grad_norm": 1.8670207636289677, + "language_loss": 0.9868027, + "learning_rate": 3.6571915369953646e-06, + "loss": 1.01574814, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.45166016, + "step": 293, + "time_per_iteration": 4.44026780128479 + }, + { + "auxiliary_loss_clip": 0.01496488, + "auxiliary_loss_mlp": 0.01393086, + "balance_loss_clip": 1.31870604, + "balance_loss_mlp": 1.34685707, + "epoch": 0.017676236284382984, + "flos": 20166494405760.0, + "grad_norm": 1.520819554894164, + "language_loss": 0.93919694, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.96809268, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.46240234, + "step": 294, + "time_per_iteration": 3.053316831588745 + }, + { + "auxiliary_loss_clip": 0.01490006, + "auxiliary_loss_mlp": 0.01400349, + "balance_loss_clip": 1.31014943, + "balance_loss_mlp": 1.34548938, + "epoch": 0.017736359537050956, + "flos": 25232905486080.0, + "grad_norm": 2.065134015617671, + "language_loss": 0.9333865, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.96229005, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.54833984, + "step": 295, + "time_per_iteration": 4.520801305770874 + }, + { + "auxiliary_loss_clip": 0.0151354, + "auxiliary_loss_mlp": 0.01393216, + "balance_loss_clip": 1.33275843, + "balance_loss_mlp": 1.34600925, + "epoch": 0.017796482789718925, + "flos": 20348288588160.0, + "grad_norm": 1.8143509918538279, + "language_loss": 0.93494141, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.96400905, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.47216797, + "step": 296, + "time_per_iteration": 6.0985267162323 + }, + { + "auxiliary_loss_clip": 0.01510128, + "auxiliary_loss_mlp": 0.01390328, + "balance_loss_clip": 1.3312403, + "balance_loss_mlp": 1.33944988, + "epoch": 0.017856606042386893, + "flos": 22388500707840.0, + "grad_norm": 1.7342284669151882, + "language_loss": 0.98527503, + "learning_rate": 3.665921869855132e-06, + "loss": 1.0142796, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.50830078, + "step": 297, + "time_per_iteration": 3.0394985675811768 + }, + { + "auxiliary_loss_clip": 0.01520207, + "auxiliary_loss_mlp": 0.01388346, + "balance_loss_clip": 1.34330201, + "balance_loss_mlp": 1.34318995, + "epoch": 0.017916729295054862, + "flos": 20239709771520.0, + "grad_norm": 1.6885585161482857, + "language_loss": 1.01833534, + "learning_rate": 3.6680860809130346e-06, + "loss": 1.04742098, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.45166016, + "step": 298, + "time_per_iteration": 2.981853485107422 + }, + { + "auxiliary_loss_clip": 0.01522472, + "auxiliary_loss_mlp": 0.01406211, + "balance_loss_clip": 1.34464145, + "balance_loss_mlp": 1.35023046, + "epoch": 0.01797685254772283, + "flos": 19400425130880.0, + "grad_norm": 1.496350976297131, + "language_loss": 0.97697282, + "learning_rate": 3.6702430416690516e-06, + "loss": 1.00625968, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.55957031, + "step": 299, + "time_per_iteration": 3.018101692199707 + }, + { + "auxiliary_loss_clip": 0.01523025, + "auxiliary_loss_mlp": 0.01420803, + "balance_loss_clip": 1.34110975, + "balance_loss_mlp": 1.36007786, + "epoch": 0.018036975800390802, + "flos": 24436766401920.0, + "grad_norm": 2.0791540134678446, + "language_loss": 0.82400465, + "learning_rate": 3.672392800539357e-06, + "loss": 0.85344297, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.60742188, + "step": 300, + "time_per_iteration": 3.007295608520508 + }, + { + "auxiliary_loss_clip": 0.01533687, + "auxiliary_loss_mlp": 0.01417113, + "balance_loss_clip": 1.35504127, + "balance_loss_mlp": 1.35934424, + "epoch": 0.01809709905305877, + "flos": 15787462613760.0, + "grad_norm": 1.7327589260779785, + "language_loss": 1.01556432, + "learning_rate": 3.6745354054567686e-06, + "loss": 1.04507232, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.5769043, + "step": 301, + "time_per_iteration": 3.038264274597168 + }, + { + "auxiliary_loss_clip": 0.01549346, + "auxiliary_loss_mlp": 0.01361224, + "balance_loss_clip": 1.39953136, + "balance_loss_mlp": 1.34176922, + "epoch": 0.01815722230572674, + "flos": 67383673125120.0, + "grad_norm": 0.8555211435471284, + "language_loss": 0.62499094, + "learning_rate": 3.676670903877158e-06, + "loss": 0.65409666, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19433594, + "step": 302, + "time_per_iteration": 3.5381991863250732 + }, + { + "auxiliary_loss_clip": 0.01497692, + "auxiliary_loss_mlp": 0.01442441, + "balance_loss_clip": 1.31959164, + "balance_loss_mlp": 1.38352799, + "epoch": 0.01821734555839471, + "flos": 15493696254720.0, + "grad_norm": 1.7998762991168389, + "language_loss": 1.02892613, + "learning_rate": 3.6787993427857567e-06, + "loss": 1.05832756, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.58911133, + "step": 303, + "time_per_iteration": 3.114450216293335 + }, + { + "auxiliary_loss_clip": 0.014903, + "auxiliary_loss_mlp": 0.01449872, + "balance_loss_clip": 1.31330061, + "balance_loss_mlp": 1.40366626, + "epoch": 0.018277468811062677, + "flos": 24108043795200.0, + "grad_norm": 1.5777160398055636, + "language_loss": 0.91424119, + "learning_rate": 3.680920768703364e-06, + "loss": 0.94364297, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.46166992, + "step": 304, + "time_per_iteration": 3.2559657096862793 + }, + { + "auxiliary_loss_clip": 0.01488879, + "auxiliary_loss_mlp": 0.01437166, + "balance_loss_clip": 1.31249475, + "balance_loss_mlp": 1.38705063, + "epoch": 0.01833759206373065, + "flos": 20969058251520.0, + "grad_norm": 1.4462071373287906, + "language_loss": 0.88289249, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.91215301, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.5012207, + "step": 305, + "time_per_iteration": 3.1393206119537354 + }, + { + "auxiliary_loss_clip": 0.01482995, + "auxiliary_loss_mlp": 0.01403499, + "balance_loss_clip": 1.30736363, + "balance_loss_mlp": 1.35412252, + "epoch": 0.018397715316398618, + "flos": 19400153662080.0, + "grad_norm": 1.5734879561229687, + "language_loss": 0.99025142, + "learning_rate": 3.685142765363119e-06, + "loss": 1.0191164, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.49389648, + "step": 306, + "time_per_iteration": 2.957449436187744 + }, + { + "auxiliary_loss_clip": 0.01483647, + "auxiliary_loss_mlp": 0.01400988, + "balance_loss_clip": 1.30548954, + "balance_loss_mlp": 1.3478446, + "epoch": 0.018457838569066586, + "flos": 29144475555840.0, + "grad_norm": 1.6645942564336902, + "language_loss": 0.98157763, + "learning_rate": 3.687243426879095e-06, + "loss": 1.0104239, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.53125, + "step": 307, + "time_per_iteration": 3.1539502143859863 + }, + { + "auxiliary_loss_clip": 0.01487184, + "auxiliary_loss_mlp": 0.01418091, + "balance_loss_clip": 1.30950236, + "balance_loss_mlp": 1.3624208, + "epoch": 0.018517961821734555, + "flos": 19218088010880.0, + "grad_norm": 1.7528135992564817, + "language_loss": 0.8607589, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.88981164, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.55688477, + "step": 308, + "time_per_iteration": 3.064777374267578 + }, + { + "auxiliary_loss_clip": 0.01496588, + "auxiliary_loss_mlp": 0.01460572, + "balance_loss_clip": 1.31005168, + "balance_loss_mlp": 1.3750751, + "epoch": 0.018578085074402523, + "flos": 19872004129920.0, + "grad_norm": 1.6750426611418898, + "language_loss": 0.9778837, + "learning_rate": 3.6914242999043395e-06, + "loss": 1.00745535, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.85498047, + "step": 309, + "time_per_iteration": 3.1307332515716553 + }, + { + "auxiliary_loss_clip": 0.0149202, + "auxiliary_loss_mlp": 0.01443482, + "balance_loss_clip": 1.30658054, + "balance_loss_mlp": 1.37324381, + "epoch": 0.018638208327070496, + "flos": 29619040711680.0, + "grad_norm": 1.7402551650750409, + "language_loss": 0.88026601, + "learning_rate": 3.69350459956065e-06, + "loss": 0.909621, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.70263672, + "step": 310, + "time_per_iteration": 3.207200050354004 + }, + { + "auxiliary_loss_clip": 0.01476897, + "auxiliary_loss_mlp": 0.01411427, + "balance_loss_clip": 1.29884362, + "balance_loss_mlp": 1.36209834, + "epoch": 0.018698331579738464, + "flos": 45747288311040.0, + "grad_norm": 1.5071328675910307, + "language_loss": 0.83718276, + "learning_rate": 3.695578199367497e-06, + "loss": 0.86606598, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.49389648, + "step": 311, + "time_per_iteration": 3.3521690368652344 + }, + { + "auxiliary_loss_clip": 0.01474421, + "auxiliary_loss_mlp": 0.01408315, + "balance_loss_clip": 1.29499292, + "balance_loss_mlp": 1.34997427, + "epoch": 0.018758454832406433, + "flos": 20492909527680.0, + "grad_norm": 2.07319267649501, + "language_loss": 1.02580142, + "learning_rate": 3.6976451423416825e-06, + "loss": 1.05462885, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.58374023, + "step": 312, + "time_per_iteration": 2.9914748668670654 + }, + { + "auxiliary_loss_clip": 0.01469237, + "auxiliary_loss_mlp": 0.01400846, + "balance_loss_clip": 1.29221463, + "balance_loss_mlp": 1.34636736, + "epoch": 0.0188185780850744, + "flos": 15786014780160.0, + "grad_norm": 1.7539398931101273, + "language_loss": 1.03918719, + "learning_rate": 3.699705471087043e-06, + "loss": 1.06788802, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.54492188, + "step": 313, + "time_per_iteration": 2.9839365482330322 + }, + { + "auxiliary_loss_clip": 0.01486014, + "auxiliary_loss_mlp": 0.01410003, + "balance_loss_clip": 1.30269504, + "balance_loss_mlp": 1.34787118, + "epoch": 0.018878701337742373, + "flos": 22465969084800.0, + "grad_norm": 1.911464390569888, + "language_loss": 0.94432068, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.97328079, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.62182617, + "step": 314, + "time_per_iteration": 3.0625627040863037 + }, + { + "auxiliary_loss_clip": 0.01478496, + "auxiliary_loss_mlp": 0.01405175, + "balance_loss_clip": 1.30213809, + "balance_loss_mlp": 1.35143518, + "epoch": 0.018938824590410342, + "flos": 31006015384320.0, + "grad_norm": 2.3323486976310037, + "language_loss": 1.04136896, + "learning_rate": 3.7038064542733654e-06, + "loss": 1.07020569, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.53710938, + "step": 315, + "time_per_iteration": 3.095252275466919 + }, + { + "auxiliary_loss_clip": 0.01497209, + "auxiliary_loss_mlp": 0.01440329, + "balance_loss_clip": 1.31497765, + "balance_loss_mlp": 1.35404527, + "epoch": 0.01899894784307831, + "flos": 23269483071360.0, + "grad_norm": 1.5835644772275472, + "language_loss": 0.91874284, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.94811821, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.86254883, + "step": 316, + "time_per_iteration": 2.9922244548797607 + }, + { + "auxiliary_loss_clip": 0.01493459, + "auxiliary_loss_mlp": 0.01410624, + "balance_loss_clip": 1.31684017, + "balance_loss_mlp": 1.35547793, + "epoch": 0.01905907109574628, + "flos": 17467027280640.0, + "grad_norm": 1.6459503298263989, + "language_loss": 0.96892136, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.99796218, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.55126953, + "step": 317, + "time_per_iteration": 3.0228896141052246 + }, + { + "auxiliary_loss_clip": 0.01513492, + "auxiliary_loss_mlp": 0.01444572, + "balance_loss_clip": 1.33365369, + "balance_loss_mlp": 1.35380578, + "epoch": 0.019119194348414248, + "flos": 14976980928000.0, + "grad_norm": 1.9903374222222234, + "language_loss": 1.05596328, + "learning_rate": 3.709909364265374e-06, + "loss": 1.08554399, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.90771484, + "step": 318, + "time_per_iteration": 3.077000856399536 + }, + { + "auxiliary_loss_clip": 0.01514164, + "auxiliary_loss_mlp": 0.01392695, + "balance_loss_clip": 1.33813977, + "balance_loss_mlp": 1.34253144, + "epoch": 0.01917931760108222, + "flos": 25493525389440.0, + "grad_norm": 1.9090698910439856, + "language_loss": 1.05335855, + "learning_rate": 3.7119308798459706e-06, + "loss": 1.08242726, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.50170898, + "step": 319, + "time_per_iteration": 3.0136423110961914 + }, + { + "auxiliary_loss_clip": 0.01523239, + "auxiliary_loss_mlp": 0.01398995, + "balance_loss_clip": 1.38626301, + "balance_loss_mlp": 1.37038481, + "epoch": 0.01923944085375019, + "flos": 71587181779200.0, + "grad_norm": 0.9624002766621472, + "language_loss": 0.59982169, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62904394, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.28515625, + "step": 320, + "time_per_iteration": 3.3411078453063965 + }, + { + "auxiliary_loss_clip": 0.01509156, + "auxiliary_loss_mlp": 0.01399642, + "balance_loss_clip": 1.33212852, + "balance_loss_mlp": 1.35262609, + "epoch": 0.019299564106418157, + "flos": 19691929249920.0, + "grad_norm": 1.771317087982245, + "language_loss": 1.06636834, + "learning_rate": 3.715954969092154e-06, + "loss": 1.09545624, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.4699707, + "step": 321, + "time_per_iteration": 3.104935646057129 + }, + { + "auxiliary_loss_clip": 0.01507086, + "auxiliary_loss_mlp": 0.01425781, + "balance_loss_clip": 1.33182752, + "balance_loss_mlp": 1.38305664, + "epoch": 0.019359687359086126, + "flos": 24397285674240.0, + "grad_norm": 1.7814399770519727, + "language_loss": 0.95840096, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.98772955, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.42700195, + "step": 322, + "time_per_iteration": 3.078573226928711 + }, + { + "auxiliary_loss_clip": 0.01497133, + "auxiliary_loss_mlp": 0.01431927, + "balance_loss_clip": 1.32489681, + "balance_loss_mlp": 1.38007081, + "epoch": 0.019419810611754094, + "flos": 23961929777280.0, + "grad_norm": 1.931863835200685, + "language_loss": 0.87765032, + "learning_rate": 3.719954063833981e-06, + "loss": 0.90694094, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.51928711, + "step": 323, + "time_per_iteration": 3.072577953338623 + }, + { + "auxiliary_loss_clip": 0.01501231, + "auxiliary_loss_mlp": 0.01419917, + "balance_loss_clip": 1.32731128, + "balance_loss_mlp": 1.38041091, + "epoch": 0.019479933864422067, + "flos": 22169442792960.0, + "grad_norm": 1.5837378961118929, + "language_loss": 1.01682043, + "learning_rate": 3.721944334919596e-06, + "loss": 1.04603183, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.39526367, + "step": 324, + "time_per_iteration": 3.0561697483062744 + }, + { + "auxiliary_loss_clip": 0.01505146, + "auxiliary_loss_mlp": 0.01408207, + "balance_loss_clip": 1.33186126, + "balance_loss_mlp": 1.3645525, + "epoch": 0.019540057117090035, + "flos": 22247001659520.0, + "grad_norm": 1.7131676642877194, + "language_loss": 0.82971179, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.85884535, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.43652344, + "step": 325, + "time_per_iteration": 3.0457406044006348 + }, + { + "auxiliary_loss_clip": 0.01493283, + "auxiliary_loss_mlp": 0.01386142, + "balance_loss_clip": 1.32303524, + "balance_loss_mlp": 1.34499121, + "epoch": 0.019600180369758004, + "flos": 23087372175360.0, + "grad_norm": 1.5096052595433331, + "language_loss": 0.86447024, + "learning_rate": 3.72590651470665e-06, + "loss": 0.89326453, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.41162109, + "step": 326, + "time_per_iteration": 3.2306222915649414 + }, + { + "auxiliary_loss_clip": 0.01477275, + "auxiliary_loss_mlp": 0.01416236, + "balance_loss_clip": 1.30882549, + "balance_loss_mlp": 1.36712146, + "epoch": 0.019660303622425972, + "flos": 25421803102080.0, + "grad_norm": 1.7092177694503095, + "language_loss": 0.89608318, + "learning_rate": 3.727878498433505e-06, + "loss": 0.92501831, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.49121094, + "step": 327, + "time_per_iteration": 3.0549449920654297 + }, + { + "auxiliary_loss_clip": 0.01485842, + "auxiliary_loss_mlp": 0.01467873, + "balance_loss_clip": 1.3133111, + "balance_loss_mlp": 1.42088056, + "epoch": 0.01972042687509394, + "flos": 23667484746240.0, + "grad_norm": 1.8342278057771246, + "language_loss": 0.89528757, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.92482471, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.4699707, + "step": 328, + "time_per_iteration": 3.0483720302581787 + }, + { + "auxiliary_loss_clip": 0.01474137, + "auxiliary_loss_mlp": 0.01510382, + "balance_loss_clip": 1.29972148, + "balance_loss_mlp": 1.45692909, + "epoch": 0.019780550127761913, + "flos": 18232644107520.0, + "grad_norm": 2.0624704831482807, + "language_loss": 1.14614737, + "learning_rate": 3.731804438545683e-06, + "loss": 1.17599261, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.53491211, + "step": 329, + "time_per_iteration": 4.45777440071106 + }, + { + "auxiliary_loss_clip": 0.01464542, + "auxiliary_loss_mlp": 0.01512852, + "balance_loss_clip": 1.29056382, + "balance_loss_mlp": 1.45408154, + "epoch": 0.01984067338042988, + "flos": 22428886331520.0, + "grad_norm": 1.906942680224435, + "language_loss": 0.86636215, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.89613605, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.58789062, + "step": 330, + "time_per_iteration": 4.598118782043457 + }, + { + "auxiliary_loss_clip": 0.01464744, + "auxiliary_loss_mlp": 0.0147667, + "balance_loss_clip": 1.28726912, + "balance_loss_mlp": 1.41484797, + "epoch": 0.01990079663309785, + "flos": 17063867698560.0, + "grad_norm": 1.9352809832485176, + "language_loss": 1.11481023, + "learning_rate": 3.7357065849353186e-06, + "loss": 1.14422441, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.61791992, + "step": 331, + "time_per_iteration": 6.723036050796509 + }, + { + "auxiliary_loss_clip": 0.01463144, + "auxiliary_loss_mlp": 0.01464907, + "balance_loss_clip": 1.28754973, + "balance_loss_mlp": 1.40639949, + "epoch": 0.01996091988576582, + "flos": 15970478405760.0, + "grad_norm": 1.6518732858021503, + "language_loss": 1.03887153, + "learning_rate": 3.737648825272422e-06, + "loss": 1.06815219, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.58496094, + "step": 332, + "time_per_iteration": 3.059453248977661 + }, + { + "auxiliary_loss_clip": 0.01469571, + "auxiliary_loss_mlp": 0.01460515, + "balance_loss_clip": 1.28973866, + "balance_loss_mlp": 1.38603282, + "epoch": 0.02002104313843379, + "flos": 23596893578880.0, + "grad_norm": 2.067335079294609, + "language_loss": 0.93447161, + "learning_rate": 3.739585224276384e-06, + "loss": 0.96377254, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.74414062, + "step": 333, + "time_per_iteration": 3.004992723464966 + }, + { + "auxiliary_loss_clip": 0.01472687, + "auxiliary_loss_mlp": 0.01455281, + "balance_loss_clip": 1.29409552, + "balance_loss_mlp": 1.38203859, + "epoch": 0.02008116639110176, + "flos": 34108189643520.0, + "grad_norm": 1.6350103243163914, + "language_loss": 0.94027656, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.96955621, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.73291016, + "step": 334, + "time_per_iteration": 3.2349965572357178 + }, + { + "auxiliary_loss_clip": 0.0147642, + "auxiliary_loss_mlp": 0.01427596, + "balance_loss_clip": 1.29691744, + "balance_loss_mlp": 1.35897923, + "epoch": 0.020141289643769728, + "flos": 19693467573120.0, + "grad_norm": 1.5715361575108515, + "language_loss": 0.92593014, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.9549703, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.68603516, + "step": 335, + "time_per_iteration": 3.1878459453582764 + }, + { + "auxiliary_loss_clip": 0.01478459, + "auxiliary_loss_mlp": 0.01440047, + "balance_loss_clip": 1.29956186, + "balance_loss_mlp": 1.37476826, + "epoch": 0.020201412896437697, + "flos": 20750271805440.0, + "grad_norm": 1.96762225571726, + "language_loss": 1.04488349, + "learning_rate": 3.745359722027911e-06, + "loss": 1.07406855, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.65380859, + "step": 336, + "time_per_iteration": 3.0201406478881836 + }, + { + "auxiliary_loss_clip": 0.01472405, + "auxiliary_loss_mlp": 0.0145465, + "balance_loss_clip": 1.29384494, + "balance_loss_mlp": 1.39587986, + "epoch": 0.020261536149105665, + "flos": 20276159097600.0, + "grad_norm": 1.5103736592518329, + "language_loss": 0.96229464, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.99156523, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.58837891, + "step": 337, + "time_per_iteration": 3.1171329021453857 + }, + { + "auxiliary_loss_clip": 0.01459495, + "auxiliary_loss_mlp": 0.01435409, + "balance_loss_clip": 1.28435302, + "balance_loss_mlp": 1.38021469, + "epoch": 0.020321659401773638, + "flos": 25859828442240.0, + "grad_norm": 1.2721854678417928, + "language_loss": 0.95535713, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.98430622, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.55175781, + "step": 338, + "time_per_iteration": 3.07940673828125 + }, + { + "auxiliary_loss_clip": 0.01454658, + "auxiliary_loss_mlp": 0.01400357, + "balance_loss_clip": 1.28217638, + "balance_loss_mlp": 1.35126662, + "epoch": 0.020381782654441606, + "flos": 17504336257920.0, + "grad_norm": 1.6471674983854503, + "language_loss": 0.96002573, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.98857582, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.49145508, + "step": 339, + "time_per_iteration": 3.021568536758423 + }, + { + "auxiliary_loss_clip": 0.01459348, + "auxiliary_loss_mlp": 0.01405556, + "balance_loss_clip": 1.28401589, + "balance_loss_mlp": 1.3539623, + "epoch": 0.020441905907109575, + "flos": 24254745995520.0, + "grad_norm": 1.4845437094376257, + "language_loss": 0.9766953, + "learning_rate": 3.75297936342452e-06, + "loss": 1.00534439, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.51611328, + "step": 340, + "time_per_iteration": 3.0535926818847656 + }, + { + "auxiliary_loss_clip": 0.01477674, + "auxiliary_loss_mlp": 0.01421407, + "balance_loss_clip": 1.2992878, + "balance_loss_mlp": 1.35937035, + "epoch": 0.020502029159777543, + "flos": 22242567669120.0, + "grad_norm": 1.6394163382626339, + "language_loss": 0.97501427, + "learning_rate": 3.7548702669378253e-06, + "loss": 1.00400507, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.62036133, + "step": 341, + "time_per_iteration": 3.0693359375 + }, + { + "auxiliary_loss_clip": 0.01487616, + "auxiliary_loss_mlp": 0.01424647, + "balance_loss_clip": 1.31366706, + "balance_loss_mlp": 1.3724339, + "epoch": 0.020562152412445512, + "flos": 23998424348160.0, + "grad_norm": 1.8438483522548366, + "language_loss": 0.97410512, + "learning_rate": 3.756755633390458e-06, + "loss": 1.00322771, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.52197266, + "step": 342, + "time_per_iteration": 3.195490598678589 + }, + { + "auxiliary_loss_clip": 0.01497527, + "auxiliary_loss_mlp": 0.01427227, + "balance_loss_clip": 1.32412255, + "balance_loss_mlp": 1.37463176, + "epoch": 0.020622275665113484, + "flos": 26985504539520.0, + "grad_norm": 1.4232196874778749, + "language_loss": 0.97543871, + "learning_rate": 3.7586354951156886e-06, + "loss": 1.00468612, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.52587891, + "step": 343, + "time_per_iteration": 3.173346996307373 + }, + { + "auxiliary_loss_clip": 0.01502068, + "auxiliary_loss_mlp": 0.01416996, + "balance_loss_clip": 1.32960653, + "balance_loss_mlp": 1.37036109, + "epoch": 0.020682398917781453, + "flos": 22610725758720.0, + "grad_norm": 1.523119582949282, + "language_loss": 0.86881483, + "learning_rate": 3.7605098841644e-06, + "loss": 0.89800549, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.46679688, + "step": 344, + "time_per_iteration": 3.075843572616577 + }, + { + "auxiliary_loss_clip": 0.01515132, + "auxiliary_loss_mlp": 0.01397996, + "balance_loss_clip": 1.34014606, + "balance_loss_mlp": 1.3530302, + "epoch": 0.02074252217044942, + "flos": 15022162500480.0, + "grad_norm": 1.4009184950519529, + "language_loss": 0.88423657, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.91336793, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.44921875, + "step": 345, + "time_per_iteration": 3.0539114475250244 + }, + { + "auxiliary_loss_clip": 0.01522942, + "auxiliary_loss_mlp": 0.01401536, + "balance_loss_clip": 1.34606242, + "balance_loss_mlp": 1.35089564, + "epoch": 0.02080264542311739, + "flos": 25348904449920.0, + "grad_norm": 1.7071571912615158, + "language_loss": 0.97690898, + "learning_rate": 3.7642423710434837e-06, + "loss": 1.0061537, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.50610352, + "step": 346, + "time_per_iteration": 3.038041114807129 + }, + { + "auxiliary_loss_clip": 0.01537242, + "auxiliary_loss_mlp": 0.01403328, + "balance_loss_clip": 1.35929668, + "balance_loss_mlp": 1.34503472, + "epoch": 0.02086276867578536, + "flos": 24399366935040.0, + "grad_norm": 1.6685732937941726, + "language_loss": 0.91264701, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.9420526, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.58325195, + "step": 347, + "time_per_iteration": 3.0891525745391846 + }, + { + "auxiliary_loss_clip": 0.0153872, + "auxiliary_loss_mlp": 0.01402186, + "balance_loss_clip": 1.35928476, + "balance_loss_mlp": 1.34801722, + "epoch": 0.02092289192845333, + "flos": 24472899014400.0, + "grad_norm": 1.5293046524872853, + "language_loss": 0.83135027, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.86075932, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.54150391, + "step": 348, + "time_per_iteration": 3.0590052604675293 + }, + { + "auxiliary_loss_clip": 0.01541126, + "auxiliary_loss_mlp": 0.01410458, + "balance_loss_clip": 1.3610636, + "balance_loss_mlp": 1.34961414, + "epoch": 0.0209830151811213, + "flos": 17458566503040.0, + "grad_norm": 1.851508479393449, + "language_loss": 0.92335117, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.95286703, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.60864258, + "step": 349, + "time_per_iteration": 2.971245050430298 + }, + { + "auxiliary_loss_clip": 0.01533332, + "auxiliary_loss_mlp": 0.01385645, + "balance_loss_clip": 1.35909522, + "balance_loss_mlp": 1.34301567, + "epoch": 0.021043138433789268, + "flos": 24585323639040.0, + "grad_norm": 1.5157037398508373, + "language_loss": 0.93425339, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.9634431, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.42626953, + "step": 350, + "time_per_iteration": 3.0633199214935303 + }, + { + "auxiliary_loss_clip": 0.01527433, + "auxiliary_loss_mlp": 0.01390161, + "balance_loss_clip": 1.35112095, + "balance_loss_mlp": 1.34352672, + "epoch": 0.021103261686457236, + "flos": 24462628444800.0, + "grad_norm": 1.78754116670291, + "language_loss": 0.89276218, + "learning_rate": 3.773480007028776e-06, + "loss": 0.92193812, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.46655273, + "step": 351, + "time_per_iteration": 3.0954644680023193 + }, + { + "auxiliary_loss_clip": 0.01531319, + "auxiliary_loss_mlp": 0.01396814, + "balance_loss_clip": 1.35297775, + "balance_loss_mlp": 1.34302652, + "epoch": 0.021163384939125205, + "flos": 14690996674560.0, + "grad_norm": 1.5979956049716375, + "language_loss": 0.96232545, + "learning_rate": 3.775311735671078e-06, + "loss": 0.99160677, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.5378418, + "step": 352, + "time_per_iteration": 3.1323671340942383 + }, + { + "auxiliary_loss_clip": 0.01527189, + "auxiliary_loss_mlp": 0.01393762, + "balance_loss_clip": 1.34998524, + "balance_loss_mlp": 1.34681737, + "epoch": 0.021223508191793177, + "flos": 24502471130880.0, + "grad_norm": 1.556924245880262, + "language_loss": 0.90965623, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.93886578, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.46948242, + "step": 353, + "time_per_iteration": 3.077589750289917 + }, + { + "auxiliary_loss_clip": 0.01514095, + "auxiliary_loss_mlp": 0.01411254, + "balance_loss_clip": 1.33765483, + "balance_loss_mlp": 1.35842061, + "epoch": 0.021283631444461146, + "flos": 24135806119680.0, + "grad_norm": 1.7590114882651124, + "language_loss": 0.88947845, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.91873199, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.52856445, + "step": 354, + "time_per_iteration": 2.9961435794830322 + }, + { + "auxiliary_loss_clip": 0.01498583, + "auxiliary_loss_mlp": 0.01418702, + "balance_loss_clip": 1.32054603, + "balance_loss_mlp": 1.35728502, + "epoch": 0.021343754697129114, + "flos": 25203197635200.0, + "grad_norm": 1.760801692489904, + "language_loss": 0.91473269, + "learning_rate": 3.780775860546545e-06, + "loss": 0.94390559, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.61401367, + "step": 355, + "time_per_iteration": 3.031515121459961 + }, + { + "auxiliary_loss_clip": 0.01495181, + "auxiliary_loss_mlp": 0.01425081, + "balance_loss_clip": 1.3175869, + "balance_loss_mlp": 1.36430788, + "epoch": 0.021403877949797083, + "flos": 17282925613440.0, + "grad_norm": 1.8409357690168766, + "language_loss": 1.02948475, + "learning_rate": 3.7825869789939474e-06, + "loss": 1.05868733, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.60742188, + "step": 356, + "time_per_iteration": 2.9889063835144043 + }, + { + "auxiliary_loss_clip": 0.01469107, + "auxiliary_loss_mlp": 0.01428397, + "balance_loss_clip": 1.29765463, + "balance_loss_mlp": 1.37205863, + "epoch": 0.021464001202465055, + "flos": 30929089944960.0, + "grad_norm": 1.6436245803737446, + "language_loss": 0.90599585, + "learning_rate": 3.784393017158528e-06, + "loss": 0.93497086, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.56323242, + "step": 357, + "time_per_iteration": 3.0896823406219482 + }, + { + "auxiliary_loss_clip": 0.01462172, + "auxiliary_loss_mlp": 0.01420497, + "balance_loss_clip": 1.28911722, + "balance_loss_mlp": 1.36740136, + "epoch": 0.021524124455133024, + "flos": 18195380375040.0, + "grad_norm": 1.9704477456252407, + "language_loss": 0.8992632, + "learning_rate": 3.786194003461506e-06, + "loss": 0.92808992, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.53051758, + "step": 358, + "time_per_iteration": 3.0209639072418213 + }, + { + "auxiliary_loss_clip": 0.01457875, + "auxiliary_loss_mlp": 0.01414363, + "balance_loss_clip": 1.28148413, + "balance_loss_mlp": 1.35685611, + "epoch": 0.021584247707800992, + "flos": 13813045712640.0, + "grad_norm": 1.7466749884829962, + "language_loss": 1.01175082, + "learning_rate": 3.787989966086264e-06, + "loss": 1.0404731, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.57519531, + "step": 359, + "time_per_iteration": 3.0023770332336426 + }, + { + "auxiliary_loss_clip": 0.01468335, + "auxiliary_loss_mlp": 0.01407062, + "balance_loss_clip": 1.28754759, + "balance_loss_mlp": 1.34664679, + "epoch": 0.02164437096046896, + "flos": 23305117991040.0, + "grad_norm": 1.9815774087440103, + "language_loss": 0.95779788, + "learning_rate": 3.789780932980997e-06, + "loss": 0.98655182, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.60400391, + "step": 360, + "time_per_iteration": 3.0000691413879395 + }, + { + "auxiliary_loss_clip": 0.0149684, + "auxiliary_loss_mlp": 0.0139793, + "balance_loss_clip": 1.34285188, + "balance_loss_mlp": 1.36817586, + "epoch": 0.02170449421313693, + "flos": 68930679231360.0, + "grad_norm": 0.8714581387023677, + "language_loss": 0.65157449, + "learning_rate": 3.79156693186132e-06, + "loss": 0.6805222, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.296875, + "step": 361, + "time_per_iteration": 3.525885581970215 + }, + { + "auxiliary_loss_clip": 0.01439667, + "auxiliary_loss_mlp": 0.01410311, + "balance_loss_clip": 1.26543677, + "balance_loss_mlp": 1.34460247, + "epoch": 0.0217646174658049, + "flos": 25239465982080.0, + "grad_norm": 2.079753360825001, + "language_loss": 0.9527486, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.98124838, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.65771484, + "step": 362, + "time_per_iteration": 3.015835762023926 + }, + { + "auxiliary_loss_clip": 0.01438653, + "auxiliary_loss_mlp": 0.01416791, + "balance_loss_clip": 1.26202929, + "balance_loss_mlp": 1.34927106, + "epoch": 0.02182474071847287, + "flos": 22903360997760.0, + "grad_norm": 1.677656112524445, + "language_loss": 1.01712966, + "learning_rate": 3.7951241352937077e-06, + "loss": 1.04568422, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.67529297, + "step": 363, + "time_per_iteration": 4.384565591812134 + }, + { + "auxiliary_loss_clip": 0.01436558, + "auxiliary_loss_mlp": 0.01424127, + "balance_loss_clip": 1.26181901, + "balance_loss_mlp": 1.35512853, + "epoch": 0.02188486397114084, + "flos": 23669339783040.0, + "grad_norm": 1.6760356263019067, + "language_loss": 1.00927758, + "learning_rate": 3.7968953941370915e-06, + "loss": 1.03788447, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.68945312, + "step": 364, + "time_per_iteration": 3.041700839996338 + }, + { + "auxiliary_loss_clip": 0.01431452, + "auxiliary_loss_mlp": 0.01434458, + "balance_loss_clip": 1.25763702, + "balance_loss_mlp": 1.3497721, + "epoch": 0.021944987223808807, + "flos": 21553650057600.0, + "grad_norm": 1.6876051991043017, + "language_loss": 0.93163347, + "learning_rate": 3.798661793553676e-06, + "loss": 0.96029258, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.84619141, + "step": 365, + "time_per_iteration": 4.378239393234253 + }, + { + "auxiliary_loss_clip": 0.01430169, + "auxiliary_loss_mlp": 0.01413898, + "balance_loss_clip": 1.25651252, + "balance_loss_mlp": 1.34013104, + "epoch": 0.022005110476476776, + "flos": 16079509670400.0, + "grad_norm": 1.4857720072229692, + "language_loss": 0.91796309, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.94640374, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.73681641, + "step": 366, + "time_per_iteration": 6.557047367095947 + }, + { + "auxiliary_loss_clip": 0.01431823, + "auxiliary_loss_mlp": 0.01405276, + "balance_loss_clip": 1.25627637, + "balance_loss_mlp": 1.34128475, + "epoch": 0.022065233729144748, + "flos": 21443578162560.0, + "grad_norm": 1.7479548207497289, + "language_loss": 1.00543082, + "learning_rate": 3.8021801202512694e-06, + "loss": 1.03380179, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.63891602, + "step": 367, + "time_per_iteration": 2.987262725830078 + }, + { + "auxiliary_loss_clip": 0.01433545, + "auxiliary_loss_mlp": 0.01410776, + "balance_loss_clip": 1.25360799, + "balance_loss_mlp": 1.34535384, + "epoch": 0.022125356981812717, + "flos": 21553740547200.0, + "grad_norm": 1.5688373830425824, + "language_loss": 0.9672749, + "learning_rate": 3.803932100062912e-06, + "loss": 0.99571806, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.65380859, + "step": 368, + "time_per_iteration": 2.9796643257141113 + }, + { + "auxiliary_loss_clip": 0.01429409, + "auxiliary_loss_mlp": 0.01409932, + "balance_loss_clip": 1.25231242, + "balance_loss_mlp": 1.35161448, + "epoch": 0.022185480234480685, + "flos": 20713867724160.0, + "grad_norm": 2.1678097600846558, + "language_loss": 0.96813887, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.99653232, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.58349609, + "step": 369, + "time_per_iteration": 3.066594362258911 + }, + { + "auxiliary_loss_clip": 0.01420545, + "auxiliary_loss_mlp": 0.01423145, + "balance_loss_clip": 1.24796224, + "balance_loss_mlp": 1.34437203, + "epoch": 0.022245603487148654, + "flos": 25204419244800.0, + "grad_norm": 1.7243772129759378, + "language_loss": 0.94816613, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.97660303, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.78637695, + "step": 370, + "time_per_iteration": 3.007535934448242 + }, + { + "auxiliary_loss_clip": 0.01420156, + "auxiliary_loss_mlp": 0.01405832, + "balance_loss_clip": 1.24880195, + "balance_loss_mlp": 1.34541714, + "epoch": 0.022305726739816623, + "flos": 21405454778880.0, + "grad_norm": 1.396836741858576, + "language_loss": 0.8987062, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.92696607, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.60498047, + "step": 371, + "time_per_iteration": 2.941368818283081 + }, + { + "auxiliary_loss_clip": 0.01424207, + "auxiliary_loss_mlp": 0.01405674, + "balance_loss_clip": 1.25183988, + "balance_loss_mlp": 1.34494925, + "epoch": 0.022365849992484595, + "flos": 22502373166080.0, + "grad_norm": 2.0712633649793264, + "language_loss": 0.98791945, + "learning_rate": 3.8108927320022896e-06, + "loss": 1.01621819, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.60717773, + "step": 372, + "time_per_iteration": 2.941314935684204 + }, + { + "auxiliary_loss_clip": 0.01418251, + "auxiliary_loss_mlp": 0.01412321, + "balance_loss_clip": 1.24824047, + "balance_loss_mlp": 1.35059404, + "epoch": 0.022425973245152563, + "flos": 17865119445120.0, + "grad_norm": 2.0194244935171204, + "language_loss": 0.9441303, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.97243607, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.61791992, + "step": 373, + "time_per_iteration": 2.9012603759765625 + }, + { + "auxiliary_loss_clip": 0.01428781, + "auxiliary_loss_mlp": 0.01416824, + "balance_loss_clip": 1.25678647, + "balance_loss_mlp": 1.34770656, + "epoch": 0.022486096497820532, + "flos": 15490438629120.0, + "grad_norm": 1.9337030588875195, + "language_loss": 0.95945686, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.98791289, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.69140625, + "step": 374, + "time_per_iteration": 2.9126269817352295 + }, + { + "auxiliary_loss_clip": 0.01419895, + "auxiliary_loss_mlp": 0.01409538, + "balance_loss_clip": 1.25123644, + "balance_loss_mlp": 1.34819293, + "epoch": 0.0225462197504885, + "flos": 27796167204480.0, + "grad_norm": 1.5101593797601496, + "language_loss": 0.92323256, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.95152688, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.61328125, + "step": 375, + "time_per_iteration": 3.106255054473877 + }, + { + "auxiliary_loss_clip": 0.01423486, + "auxiliary_loss_mlp": 0.01388757, + "balance_loss_clip": 1.25807095, + "balance_loss_mlp": 1.34092987, + "epoch": 0.02260634300315647, + "flos": 19985288405760.0, + "grad_norm": 1.9035116013405038, + "language_loss": 0.98437035, + "learning_rate": 3.817778917253314e-06, + "loss": 1.01249278, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.4777832, + "step": 376, + "time_per_iteration": 3.031017780303955 + }, + { + "auxiliary_loss_clip": 0.01428907, + "auxiliary_loss_mlp": 0.01398933, + "balance_loss_clip": 1.26043415, + "balance_loss_mlp": 1.33966231, + "epoch": 0.02266646625582444, + "flos": 16035187749120.0, + "grad_norm": 2.487692859361529, + "language_loss": 0.93319046, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.96146894, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.59228516, + "step": 377, + "time_per_iteration": 2.947862148284912 + }, + { + "auxiliary_loss_clip": 0.01424874, + "auxiliary_loss_mlp": 0.01390467, + "balance_loss_clip": 1.26091862, + "balance_loss_mlp": 1.34213948, + "epoch": 0.02272658950849241, + "flos": 20412274014720.0, + "grad_norm": 1.7820099384838644, + "language_loss": 1.08027172, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.10842514, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.48339844, + "step": 378, + "time_per_iteration": 2.962033987045288 + }, + { + "auxiliary_loss_clip": 0.01415652, + "auxiliary_loss_mlp": 0.01397076, + "balance_loss_clip": 1.27318418, + "balance_loss_mlp": 1.36598623, + "epoch": 0.02278671276116038, + "flos": 69878452199040.0, + "grad_norm": 1.0108386916639804, + "language_loss": 0.75607866, + "learning_rate": 3.822895650276492e-06, + "loss": 0.78420597, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.31054688, + "step": 379, + "time_per_iteration": 3.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.0142333, + "auxiliary_loss_mlp": 0.01417852, + "balance_loss_clip": 1.25590694, + "balance_loss_mlp": 1.35641193, + "epoch": 0.022846836013828347, + "flos": 38522992089600.0, + "grad_norm": 1.929103013988106, + "language_loss": 0.92802167, + "learning_rate": 3.824592231451859e-06, + "loss": 0.95643353, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.61450195, + "step": 380, + "time_per_iteration": 3.1631617546081543 + }, + { + "auxiliary_loss_clip": 0.01418127, + "auxiliary_loss_mlp": 0.01417501, + "balance_loss_clip": 1.25637436, + "balance_loss_mlp": 1.36984134, + "epoch": 0.02290695926649632, + "flos": 20969239230720.0, + "grad_norm": 1.7691760748065462, + "language_loss": 1.0748477, + "learning_rate": 3.826284353801652e-06, + "loss": 1.10320401, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.47680664, + "step": 381, + "time_per_iteration": 3.0069222450256348 + }, + { + "auxiliary_loss_clip": 0.01420619, + "auxiliary_loss_mlp": 0.01423419, + "balance_loss_clip": 1.25696027, + "balance_loss_mlp": 1.36169195, + "epoch": 0.022967082519164288, + "flos": 24032475699840.0, + "grad_norm": 1.7801391124094386, + "language_loss": 0.9878068, + "learning_rate": 3.827972040701142e-06, + "loss": 1.01624715, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.61791992, + "step": 382, + "time_per_iteration": 3.013899087905884 + }, + { + "auxiliary_loss_clip": 0.01410668, + "auxiliary_loss_mlp": 0.01395707, + "balance_loss_clip": 1.24989867, + "balance_loss_mlp": 1.34382701, + "epoch": 0.023027205771832256, + "flos": 21007226880000.0, + "grad_norm": 1.5384667393928166, + "language_loss": 0.96763223, + "learning_rate": 3.829655315342268e-06, + "loss": 0.99569595, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.51904297, + "step": 383, + "time_per_iteration": 2.913273572921753 + }, + { + "auxiliary_loss_clip": 0.01410722, + "auxiliary_loss_mlp": 0.01392522, + "balance_loss_clip": 1.24868751, + "balance_loss_mlp": 1.33923507, + "epoch": 0.023087329024500225, + "flos": 21370679510400.0, + "grad_norm": 1.6365186973659018, + "language_loss": 0.96944022, + "learning_rate": 3.831334200735543e-06, + "loss": 0.9974727, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.53271484, + "step": 384, + "time_per_iteration": 2.9126317501068115 + }, + { + "auxiliary_loss_clip": 0.01408058, + "auxiliary_loss_mlp": 0.01405683, + "balance_loss_clip": 1.24840367, + "balance_loss_mlp": 1.36012149, + "epoch": 0.023147452277168194, + "flos": 21882644133120.0, + "grad_norm": 1.962256499625677, + "language_loss": 0.97008049, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.998218, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.45581055, + "step": 385, + "time_per_iteration": 2.906268835067749 + }, + { + "auxiliary_loss_clip": 0.01411438, + "auxiliary_loss_mlp": 0.01425657, + "balance_loss_clip": 1.25042713, + "balance_loss_mlp": 1.37039185, + "epoch": 0.023207575529836166, + "flos": 18926040954240.0, + "grad_norm": 1.5169892406388585, + "language_loss": 0.76595026, + "learning_rate": 3.83467889492477e-06, + "loss": 0.79432124, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.55273438, + "step": 386, + "time_per_iteration": 2.958756923675537 + }, + { + "auxiliary_loss_clip": 0.01414276, + "auxiliary_loss_mlp": 0.01446646, + "balance_loss_clip": 1.24932146, + "balance_loss_mlp": 1.36954165, + "epoch": 0.023267698782504134, + "flos": 25056857393280.0, + "grad_norm": 1.6164475051241824, + "language_loss": 0.96341622, + "learning_rate": 3.836344748851495e-06, + "loss": 0.99202549, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.77050781, + "step": 387, + "time_per_iteration": 3.0218093395233154 + }, + { + "auxiliary_loss_clip": 0.01417861, + "auxiliary_loss_mlp": 0.01397459, + "balance_loss_clip": 1.25201678, + "balance_loss_mlp": 1.34610343, + "epoch": 0.023327822035172103, + "flos": 28891637758080.0, + "grad_norm": 1.5233004525436982, + "language_loss": 0.94490337, + "learning_rate": 3.838006303795566e-06, + "loss": 0.9730565, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.51318359, + "step": 388, + "time_per_iteration": 3.0295302867889404 + }, + { + "auxiliary_loss_clip": 0.01413909, + "auxiliary_loss_mlp": 0.01388534, + "balance_loss_clip": 1.25124776, + "balance_loss_mlp": 1.34168434, + "epoch": 0.02338794528784007, + "flos": 27131844781440.0, + "grad_norm": 1.8298004973904598, + "language_loss": 1.06572104, + "learning_rate": 3.839663581888206e-06, + "loss": 1.09374535, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.46875, + "step": 389, + "time_per_iteration": 2.9822375774383545 + }, + { + "auxiliary_loss_clip": 0.01416686, + "auxiliary_loss_mlp": 0.01390751, + "balance_loss_clip": 1.2534982, + "balance_loss_mlp": 1.34297156, + "epoch": 0.02344806854050804, + "flos": 21331696475520.0, + "grad_norm": 1.6355058499117958, + "language_loss": 0.97054803, + "learning_rate": 3.841316605090178e-06, + "loss": 0.99862236, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.47827148, + "step": 390, + "time_per_iteration": 3.0225462913513184 + }, + { + "auxiliary_loss_clip": 0.01417435, + "auxiliary_loss_mlp": 0.01394344, + "balance_loss_clip": 1.25555551, + "balance_loss_mlp": 1.34859204, + "epoch": 0.023508191793176012, + "flos": 24800626235520.0, + "grad_norm": 1.8998978110571239, + "language_loss": 1.04456878, + "learning_rate": 3.842965395193529e-06, + "loss": 1.07268655, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.45776367, + "step": 391, + "time_per_iteration": 3.075310468673706 + }, + { + "auxiliary_loss_clip": 0.01424582, + "auxiliary_loss_mlp": 0.01397412, + "balance_loss_clip": 1.25918221, + "balance_loss_mlp": 1.34455442, + "epoch": 0.02356831504584398, + "flos": 26006666376960.0, + "grad_norm": 1.6777613334588555, + "language_loss": 0.9790647, + "learning_rate": 3.84460997382332e-06, + "loss": 1.00728464, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.52832031, + "step": 392, + "time_per_iteration": 3.035109519958496 + }, + { + "auxiliary_loss_clip": 0.01420273, + "auxiliary_loss_mlp": 0.0139555, + "balance_loss_clip": 1.25692046, + "balance_loss_mlp": 1.34574389, + "epoch": 0.02362843829851195, + "flos": 19071793013760.0, + "grad_norm": 1.6557398565930521, + "language_loss": 0.97907925, + "learning_rate": 3.8462503624393256e-06, + "loss": 1.00723743, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.49780273, + "step": 393, + "time_per_iteration": 2.9025425910949707 + }, + { + "auxiliary_loss_clip": 0.01417072, + "auxiliary_loss_mlp": 0.01389342, + "balance_loss_clip": 1.25435269, + "balance_loss_mlp": 1.34072864, + "epoch": 0.023688561551179918, + "flos": 16079690649600.0, + "grad_norm": 1.5429131146218331, + "language_loss": 0.89719033, + "learning_rate": 3.84788658233771e-06, + "loss": 0.92525446, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.48632812, + "step": 394, + "time_per_iteration": 2.9892895221710205 + }, + { + "auxiliary_loss_clip": 0.01414507, + "auxiliary_loss_mlp": 0.01406078, + "balance_loss_clip": 1.25088549, + "balance_loss_mlp": 1.34158587, + "epoch": 0.023748684803847887, + "flos": 21733996406400.0, + "grad_norm": 1.4948832172421616, + "language_loss": 0.93770057, + "learning_rate": 3.84951865465269e-06, + "loss": 0.96590644, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.64428711, + "step": 395, + "time_per_iteration": 2.9781875610351562 + }, + { + "auxiliary_loss_clip": 0.01374215, + "auxiliary_loss_mlp": 0.01363218, + "balance_loss_clip": 1.24009013, + "balance_loss_mlp": 1.33966208, + "epoch": 0.02380880805651586, + "flos": 61954289124480.0, + "grad_norm": 0.9497876900872289, + "language_loss": 0.64010942, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66748369, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.23535156, + "step": 396, + "time_per_iteration": 3.201403856277466 + }, + { + "auxiliary_loss_clip": 0.01408417, + "auxiliary_loss_mlp": 0.01391393, + "balance_loss_clip": 1.24388313, + "balance_loss_mlp": 1.34204006, + "epoch": 0.023868931309183827, + "flos": 20275932873600.0, + "grad_norm": 1.9052428541560735, + "language_loss": 0.9938761, + "learning_rate": 3.852770440269372e-06, + "loss": 1.02187419, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.49365234, + "step": 397, + "time_per_iteration": 2.9417977333068848 + }, + { + "auxiliary_loss_clip": 0.01409678, + "auxiliary_loss_mlp": 0.01392491, + "balance_loss_clip": 1.24542904, + "balance_loss_mlp": 1.34537947, + "epoch": 0.023929054561851796, + "flos": 21148182990720.0, + "grad_norm": 1.7170932428235586, + "language_loss": 1.0027554, + "learning_rate": 3.854390195044404e-06, + "loss": 1.0307771, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.47119141, + "step": 398, + "time_per_iteration": 4.419039726257324 + }, + { + "auxiliary_loss_clip": 0.01404018, + "auxiliary_loss_mlp": 0.01420851, + "balance_loss_clip": 1.23968101, + "balance_loss_mlp": 1.34408045, + "epoch": 0.023989177814519765, + "flos": 13706321932800.0, + "grad_norm": 2.0404837169316954, + "language_loss": 1.07362795, + "learning_rate": 3.856005885185868e-06, + "loss": 1.10187662, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.76708984, + "step": 399, + "time_per_iteration": 2.8647799491882324 + }, + { + "auxiliary_loss_clip": 0.01402744, + "auxiliary_loss_mlp": 0.0138984, + "balance_loss_clip": 1.24073505, + "balance_loss_mlp": 1.34396887, + "epoch": 0.024049301067187733, + "flos": 26332945764480.0, + "grad_norm": 1.6443111007752744, + "language_loss": 0.93580818, + "learning_rate": 3.857617531042398e-06, + "loss": 0.96373403, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.45874023, + "step": 400, + "time_per_iteration": 4.4383416175842285 + }, + { + "auxiliary_loss_clip": 0.01408905, + "auxiliary_loss_mlp": 0.01395649, + "balance_loss_clip": 1.24383187, + "balance_loss_mlp": 1.34152746, + "epoch": 0.024109424319855705, + "flos": 24436042485120.0, + "grad_norm": 1.5000425070600032, + "language_loss": 0.8928504, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.92089593, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.54125977, + "step": 401, + "time_per_iteration": 5.888904333114624 + }, + { + "auxiliary_loss_clip": 0.01403965, + "auxiliary_loss_mlp": 0.01394754, + "balance_loss_clip": 1.24218559, + "balance_loss_mlp": 1.3442812, + "epoch": 0.024169547572523674, + "flos": 29615828330880.0, + "grad_norm": 1.6399373181979282, + "language_loss": 0.90155423, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.92954147, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.50488281, + "step": 402, + "time_per_iteration": 3.050919532775879 + }, + { + "auxiliary_loss_clip": 0.01409755, + "auxiliary_loss_mlp": 0.01393463, + "balance_loss_clip": 1.24203622, + "balance_loss_mlp": 1.34112978, + "epoch": 0.024229670825191642, + "flos": 22611675899520.0, + "grad_norm": 2.0705636146900686, + "language_loss": 1.08263421, + "learning_rate": 3.86242840411147e-06, + "loss": 1.11066628, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.52270508, + "step": 403, + "time_per_iteration": 2.9243438243865967 + }, + { + "auxiliary_loss_clip": 0.01399575, + "auxiliary_loss_mlp": 0.01398846, + "balance_loss_clip": 1.23578787, + "balance_loss_mlp": 1.34207821, + "epoch": 0.02428979407785961, + "flos": 18159338252160.0, + "grad_norm": 1.8548026670563853, + "language_loss": 1.1123991, + "learning_rate": 3.864024073288798e-06, + "loss": 1.14038324, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.56787109, + "step": 404, + "time_per_iteration": 2.909031629562378 + }, + { + "auxiliary_loss_clip": 0.01400673, + "auxiliary_loss_mlp": 0.01396473, + "balance_loss_clip": 1.23807812, + "balance_loss_mlp": 1.34602344, + "epoch": 0.024349917330527583, + "flos": 15313485640320.0, + "grad_norm": 1.6281389854642492, + "language_loss": 1.00279319, + "learning_rate": 3.865615797668091e-06, + "loss": 1.03076482, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.50439453, + "step": 405, + "time_per_iteration": 2.8969004154205322 + }, + { + "auxiliary_loss_clip": 0.01400338, + "auxiliary_loss_mlp": 0.01389689, + "balance_loss_clip": 1.23801303, + "balance_loss_mlp": 1.33988297, + "epoch": 0.024410040583195552, + "flos": 20782784833920.0, + "grad_norm": 1.7097613299573007, + "language_loss": 1.04452968, + "learning_rate": 3.867203596705844e-06, + "loss": 1.0724299, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.49853516, + "step": 406, + "time_per_iteration": 2.934605360031128 + }, + { + "auxiliary_loss_clip": 0.01398272, + "auxiliary_loss_mlp": 0.01386491, + "balance_loss_clip": 1.23508644, + "balance_loss_mlp": 1.33930767, + "epoch": 0.02447016383586352, + "flos": 21808885829760.0, + "grad_norm": 1.4899744765643543, + "language_loss": 0.97820926, + "learning_rate": 3.86878748971496e-06, + "loss": 1.00605679, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.47167969, + "step": 407, + "time_per_iteration": 2.9809410572052 + }, + { + "auxiliary_loss_clip": 0.01393676, + "auxiliary_loss_mlp": 0.0140858, + "balance_loss_clip": 1.2330811, + "balance_loss_mlp": 1.34523225, + "epoch": 0.02453028708853149, + "flos": 33961170729600.0, + "grad_norm": 1.3913886376998021, + "language_loss": 0.82262158, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.85064411, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.63305664, + "step": 408, + "time_per_iteration": 3.0335466861724854 + }, + { + "auxiliary_loss_clip": 0.0139531, + "auxiliary_loss_mlp": 0.01388018, + "balance_loss_clip": 1.23351383, + "balance_loss_mlp": 1.34176445, + "epoch": 0.024590410341199458, + "flos": 21801510927360.0, + "grad_norm": 3.086162629915322, + "language_loss": 1.06479311, + "learning_rate": 3.871943634189376e-06, + "loss": 1.09262645, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.46289062, + "step": 409, + "time_per_iteration": 2.96094012260437 + }, + { + "auxiliary_loss_clip": 0.01402858, + "auxiliary_loss_mlp": 0.01397965, + "balance_loss_clip": 1.23773444, + "balance_loss_mlp": 1.34656167, + "epoch": 0.02465053359386743, + "flos": 35127911122560.0, + "grad_norm": 1.7839932470057396, + "language_loss": 0.93995953, + "learning_rate": 3.873515923575128e-06, + "loss": 0.96796781, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.51391602, + "step": 410, + "time_per_iteration": 3.063957452774048 + }, + { + "auxiliary_loss_clip": 0.01393456, + "auxiliary_loss_mlp": 0.01409216, + "balance_loss_clip": 1.23241997, + "balance_loss_mlp": 1.34956384, + "epoch": 0.0247106568465354, + "flos": 27462196200960.0, + "grad_norm": 1.9660835906450607, + "language_loss": 0.91820765, + "learning_rate": 3.875084382775879e-06, + "loss": 0.94623435, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.59643555, + "step": 411, + "time_per_iteration": 3.0753536224365234 + }, + { + "auxiliary_loss_clip": 0.01395271, + "auxiliary_loss_mlp": 0.01407578, + "balance_loss_clip": 1.23251987, + "balance_loss_mlp": 1.35293245, + "epoch": 0.024770780099203367, + "flos": 20713415276160.0, + "grad_norm": 1.8564675148160124, + "language_loss": 0.98956716, + "learning_rate": 3.87664903040738e-06, + "loss": 1.01759565, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.54711914, + "step": 412, + "time_per_iteration": 2.9543721675872803 + }, + { + "auxiliary_loss_clip": 0.01367631, + "auxiliary_loss_mlp": 0.01409191, + "balance_loss_clip": 1.23895931, + "balance_loss_mlp": 1.38096189, + "epoch": 0.024830903351871336, + "flos": 69581971152000.0, + "grad_norm": 0.8686447461050046, + "language_loss": 0.5884645, + "learning_rate": 3.878209884949994e-06, + "loss": 0.61623269, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.28320312, + "step": 413, + "time_per_iteration": 3.42899751663208 + }, + { + "auxiliary_loss_clip": 0.01393279, + "auxiliary_loss_mlp": 0.01409888, + "balance_loss_clip": 1.23052263, + "balance_loss_mlp": 1.34217715, + "epoch": 0.024891026604539304, + "flos": 32283280120320.0, + "grad_norm": 1.3623774937065019, + "language_loss": 0.88302422, + "learning_rate": 3.879766964750006e-06, + "loss": 0.91105592, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.67749023, + "step": 414, + "time_per_iteration": 3.034453868865967 + }, + { + "auxiliary_loss_clip": 0.01390613, + "auxiliary_loss_mlp": 0.01382706, + "balance_loss_clip": 1.23197889, + "balance_loss_mlp": 1.34036326, + "epoch": 0.024951149857207276, + "flos": 18848708311680.0, + "grad_norm": 1.6416095718939654, + "language_loss": 0.91087341, + "learning_rate": 3.881320288020917e-06, + "loss": 0.93860662, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.42333984, + "step": 415, + "time_per_iteration": 2.92095947265625 + }, + { + "auxiliary_loss_clip": 0.01402184, + "auxiliary_loss_mlp": 0.01395512, + "balance_loss_clip": 1.23761499, + "balance_loss_mlp": 1.34439492, + "epoch": 0.025011273109875245, + "flos": 15385524641280.0, + "grad_norm": 1.915516112430771, + "language_loss": 1.17814624, + "learning_rate": 3.882869872844723e-06, + "loss": 1.20612323, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.51123047, + "step": 416, + "time_per_iteration": 2.9286961555480957 + }, + { + "auxiliary_loss_clip": 0.01396875, + "auxiliary_loss_mlp": 0.01417205, + "balance_loss_clip": 1.23453426, + "balance_loss_mlp": 1.35264099, + "epoch": 0.025071396362543213, + "flos": 18924547875840.0, + "grad_norm": 1.3825791710074287, + "language_loss": 0.87238657, + "learning_rate": 3.884415737173176e-06, + "loss": 0.90052736, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.64599609, + "step": 417, + "time_per_iteration": 3.029409646987915 + }, + { + "auxiliary_loss_clip": 0.01397121, + "auxiliary_loss_mlp": 0.01393547, + "balance_loss_clip": 1.2375468, + "balance_loss_mlp": 1.34958267, + "epoch": 0.025131519615211182, + "flos": 25348904449920.0, + "grad_norm": 1.5118812616867696, + "language_loss": 0.8642205, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.89212716, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.43969727, + "step": 418, + "time_per_iteration": 2.9942479133605957 + }, + { + "auxiliary_loss_clip": 0.01401634, + "auxiliary_loss_mlp": 0.01392822, + "balance_loss_clip": 1.24175024, + "balance_loss_mlp": 1.34382665, + "epoch": 0.02519164286787915, + "flos": 18962490280320.0, + "grad_norm": 1.9549665690522424, + "language_loss": 1.01676965, + "learning_rate": 3.887496375507294e-06, + "loss": 1.04471409, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.49023438, + "step": 419, + "time_per_iteration": 3.0266122817993164 + }, + { + "auxiliary_loss_clip": 0.01398229, + "auxiliary_loss_mlp": 0.01387285, + "balance_loss_clip": 1.24019802, + "balance_loss_mlp": 1.33995903, + "epoch": 0.025251766120547123, + "flos": 17430125506560.0, + "grad_norm": 1.4827028826682658, + "language_loss": 0.84841162, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.87626672, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.47314453, + "step": 420, + "time_per_iteration": 2.925823926925659 + }, + { + "auxiliary_loss_clip": 0.01403063, + "auxiliary_loss_mlp": 0.01389467, + "balance_loss_clip": 1.24438667, + "balance_loss_mlp": 1.33997166, + "epoch": 0.02531188937321509, + "flos": 25056133476480.0, + "grad_norm": 1.5284838003167749, + "language_loss": 0.88954908, + "learning_rate": 3.890562344079484e-06, + "loss": 0.91747439, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.49487305, + "step": 421, + "time_per_iteration": 2.99147367477417 + }, + { + "auxiliary_loss_clip": 0.01404969, + "auxiliary_loss_mlp": 0.01412558, + "balance_loss_clip": 1.24627173, + "balance_loss_mlp": 1.34668326, + "epoch": 0.02537201262588306, + "flos": 30604620349440.0, + "grad_norm": 1.6931249223096987, + "language_loss": 0.97400844, + "learning_rate": 3.89208987073549e-06, + "loss": 1.00218368, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.65893555, + "step": 422, + "time_per_iteration": 3.0301687717437744 + }, + { + "auxiliary_loss_clip": 0.01404636, + "auxiliary_loss_mlp": 0.01395875, + "balance_loss_clip": 1.24663889, + "balance_loss_mlp": 1.35062301, + "epoch": 0.02543213587855103, + "flos": 26075900200320.0, + "grad_norm": 1.4579814032477771, + "language_loss": 0.93270713, + "learning_rate": 3.893613781940409e-06, + "loss": 0.96071231, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.45263672, + "step": 423, + "time_per_iteration": 2.9439830780029297 + }, + { + "auxiliary_loss_clip": 0.01401004, + "auxiliary_loss_mlp": 0.01395924, + "balance_loss_clip": 1.24583828, + "balance_loss_mlp": 1.34914601, + "epoch": 0.025492259131218997, + "flos": 36035117487360.0, + "grad_norm": 1.473053770949093, + "language_loss": 0.84502828, + "learning_rate": 3.895134094768415e-06, + "loss": 0.87299764, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.46728516, + "step": 424, + "time_per_iteration": 3.1030938625335693 + }, + { + "auxiliary_loss_clip": 0.01398381, + "auxiliary_loss_mlp": 0.01398871, + "balance_loss_clip": 1.24246073, + "balance_loss_mlp": 1.35521662, + "epoch": 0.02555238238388697, + "flos": 18597227857920.0, + "grad_norm": 1.7119743807831254, + "language_loss": 0.97581804, + "learning_rate": 3.896650826173015e-06, + "loss": 1.00379062, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.43652344, + "step": 425, + "time_per_iteration": 2.9173872470855713 + }, + { + "auxiliary_loss_clip": 0.01401549, + "auxiliary_loss_mlp": 0.01394044, + "balance_loss_clip": 1.24450099, + "balance_loss_mlp": 1.35215354, + "epoch": 0.025612505636554938, + "flos": 24253795854720.0, + "grad_norm": 1.8073242196451935, + "language_loss": 0.99149489, + "learning_rate": 3.898163992988186e-06, + "loss": 1.01945078, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.41894531, + "step": 426, + "time_per_iteration": 2.956328868865967 + }, + { + "auxiliary_loss_clip": 0.01364835, + "auxiliary_loss_mlp": 0.0135326, + "balance_loss_clip": 1.24177933, + "balance_loss_mlp": 1.33027601, + "epoch": 0.025672628889222907, + "flos": 60617472952320.0, + "grad_norm": 0.8757884556402402, + "language_loss": 0.57315564, + "learning_rate": 3.899673611929491e-06, + "loss": 0.60033655, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.22949219, + "step": 427, + "time_per_iteration": 3.5057671070098877 + }, + { + "auxiliary_loss_clip": 0.01395341, + "auxiliary_loss_mlp": 0.0138328, + "balance_loss_clip": 1.24001908, + "balance_loss_mlp": 1.34282017, + "epoch": 0.025732752141890875, + "flos": 19582717006080.0, + "grad_norm": 1.8159926443199037, + "language_loss": 1.01561594, + "learning_rate": 3.901179699595194e-06, + "loss": 1.04340219, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.40429688, + "step": 428, + "time_per_iteration": 2.901517391204834 + }, + { + "auxiliary_loss_clip": 0.01392683, + "auxiliary_loss_mlp": 0.01386575, + "balance_loss_clip": 1.23747241, + "balance_loss_mlp": 1.34106064, + "epoch": 0.025792875394558847, + "flos": 31296524117760.0, + "grad_norm": 1.385685399917727, + "language_loss": 0.92941296, + "learning_rate": 3.902682272467353e-06, + "loss": 0.95720553, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.45507812, + "step": 429, + "time_per_iteration": 3.0726804733276367 + }, + { + "auxiliary_loss_clip": 0.01397484, + "auxiliary_loss_mlp": 0.0138321, + "balance_loss_clip": 1.23908448, + "balance_loss_mlp": 1.34217799, + "epoch": 0.025852998647226816, + "flos": 32392356629760.0, + "grad_norm": 1.6523392557458885, + "language_loss": 0.96333861, + "learning_rate": 3.904181346912895e-06, + "loss": 0.99114561, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.41040039, + "step": 430, + "time_per_iteration": 3.0614352226257324 + }, + { + "auxiliary_loss_clip": 0.01404627, + "auxiliary_loss_mlp": 0.0139908, + "balance_loss_clip": 1.24561357, + "balance_loss_mlp": 1.34443426, + "epoch": 0.025913121899894784, + "flos": 20202853242240.0, + "grad_norm": 1.2954869332652592, + "language_loss": 0.93149275, + "learning_rate": 3.905676939184698e-06, + "loss": 0.95952982, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.54711914, + "step": 431, + "time_per_iteration": 2.9164254665374756 + }, + { + "auxiliary_loss_clip": 0.01399544, + "auxiliary_loss_mlp": 0.0138687, + "balance_loss_clip": 1.24134183, + "balance_loss_mlp": 1.34173787, + "epoch": 0.025973245152562753, + "flos": 14728577120640.0, + "grad_norm": 1.7946940387086892, + "language_loss": 1.02198172, + "learning_rate": 3.907169065422638e-06, + "loss": 1.04984593, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.45166016, + "step": 432, + "time_per_iteration": 2.9389102458953857 + }, + { + "auxiliary_loss_clip": 0.013944, + "auxiliary_loss_mlp": 0.01380363, + "balance_loss_clip": 1.23937201, + "balance_loss_mlp": 1.34104741, + "epoch": 0.02603336840523072, + "flos": 31005336712320.0, + "grad_norm": 1.5179713286549281, + "language_loss": 0.88159579, + "learning_rate": 3.908657741654636e-06, + "loss": 0.90934348, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.39331055, + "step": 433, + "time_per_iteration": 4.500396013259888 + }, + { + "auxiliary_loss_clip": 0.0139473, + "auxiliary_loss_mlp": 0.01393043, + "balance_loss_clip": 1.23726118, + "balance_loss_mlp": 1.34073412, + "epoch": 0.026093491657898694, + "flos": 17682782325120.0, + "grad_norm": 1.720657523640355, + "language_loss": 1.00807548, + "learning_rate": 3.910142983797699e-06, + "loss": 1.03595328, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.52294922, + "step": 434, + "time_per_iteration": 2.913849353790283 + }, + { + "auxiliary_loss_clip": 0.01397763, + "auxiliary_loss_mlp": 0.01390417, + "balance_loss_clip": 1.23954082, + "balance_loss_mlp": 1.34719181, + "epoch": 0.026153614910566662, + "flos": 17866748257920.0, + "grad_norm": 1.7119908151992933, + "language_loss": 0.94537556, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.9732573, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.43212891, + "step": 435, + "time_per_iteration": 5.990097761154175 + }, + { + "auxiliary_loss_clip": 0.01391564, + "auxiliary_loss_mlp": 0.01390757, + "balance_loss_clip": 1.23408782, + "balance_loss_mlp": 1.34517109, + "epoch": 0.02621373816323463, + "flos": 20020697101440.0, + "grad_norm": 1.7765572630960804, + "language_loss": 1.00634158, + "learning_rate": 3.913103228936546e-06, + "loss": 1.03416479, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.45556641, + "step": 436, + "time_per_iteration": 4.30284571647644 + }, + { + "auxiliary_loss_clip": 0.01386504, + "auxiliary_loss_mlp": 0.01390833, + "balance_loss_clip": 1.23267043, + "balance_loss_mlp": 1.34429348, + "epoch": 0.0262738614159026, + "flos": 19290534215040.0, + "grad_norm": 1.7667629061327987, + "language_loss": 0.90416002, + "learning_rate": 3.914578263220868e-06, + "loss": 0.93193334, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.46533203, + "step": 437, + "time_per_iteration": 2.9344027042388916 + }, + { + "auxiliary_loss_clip": 0.01392165, + "auxiliary_loss_mlp": 0.01395532, + "balance_loss_clip": 1.23491108, + "balance_loss_mlp": 1.34195948, + "epoch": 0.026333984668570568, + "flos": 18816511996800.0, + "grad_norm": 1.871606671805799, + "language_loss": 1.0253365, + "learning_rate": 3.916049925995316e-06, + "loss": 1.05321348, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.53540039, + "step": 438, + "time_per_iteration": 2.9069290161132812 + }, + { + "auxiliary_loss_clip": 0.01341885, + "auxiliary_loss_mlp": 0.01368823, + "balance_loss_clip": 1.21738458, + "balance_loss_mlp": 1.35127509, + "epoch": 0.02639410792123854, + "flos": 64605696992640.0, + "grad_norm": 0.8881325415306814, + "language_loss": 0.62714016, + "learning_rate": 3.917518232637377e-06, + "loss": 0.65424728, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.17578125, + "step": 439, + "time_per_iteration": 3.4116811752319336 + }, + { + "auxiliary_loss_clip": 0.0139053, + "auxiliary_loss_mlp": 0.0139002, + "balance_loss_clip": 1.23565936, + "balance_loss_mlp": 1.34619856, + "epoch": 0.02645423117390651, + "flos": 28484632368000.0, + "grad_norm": 1.6358271261305932, + "language_loss": 0.87253404, + "learning_rate": 3.918983198419573e-06, + "loss": 0.90033948, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.43823242, + "step": 440, + "time_per_iteration": 3.0350475311279297 + }, + { + "auxiliary_loss_clip": 0.01391654, + "auxiliary_loss_mlp": 0.01393591, + "balance_loss_clip": 1.23760378, + "balance_loss_mlp": 1.35191536, + "epoch": 0.026514354426574478, + "flos": 18560326083840.0, + "grad_norm": 1.545352241203962, + "language_loss": 0.94404054, + "learning_rate": 3.920444838510415e-06, + "loss": 0.97189295, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.41674805, + "step": 441, + "time_per_iteration": 2.8807730674743652 + }, + { + "auxiliary_loss_clip": 0.01395765, + "auxiliary_loss_mlp": 0.01409837, + "balance_loss_clip": 1.24056935, + "balance_loss_mlp": 1.36437011, + "epoch": 0.026574477679242446, + "flos": 20677554132480.0, + "grad_norm": 1.6466820553337695, + "language_loss": 0.89875805, + "learning_rate": 3.92190316797534e-06, + "loss": 0.92681408, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.45507812, + "step": 442, + "time_per_iteration": 2.946699619293213 + }, + { + "auxiliary_loss_clip": 0.01330035, + "auxiliary_loss_mlp": 0.01364959, + "balance_loss_clip": 1.20807445, + "balance_loss_mlp": 1.34769762, + "epoch": 0.026634600931910415, + "flos": 57984977410560.0, + "grad_norm": 0.961870205521472, + "language_loss": 0.64679015, + "learning_rate": 3.92335820177765e-06, + "loss": 0.67374015, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.17285156, + "step": 443, + "time_per_iteration": 3.2001097202301025 + }, + { + "auxiliary_loss_clip": 0.01393847, + "auxiliary_loss_mlp": 0.01413296, + "balance_loss_clip": 1.23834443, + "balance_loss_mlp": 1.37104845, + "epoch": 0.026694724184578387, + "flos": 15823595226240.0, + "grad_norm": 1.696066373000528, + "language_loss": 0.98958337, + "learning_rate": 3.924809954779425e-06, + "loss": 1.01765478, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.42236328, + "step": 444, + "time_per_iteration": 2.8932900428771973 + }, + { + "auxiliary_loss_clip": 0.01399802, + "auxiliary_loss_mlp": 0.01415403, + "balance_loss_clip": 1.24138057, + "balance_loss_mlp": 1.36862528, + "epoch": 0.026754847437246355, + "flos": 23450598581760.0, + "grad_norm": 1.891215762411875, + "language_loss": 1.05967486, + "learning_rate": 3.9262584417424425e-06, + "loss": 1.08782685, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.46801758, + "step": 445, + "time_per_iteration": 2.998979091644287 + }, + { + "auxiliary_loss_clip": 0.01395505, + "auxiliary_loss_mlp": 0.01414267, + "balance_loss_clip": 1.23962259, + "balance_loss_mlp": 1.36517704, + "epoch": 0.026814970689914324, + "flos": 17349173280000.0, + "grad_norm": 1.7231729153345738, + "language_loss": 1.06627774, + "learning_rate": 3.9277036773290725e-06, + "loss": 1.09437537, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.49121094, + "step": 446, + "time_per_iteration": 2.9196202754974365 + }, + { + "auxiliary_loss_clip": 0.01396812, + "auxiliary_loss_mlp": 0.01418474, + "balance_loss_clip": 1.2432183, + "balance_loss_mlp": 1.37448537, + "epoch": 0.026875093942582293, + "flos": 17903695276800.0, + "grad_norm": 1.60495784845932, + "language_loss": 0.90625036, + "learning_rate": 3.92914567610317e-06, + "loss": 0.93440318, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.43994141, + "step": 447, + "time_per_iteration": 3.000779390335083 + }, + { + "auxiliary_loss_clip": 0.01402573, + "auxiliary_loss_mlp": 0.01407204, + "balance_loss_clip": 1.24788427, + "balance_loss_mlp": 1.3569926, + "epoch": 0.026935217195250265, + "flos": 21733498713600.0, + "grad_norm": 1.6459954513717325, + "language_loss": 0.97970629, + "learning_rate": 3.930584452530952e-06, + "loss": 1.00780404, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.50244141, + "step": 448, + "time_per_iteration": 2.9637033939361572 + }, + { + "auxiliary_loss_clip": 0.01407288, + "auxiliary_loss_mlp": 0.01387741, + "balance_loss_clip": 1.25114012, + "balance_loss_mlp": 1.34592271, + "epoch": 0.026995340447918233, + "flos": 23633071436160.0, + "grad_norm": 1.9923106691001402, + "language_loss": 0.97877586, + "learning_rate": 3.9320200209818755e-06, + "loss": 1.00672626, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.41845703, + "step": 449, + "time_per_iteration": 2.9294071197509766 + }, + { + "auxiliary_loss_clip": 0.01422271, + "auxiliary_loss_mlp": 0.0140812, + "balance_loss_clip": 1.26023197, + "balance_loss_mlp": 1.34660769, + "epoch": 0.027055463700586202, + "flos": 17940280337280.0, + "grad_norm": 1.6448201583312032, + "language_loss": 0.95780754, + "learning_rate": 3.933452395729493e-06, + "loss": 0.9861114, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.61572266, + "step": 450, + "time_per_iteration": 2.943301200866699 + }, + { + "auxiliary_loss_clip": 0.01420491, + "auxiliary_loss_mlp": 0.0140163, + "balance_loss_clip": 1.26398015, + "balance_loss_mlp": 1.35666466, + "epoch": 0.02711558695325417, + "flos": 25129756045440.0, + "grad_norm": 1.3423579669924286, + "language_loss": 0.87928998, + "learning_rate": 3.934881590952304e-06, + "loss": 0.90751117, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.44970703, + "step": 451, + "time_per_iteration": 3.01282000541687 + }, + { + "auxiliary_loss_clip": 0.01413283, + "auxiliary_loss_mlp": 0.01404288, + "balance_loss_clip": 1.25681424, + "balance_loss_mlp": 1.36220717, + "epoch": 0.02717571020592214, + "flos": 24249226129920.0, + "grad_norm": 1.3995550304768931, + "language_loss": 0.82752109, + "learning_rate": 3.936307620734599e-06, + "loss": 0.8556968, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.42114258, + "step": 452, + "time_per_iteration": 3.0229203701019287 + }, + { + "auxiliary_loss_clip": 0.01414978, + "auxiliary_loss_mlp": 0.01409309, + "balance_loss_clip": 1.25545239, + "balance_loss_mlp": 1.3611964, + "epoch": 0.02723583345859011, + "flos": 25129801290240.0, + "grad_norm": 1.3968705017557292, + "language_loss": 0.79238933, + "learning_rate": 3.937730499067294e-06, + "loss": 0.82063222, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.48120117, + "step": 453, + "time_per_iteration": 3.002072334289551 + }, + { + "auxiliary_loss_clip": 0.01400529, + "auxiliary_loss_mlp": 0.01393735, + "balance_loss_clip": 1.245749, + "balance_loss_mlp": 1.35251212, + "epoch": 0.02729595671125808, + "flos": 42757040983680.0, + "grad_norm": 1.5344341816424516, + "language_loss": 0.93651056, + "learning_rate": 3.939150239848748e-06, + "loss": 0.96445322, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.41235352, + "step": 454, + "time_per_iteration": 3.089520215988159 + }, + { + "auxiliary_loss_clip": 0.01395591, + "auxiliary_loss_mlp": 0.01401952, + "balance_loss_clip": 1.24100053, + "balance_loss_mlp": 1.3535533, + "epoch": 0.02735607996392605, + "flos": 21440139557760.0, + "grad_norm": 1.3592123467678543, + "language_loss": 0.83798641, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.86596179, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.48388672, + "step": 455, + "time_per_iteration": 2.975954532623291 + }, + { + "auxiliary_loss_clip": 0.01395962, + "auxiliary_loss_mlp": 0.01386229, + "balance_loss_clip": 1.23839784, + "balance_loss_mlp": 1.34321856, + "epoch": 0.027416203216594017, + "flos": 20861339086080.0, + "grad_norm": 1.5838004258745126, + "language_loss": 0.92867589, + "learning_rate": 3.941980363893499e-06, + "loss": 0.95649779, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.42993164, + "step": 456, + "time_per_iteration": 2.9180235862731934 + }, + { + "auxiliary_loss_clip": 0.01395749, + "auxiliary_loss_mlp": 0.01406482, + "balance_loss_clip": 1.23816264, + "balance_loss_mlp": 1.355937, + "epoch": 0.027476326469261986, + "flos": 13232254469760.0, + "grad_norm": 1.4667757240477777, + "language_loss": 0.9012388, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.92926109, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.50537109, + "step": 457, + "time_per_iteration": 2.953564405441284 + }, + { + "auxiliary_loss_clip": 0.01393254, + "auxiliary_loss_mlp": 0.01415634, + "balance_loss_clip": 1.23626959, + "balance_loss_mlp": 1.36966717, + "epoch": 0.027536449721929958, + "flos": 24035145143040.0, + "grad_norm": 1.7860379551689107, + "language_loss": 1.04052508, + "learning_rate": 3.944798102235412e-06, + "loss": 1.06861389, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.45922852, + "step": 458, + "time_per_iteration": 3.1575000286102295 + }, + { + "auxiliary_loss_clip": 0.01392861, + "auxiliary_loss_mlp": 0.01424783, + "balance_loss_clip": 1.23459816, + "balance_loss_mlp": 1.37028027, + "epoch": 0.027596572974597926, + "flos": 13013875226880.0, + "grad_norm": 1.7813968559654139, + "language_loss": 0.95093888, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.97911531, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.54516602, + "step": 459, + "time_per_iteration": 2.9447152614593506 + }, + { + "auxiliary_loss_clip": 0.01388817, + "auxiliary_loss_mlp": 0.01383823, + "balance_loss_clip": 1.23493648, + "balance_loss_mlp": 1.3450563, + "epoch": 0.027656696227265895, + "flos": 26154861655680.0, + "grad_norm": 1.4659047235051597, + "language_loss": 0.91302001, + "learning_rate": 3.947603562811407e-06, + "loss": 0.94074649, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.38745117, + "step": 460, + "time_per_iteration": 3.053457260131836 + }, + { + "auxiliary_loss_clip": 0.01345289, + "auxiliary_loss_mlp": 0.01352943, + "balance_loss_clip": 1.22423649, + "balance_loss_mlp": 1.3336786, + "epoch": 0.027716819479933864, + "flos": 60727997295360.0, + "grad_norm": 1.5583128175533738, + "language_loss": 0.73720002, + "learning_rate": 3.949001722282675e-06, + "loss": 0.76418233, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.19238281, + "step": 461, + "time_per_iteration": 3.3921680450439453 + }, + { + "auxiliary_loss_clip": 0.01393577, + "auxiliary_loss_mlp": 0.01393772, + "balance_loss_clip": 1.23783088, + "balance_loss_mlp": 1.35154796, + "epoch": 0.027776942732601832, + "flos": 31224349382400.0, + "grad_norm": 1.8583306578271923, + "language_loss": 0.95236367, + "learning_rate": 3.950396852153582e-06, + "loss": 0.98023719, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.42236328, + "step": 462, + "time_per_iteration": 3.0480215549468994 + }, + { + "auxiliary_loss_clip": 0.01385656, + "auxiliary_loss_mlp": 0.0140004, + "balance_loss_clip": 1.23103058, + "balance_loss_mlp": 1.35893643, + "epoch": 0.027837065985269804, + "flos": 22684936510080.0, + "grad_norm": 1.812960128268221, + "language_loss": 1.04469562, + "learning_rate": 3.951788965525118e-06, + "loss": 1.07255244, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.41064453, + "step": 463, + "time_per_iteration": 3.029693841934204 + }, + { + "auxiliary_loss_clip": 0.01342879, + "auxiliary_loss_mlp": 0.01374128, + "balance_loss_clip": 1.22463417, + "balance_loss_mlp": 1.35209787, + "epoch": 0.027897189237937773, + "flos": 62210882240640.0, + "grad_norm": 0.8934585770852488, + "language_loss": 0.59342104, + "learning_rate": 3.953178075413476e-06, + "loss": 0.6205911, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.22070312, + "step": 464, + "time_per_iteration": 3.3332359790802 + }, + { + "auxiliary_loss_clip": 0.01402421, + "auxiliary_loss_mlp": 0.01401433, + "balance_loss_clip": 1.24197197, + "balance_loss_mlp": 1.35327208, + "epoch": 0.02795731249060574, + "flos": 24502425886080.0, + "grad_norm": 1.631430914523931, + "language_loss": 0.97409999, + "learning_rate": 3.954564194750784e-06, + "loss": 1.00213861, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.48144531, + "step": 465, + "time_per_iteration": 3.0281388759613037 + }, + { + "auxiliary_loss_clip": 0.01386456, + "auxiliary_loss_mlp": 0.01382408, + "balance_loss_clip": 1.23317468, + "balance_loss_mlp": 1.34128129, + "epoch": 0.02801743574327371, + "flos": 23743641024000.0, + "grad_norm": 1.62046004387898, + "language_loss": 0.87181485, + "learning_rate": 3.955947336385828e-06, + "loss": 0.89950347, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.41137695, + "step": 466, + "time_per_iteration": 3.0103933811187744 + }, + { + "auxiliary_loss_clip": 0.01384214, + "auxiliary_loss_mlp": 0.01385689, + "balance_loss_clip": 1.23188627, + "balance_loss_mlp": 1.34317899, + "epoch": 0.02807755899594168, + "flos": 20638254384000.0, + "grad_norm": 1.592974291934706, + "language_loss": 0.94272721, + "learning_rate": 3.957327513084761e-06, + "loss": 0.9704262, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.42504883, + "step": 467, + "time_per_iteration": 2.9819204807281494 + }, + { + "auxiliary_loss_clip": 0.01391666, + "auxiliary_loss_mlp": 0.01401553, + "balance_loss_clip": 1.23797643, + "balance_loss_mlp": 1.35858989, + "epoch": 0.02813768224860965, + "flos": 19253858664960.0, + "grad_norm": 1.7023492181544089, + "language_loss": 0.96316338, + "learning_rate": 3.958704737531818e-06, + "loss": 0.99109554, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.42944336, + "step": 468, + "time_per_iteration": 4.536728858947754 + }, + { + "auxiliary_loss_clip": 0.0139219, + "auxiliary_loss_mlp": 0.01397046, + "balance_loss_clip": 1.23710251, + "balance_loss_mlp": 1.35136557, + "epoch": 0.02819780550127762, + "flos": 20823803884800.0, + "grad_norm": 1.7877427659459773, + "language_loss": 1.04302728, + "learning_rate": 3.9600790223300065e-06, + "loss": 1.07091963, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.45678711, + "step": 469, + "time_per_iteration": 3.0760984420776367 + }, + { + "auxiliary_loss_clip": 0.01386101, + "auxiliary_loss_mlp": 0.01400987, + "balance_loss_clip": 1.23119903, + "balance_loss_mlp": 1.35416174, + "epoch": 0.028257928753945588, + "flos": 19983569103360.0, + "grad_norm": 1.741033123569448, + "language_loss": 0.98274845, + "learning_rate": 3.96145038000181e-06, + "loss": 1.0106194, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.46850586, + "step": 470, + "time_per_iteration": 6.044286251068115 + }, + { + "auxiliary_loss_clip": 0.01392183, + "auxiliary_loss_mlp": 0.01396169, + "balance_loss_clip": 1.23528922, + "balance_loss_mlp": 1.35284841, + "epoch": 0.028318052006613557, + "flos": 20493950158080.0, + "grad_norm": 1.5820727714074132, + "language_loss": 1.04733515, + "learning_rate": 3.962818822989861e-06, + "loss": 1.07521868, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.43334961, + "step": 471, + "time_per_iteration": 3.0550925731658936 + }, + { + "auxiliary_loss_clip": 0.01387844, + "auxiliary_loss_mlp": 0.01416427, + "balance_loss_clip": 1.23063016, + "balance_loss_mlp": 1.36204338, + "epoch": 0.02837817525928153, + "flos": 28527280231680.0, + "grad_norm": 1.5595663105136184, + "language_loss": 0.86103892, + "learning_rate": 3.964184363657625e-06, + "loss": 0.8890816, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.54418945, + "step": 472, + "time_per_iteration": 3.018585205078125 + }, + { + "auxiliary_loss_clip": 0.01388575, + "auxiliary_loss_mlp": 0.01418744, + "balance_loss_clip": 1.23426294, + "balance_loss_mlp": 1.3634789, + "epoch": 0.028438298511949497, + "flos": 18560869021440.0, + "grad_norm": 1.5753504677617136, + "language_loss": 1.0451405, + "learning_rate": 3.965547014290071e-06, + "loss": 1.07321382, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.55297852, + "step": 473, + "time_per_iteration": 2.959052085876465 + }, + { + "auxiliary_loss_clip": 0.0139158, + "auxiliary_loss_mlp": 0.01415794, + "balance_loss_clip": 1.23660386, + "balance_loss_mlp": 1.36901641, + "epoch": 0.028498421764617466, + "flos": 16919111024640.0, + "grad_norm": 1.7869397134839766, + "language_loss": 1.04671931, + "learning_rate": 3.96690678709433e-06, + "loss": 1.07479298, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.4675293, + "step": 474, + "time_per_iteration": 2.8996384143829346 + }, + { + "auxiliary_loss_clip": 0.01385194, + "auxiliary_loss_mlp": 0.0139348, + "balance_loss_clip": 1.23230219, + "balance_loss_mlp": 1.3457005, + "epoch": 0.028558545017285435, + "flos": 27789289994880.0, + "grad_norm": 1.7831213893897782, + "language_loss": 0.89693069, + "learning_rate": 3.968263694200355e-06, + "loss": 0.92471743, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.47753906, + "step": 475, + "time_per_iteration": 3.032871723175049 + }, + { + "auxiliary_loss_clip": 0.01346133, + "auxiliary_loss_mlp": 0.01425698, + "balance_loss_clip": 1.22225761, + "balance_loss_mlp": 1.39651585, + "epoch": 0.028618668269953403, + "flos": 65685150887040.0, + "grad_norm": 0.9710452519682823, + "language_loss": 0.67209744, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69981575, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.29101562, + "step": 476, + "time_per_iteration": 3.3518314361572266 + }, + { + "auxiliary_loss_clip": 0.01397256, + "auxiliary_loss_mlp": 0.01419079, + "balance_loss_clip": 1.23945737, + "balance_loss_mlp": 1.36536384, + "epoch": 0.028678791522621375, + "flos": 21945136481280.0, + "grad_norm": 1.7313659423111778, + "language_loss": 0.98940587, + "learning_rate": 3.970968959455509e-06, + "loss": 1.01756918, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.53735352, + "step": 477, + "time_per_iteration": 2.969330310821533 + }, + { + "auxiliary_loss_clip": 0.01391901, + "auxiliary_loss_mlp": 0.01432608, + "balance_loss_clip": 1.23873854, + "balance_loss_mlp": 1.37786734, + "epoch": 0.028738914775289344, + "flos": 24582970909440.0, + "grad_norm": 1.8757472518641047, + "language_loss": 0.95890367, + "learning_rate": 3.97231734148446e-06, + "loss": 0.98714876, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.54760742, + "step": 478, + "time_per_iteration": 2.9957146644592285 + }, + { + "auxiliary_loss_clip": 0.01387514, + "auxiliary_loss_mlp": 0.01433185, + "balance_loss_clip": 1.23384476, + "balance_loss_mlp": 1.37596464, + "epoch": 0.028799038027957313, + "flos": 23268306706560.0, + "grad_norm": 1.4470676800656015, + "language_loss": 0.92238498, + "learning_rate": 3.973662905576082e-06, + "loss": 0.95059198, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.57226562, + "step": 479, + "time_per_iteration": 3.0036611557006836 + }, + { + "auxiliary_loss_clip": 0.0138214, + "auxiliary_loss_mlp": 0.01407357, + "balance_loss_clip": 1.22935414, + "balance_loss_mlp": 1.36384535, + "epoch": 0.02885916128062528, + "flos": 22174329231360.0, + "grad_norm": 1.7120811404400147, + "language_loss": 0.8532092, + "learning_rate": 3.975005663484038e-06, + "loss": 0.88110411, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.43505859, + "step": 480, + "time_per_iteration": 3.052868366241455 + }, + { + "auxiliary_loss_clip": 0.0138378, + "auxiliary_loss_mlp": 0.01387542, + "balance_loss_clip": 1.23049438, + "balance_loss_mlp": 1.34474635, + "epoch": 0.02891928453329325, + "flos": 22943927600640.0, + "grad_norm": 1.435140853335933, + "language_loss": 0.95851576, + "learning_rate": 3.976345626888605e-06, + "loss": 0.98622894, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.42797852, + "step": 481, + "time_per_iteration": 3.131495475769043 + }, + { + "auxiliary_loss_clip": 0.01328846, + "auxiliary_loss_mlp": 0.0136574, + "balance_loss_clip": 1.20733905, + "balance_loss_mlp": 1.34428215, + "epoch": 0.028979407785961222, + "flos": 57458443962240.0, + "grad_norm": 0.8443228180500754, + "language_loss": 0.6636076, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.69055343, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.21484375, + "step": 482, + "time_per_iteration": 3.1530187129974365 + }, + { + "auxiliary_loss_clip": 0.01387044, + "auxiliary_loss_mlp": 0.01398808, + "balance_loss_clip": 1.2309773, + "balance_loss_mlp": 1.34986019, + "epoch": 0.02903953103862919, + "flos": 16730258653440.0, + "grad_norm": 1.9488692586413054, + "language_loss": 0.98439902, + "learning_rate": 3.979017216545415e-06, + "loss": 1.01225758, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.48974609, + "step": 483, + "time_per_iteration": 2.965873956680298 + }, + { + "auxiliary_loss_clip": 0.01393316, + "auxiliary_loss_mlp": 0.01407522, + "balance_loss_clip": 1.23676753, + "balance_loss_mlp": 1.36038697, + "epoch": 0.02909965429129716, + "flos": 16772318334720.0, + "grad_norm": 1.4540944066892914, + "language_loss": 0.87004936, + "learning_rate": 3.980348865796749e-06, + "loss": 0.8980577, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.47119141, + "step": 484, + "time_per_iteration": 3.045238971710205 + }, + { + "auxiliary_loss_clip": 0.01391126, + "auxiliary_loss_mlp": 0.01416725, + "balance_loss_clip": 1.23546386, + "balance_loss_mlp": 1.35139823, + "epoch": 0.029159777543965128, + "flos": 19793087919360.0, + "grad_norm": 1.8418630867811154, + "language_loss": 0.94000095, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.96807945, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.65356445, + "step": 485, + "time_per_iteration": 3.074298143386841 + }, + { + "auxiliary_loss_clip": 0.01386934, + "auxiliary_loss_mlp": 0.01386102, + "balance_loss_clip": 1.23393691, + "balance_loss_mlp": 1.34340167, + "epoch": 0.029219900796633096, + "flos": 19651226912640.0, + "grad_norm": 1.6911738387022914, + "language_loss": 1.00434935, + "learning_rate": 3.983003930109732e-06, + "loss": 1.0320797, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.42675781, + "step": 486, + "time_per_iteration": 3.085508108139038 + }, + { + "auxiliary_loss_clip": 0.01389204, + "auxiliary_loss_mlp": 0.01388613, + "balance_loss_clip": 1.2342732, + "balance_loss_mlp": 1.34555435, + "epoch": 0.02928002404930107, + "flos": 25896911195520.0, + "grad_norm": 1.4838180744284526, + "language_loss": 0.98852217, + "learning_rate": 3.984327367746315e-06, + "loss": 1.01630032, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.43041992, + "step": 487, + "time_per_iteration": 2.9954655170440674 + }, + { + "auxiliary_loss_clip": 0.01396275, + "auxiliary_loss_mlp": 0.01394573, + "balance_loss_clip": 1.24080586, + "balance_loss_mlp": 1.3440758, + "epoch": 0.029340147301969037, + "flos": 20668007479680.0, + "grad_norm": 2.1816299400909656, + "language_loss": 1.05241919, + "learning_rate": 3.985648090637122e-06, + "loss": 1.08032775, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.50537109, + "step": 488, + "time_per_iteration": 2.9837381839752197 + }, + { + "auxiliary_loss_clip": 0.01393009, + "auxiliary_loss_mlp": 0.01388786, + "balance_loss_clip": 1.23978245, + "balance_loss_mlp": 1.34417808, + "epoch": 0.029400270554637006, + "flos": 24439300110720.0, + "grad_norm": 1.5692332047972928, + "language_loss": 0.94955957, + "learning_rate": 3.986966109896785e-06, + "loss": 0.97737747, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.44604492, + "step": 489, + "time_per_iteration": 3.0294408798217773 + }, + { + "auxiliary_loss_clip": 0.01393313, + "auxiliary_loss_mlp": 0.01393123, + "balance_loss_clip": 1.23912573, + "balance_loss_mlp": 1.34446192, + "epoch": 0.029460393807304974, + "flos": 20130588017280.0, + "grad_norm": 1.4491188411076936, + "language_loss": 0.95083821, + "learning_rate": 3.988281436571815e-06, + "loss": 0.97870255, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.48681641, + "step": 490, + "time_per_iteration": 2.9073665142059326 + }, + { + "auxiliary_loss_clip": 0.01398045, + "auxiliary_loss_mlp": 0.01391866, + "balance_loss_clip": 1.24218082, + "balance_loss_mlp": 1.34520733, + "epoch": 0.029520517059972943, + "flos": 17684863585920.0, + "grad_norm": 1.8406277389500665, + "language_loss": 1.05263448, + "learning_rate": 3.989594081641164e-06, + "loss": 1.0805335, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.46606445, + "step": 491, + "time_per_iteration": 3.0073463916778564 + }, + { + "auxiliary_loss_clip": 0.01392238, + "auxiliary_loss_mlp": 0.01392325, + "balance_loss_clip": 1.23919392, + "balance_loss_mlp": 1.34933782, + "epoch": 0.029580640312640915, + "flos": 18962535525120.0, + "grad_norm": 1.5519043220247328, + "language_loss": 0.93984282, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.9676885, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.4296875, + "step": 492, + "time_per_iteration": 2.9352288246154785 + }, + { + "auxiliary_loss_clip": 0.0140917, + "auxiliary_loss_mlp": 0.01427459, + "balance_loss_clip": 1.25452554, + "balance_loss_mlp": 1.35343051, + "epoch": 0.029640763565308884, + "flos": 18733297530240.0, + "grad_norm": 1.9499404470537487, + "language_loss": 0.99370396, + "learning_rate": 3.992211370544093e-06, + "loss": 1.02207017, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.73974609, + "step": 493, + "time_per_iteration": 2.9547882080078125 + }, + { + "auxiliary_loss_clip": 0.01408833, + "auxiliary_loss_mlp": 0.01389746, + "balance_loss_clip": 1.25428677, + "balance_loss_mlp": 1.3501687, + "epoch": 0.029700886817976852, + "flos": 20605153173120.0, + "grad_norm": 1.444415619017881, + "language_loss": 0.98846906, + "learning_rate": 3.99351603600268e-06, + "loss": 1.01645494, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.39575195, + "step": 494, + "time_per_iteration": 2.949354887008667 + }, + { + "auxiliary_loss_clip": 0.01412453, + "auxiliary_loss_mlp": 0.01389366, + "balance_loss_clip": 1.25701046, + "balance_loss_mlp": 1.3465941, + "epoch": 0.02976101007064482, + "flos": 22247046904320.0, + "grad_norm": 1.8480463043505453, + "language_loss": 0.97758102, + "learning_rate": 3.994818063106668e-06, + "loss": 1.00559926, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.42797852, + "step": 495, + "time_per_iteration": 2.9766459465026855 + }, + { + "auxiliary_loss_clip": 0.01401994, + "auxiliary_loss_mlp": 0.01379683, + "balance_loss_clip": 1.2488637, + "balance_loss_mlp": 1.3427043, + "epoch": 0.029821133323312793, + "flos": 23743188576000.0, + "grad_norm": 1.4243236317464374, + "language_loss": 0.72786117, + "learning_rate": 3.99611746250533e-06, + "loss": 0.75567794, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.36987305, + "step": 496, + "time_per_iteration": 3.0946428775787354 + }, + { + "auxiliary_loss_clip": 0.01406656, + "auxiliary_loss_mlp": 0.01392992, + "balance_loss_clip": 1.25194752, + "balance_loss_mlp": 1.35229349, + "epoch": 0.02988125657598076, + "flos": 22429791227520.0, + "grad_norm": 1.3815379439829476, + "language_loss": 0.96899009, + "learning_rate": 3.997414244783595e-06, + "loss": 0.99698657, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.40673828, + "step": 497, + "time_per_iteration": 2.9681286811828613 + }, + { + "auxiliary_loss_clip": 0.0139576, + "auxiliary_loss_mlp": 0.01397834, + "balance_loss_clip": 1.24123824, + "balance_loss_mlp": 1.3505795, + "epoch": 0.02994137982864873, + "flos": 13853114622720.0, + "grad_norm": 1.9164314637955884, + "language_loss": 1.02092266, + "learning_rate": 3.998708420462557e-06, + "loss": 1.04885864, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.47265625, + "step": 498, + "time_per_iteration": 2.9890737533569336 + }, + { + "auxiliary_loss_clip": 0.01391419, + "auxiliary_loss_mlp": 0.01401417, + "balance_loss_clip": 1.23638237, + "balance_loss_mlp": 1.35637999, + "epoch": 0.0300015030813167, + "flos": 23917200652800.0, + "grad_norm": 1.9508563847400384, + "language_loss": 0.97575581, + "learning_rate": 4e-06, + "loss": 1.00368428, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.45043945, + "step": 499, + "time_per_iteration": 2.9326705932617188 + }, + { + "auxiliary_loss_clip": 0.01382613, + "auxiliary_loss_mlp": 0.01395076, + "balance_loss_clip": 1.23100603, + "balance_loss_mlp": 1.34834599, + "epoch": 0.030061626333984667, + "flos": 22026631645440.0, + "grad_norm": 1.3859092865242784, + "language_loss": 0.90384626, + "learning_rate": 3.9999999620799e-06, + "loss": 0.9316231, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.46777344, + "step": 500, + "time_per_iteration": 3.1282660961151123 + }, + { + "auxiliary_loss_clip": 0.01385132, + "auxiliary_loss_mlp": 0.01417048, + "balance_loss_clip": 1.23113632, + "balance_loss_mlp": 1.35374844, + "epoch": 0.03012174958665264, + "flos": 23050198932480.0, + "grad_norm": 1.8813208962994148, + "language_loss": 1.03797114, + "learning_rate": 3.9999998483196e-06, + "loss": 1.06599295, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.63354492, + "step": 501, + "time_per_iteration": 2.9867758750915527 + }, + { + "auxiliary_loss_clip": 0.01386401, + "auxiliary_loss_mlp": 0.01398615, + "balance_loss_clip": 1.23308694, + "balance_loss_mlp": 1.3543644, + "epoch": 0.030181872839320608, + "flos": 18962626014720.0, + "grad_norm": 1.8442159602792667, + "language_loss": 0.98640454, + "learning_rate": 3.9999996587191065e-06, + "loss": 1.01425469, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.44262695, + "step": 502, + "time_per_iteration": 2.873424768447876 + }, + { + "auxiliary_loss_clip": 0.01383441, + "auxiliary_loss_mlp": 0.01396099, + "balance_loss_clip": 1.23239124, + "balance_loss_mlp": 1.35573423, + "epoch": 0.030241996091988577, + "flos": 16736909639040.0, + "grad_norm": 1.9187942634401132, + "language_loss": 0.95452714, + "learning_rate": 3.999999393278425e-06, + "loss": 0.98232257, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.40380859, + "step": 503, + "time_per_iteration": 3.0077877044677734 + }, + { + "auxiliary_loss_clip": 0.01372528, + "auxiliary_loss_mlp": 0.01395638, + "balance_loss_clip": 1.2272408, + "balance_loss_mlp": 1.35920751, + "epoch": 0.030302119344656545, + "flos": 28632963381120.0, + "grad_norm": 1.478112613598757, + "language_loss": 0.97800672, + "learning_rate": 3.999999051997567e-06, + "loss": 1.00568843, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.36425781, + "step": 504, + "time_per_iteration": 4.583301067352295 + }, + { + "auxiliary_loss_clip": 0.01380949, + "auxiliary_loss_mlp": 0.0138645, + "balance_loss_clip": 1.23060441, + "balance_loss_mlp": 1.34892309, + "epoch": 0.030362242597324514, + "flos": 15677797921920.0, + "grad_norm": 1.4534440519363268, + "language_loss": 0.84831536, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.87598926, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.375, + "step": 505, + "time_per_iteration": 5.2347893714904785 + }, + { + "auxiliary_loss_clip": 0.0138243, + "auxiliary_loss_mlp": 0.01409152, + "balance_loss_clip": 1.26197577, + "balance_loss_mlp": 1.38187671, + "epoch": 0.030422365849992486, + "flos": 72161973446400.0, + "grad_norm": 0.8825622931093922, + "language_loss": 0.5520798, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57999557, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.2734375, + "step": 506, + "time_per_iteration": 5.04321551322937 + }, + { + "auxiliary_loss_clip": 0.01384931, + "auxiliary_loss_mlp": 0.01419956, + "balance_loss_clip": 1.23507762, + "balance_loss_mlp": 1.36287892, + "epoch": 0.030482489102660455, + "flos": 19437372149760.0, + "grad_norm": 1.4770197951000967, + "language_loss": 0.90798068, + "learning_rate": 3.999997573114069e-06, + "loss": 0.93602955, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.57080078, + "step": 507, + "time_per_iteration": 2.9115123748779297 + }, + { + "auxiliary_loss_clip": 0.0138816, + "auxiliary_loss_mlp": 0.0142399, + "balance_loss_clip": 1.2396543, + "balance_loss_mlp": 1.38388777, + "epoch": 0.030542612355328423, + "flos": 20385461831040.0, + "grad_norm": 1.801841636527473, + "language_loss": 1.00916982, + "learning_rate": 3.999996928472659e-06, + "loss": 1.03729141, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.40136719, + "step": 508, + "time_per_iteration": 2.9402434825897217 + }, + { + "auxiliary_loss_clip": 0.01389368, + "auxiliary_loss_mlp": 0.01437864, + "balance_loss_clip": 1.23873377, + "balance_loss_mlp": 1.39454341, + "epoch": 0.030602735607996392, + "flos": 34690473964800.0, + "grad_norm": 1.6227045912075388, + "language_loss": 0.79833537, + "learning_rate": 3.999996207991165e-06, + "loss": 0.8266077, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 1.50390625, + "router_z_loss_mlp": 0.43310547, + "step": 509, + "time_per_iteration": 3.067511558532715 + }, + { + "auxiliary_loss_clip": 0.01380324, + "auxiliary_loss_mlp": 0.0139072, + "balance_loss_clip": 1.23200727, + "balance_loss_mlp": 1.35123801, + "epoch": 0.03066285886066436, + "flos": 23668932579840.0, + "grad_norm": 1.7821230854378898, + "language_loss": 0.92262346, + "learning_rate": 3.999995411669614e-06, + "loss": 0.95033383, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.39477539, + "step": 510, + "time_per_iteration": 2.9140372276306152 + }, + { + "auxiliary_loss_clip": 0.01382837, + "auxiliary_loss_mlp": 0.01378616, + "balance_loss_clip": 1.23423159, + "balance_loss_mlp": 1.34130335, + "epoch": 0.030722982113332332, + "flos": 23013342403200.0, + "grad_norm": 1.7832758260586248, + "language_loss": 0.97462076, + "learning_rate": 3.999994539508036e-06, + "loss": 1.00223517, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.37329102, + "step": 511, + "time_per_iteration": 2.9507713317871094 + }, + { + "auxiliary_loss_clip": 0.01385504, + "auxiliary_loss_mlp": 0.01401197, + "balance_loss_clip": 1.23218274, + "balance_loss_mlp": 1.35859168, + "epoch": 0.0307831053660003, + "flos": 24760783549440.0, + "grad_norm": 1.5482808632358973, + "language_loss": 0.93070579, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.95857286, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.42626953, + "step": 512, + "time_per_iteration": 2.9955101013183594 + }, + { + "auxiliary_loss_clip": 0.01381079, + "auxiliary_loss_mlp": 0.01424607, + "balance_loss_clip": 1.22898185, + "balance_loss_mlp": 1.38009429, + "epoch": 0.03084322861866827, + "flos": 26152101722880.0, + "grad_norm": 1.6439685943218116, + "language_loss": 0.96728939, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.99534625, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.44555664, + "step": 513, + "time_per_iteration": 3.0988457202911377 + }, + { + "auxiliary_loss_clip": 0.01381079, + "auxiliary_loss_mlp": 0.01428278, + "balance_loss_clip": 1.22732019, + "balance_loss_mlp": 1.38819945, + "epoch": 0.03090335187133624, + "flos": 18780741342720.0, + "grad_norm": 1.2908774511688603, + "language_loss": 0.90234423, + "learning_rate": 3.999991467983491e-06, + "loss": 0.9304378, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.40087891, + "step": 514, + "time_per_iteration": 3.0078907012939453 + }, + { + "auxiliary_loss_clip": 0.0139261, + "auxiliary_loss_mlp": 0.01415255, + "balance_loss_clip": 1.23924172, + "balance_loss_mlp": 1.37319827, + "epoch": 0.030963475124004207, + "flos": 23232309828480.0, + "grad_norm": 2.378660491198348, + "language_loss": 0.92467397, + "learning_rate": 3.999990292462167e-06, + "loss": 0.95275259, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.42089844, + "step": 515, + "time_per_iteration": 2.931396245956421 + }, + { + "auxiliary_loss_clip": 0.01382406, + "auxiliary_loss_mlp": 0.01401265, + "balance_loss_clip": 1.2282238, + "balance_loss_mlp": 1.35479772, + "epoch": 0.03102359837667218, + "flos": 42541738387200.0, + "grad_norm": 1.5105274688297254, + "language_loss": 0.95988578, + "learning_rate": 3.999989041101011e-06, + "loss": 0.98772246, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.46459961, + "step": 516, + "time_per_iteration": 3.1434574127197266 + }, + { + "auxiliary_loss_clip": 0.01381521, + "auxiliary_loss_mlp": 0.01393495, + "balance_loss_clip": 1.22707498, + "balance_loss_mlp": 1.35317791, + "epoch": 0.031083721629340148, + "flos": 21186396864000.0, + "grad_norm": 1.5355368633898163, + "language_loss": 0.86996794, + "learning_rate": 3.999987713900071e-06, + "loss": 0.89771807, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.40332031, + "step": 517, + "time_per_iteration": 2.9121408462524414 + }, + { + "auxiliary_loss_clip": 0.01377672, + "auxiliary_loss_mlp": 0.01412103, + "balance_loss_clip": 1.22401285, + "balance_loss_mlp": 1.36637402, + "epoch": 0.031143844882008116, + "flos": 29728252955520.0, + "grad_norm": 1.3408313006073282, + "language_loss": 0.96561956, + "learning_rate": 3.999986310859396e-06, + "loss": 0.99351734, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.45703125, + "step": 518, + "time_per_iteration": 3.0218162536621094 + }, + { + "auxiliary_loss_clip": 0.01386037, + "auxiliary_loss_mlp": 0.01405725, + "balance_loss_clip": 1.23136044, + "balance_loss_mlp": 1.35971022, + "epoch": 0.031203968134676085, + "flos": 23122871360640.0, + "grad_norm": 1.6397878325037119, + "language_loss": 0.96773589, + "learning_rate": 3.999984831979039e-06, + "loss": 0.99565351, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.46020508, + "step": 519, + "time_per_iteration": 2.99403977394104 + }, + { + "auxiliary_loss_clip": 0.01379204, + "auxiliary_loss_mlp": 0.01398535, + "balance_loss_clip": 1.22603059, + "balance_loss_mlp": 1.35747933, + "epoch": 0.03126409138734405, + "flos": 20962814469120.0, + "grad_norm": 1.63570805755852, + "language_loss": 0.96088082, + "learning_rate": 3.999983277259057e-06, + "loss": 0.98865819, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.41064453, + "step": 520, + "time_per_iteration": 2.8942577838897705 + }, + { + "auxiliary_loss_clip": 0.01385572, + "auxiliary_loss_mlp": 0.01386406, + "balance_loss_clip": 1.23213184, + "balance_loss_mlp": 1.34420586, + "epoch": 0.031324214640012026, + "flos": 21659378451840.0, + "grad_norm": 1.5367765307014523, + "language_loss": 0.96037579, + "learning_rate": 3.999981646699509e-06, + "loss": 0.98809552, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.42163086, + "step": 521, + "time_per_iteration": 2.9566125869750977 + }, + { + "auxiliary_loss_clip": 0.01388699, + "auxiliary_loss_mlp": 0.01389588, + "balance_loss_clip": 1.23249733, + "balance_loss_mlp": 1.3469348, + "epoch": 0.03138433789267999, + "flos": 23451955925760.0, + "grad_norm": 1.5779766309577878, + "language_loss": 0.79782134, + "learning_rate": 3.999979940300456e-06, + "loss": 0.8256042, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.42700195, + "step": 522, + "time_per_iteration": 2.943603754043579 + }, + { + "auxiliary_loss_clip": 0.01395642, + "auxiliary_loss_mlp": 0.01390611, + "balance_loss_clip": 1.23521614, + "balance_loss_mlp": 1.34986567, + "epoch": 0.03144446114534796, + "flos": 18989573932800.0, + "grad_norm": 2.0136101770400527, + "language_loss": 1.00219035, + "learning_rate": 3.999978158061963e-06, + "loss": 1.0300529, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.40673828, + "step": 523, + "time_per_iteration": 2.9485507011413574 + }, + { + "auxiliary_loss_clip": 0.01397135, + "auxiliary_loss_mlp": 0.01410399, + "balance_loss_clip": 1.23499906, + "balance_loss_mlp": 1.36416936, + "epoch": 0.031504584398015935, + "flos": 22647853756800.0, + "grad_norm": 1.674414843428846, + "language_loss": 1.03941453, + "learning_rate": 3.999976299984099e-06, + "loss": 1.06748986, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.4621582, + "step": 524, + "time_per_iteration": 3.0019829273223877 + }, + { + "auxiliary_loss_clip": 0.013954, + "auxiliary_loss_mlp": 0.0140097, + "balance_loss_clip": 1.23676395, + "balance_loss_mlp": 1.36408627, + "epoch": 0.0315647076506839, + "flos": 25306844768640.0, + "grad_norm": 1.8008962303306546, + "language_loss": 0.94083709, + "learning_rate": 3.999974366066933e-06, + "loss": 0.96880078, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.36889648, + "step": 525, + "time_per_iteration": 3.0052995681762695 + }, + { + "auxiliary_loss_clip": 0.01390021, + "auxiliary_loss_mlp": 0.01404591, + "balance_loss_clip": 1.23248565, + "balance_loss_mlp": 1.35461831, + "epoch": 0.03162483090335187, + "flos": 16991647718400.0, + "grad_norm": 1.661271286255701, + "language_loss": 0.90558642, + "learning_rate": 3.999972356310538e-06, + "loss": 0.9335326, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.49975586, + "step": 526, + "time_per_iteration": 2.934609889984131 + }, + { + "auxiliary_loss_clip": 0.0138842, + "auxiliary_loss_mlp": 0.01384785, + "balance_loss_clip": 1.23024106, + "balance_loss_mlp": 1.3406775, + "epoch": 0.03168495415601984, + "flos": 18743839568640.0, + "grad_norm": 1.601443643064492, + "language_loss": 0.95155841, + "learning_rate": 3.999970270714991e-06, + "loss": 0.97929043, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.44116211, + "step": 527, + "time_per_iteration": 2.9149956703186035 + }, + { + "auxiliary_loss_clip": 0.01392083, + "auxiliary_loss_mlp": 0.0138915, + "balance_loss_clip": 1.23354363, + "balance_loss_mlp": 1.34432757, + "epoch": 0.03174507740868781, + "flos": 21224701226880.0, + "grad_norm": 1.6999017800949616, + "language_loss": 1.08205235, + "learning_rate": 3.999968109280371e-06, + "loss": 1.10986471, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.44824219, + "step": 528, + "time_per_iteration": 2.883425235748291 + }, + { + "auxiliary_loss_clip": 0.01393634, + "auxiliary_loss_mlp": 0.01389864, + "balance_loss_clip": 1.23388791, + "balance_loss_mlp": 1.34980965, + "epoch": 0.03180520066135578, + "flos": 24797685323520.0, + "grad_norm": 1.6407552048578695, + "language_loss": 0.93865645, + "learning_rate": 3.99996587200676e-06, + "loss": 0.96649146, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.40039062, + "step": 529, + "time_per_iteration": 3.064373254776001 + }, + { + "auxiliary_loss_clip": 0.01396485, + "auxiliary_loss_mlp": 0.01394397, + "balance_loss_clip": 1.23704076, + "balance_loss_mlp": 1.34957457, + "epoch": 0.03186532391402375, + "flos": 24875244190080.0, + "grad_norm": 1.5404479783076748, + "language_loss": 1.00049734, + "learning_rate": 3.999963558894243e-06, + "loss": 1.02840614, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.44824219, + "step": 530, + "time_per_iteration": 2.948352575302124 + }, + { + "auxiliary_loss_clip": 0.01398473, + "auxiliary_loss_mlp": 0.01393585, + "balance_loss_clip": 1.23581767, + "balance_loss_mlp": 1.34988248, + "epoch": 0.03192544716669172, + "flos": 21224927450880.0, + "grad_norm": 1.7580558168853224, + "language_loss": 0.89432806, + "learning_rate": 3.999961169942907e-06, + "loss": 0.92224866, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.43676758, + "step": 531, + "time_per_iteration": 2.9925355911254883 + }, + { + "auxiliary_loss_clip": 0.01399831, + "auxiliary_loss_mlp": 0.01389245, + "balance_loss_clip": 1.23727727, + "balance_loss_mlp": 1.34988201, + "epoch": 0.03198557041935969, + "flos": 24364003484160.0, + "grad_norm": 1.5905847935105442, + "language_loss": 1.01604247, + "learning_rate": 3.999958705152843e-06, + "loss": 1.04393327, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.39355469, + "step": 532, + "time_per_iteration": 3.0931036472320557 + }, + { + "auxiliary_loss_clip": 0.01360488, + "auxiliary_loss_mlp": 0.01363705, + "balance_loss_clip": 1.23154283, + "balance_loss_mlp": 1.34100759, + "epoch": 0.032045693672027656, + "flos": 61858197872640.0, + "grad_norm": 0.7286025446375463, + "language_loss": 0.58068454, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60792649, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.2265625, + "step": 533, + "time_per_iteration": 3.486196756362915 + }, + { + "auxiliary_loss_clip": 0.01402596, + "auxiliary_loss_mlp": 0.01390816, + "balance_loss_clip": 1.23838091, + "balance_loss_mlp": 1.35119104, + "epoch": 0.03210581692469563, + "flos": 28412502877440.0, + "grad_norm": 1.4797032329372963, + "language_loss": 0.96599734, + "learning_rate": 3.999953548056907e-06, + "loss": 0.99393147, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.39599609, + "step": 534, + "time_per_iteration": 2.998612403869629 + }, + { + "auxiliary_loss_clip": 0.01403492, + "auxiliary_loss_mlp": 0.01387605, + "balance_loss_clip": 1.2360146, + "balance_loss_mlp": 1.34597707, + "epoch": 0.03216594017736359, + "flos": 24728496744960.0, + "grad_norm": 1.7414919972863243, + "language_loss": 0.90999508, + "learning_rate": 3.999950855751232e-06, + "loss": 0.93790615, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.41625977, + "step": 535, + "time_per_iteration": 2.98520565032959 + }, + { + "auxiliary_loss_clip": 0.01404672, + "auxiliary_loss_mlp": 0.01391438, + "balance_loss_clip": 1.23800397, + "balance_loss_mlp": 1.34849834, + "epoch": 0.032226063430031565, + "flos": 31187854811520.0, + "grad_norm": 1.9769038313455278, + "language_loss": 0.93482828, + "learning_rate": 3.999948087607219e-06, + "loss": 0.96278942, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.42944336, + "step": 536, + "time_per_iteration": 3.1017744541168213 + }, + { + "auxiliary_loss_clip": 0.01414027, + "auxiliary_loss_mlp": 0.01395451, + "balance_loss_clip": 1.24216807, + "balance_loss_mlp": 1.35069966, + "epoch": 0.03228618668269954, + "flos": 32211195874560.0, + "grad_norm": 1.4828893014279971, + "language_loss": 0.83592248, + "learning_rate": 3.999945243624975e-06, + "loss": 0.86401731, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.44750977, + "step": 537, + "time_per_iteration": 3.110311269760132 + }, + { + "auxiliary_loss_clip": 0.01409134, + "auxiliary_loss_mlp": 0.01385102, + "balance_loss_clip": 1.2376653, + "balance_loss_mlp": 1.34295011, + "epoch": 0.0323463099353675, + "flos": 22679280910080.0, + "grad_norm": 1.6555354543230907, + "language_loss": 0.94436759, + "learning_rate": 3.999942323804607e-06, + "loss": 0.97230995, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.42138672, + "step": 538, + "time_per_iteration": 3.068390130996704 + }, + { + "auxiliary_loss_clip": 0.01405225, + "auxiliary_loss_mlp": 0.01387194, + "balance_loss_clip": 1.23500729, + "balance_loss_mlp": 1.34518492, + "epoch": 0.032406433188035474, + "flos": 26916089736960.0, + "grad_norm": 1.6173157136304888, + "language_loss": 0.88633031, + "learning_rate": 3.999939328146225e-06, + "loss": 0.91425443, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.41992188, + "step": 539, + "time_per_iteration": 4.450425148010254 + }, + { + "auxiliary_loss_clip": 0.01408529, + "auxiliary_loss_mlp": 0.01394137, + "balance_loss_clip": 1.23601818, + "balance_loss_mlp": 1.34392619, + "epoch": 0.03246655644070344, + "flos": 31516441683840.0, + "grad_norm": 1.6390056716875077, + "language_loss": 0.88359332, + "learning_rate": 3.999936256649943e-06, + "loss": 0.9116199, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.50170898, + "step": 540, + "time_per_iteration": 4.552294731140137 + }, + { + "auxiliary_loss_clip": 0.01417582, + "auxiliary_loss_mlp": 0.01396456, + "balance_loss_clip": 1.24124503, + "balance_loss_mlp": 1.3509419, + "epoch": 0.03252667969337141, + "flos": 23227830593280.0, + "grad_norm": 1.7019306959655565, + "language_loss": 0.9771533, + "learning_rate": 3.999933109315878e-06, + "loss": 1.00529373, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.45532227, + "step": 541, + "time_per_iteration": 5.889995813369751 + }, + { + "auxiliary_loss_clip": 0.01415401, + "auxiliary_loss_mlp": 0.01391975, + "balance_loss_clip": 1.23799074, + "balance_loss_mlp": 1.3518014, + "epoch": 0.032586802946039384, + "flos": 14765750363520.0, + "grad_norm": 1.7455096562493742, + "language_loss": 0.9773379, + "learning_rate": 3.9999298861441496e-06, + "loss": 1.00541162, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.40185547, + "step": 542, + "time_per_iteration": 2.981987476348877 + }, + { + "auxiliary_loss_clip": 0.01419446, + "auxiliary_loss_mlp": 0.01406006, + "balance_loss_clip": 1.23915958, + "balance_loss_mlp": 1.36175561, + "epoch": 0.03264692619870735, + "flos": 24291557280000.0, + "grad_norm": 1.5078472738815625, + "language_loss": 0.81774366, + "learning_rate": 3.999926587134879e-06, + "loss": 0.84599829, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.44262695, + "step": 543, + "time_per_iteration": 2.9540932178497314 + }, + { + "auxiliary_loss_clip": 0.01430083, + "auxiliary_loss_mlp": 0.01400923, + "balance_loss_clip": 1.24284554, + "balance_loss_mlp": 1.35643399, + "epoch": 0.03270704945137532, + "flos": 22903406242560.0, + "grad_norm": 1.9370127086916347, + "language_loss": 1.05949759, + "learning_rate": 3.999923212288192e-06, + "loss": 1.08780766, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.44458008, + "step": 544, + "time_per_iteration": 3.057410478591919 + }, + { + "auxiliary_loss_clip": 0.01425491, + "auxiliary_loss_mlp": 0.01409114, + "balance_loss_clip": 1.24187613, + "balance_loss_mlp": 1.36028552, + "epoch": 0.032767172704043286, + "flos": 18050126008320.0, + "grad_norm": 1.9524213903566685, + "language_loss": 0.84328485, + "learning_rate": 3.999919761604216e-06, + "loss": 0.87163097, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.48803711, + "step": 545, + "time_per_iteration": 2.934523344039917 + }, + { + "auxiliary_loss_clip": 0.01432438, + "auxiliary_loss_mlp": 0.01395592, + "balance_loss_clip": 1.24188435, + "balance_loss_mlp": 1.35062671, + "epoch": 0.03282729595671126, + "flos": 22539003471360.0, + "grad_norm": 1.8226463613501447, + "language_loss": 1.03038907, + "learning_rate": 3.999916235083083e-06, + "loss": 1.05866933, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.44946289, + "step": 546, + "time_per_iteration": 2.908083438873291 + }, + { + "auxiliary_loss_clip": 0.01446037, + "auxiliary_loss_mlp": 0.01393571, + "balance_loss_clip": 1.25105596, + "balance_loss_mlp": 1.34991646, + "epoch": 0.03288741920937923, + "flos": 20419829896320.0, + "grad_norm": 1.8617514936145956, + "language_loss": 0.99947137, + "learning_rate": 3.999912632724925e-06, + "loss": 1.02786732, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.4362793, + "step": 547, + "time_per_iteration": 2.9383485317230225 + }, + { + "auxiliary_loss_clip": 0.01439037, + "auxiliary_loss_mlp": 0.0140349, + "balance_loss_clip": 1.24744642, + "balance_loss_mlp": 1.34700871, + "epoch": 0.032947542462047195, + "flos": 20787942741120.0, + "grad_norm": 1.462629543571229, + "language_loss": 0.95405847, + "learning_rate": 3.999908954529881e-06, + "loss": 0.9824838, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.56445312, + "step": 548, + "time_per_iteration": 2.965420722961426 + }, + { + "auxiliary_loss_clip": 0.01454649, + "auxiliary_loss_mlp": 0.01399372, + "balance_loss_clip": 1.25546622, + "balance_loss_mlp": 1.35009098, + "epoch": 0.03300766571471517, + "flos": 19910715696000.0, + "grad_norm": 1.913275016463635, + "language_loss": 0.87104952, + "learning_rate": 3.999905200498087e-06, + "loss": 0.89958966, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.49316406, + "step": 549, + "time_per_iteration": 2.972935438156128 + }, + { + "auxiliary_loss_clip": 0.01450822, + "auxiliary_loss_mlp": 0.01399339, + "balance_loss_clip": 1.25678253, + "balance_loss_mlp": 1.34898472, + "epoch": 0.03306778896738313, + "flos": 17976774908160.0, + "grad_norm": 1.5685326313515922, + "language_loss": 0.94064575, + "learning_rate": 3.999901370629689e-06, + "loss": 0.96914738, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.50366211, + "step": 550, + "time_per_iteration": 2.9491541385650635 + }, + { + "auxiliary_loss_clip": 0.01469297, + "auxiliary_loss_mlp": 0.01401381, + "balance_loss_clip": 1.27222669, + "balance_loss_mlp": 1.34778476, + "epoch": 0.033127912220051105, + "flos": 21663495728640.0, + "grad_norm": 1.4016472453168107, + "language_loss": 0.86652619, + "learning_rate": 3.99989746492483e-06, + "loss": 0.89523292, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.53588867, + "step": 551, + "time_per_iteration": 2.9162535667419434 + }, + { + "auxiliary_loss_clip": 0.01474119, + "auxiliary_loss_mlp": 0.01391931, + "balance_loss_clip": 1.27341855, + "balance_loss_mlp": 1.34729874, + "epoch": 0.03318803547271908, + "flos": 30200012933760.0, + "grad_norm": 1.9473256714319205, + "language_loss": 1.06166327, + "learning_rate": 3.999893483383658e-06, + "loss": 1.09032369, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.4465332, + "step": 552, + "time_per_iteration": 3.022292137145996 + }, + { + "auxiliary_loss_clip": 0.01476424, + "auxiliary_loss_mlp": 0.01391541, + "balance_loss_clip": 1.27649856, + "balance_loss_mlp": 1.34228325, + "epoch": 0.03324815872538704, + "flos": 20385733299840.0, + "grad_norm": 1.9504226313854764, + "language_loss": 1.04576027, + "learning_rate": 3.999889426006326e-06, + "loss": 1.07444, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.49291992, + "step": 553, + "time_per_iteration": 2.897512912750244 + }, + { + "auxiliary_loss_clip": 0.01491939, + "auxiliary_loss_mlp": 0.01388291, + "balance_loss_clip": 1.28249943, + "balance_loss_mlp": 1.34239566, + "epoch": 0.033308281978055014, + "flos": 24504190433280.0, + "grad_norm": 1.745131349360767, + "language_loss": 0.8820616, + "learning_rate": 3.999885292792986e-06, + "loss": 0.91086394, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.45898438, + "step": 554, + "time_per_iteration": 3.0998427867889404 + }, + { + "auxiliary_loss_clip": 0.01493839, + "auxiliary_loss_mlp": 0.01399226, + "balance_loss_clip": 1.2877872, + "balance_loss_mlp": 1.3488009, + "epoch": 0.03336840523072298, + "flos": 23409760510080.0, + "grad_norm": 1.708469788618555, + "language_loss": 0.90916187, + "learning_rate": 3.999881083743795e-06, + "loss": 0.93809253, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.50415039, + "step": 555, + "time_per_iteration": 2.939497232437134 + }, + { + "auxiliary_loss_clip": 0.01525754, + "auxiliary_loss_mlp": 0.01395023, + "balance_loss_clip": 1.30556285, + "balance_loss_mlp": 1.34731567, + "epoch": 0.03342852848339095, + "flos": 30561610527360.0, + "grad_norm": 1.9202911351776153, + "language_loss": 1.02395892, + "learning_rate": 3.999876798858914e-06, + "loss": 1.05316663, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.47705078, + "step": 556, + "time_per_iteration": 3.023836612701416 + }, + { + "auxiliary_loss_clip": 0.01518416, + "auxiliary_loss_mlp": 0.01397441, + "balance_loss_clip": 1.3020649, + "balance_loss_mlp": 1.35016227, + "epoch": 0.03348865173605892, + "flos": 22903587221760.0, + "grad_norm": 1.8669638468318388, + "language_loss": 0.94562364, + "learning_rate": 3.999872438138503e-06, + "loss": 0.97478217, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.47314453, + "step": 557, + "time_per_iteration": 2.9554860591888428 + }, + { + "auxiliary_loss_clip": 0.01553419, + "auxiliary_loss_mlp": 0.01404297, + "balance_loss_clip": 1.3264966, + "balance_loss_mlp": 1.34936523, + "epoch": 0.03354877498872689, + "flos": 17684999320320.0, + "grad_norm": 2.080247149894823, + "language_loss": 1.10047805, + "learning_rate": 3.999868001582729e-06, + "loss": 1.13005519, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.54907227, + "step": 558, + "time_per_iteration": 2.8692195415496826 + }, + { + "auxiliary_loss_clip": 0.0152942, + "auxiliary_loss_mlp": 0.01397831, + "balance_loss_clip": 1.30539191, + "balance_loss_mlp": 1.34938407, + "epoch": 0.03360889824139486, + "flos": 21662998035840.0, + "grad_norm": 1.844863204598611, + "language_loss": 0.91878843, + "learning_rate": 3.99986348919176e-06, + "loss": 0.94806093, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.48510742, + "step": 559, + "time_per_iteration": 2.9709036350250244 + }, + { + "auxiliary_loss_clip": 0.01548703, + "auxiliary_loss_mlp": 0.01394481, + "balance_loss_clip": 1.31528199, + "balance_loss_mlp": 1.34245849, + "epoch": 0.033669021494062826, + "flos": 21804949532160.0, + "grad_norm": 1.5933235318591445, + "language_loss": 0.93449032, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.96392214, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.52001953, + "step": 560, + "time_per_iteration": 2.90649151802063 + }, + { + "auxiliary_loss_clip": 0.01542154, + "auxiliary_loss_mlp": 0.01392166, + "balance_loss_clip": 1.31243253, + "balance_loss_mlp": 1.34658027, + "epoch": 0.0337291447467308, + "flos": 21874816782720.0, + "grad_norm": 1.95546844922186, + "language_loss": 0.90095866, + "learning_rate": 3.999854236904925e-06, + "loss": 0.9303019, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.45581055, + "step": 561, + "time_per_iteration": 2.9607489109039307 + }, + { + "auxiliary_loss_clip": 0.0157492, + "auxiliary_loss_mlp": 0.013993, + "balance_loss_clip": 1.32602048, + "balance_loss_mlp": 1.35545468, + "epoch": 0.03378926799939877, + "flos": 24257279704320.0, + "grad_norm": 1.5364889026421389, + "language_loss": 0.87829781, + "learning_rate": 3.999849497009409e-06, + "loss": 0.90804005, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.4387207, + "step": 562, + "time_per_iteration": 3.085115432739258 + }, + { + "auxiliary_loss_clip": 0.01574845, + "auxiliary_loss_mlp": 0.01415154, + "balance_loss_clip": 1.32363439, + "balance_loss_mlp": 1.35640764, + "epoch": 0.033849391252066735, + "flos": 16516313400960.0, + "grad_norm": 1.685787422507711, + "language_loss": 0.92255175, + "learning_rate": 3.999844681279401e-06, + "loss": 0.95245177, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.58789062, + "step": 563, + "time_per_iteration": 2.916097640991211 + }, + { + "auxiliary_loss_clip": 0.01566128, + "auxiliary_loss_mlp": 0.01397724, + "balance_loss_clip": 1.31944573, + "balance_loss_mlp": 1.34949231, + "epoch": 0.03390951450473471, + "flos": 15677752677120.0, + "grad_norm": 1.8652635008686627, + "language_loss": 0.99611664, + "learning_rate": 3.99983978971508e-06, + "loss": 1.02575517, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.48266602, + "step": 564, + "time_per_iteration": 2.9636433124542236 + }, + { + "auxiliary_loss_clip": 0.01554717, + "auxiliary_loss_mlp": 0.01413911, + "balance_loss_clip": 1.31344008, + "balance_loss_mlp": 1.34708214, + "epoch": 0.03396963775740267, + "flos": 22685117489280.0, + "grad_norm": 1.9772971195326738, + "language_loss": 1.03355336, + "learning_rate": 3.999834822316635e-06, + "loss": 1.06323981, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.66821289, + "step": 565, + "time_per_iteration": 2.902076005935669 + }, + { + "auxiliary_loss_clip": 0.01426553, + "auxiliary_loss_mlp": 0.01361786, + "balance_loss_clip": 1.27183127, + "balance_loss_mlp": 1.33737159, + "epoch": 0.034029761010070644, + "flos": 64427115191040.0, + "grad_norm": 0.9737345547455311, + "language_loss": 0.550174, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57805741, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.24414062, + "step": 566, + "time_per_iteration": 3.475527048110962 + }, + { + "auxiliary_loss_clip": 0.01544389, + "auxiliary_loss_mlp": 0.01393325, + "balance_loss_clip": 1.30670536, + "balance_loss_mlp": 1.3473103, + "epoch": 0.034089884262738616, + "flos": 25013666592000.0, + "grad_norm": 1.977168972354823, + "language_loss": 0.89368403, + "learning_rate": 3.999824660018126e-06, + "loss": 0.92306119, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.46044922, + "step": 567, + "time_per_iteration": 3.06400728225708 + }, + { + "auxiliary_loss_clip": 0.0152702, + "auxiliary_loss_mlp": 0.01411977, + "balance_loss_clip": 1.30008912, + "balance_loss_mlp": 1.36541343, + "epoch": 0.03415000751540658, + "flos": 28450807240320.0, + "grad_norm": 1.8598290678917226, + "language_loss": 0.88551795, + "learning_rate": 3.999819465118447e-06, + "loss": 0.91490793, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.46606445, + "step": 568, + "time_per_iteration": 3.1752214431762695 + }, + { + "auxiliary_loss_clip": 0.0154429, + "auxiliary_loss_mlp": 0.01419374, + "balance_loss_clip": 1.31187963, + "balance_loss_mlp": 1.37157059, + "epoch": 0.034210130768074554, + "flos": 21478217696640.0, + "grad_norm": 1.5523308431901075, + "language_loss": 0.94625556, + "learning_rate": 3.999814194385413e-06, + "loss": 0.97589225, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.47827148, + "step": 569, + "time_per_iteration": 3.0447850227355957 + }, + { + "auxiliary_loss_clip": 0.01553742, + "auxiliary_loss_mlp": 0.01414346, + "balance_loss_clip": 1.31067061, + "balance_loss_mlp": 1.36244202, + "epoch": 0.03427025402074252, + "flos": 18706711570560.0, + "grad_norm": 1.5836588248020789, + "language_loss": 1.01278734, + "learning_rate": 3.9998088478192255e-06, + "loss": 1.04246819, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.51953125, + "step": 570, + "time_per_iteration": 2.9158406257629395 + }, + { + "auxiliary_loss_clip": 0.01547575, + "auxiliary_loss_mlp": 0.01411431, + "balance_loss_clip": 1.31243038, + "balance_loss_mlp": 1.36091006, + "epoch": 0.03433037727341049, + "flos": 20859710273280.0, + "grad_norm": 1.7575915748496391, + "language_loss": 0.90065736, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.93024743, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.50537109, + "step": 571, + "time_per_iteration": 2.9750373363494873 + }, + { + "auxiliary_loss_clip": 0.01531726, + "auxiliary_loss_mlp": 0.01404086, + "balance_loss_clip": 1.30553555, + "balance_loss_mlp": 1.35287333, + "epoch": 0.03439050052607846, + "flos": 25421079185280.0, + "grad_norm": 1.8091068039489129, + "language_loss": 0.91639858, + "learning_rate": 3.999797927188199e-06, + "loss": 0.94575667, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.51196289, + "step": 572, + "time_per_iteration": 3.0706939697265625 + }, + { + "auxiliary_loss_clip": 0.01544496, + "auxiliary_loss_mlp": 0.01387988, + "balance_loss_clip": 1.31299996, + "balance_loss_mlp": 1.34585905, + "epoch": 0.03445062377874643, + "flos": 17649273911040.0, + "grad_norm": 1.6600077499875048, + "language_loss": 0.92382163, + "learning_rate": 3.999792353123774e-06, + "loss": 0.95314646, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.42138672, + "step": 573, + "time_per_iteration": 4.458945035934448 + }, + { + "auxiliary_loss_clip": 0.01552719, + "auxiliary_loss_mlp": 0.01399822, + "balance_loss_clip": 1.3191011, + "balance_loss_mlp": 1.35199523, + "epoch": 0.0345107470314144, + "flos": 16773404209920.0, + "grad_norm": 1.8508423895187767, + "language_loss": 0.91364622, + "learning_rate": 3.999786703227023e-06, + "loss": 0.94317156, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.47851562, + "step": 574, + "time_per_iteration": 2.9615890979766846 + }, + { + "auxiliary_loss_clip": 0.01561784, + "auxiliary_loss_mlp": 0.01406269, + "balance_loss_clip": 1.32681191, + "balance_loss_mlp": 1.36120772, + "epoch": 0.03457087028408237, + "flos": 14692715976960.0, + "grad_norm": 1.95884858121731, + "language_loss": 0.94009417, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.96977472, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.45043945, + "step": 575, + "time_per_iteration": 5.783670663833618 + }, + { + "auxiliary_loss_clip": 0.0156227, + "auxiliary_loss_mlp": 0.01412789, + "balance_loss_clip": 1.328969, + "balance_loss_mlp": 1.36531985, + "epoch": 0.03463099353675034, + "flos": 20020923325440.0, + "grad_norm": 1.8911038298399365, + "language_loss": 0.89538419, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.92513484, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.47460938, + "step": 576, + "time_per_iteration": 4.725090026855469 + }, + { + "auxiliary_loss_clip": 0.01566596, + "auxiliary_loss_mlp": 0.01415614, + "balance_loss_clip": 1.33039618, + "balance_loss_mlp": 1.37067223, + "epoch": 0.03469111678941831, + "flos": 25312002675840.0, + "grad_norm": 1.8773244416313495, + "language_loss": 0.92319477, + "learning_rate": 3.99976929854497e-06, + "loss": 0.95301682, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.44946289, + "step": 577, + "time_per_iteration": 3.0688271522521973 + }, + { + "auxiliary_loss_clip": 0.0159292, + "auxiliary_loss_mlp": 0.01412019, + "balance_loss_clip": 1.34544909, + "balance_loss_mlp": 1.36533618, + "epoch": 0.034751240042086275, + "flos": 23270116498560.0, + "grad_norm": 1.7727618356935604, + "language_loss": 0.78086531, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.81091475, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.46704102, + "step": 578, + "time_per_iteration": 2.976106643676758 + }, + { + "auxiliary_loss_clip": 0.01620519, + "auxiliary_loss_mlp": 0.01419709, + "balance_loss_clip": 1.35702312, + "balance_loss_mlp": 1.36635125, + "epoch": 0.03481136329475425, + "flos": 23779818881280.0, + "grad_norm": 1.63921187455204, + "language_loss": 0.83574361, + "learning_rate": 3.999757316265973e-06, + "loss": 0.86614585, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.53369141, + "step": 579, + "time_per_iteration": 2.9703712463378906 + }, + { + "auxiliary_loss_clip": 0.01603844, + "auxiliary_loss_mlp": 0.01404163, + "balance_loss_clip": 1.34932995, + "balance_loss_mlp": 1.35512018, + "epoch": 0.03487148654742222, + "flos": 20167263567360.0, + "grad_norm": 1.742623779028622, + "language_loss": 0.93522108, + "learning_rate": 3.999751211379863e-06, + "loss": 0.96530116, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.49072266, + "step": 580, + "time_per_iteration": 2.944106340408325 + }, + { + "auxiliary_loss_clip": 0.01630414, + "auxiliary_loss_mlp": 0.01403878, + "balance_loss_clip": 1.36538768, + "balance_loss_mlp": 1.35750556, + "epoch": 0.034931609800090184, + "flos": 15678114635520.0, + "grad_norm": 1.911331932963442, + "language_loss": 0.93035293, + "learning_rate": 3.999745030662987e-06, + "loss": 0.96069586, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.46386719, + "step": 581, + "time_per_iteration": 3.028916835784912 + }, + { + "auxiliary_loss_clip": 0.01657324, + "auxiliary_loss_mlp": 0.01406587, + "balance_loss_clip": 1.38635778, + "balance_loss_mlp": 1.35499287, + "epoch": 0.034991733052758156, + "flos": 16370968544640.0, + "grad_norm": 1.6989088379021084, + "language_loss": 0.84250855, + "learning_rate": 3.99973877411558e-06, + "loss": 0.87314767, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.51635742, + "step": 582, + "time_per_iteration": 3.1410257816314697 + }, + { + "auxiliary_loss_clip": 0.01646788, + "auxiliary_loss_mlp": 0.01407181, + "balance_loss_clip": 1.37820065, + "balance_loss_mlp": 1.35289311, + "epoch": 0.03505185630542612, + "flos": 19395810161280.0, + "grad_norm": 1.7233054618047592, + "language_loss": 0.93704462, + "learning_rate": 3.999732441737877e-06, + "loss": 0.96758431, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.54272461, + "step": 583, + "time_per_iteration": 2.9909024238586426 + }, + { + "auxiliary_loss_clip": 0.01662865, + "auxiliary_loss_mlp": 0.01406104, + "balance_loss_clip": 1.38821793, + "balance_loss_mlp": 1.34740579, + "epoch": 0.03511197955809409, + "flos": 21333551512320.0, + "grad_norm": 2.222458601761601, + "language_loss": 0.91606987, + "learning_rate": 3.99972603353012e-06, + "loss": 0.94675964, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.5871582, + "step": 584, + "time_per_iteration": 2.9337964057922363 + }, + { + "auxiliary_loss_clip": 0.01673411, + "auxiliary_loss_mlp": 0.01400228, + "balance_loss_clip": 1.39251792, + "balance_loss_mlp": 1.34596384, + "epoch": 0.035172102810762065, + "flos": 14144573496960.0, + "grad_norm": 2.4465801816659063, + "language_loss": 1.06761026, + "learning_rate": 3.999719549492551e-06, + "loss": 1.09834671, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.54248047, + "step": 585, + "time_per_iteration": 2.921839952468872 + }, + { + "auxiliary_loss_clip": 0.01683938, + "auxiliary_loss_mlp": 0.01408947, + "balance_loss_clip": 1.39824033, + "balance_loss_mlp": 1.34638631, + "epoch": 0.03523222606343003, + "flos": 20304962052480.0, + "grad_norm": 2.1229876399611443, + "language_loss": 0.96464878, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.99557763, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.62597656, + "step": 586, + "time_per_iteration": 2.8853392601013184 + }, + { + "auxiliary_loss_clip": 0.01720813, + "auxiliary_loss_mlp": 0.01407195, + "balance_loss_clip": 1.41742241, + "balance_loss_mlp": 1.34930742, + "epoch": 0.035292349316098, + "flos": 20385461831040.0, + "grad_norm": 1.951628450169371, + "language_loss": 0.86362016, + "learning_rate": 3.999706353928965e-06, + "loss": 0.89490026, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.57885742, + "step": 587, + "time_per_iteration": 2.9347589015960693 + }, + { + "auxiliary_loss_clip": 0.01743331, + "auxiliary_loss_mlp": 0.01408699, + "balance_loss_clip": 1.4238832, + "balance_loss_mlp": 1.34649539, + "epoch": 0.03535247256876597, + "flos": 21478398675840.0, + "grad_norm": 1.5591245626304466, + "language_loss": 0.86218131, + "learning_rate": 3.999699642403449e-06, + "loss": 0.89370161, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.62231445, + "step": 588, + "time_per_iteration": 2.9129388332366943 + }, + { + "auxiliary_loss_clip": 0.01756896, + "auxiliary_loss_mlp": 0.01414689, + "balance_loss_clip": 1.43345726, + "balance_loss_mlp": 1.3465488, + "epoch": 0.03541259582143394, + "flos": 23633523884160.0, + "grad_norm": 1.848168934006035, + "language_loss": 1.05603802, + "learning_rate": 3.99969285504912e-06, + "loss": 1.08775389, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.68164062, + "step": 589, + "time_per_iteration": 3.0794179439544678 + }, + { + "auxiliary_loss_clip": 0.01793693, + "auxiliary_loss_mlp": 0.01420307, + "balance_loss_clip": 1.4631784, + "balance_loss_mlp": 1.35908079, + "epoch": 0.03547271907410191, + "flos": 33738628965120.0, + "grad_norm": 2.287648887012532, + "language_loss": 0.91153169, + "learning_rate": 3.99968599186624e-06, + "loss": 0.9436717, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.61279297, + "step": 590, + "time_per_iteration": 3.0532338619232178 + }, + { + "auxiliary_loss_clip": 0.01821535, + "auxiliary_loss_mlp": 0.01429073, + "balance_loss_clip": 1.49155951, + "balance_loss_mlp": 1.36021757, + "epoch": 0.03553284232676988, + "flos": 21152707470720.0, + "grad_norm": 1.9762038539876163, + "language_loss": 0.94691497, + "learning_rate": 3.999679052855065e-06, + "loss": 0.97942102, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.68847656, + "step": 591, + "time_per_iteration": 2.9257352352142334 + }, + { + "auxiliary_loss_clip": 0.01862717, + "auxiliary_loss_mlp": 0.01451327, + "balance_loss_clip": 1.51514554, + "balance_loss_mlp": 1.37427056, + "epoch": 0.03559296557943785, + "flos": 20055924817920.0, + "grad_norm": 2.1810165776733474, + "language_loss": 0.90176773, + "learning_rate": 3.999672038015861e-06, + "loss": 0.93490815, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 3.47460938, + "router_z_loss_mlp": 0.77099609, + "step": 592, + "time_per_iteration": 2.947638988494873 + }, + { + "auxiliary_loss_clip": 0.01740423, + "auxiliary_loss_mlp": 0.0148205, + "balance_loss_clip": 1.48756504, + "balance_loss_mlp": 1.41605544, + "epoch": 0.035653088832105814, + "flos": 60365856764160.0, + "grad_norm": 0.873922115681388, + "language_loss": 0.59890729, + "learning_rate": 3.999664947348893e-06, + "loss": 0.63113207, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.66015625, + "step": 593, + "time_per_iteration": 3.5285894870758057 + }, + { + "auxiliary_loss_clip": 0.01888832, + "auxiliary_loss_mlp": 0.01433082, + "balance_loss_clip": 1.54534328, + "balance_loss_mlp": 1.35206783, + "epoch": 0.035713212084773786, + "flos": 20120815140480.0, + "grad_norm": 1.9080466413344728, + "language_loss": 0.91681135, + "learning_rate": 3.999657780854429e-06, + "loss": 0.95003045, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 3.43164062, + "router_z_loss_mlp": 0.80957031, + "step": 594, + "time_per_iteration": 2.970724582672119 + }, + { + "auxiliary_loss_clip": 0.01893908, + "auxiliary_loss_mlp": 0.01432502, + "balance_loss_clip": 1.54917574, + "balance_loss_mlp": 1.35105872, + "epoch": 0.03577333533744176, + "flos": 26296767907200.0, + "grad_norm": 1.837331053138254, + "language_loss": 0.90361065, + "learning_rate": 3.999650538532742e-06, + "loss": 0.93687475, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.81494141, + "step": 595, + "time_per_iteration": 3.101132869720459 + }, + { + "auxiliary_loss_clip": 0.01892363, + "auxiliary_loss_mlp": 0.01432611, + "balance_loss_clip": 1.55467236, + "balance_loss_mlp": 1.35035682, + "epoch": 0.035833458590109724, + "flos": 10896963891840.0, + "grad_norm": 2.6076380909210455, + "language_loss": 1.06050766, + "learning_rate": 3.999643220384106e-06, + "loss": 1.09375751, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 3.37304688, + "router_z_loss_mlp": 0.82226562, + "step": 596, + "time_per_iteration": 3.223883867263794 + }, + { + "auxiliary_loss_clip": 0.01901068, + "auxiliary_loss_mlp": 0.01447962, + "balance_loss_clip": 1.5656594, + "balance_loss_mlp": 1.35741055, + "epoch": 0.035893581842777696, + "flos": 22100163724800.0, + "grad_norm": 2.7167327480983103, + "language_loss": 0.89809322, + "learning_rate": 3.999635826408799e-06, + "loss": 0.93158352, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 3.35546875, + "router_z_loss_mlp": 0.90429688, + "step": 597, + "time_per_iteration": 3.0605852603912354 + }, + { + "auxiliary_loss_clip": 0.01884526, + "auxiliary_loss_mlp": 0.01437787, + "balance_loss_clip": 1.55691886, + "balance_loss_mlp": 1.36197007, + "epoch": 0.03595370509544566, + "flos": 23048434385280.0, + "grad_norm": 1.5508005042904893, + "language_loss": 0.84828365, + "learning_rate": 3.999628356607101e-06, + "loss": 0.8815068, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.7578125, + "step": 598, + "time_per_iteration": 3.0795273780822754 + }, + { + "auxiliary_loss_clip": 0.01906464, + "auxiliary_loss_mlp": 0.01420613, + "balance_loss_clip": 1.57834888, + "balance_loss_mlp": 1.35600185, + "epoch": 0.03601382834811363, + "flos": 20787626027520.0, + "grad_norm": 2.2941943231481536, + "language_loss": 0.85047519, + "learning_rate": 3.999620810979295e-06, + "loss": 0.88374597, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.64599609, + "step": 599, + "time_per_iteration": 3.1124086380004883 + }, + { + "auxiliary_loss_clip": 0.01947726, + "auxiliary_loss_mlp": 0.0144355, + "balance_loss_clip": 1.59417224, + "balance_loss_mlp": 1.36515772, + "epoch": 0.036073951600781605, + "flos": 23962246490880.0, + "grad_norm": 2.3527894683130315, + "language_loss": 0.94584858, + "learning_rate": 3.999613189525668e-06, + "loss": 0.97976142, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.78320312, + "step": 600, + "time_per_iteration": 3.151994228363037 + }, + { + "auxiliary_loss_clip": 0.01930073, + "auxiliary_loss_mlp": 0.0144983, + "balance_loss_clip": 1.5954659, + "balance_loss_mlp": 1.37463236, + "epoch": 0.03613407485344957, + "flos": 18920475843840.0, + "grad_norm": 1.539176234993382, + "language_loss": 0.86816466, + "learning_rate": 3.999605492246508e-06, + "loss": 0.90196371, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 3.34570312, + "router_z_loss_mlp": 0.75195312, + "step": 601, + "time_per_iteration": 3.00903058052063 + }, + { + "auxiliary_loss_clip": 0.01929101, + "auxiliary_loss_mlp": 0.01440509, + "balance_loss_clip": 1.59515429, + "balance_loss_mlp": 1.36302269, + "epoch": 0.03619419810611754, + "flos": 23048660609280.0, + "grad_norm": 2.2456939514783096, + "language_loss": 0.83116972, + "learning_rate": 3.999597719142107e-06, + "loss": 0.86486578, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 3.34179688, + "router_z_loss_mlp": 0.77392578, + "step": 602, + "time_per_iteration": 3.0253920555114746 + }, + { + "auxiliary_loss_clip": 0.01944493, + "auxiliary_loss_mlp": 0.01440998, + "balance_loss_clip": 1.59861147, + "balance_loss_mlp": 1.37180924, + "epoch": 0.03625432135878551, + "flos": 29468990396160.0, + "grad_norm": 1.8356301208711294, + "language_loss": 0.84219337, + "learning_rate": 3.999589870212761e-06, + "loss": 0.87604827, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 3.45898438, + "router_z_loss_mlp": 0.69140625, + "step": 603, + "time_per_iteration": 3.133394479751587 + }, + { + "auxiliary_loss_clip": 0.01932313, + "auxiliary_loss_mlp": 0.01452384, + "balance_loss_clip": 1.60335541, + "balance_loss_mlp": 1.35291553, + "epoch": 0.03631444461145348, + "flos": 23517660654720.0, + "grad_norm": 1.8812945710816888, + "language_loss": 0.90467238, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.93851942, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.99462891, + "step": 604, + "time_per_iteration": 2.9759998321533203 + }, + { + "auxiliary_loss_clip": 0.01928248, + "auxiliary_loss_mlp": 0.01443333, + "balance_loss_clip": 1.59704578, + "balance_loss_mlp": 1.35526109, + "epoch": 0.03637456786412145, + "flos": 16626882988800.0, + "grad_norm": 1.9062668440924413, + "language_loss": 0.8915112, + "learning_rate": 3.999573944880424e-06, + "loss": 0.92522705, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.88037109, + "step": 605, + "time_per_iteration": 3.019666910171509 + }, + { + "auxiliary_loss_clip": 0.01930871, + "auxiliary_loss_mlp": 0.01424753, + "balance_loss_clip": 1.59140468, + "balance_loss_mlp": 1.35794806, + "epoch": 0.03643469111678942, + "flos": 15860361265920.0, + "grad_norm": 2.8454459646878076, + "language_loss": 0.91682136, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.95037758, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.66748047, + "step": 606, + "time_per_iteration": 2.983440399169922 + }, + { + "auxiliary_loss_clip": 0.01934032, + "auxiliary_loss_mlp": 0.01440346, + "balance_loss_clip": 1.59333563, + "balance_loss_mlp": 1.36409986, + "epoch": 0.03649481436945739, + "flos": 23630175768960.0, + "grad_norm": 2.23284893499446, + "language_loss": 0.8723774, + "learning_rate": 3.999557716251912e-06, + "loss": 0.90612119, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.76220703, + "step": 607, + "time_per_iteration": 3.020555019378662 + }, + { + "auxiliary_loss_clip": 0.01912361, + "auxiliary_loss_mlp": 0.01424373, + "balance_loss_clip": 1.58429635, + "balance_loss_mlp": 1.35260868, + "epoch": 0.036554937622125354, + "flos": 21763839991680.0, + "grad_norm": 2.286078587468741, + "language_loss": 0.8705709, + "learning_rate": 3.999549488202358e-06, + "loss": 0.90393817, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.71826172, + "step": 608, + "time_per_iteration": 4.450667381286621 + }, + { + "auxiliary_loss_clip": 0.0191477, + "auxiliary_loss_mlp": 0.01426967, + "balance_loss_clip": 1.58949876, + "balance_loss_mlp": 1.35396314, + "epoch": 0.036615060874793326, + "flos": 17828217671040.0, + "grad_norm": 2.323977553959948, + "language_loss": 0.87034273, + "learning_rate": 3.999541184329688e-06, + "loss": 0.90376008, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.72998047, + "step": 609, + "time_per_iteration": 2.9909160137176514 + }, + { + "auxiliary_loss_clip": 0.01931804, + "auxiliary_loss_mlp": 0.01418835, + "balance_loss_clip": 1.59376013, + "balance_loss_mlp": 1.34921646, + "epoch": 0.0366751841274613, + "flos": 26763279488640.0, + "grad_norm": 2.1419707970279918, + "language_loss": 0.85734242, + "learning_rate": 3.999532804634215e-06, + "loss": 0.89084876, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 3.3828125, + "router_z_loss_mlp": 0.69628906, + "step": 610, + "time_per_iteration": 5.774108409881592 + }, + { + "auxiliary_loss_clip": 0.01940297, + "auxiliary_loss_mlp": 0.01436349, + "balance_loss_clip": 1.60339344, + "balance_loss_mlp": 1.35905361, + "epoch": 0.03673530738012926, + "flos": 22206254077440.0, + "grad_norm": 1.926008475677365, + "language_loss": 0.92885894, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.96262538, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.77392578, + "step": 611, + "time_per_iteration": 4.349674463272095 + }, + { + "auxiliary_loss_clip": 0.01946333, + "auxiliary_loss_mlp": 0.01431096, + "balance_loss_clip": 1.61029577, + "balance_loss_mlp": 1.35842562, + "epoch": 0.036795430632797235, + "flos": 24692545111680.0, + "grad_norm": 3.94706731067507, + "language_loss": 0.77191204, + "learning_rate": 3.999515817776136e-06, + "loss": 0.80568635, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 3.36132812, + "router_z_loss_mlp": 0.72607422, + "step": 612, + "time_per_iteration": 2.924837350845337 + }, + { + "auxiliary_loss_clip": 0.01955085, + "auxiliary_loss_mlp": 0.01435615, + "balance_loss_clip": 1.6044724, + "balance_loss_mlp": 1.36351705, + "epoch": 0.0368555538854652, + "flos": 17757897972480.0, + "grad_norm": 2.5676435594147384, + "language_loss": 0.85787749, + "learning_rate": 3.999507210614175e-06, + "loss": 0.89178449, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 3.50585938, + "router_z_loss_mlp": 0.72119141, + "step": 613, + "time_per_iteration": 3.089292287826538 + }, + { + "auxiliary_loss_clip": 0.01939063, + "auxiliary_loss_mlp": 0.01430163, + "balance_loss_clip": 1.60057998, + "balance_loss_mlp": 1.35835099, + "epoch": 0.03691567713813317, + "flos": 20604203032320.0, + "grad_norm": 3.155582607142536, + "language_loss": 0.9845022, + "learning_rate": 3.9994985276307e-06, + "loss": 1.01819444, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 3.38671875, + "router_z_loss_mlp": 0.71826172, + "step": 614, + "time_per_iteration": 3.069502592086792 + }, + { + "auxiliary_loss_clip": 0.0197522, + "auxiliary_loss_mlp": 0.0143698, + "balance_loss_clip": 1.61723971, + "balance_loss_mlp": 1.35515499, + "epoch": 0.036975800390801145, + "flos": 33661839260160.0, + "grad_norm": 3.0095886195578037, + "language_loss": 0.82343149, + "learning_rate": 3.999489768826041e-06, + "loss": 0.85755348, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 3.578125, + "router_z_loss_mlp": 0.81835938, + "step": 615, + "time_per_iteration": 3.0997555255889893 + }, + { + "auxiliary_loss_clip": 0.01984445, + "auxiliary_loss_mlp": 0.01453485, + "balance_loss_clip": 1.62321246, + "balance_loss_mlp": 1.37618971, + "epoch": 0.03703592364346911, + "flos": 28305507628800.0, + "grad_norm": 1.601603677165002, + "language_loss": 0.87257296, + "learning_rate": 3.999480934200528e-06, + "loss": 0.90695226, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.7734375, + "step": 616, + "time_per_iteration": 3.0701889991760254 + }, + { + "auxiliary_loss_clip": 0.0199195, + "auxiliary_loss_mlp": 0.01430348, + "balance_loss_clip": 1.62976766, + "balance_loss_mlp": 1.3622086, + "epoch": 0.03709604689613708, + "flos": 31516803642240.0, + "grad_norm": 2.177943192355028, + "language_loss": 0.73765069, + "learning_rate": 3.999472023754499e-06, + "loss": 0.77187365, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 3.61914062, + "router_z_loss_mlp": 0.68115234, + "step": 617, + "time_per_iteration": 3.031184673309326 + }, + { + "auxiliary_loss_clip": 0.01965922, + "auxiliary_loss_mlp": 0.0143351, + "balance_loss_clip": 1.62356257, + "balance_loss_mlp": 1.36660957, + "epoch": 0.03715617014880505, + "flos": 19618759128960.0, + "grad_norm": 2.112586893724527, + "language_loss": 0.85774469, + "learning_rate": 3.99946303748829e-06, + "loss": 0.89173907, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.66894531, + "step": 618, + "time_per_iteration": 3.038404941558838 + }, + { + "auxiliary_loss_clip": 0.0198962, + "auxiliary_loss_mlp": 0.01546613, + "balance_loss_clip": 1.62751937, + "balance_loss_mlp": 1.38229489, + "epoch": 0.03721629340147302, + "flos": 15932581246080.0, + "grad_norm": 2.2050070208031958, + "language_loss": 0.95398563, + "learning_rate": 3.999453975402242e-06, + "loss": 0.98934793, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 3.62304688, + "router_z_loss_mlp": 1.64404297, + "step": 619, + "time_per_iteration": 2.9268627166748047 + }, + { + "auxiliary_loss_clip": 0.01990775, + "auxiliary_loss_mlp": 0.01464894, + "balance_loss_clip": 1.62606525, + "balance_loss_mlp": 1.38869524, + "epoch": 0.03727641665414099, + "flos": 21113543456640.0, + "grad_norm": 2.093550364324197, + "language_loss": 0.98569179, + "learning_rate": 3.9994448374967e-06, + "loss": 1.02024841, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 3.6484375, + "router_z_loss_mlp": 0.76171875, + "step": 620, + "time_per_iteration": 2.9641520977020264 + }, + { + "auxiliary_loss_clip": 0.02010425, + "auxiliary_loss_mlp": 0.01458263, + "balance_loss_clip": 1.63691294, + "balance_loss_mlp": 1.3900274, + "epoch": 0.037336539906808956, + "flos": 24141733188480.0, + "grad_norm": 1.8063927247271854, + "language_loss": 0.8125475, + "learning_rate": 3.999435623772008e-06, + "loss": 0.84723437, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 3.73632812, + "router_z_loss_mlp": 0.68212891, + "step": 621, + "time_per_iteration": 3.092707633972168 + }, + { + "auxiliary_loss_clip": 0.01988741, + "auxiliary_loss_mlp": 0.01445774, + "balance_loss_clip": 1.63122034, + "balance_loss_mlp": 1.37977982, + "epoch": 0.03739666315947693, + "flos": 22356621106560.0, + "grad_norm": 2.5836089076134385, + "language_loss": 0.9271642, + "learning_rate": 3.999426334228518e-06, + "loss": 0.96150929, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 3.57226562, + "router_z_loss_mlp": 0.65917969, + "step": 622, + "time_per_iteration": 3.104374647140503 + }, + { + "auxiliary_loss_clip": 0.01991309, + "auxiliary_loss_mlp": 0.01435521, + "balance_loss_clip": 1.63254762, + "balance_loss_mlp": 1.36900246, + "epoch": 0.0374567864121449, + "flos": 20459355868800.0, + "grad_norm": 2.056702133791348, + "language_loss": 0.9466567, + "learning_rate": 3.999416968866581e-06, + "loss": 0.98092502, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.66503906, + "step": 623, + "time_per_iteration": 3.034270763397217 + }, + { + "auxiliary_loss_clip": 0.02000277, + "auxiliary_loss_mlp": 0.01465206, + "balance_loss_clip": 1.63302922, + "balance_loss_mlp": 1.38400054, + "epoch": 0.037516909664812866, + "flos": 19217318849280.0, + "grad_norm": 2.7509971422018205, + "language_loss": 0.87283587, + "learning_rate": 3.999407527686551e-06, + "loss": 0.90749079, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 3.67578125, + "router_z_loss_mlp": 0.81152344, + "step": 624, + "time_per_iteration": 3.065488815307617 + }, + { + "auxiliary_loss_clip": 0.01989801, + "auxiliary_loss_mlp": 0.01431066, + "balance_loss_clip": 1.62796307, + "balance_loss_mlp": 1.36645436, + "epoch": 0.03757703291748084, + "flos": 35018291675520.0, + "grad_norm": 2.338898533237047, + "language_loss": 0.7313059, + "learning_rate": 3.999398010688788e-06, + "loss": 0.76551461, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 3.6171875, + "router_z_loss_mlp": 0.64648438, + "step": 625, + "time_per_iteration": 3.1954362392425537 + }, + { + "auxiliary_loss_clip": 0.01981862, + "auxiliary_loss_mlp": 0.0143992, + "balance_loss_clip": 1.62162209, + "balance_loss_mlp": 1.36601055, + "epoch": 0.0376371561701488, + "flos": 25494882733440.0, + "grad_norm": 1.9093250920311804, + "language_loss": 0.80136228, + "learning_rate": 3.999388417873652e-06, + "loss": 0.83558005, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 3.59765625, + "router_z_loss_mlp": 0.73925781, + "step": 626, + "time_per_iteration": 3.1030514240264893 + }, + { + "auxiliary_loss_clip": 0.01965828, + "auxiliary_loss_mlp": 0.0142893, + "balance_loss_clip": 1.60747957, + "balance_loss_mlp": 1.36059904, + "epoch": 0.037697279422816775, + "flos": 18194430234240.0, + "grad_norm": 2.83098725595125, + "language_loss": 0.84185112, + "learning_rate": 3.999378749241506e-06, + "loss": 0.87579876, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 3.58398438, + "router_z_loss_mlp": 0.68359375, + "step": 627, + "time_per_iteration": 3.173980951309204 + }, + { + "auxiliary_loss_clip": 0.01965883, + "auxiliary_loss_mlp": 0.01439176, + "balance_loss_clip": 1.60075998, + "balance_loss_mlp": 1.36946237, + "epoch": 0.03775740267548475, + "flos": 24654919420800.0, + "grad_norm": 1.705400091856147, + "language_loss": 0.91859508, + "learning_rate": 3.999369004792719e-06, + "loss": 0.95264566, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 3.65039062, + "router_z_loss_mlp": 0.69628906, + "step": 628, + "time_per_iteration": 3.044499397277832 + }, + { + "auxiliary_loss_clip": 0.01937686, + "auxiliary_loss_mlp": 0.01440135, + "balance_loss_clip": 1.58500576, + "balance_loss_mlp": 1.36493731, + "epoch": 0.03781752592815271, + "flos": 21298142816640.0, + "grad_norm": 2.0619044621492413, + "language_loss": 0.83408713, + "learning_rate": 3.999359184527658e-06, + "loss": 0.86786532, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 3.5234375, + "router_z_loss_mlp": 0.75195312, + "step": 629, + "time_per_iteration": 2.999502420425415 + }, + { + "auxiliary_loss_clip": 0.01938917, + "auxiliary_loss_mlp": 0.01425595, + "balance_loss_clip": 1.5889039, + "balance_loss_mlp": 1.35807514, + "epoch": 0.037877649180820684, + "flos": 22099439808000.0, + "grad_norm": 1.7071060015623658, + "language_loss": 0.82495666, + "learning_rate": 3.999349288446696e-06, + "loss": 0.85860181, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.67480469, + "step": 630, + "time_per_iteration": 2.98579740524292 + }, + { + "auxiliary_loss_clip": 0.0194274, + "auxiliary_loss_mlp": 0.01420214, + "balance_loss_clip": 1.58717513, + "balance_loss_mlp": 1.3508817, + "epoch": 0.03793777243348865, + "flos": 14509383471360.0, + "grad_norm": 2.6494900699367276, + "language_loss": 0.98618186, + "learning_rate": 3.99933931655021e-06, + "loss": 1.01981139, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 3.5546875, + "router_z_loss_mlp": 0.69335938, + "step": 631, + "time_per_iteration": 3.008906126022339 + }, + { + "auxiliary_loss_clip": 0.01899916, + "auxiliary_loss_mlp": 0.01432099, + "balance_loss_clip": 1.56636965, + "balance_loss_mlp": 1.36104989, + "epoch": 0.03799789568615662, + "flos": 21918505276800.0, + "grad_norm": 1.8301356800961104, + "language_loss": 0.938528, + "learning_rate": 3.999329268838575e-06, + "loss": 0.97184813, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.71044922, + "step": 632, + "time_per_iteration": 2.9540345668792725 + }, + { + "auxiliary_loss_clip": 0.01927582, + "auxiliary_loss_mlp": 0.01439588, + "balance_loss_clip": 1.5803442, + "balance_loss_mlp": 1.36529696, + "epoch": 0.03805801893882459, + "flos": 24837799478400.0, + "grad_norm": 1.7480285827524378, + "language_loss": 0.8796767, + "learning_rate": 3.999319145312175e-06, + "loss": 0.91334844, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 3.47070312, + "router_z_loss_mlp": 0.7421875, + "step": 633, + "time_per_iteration": 2.9677391052246094 + }, + { + "auxiliary_loss_clip": 0.01935275, + "auxiliary_loss_mlp": 0.01432711, + "balance_loss_clip": 1.58136535, + "balance_loss_mlp": 1.35918283, + "epoch": 0.03811814219149256, + "flos": 30495362860800.0, + "grad_norm": 2.3455166878697504, + "language_loss": 0.73889506, + "learning_rate": 3.999308945971392e-06, + "loss": 0.77257496, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.73583984, + "step": 634, + "time_per_iteration": 3.0682854652404785 + }, + { + "auxiliary_loss_clip": 0.01734273, + "auxiliary_loss_mlp": 0.0138631, + "balance_loss_clip": 1.46508932, + "balance_loss_mlp": 1.33290446, + "epoch": 0.03817826544416053, + "flos": 67020654960000.0, + "grad_norm": 0.9123790216905204, + "language_loss": 0.61735392, + "learning_rate": 3.999298670816614e-06, + "loss": 0.64855975, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.53515625, + "step": 635, + "time_per_iteration": 3.4068970680236816 + }, + { + "auxiliary_loss_clip": 0.019051, + "auxiliary_loss_mlp": 0.01442545, + "balance_loss_clip": 1.5672183, + "balance_loss_mlp": 1.35857415, + "epoch": 0.038238388696828496, + "flos": 20495036033280.0, + "grad_norm": 2.299456674945861, + "language_loss": 0.89363319, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.9271096, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.83984375, + "step": 636, + "time_per_iteration": 2.9538116455078125 + }, + { + "auxiliary_loss_clip": 0.01904595, + "auxiliary_loss_mlp": 0.01423407, + "balance_loss_clip": 1.55946088, + "balance_loss_mlp": 1.35374093, + "epoch": 0.03829851194949647, + "flos": 17974422178560.0, + "grad_norm": 2.527381808393012, + "language_loss": 0.86820251, + "learning_rate": 3.999277893066632e-06, + "loss": 0.90148252, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 3.453125, + "router_z_loss_mlp": 0.69677734, + "step": 637, + "time_per_iteration": 3.0464611053466797 + }, + { + "auxiliary_loss_clip": 0.01913999, + "auxiliary_loss_mlp": 0.01440562, + "balance_loss_clip": 1.56176019, + "balance_loss_mlp": 1.36412489, + "epoch": 0.03835863520216444, + "flos": 22466964470400.0, + "grad_norm": 1.8864941358407028, + "language_loss": 0.89376199, + "learning_rate": 3.999267390472215e-06, + "loss": 0.92730761, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 3.52148438, + "router_z_loss_mlp": 0.76513672, + "step": 638, + "time_per_iteration": 2.938405990600586 + }, + { + "auxiliary_loss_clip": 0.01917575, + "auxiliary_loss_mlp": 0.01436482, + "balance_loss_clip": 1.55777884, + "balance_loss_mlp": 1.35947239, + "epoch": 0.038418758454832405, + "flos": 22174736434560.0, + "grad_norm": 2.5582157337740594, + "language_loss": 0.75675887, + "learning_rate": 3.999256812065381e-06, + "loss": 0.79029942, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 3.59570312, + "router_z_loss_mlp": 0.77050781, + "step": 639, + "time_per_iteration": 3.016514539718628 + }, + { + "auxiliary_loss_clip": 0.01907902, + "auxiliary_loss_mlp": 0.0144234, + "balance_loss_clip": 1.55320704, + "balance_loss_mlp": 1.35608017, + "epoch": 0.03847888170750038, + "flos": 22757699427840.0, + "grad_norm": 2.1302036355985834, + "language_loss": 0.92118192, + "learning_rate": 3.999246157846526e-06, + "loss": 0.95468438, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 3.54882812, + "router_z_loss_mlp": 0.86230469, + "step": 640, + "time_per_iteration": 2.9296979904174805 + }, + { + "auxiliary_loss_clip": 0.01899025, + "auxiliary_loss_mlp": 0.01429637, + "balance_loss_clip": 1.55108666, + "balance_loss_mlp": 1.3654542, + "epoch": 0.03853900496016834, + "flos": 22721521570560.0, + "grad_norm": 1.8983695019685312, + "language_loss": 0.87132448, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.90461111, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 3.47851562, + "router_z_loss_mlp": 0.64160156, + "step": 641, + "time_per_iteration": 3.0239412784576416 + }, + { + "auxiliary_loss_clip": 0.01695042, + "auxiliary_loss_mlp": 0.01373602, + "balance_loss_clip": 1.43437839, + "balance_loss_mlp": 1.32839739, + "epoch": 0.038599128212836314, + "flos": 70431571607040.0, + "grad_norm": 0.9020523739176979, + "language_loss": 0.65704423, + "learning_rate": 3.999224621974381e-06, + "loss": 0.68773061, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.45117188, + "step": 642, + "time_per_iteration": 3.3538568019866943 + }, + { + "auxiliary_loss_clip": 0.01864784, + "auxiliary_loss_mlp": 0.01436553, + "balance_loss_clip": 1.52845466, + "balance_loss_mlp": 1.37122655, + "epoch": 0.03865925146550429, + "flos": 23305660928640.0, + "grad_norm": 1.7744260865943782, + "language_loss": 0.84141541, + "learning_rate": 3.999213740321906e-06, + "loss": 0.87442875, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 3.36523438, + "router_z_loss_mlp": 0.65332031, + "step": 643, + "time_per_iteration": 4.3217246532440186 + }, + { + "auxiliary_loss_clip": 0.01863137, + "auxiliary_loss_mlp": 0.01426457, + "balance_loss_clip": 1.51912498, + "balance_loss_mlp": 1.35311985, + "epoch": 0.03871937471817225, + "flos": 21439460885760.0, + "grad_norm": 1.716062926633632, + "language_loss": 0.85317433, + "learning_rate": 3.999202782859046e-06, + "loss": 0.88607025, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.73291016, + "step": 644, + "time_per_iteration": 2.926156759262085 + }, + { + "auxiliary_loss_clip": 0.01866781, + "auxiliary_loss_mlp": 0.01420001, + "balance_loss_clip": 1.52869511, + "balance_loss_mlp": 1.35748792, + "epoch": 0.038779497970840224, + "flos": 34290662497920.0, + "grad_norm": 2.0923708374500243, + "language_loss": 0.85123581, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.88410366, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.62451172, + "step": 645, + "time_per_iteration": 4.45543360710144 + }, + { + "auxiliary_loss_clip": 0.01864505, + "auxiliary_loss_mlp": 0.0142313, + "balance_loss_clip": 1.52902365, + "balance_loss_mlp": 1.35823202, + "epoch": 0.03883962122350819, + "flos": 22758649568640.0, + "grad_norm": 2.7283505714124883, + "language_loss": 0.86663806, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.89951444, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.64868164, + "step": 646, + "time_per_iteration": 4.4281065464019775 + }, + { + "auxiliary_loss_clip": 0.01845573, + "auxiliary_loss_mlp": 0.01421225, + "balance_loss_clip": 1.51264119, + "balance_loss_mlp": 1.35833037, + "epoch": 0.03889974447617616, + "flos": 21956176212480.0, + "grad_norm": 3.326580865876682, + "language_loss": 0.85393858, + "learning_rate": 3.999169455612323e-06, + "loss": 0.88660657, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 3.33007812, + "router_z_loss_mlp": 0.62890625, + "step": 647, + "time_per_iteration": 2.881925582885742 + }, + { + "auxiliary_loss_clip": 0.01845996, + "auxiliary_loss_mlp": 0.01415517, + "balance_loss_clip": 1.51274228, + "balance_loss_mlp": 1.35128653, + "epoch": 0.03895986772884413, + "flos": 31517572803840.0, + "grad_norm": 2.263818984224509, + "language_loss": 0.89751983, + "learning_rate": 3.999158194912106e-06, + "loss": 0.93013501, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 3.33203125, + "router_z_loss_mlp": 0.64208984, + "step": 648, + "time_per_iteration": 2.986081838607788 + }, + { + "auxiliary_loss_clip": 0.01847286, + "auxiliary_loss_mlp": 0.01415413, + "balance_loss_clip": 1.51501799, + "balance_loss_mlp": 1.35318518, + "epoch": 0.0390199909815121, + "flos": 19910398982400.0, + "grad_norm": 1.8080214588962245, + "language_loss": 0.87803572, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.91066277, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 3.31835938, + "router_z_loss_mlp": 0.62207031, + "step": 649, + "time_per_iteration": 2.902810573577881 + }, + { + "auxiliary_loss_clip": 0.01867056, + "auxiliary_loss_mlp": 0.01418014, + "balance_loss_clip": 1.52446318, + "balance_loss_mlp": 1.35678792, + "epoch": 0.03908011423418007, + "flos": 21621436047360.0, + "grad_norm": 1.7147108180707142, + "language_loss": 0.82984376, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8626945, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.61230469, + "step": 650, + "time_per_iteration": 2.916159152984619 + }, + { + "auxiliary_loss_clip": 0.01848007, + "auxiliary_loss_mlp": 0.01419496, + "balance_loss_clip": 1.51558495, + "balance_loss_mlp": 1.35622001, + "epoch": 0.039140237486848035, + "flos": 18670714692480.0, + "grad_norm": 2.1479990763192647, + "language_loss": 0.82980067, + "learning_rate": 3.9991239579635e-06, + "loss": 0.86247569, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 3.3203125, + "router_z_loss_mlp": 0.63232422, + "step": 651, + "time_per_iteration": 2.916741132736206 + }, + { + "auxiliary_loss_clip": 0.01865738, + "auxiliary_loss_mlp": 0.01422804, + "balance_loss_clip": 1.52524722, + "balance_loss_mlp": 1.36148286, + "epoch": 0.03920036073951601, + "flos": 18670352734080.0, + "grad_norm": 2.189500565319255, + "language_loss": 0.91284573, + "learning_rate": 3.999112394032757e-06, + "loss": 0.94573104, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 3.40820312, + "router_z_loss_mlp": 0.61328125, + "step": 652, + "time_per_iteration": 2.8671233654022217 + }, + { + "auxiliary_loss_clip": 0.01856369, + "auxiliary_loss_mlp": 0.01414307, + "balance_loss_clip": 1.51763535, + "balance_loss_mlp": 1.35079229, + "epoch": 0.03926048399218398, + "flos": 31366029409920.0, + "grad_norm": 2.516393821812266, + "language_loss": 0.86500233, + "learning_rate": 3.999100754295471e-06, + "loss": 0.89770901, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 3.390625, + "router_z_loss_mlp": 0.63574219, + "step": 653, + "time_per_iteration": 3.0481810569763184 + }, + { + "auxiliary_loss_clip": 0.01910817, + "auxiliary_loss_mlp": 0.01429159, + "balance_loss_clip": 1.54609156, + "balance_loss_mlp": 1.36268783, + "epoch": 0.039320607244851945, + "flos": 29614697210880.0, + "grad_norm": 1.9226558760436334, + "language_loss": 0.90441477, + "learning_rate": 3.999089038752085e-06, + "loss": 0.93781459, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 3.64453125, + "router_z_loss_mlp": 0.6640625, + "step": 654, + "time_per_iteration": 3.0295088291168213 + }, + { + "auxiliary_loss_clip": 0.0166176, + "auxiliary_loss_mlp": 0.01389718, + "balance_loss_clip": 1.41711056, + "balance_loss_mlp": 1.35214365, + "epoch": 0.03938073049751992, + "flos": 66566811922560.0, + "grad_norm": 0.7550562334137961, + "language_loss": 0.50148559, + "learning_rate": 3.999077247403041e-06, + "loss": 0.53200042, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.375, + "step": 655, + "time_per_iteration": 3.4123547077178955 + }, + { + "auxiliary_loss_clip": 0.01862617, + "auxiliary_loss_mlp": 0.0142485, + "balance_loss_clip": 1.52751708, + "balance_loss_mlp": 1.36872637, + "epoch": 0.03944085375018788, + "flos": 23378288112000.0, + "grad_norm": 2.0602852960731664, + "language_loss": 0.82973337, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.86260808, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.56103516, + "step": 656, + "time_per_iteration": 2.993741750717163 + }, + { + "auxiliary_loss_clip": 0.01920202, + "auxiliary_loss_mlp": 0.01433585, + "balance_loss_clip": 1.55846417, + "balance_loss_mlp": 1.36959374, + "epoch": 0.039500977002855854, + "flos": 18556616010240.0, + "grad_norm": 2.418791301111852, + "language_loss": 0.84842271, + "learning_rate": 3.999053437289776e-06, + "loss": 0.88196057, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 3.61523438, + "router_z_loss_mlp": 0.64013672, + "step": 657, + "time_per_iteration": 2.9691553115844727 + }, + { + "auxiliary_loss_clip": 0.01898823, + "auxiliary_loss_mlp": 0.01431643, + "balance_loss_clip": 1.54260051, + "balance_loss_mlp": 1.36979771, + "epoch": 0.039561100255523826, + "flos": 25349175918720.0, + "grad_norm": 2.005841512242625, + "language_loss": 0.85498816, + "learning_rate": 3.999041418526457e-06, + "loss": 0.88829285, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.61865234, + "step": 658, + "time_per_iteration": 2.9596269130706787 + }, + { + "auxiliary_loss_clip": 0.01888085, + "auxiliary_loss_mlp": 0.01426321, + "balance_loss_clip": 1.54105771, + "balance_loss_mlp": 1.36728859, + "epoch": 0.03962122350819179, + "flos": 18228119627520.0, + "grad_norm": 1.8234646566849932, + "language_loss": 0.9473331, + "learning_rate": 3.999029323959287e-06, + "loss": 0.98047709, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.59033203, + "step": 659, + "time_per_iteration": 2.960434675216675 + }, + { + "auxiliary_loss_clip": 0.01911292, + "auxiliary_loss_mlp": 0.01426167, + "balance_loss_clip": 1.55172038, + "balance_loss_mlp": 1.36546612, + "epoch": 0.03968134676085976, + "flos": 20532028296960.0, + "grad_norm": 2.123162369032689, + "language_loss": 0.83870721, + "learning_rate": 3.999017153588724e-06, + "loss": 0.87208176, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 3.59375, + "router_z_loss_mlp": 0.60742188, + "step": 660, + "time_per_iteration": 3.0618982315063477 + }, + { + "auxiliary_loss_clip": 0.01913527, + "auxiliary_loss_mlp": 0.01430668, + "balance_loss_clip": 1.55949771, + "balance_loss_mlp": 1.36434054, + "epoch": 0.03974147001352773, + "flos": 22433682280320.0, + "grad_norm": 1.9628177620222127, + "language_loss": 0.84569657, + "learning_rate": 3.999004907415231e-06, + "loss": 0.87913853, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 3.53710938, + "router_z_loss_mlp": 0.66308594, + "step": 661, + "time_per_iteration": 3.0876083374023438 + }, + { + "auxiliary_loss_clip": 0.0169653, + "auxiliary_loss_mlp": 0.01372212, + "balance_loss_clip": 1.44540238, + "balance_loss_mlp": 1.3430295, + "epoch": 0.0398015932661957, + "flos": 71161372535040.0, + "grad_norm": 0.9321118218152142, + "language_loss": 0.69561112, + "learning_rate": 3.998992585439272e-06, + "loss": 0.72629851, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.29101562, + "step": 662, + "time_per_iteration": 3.5154201984405518 + }, + { + "auxiliary_loss_clip": 0.01934405, + "auxiliary_loss_mlp": 0.01436495, + "balance_loss_clip": 1.57016182, + "balance_loss_mlp": 1.37202692, + "epoch": 0.03986171651886367, + "flos": 16809853536000.0, + "grad_norm": 1.7963993071818343, + "language_loss": 0.86277544, + "learning_rate": 3.998980187661314e-06, + "loss": 0.89648449, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.64453125, + "step": 663, + "time_per_iteration": 2.8989055156707764 + }, + { + "auxiliary_loss_clip": 0.01926693, + "auxiliary_loss_mlp": 0.01420133, + "balance_loss_clip": 1.56250846, + "balance_loss_mlp": 1.35881209, + "epoch": 0.03992183977153164, + "flos": 24545752421760.0, + "grad_norm": 1.9511389514799267, + "language_loss": 0.90215009, + "learning_rate": 3.998967714081826e-06, + "loss": 0.93561834, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.61376953, + "step": 664, + "time_per_iteration": 3.0310490131378174 + }, + { + "auxiliary_loss_clip": 0.01870944, + "auxiliary_loss_mlp": 0.01412269, + "balance_loss_clip": 1.54272759, + "balance_loss_mlp": 1.35221159, + "epoch": 0.03998196302419961, + "flos": 15604220597760.0, + "grad_norm": 1.9292840558995679, + "language_loss": 0.8748585, + "learning_rate": 3.998955164701281e-06, + "loss": 0.90769064, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.60107422, + "step": 665, + "time_per_iteration": 2.916304111480713 + }, + { + "auxiliary_loss_clip": 0.01928323, + "auxiliary_loss_mlp": 0.01431631, + "balance_loss_clip": 1.5622592, + "balance_loss_mlp": 1.36563671, + "epoch": 0.04004208627686758, + "flos": 25316934359040.0, + "grad_norm": 1.94374697864009, + "language_loss": 0.84413898, + "learning_rate": 3.998942539520158e-06, + "loss": 0.87773848, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 3.66210938, + "router_z_loss_mlp": 0.65966797, + "step": 666, + "time_per_iteration": 2.979661226272583 + }, + { + "auxiliary_loss_clip": 0.01892872, + "auxiliary_loss_mlp": 0.01415505, + "balance_loss_clip": 1.54545093, + "balance_loss_mlp": 1.35475636, + "epoch": 0.04010220952953555, + "flos": 23485962032640.0, + "grad_norm": 1.8909717933166827, + "language_loss": 0.90105104, + "learning_rate": 3.998929838538932e-06, + "loss": 0.93413484, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.60742188, + "step": 667, + "time_per_iteration": 2.935433864593506 + }, + { + "auxiliary_loss_clip": 0.01877965, + "auxiliary_loss_mlp": 0.01422149, + "balance_loss_clip": 1.54295158, + "balance_loss_mlp": 1.3608036, + "epoch": 0.04016233278220352, + "flos": 18624266265600.0, + "grad_norm": 2.211125468405515, + "language_loss": 0.85424697, + "learning_rate": 3.998917061758087e-06, + "loss": 0.8872481, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 3.34960938, + "router_z_loss_mlp": 0.61352539, + "step": 668, + "time_per_iteration": 2.887366771697998 + }, + { + "auxiliary_loss_clip": 0.01661343, + "auxiliary_loss_mlp": 0.01386464, + "balance_loss_clip": 1.41890478, + "balance_loss_mlp": 1.34755397, + "epoch": 0.040222456034871484, + "flos": 70937790140160.0, + "grad_norm": 0.8589919875942349, + "language_loss": 0.60309774, + "learning_rate": 3.998904209178107e-06, + "loss": 0.6335758, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.38867188, + "step": 669, + "time_per_iteration": 3.48913311958313 + }, + { + "auxiliary_loss_clip": 0.01886383, + "auxiliary_loss_mlp": 0.01420693, + "balance_loss_clip": 1.53702879, + "balance_loss_mlp": 1.35202837, + "epoch": 0.040282579287539456, + "flos": 23774118036480.0, + "grad_norm": 1.8331547380817905, + "language_loss": 0.89137757, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.92444831, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 3.4921875, + "router_z_loss_mlp": 0.68701172, + "step": 670, + "time_per_iteration": 2.9291563034057617 + }, + { + "auxiliary_loss_clip": 0.01876047, + "auxiliary_loss_mlp": 0.01417794, + "balance_loss_clip": 1.53944075, + "balance_loss_mlp": 1.3616941, + "epoch": 0.04034270254020743, + "flos": 18487653655680.0, + "grad_norm": 4.428579487525348, + "language_loss": 0.77549636, + "learning_rate": 3.998878276622692e-06, + "loss": 0.80843484, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 3.3671875, + "router_z_loss_mlp": 0.56079102, + "step": 671, + "time_per_iteration": 2.951420783996582 + }, + { + "auxiliary_loss_clip": 0.01881956, + "auxiliary_loss_mlp": 0.0142333, + "balance_loss_clip": 1.5373559, + "balance_loss_mlp": 1.36105514, + "epoch": 0.040402825792875394, + "flos": 17210750878080.0, + "grad_norm": 2.1542671324686564, + "language_loss": 0.96062577, + "learning_rate": 3.998865196648242e-06, + "loss": 0.99367857, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.62255859, + "step": 672, + "time_per_iteration": 2.8764255046844482 + }, + { + "auxiliary_loss_clip": 0.01877707, + "auxiliary_loss_mlp": 0.01441526, + "balance_loss_clip": 1.53363895, + "balance_loss_mlp": 1.36971474, + "epoch": 0.040462949045543366, + "flos": 19181593440000.0, + "grad_norm": 1.934278569793815, + "language_loss": 0.9276436, + "learning_rate": 3.998852040876622e-06, + "loss": 0.96083587, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 3.44140625, + "router_z_loss_mlp": 0.7175293, + "step": 673, + "time_per_iteration": 2.9121477603912354 + }, + { + "auxiliary_loss_clip": 0.018724, + "auxiliary_loss_mlp": 0.01432367, + "balance_loss_clip": 1.53451514, + "balance_loss_mlp": 1.36961508, + "epoch": 0.04052307229821133, + "flos": 24029218074240.0, + "grad_norm": 1.803658760214416, + "language_loss": 0.77880013, + "learning_rate": 3.998838809308334e-06, + "loss": 0.81184781, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.62744141, + "step": 674, + "time_per_iteration": 2.9818060398101807 + }, + { + "auxiliary_loss_clip": 0.01900931, + "auxiliary_loss_mlp": 0.01423454, + "balance_loss_clip": 1.54986596, + "balance_loss_mlp": 1.3602736, + "epoch": 0.0405831955508793, + "flos": 16445541254400.0, + "grad_norm": 2.4015005255456257, + "language_loss": 0.82635844, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.85960227, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 3.51171875, + "router_z_loss_mlp": 0.63208008, + "step": 675, + "time_per_iteration": 2.879110813140869 + }, + { + "auxiliary_loss_clip": 0.01857873, + "auxiliary_loss_mlp": 0.01427985, + "balance_loss_clip": 1.52898538, + "balance_loss_mlp": 1.37038267, + "epoch": 0.040643318803547275, + "flos": 24290607139200.0, + "grad_norm": 1.580471145716994, + "language_loss": 0.79494107, + "learning_rate": 3.998812118783757e-06, + "loss": 0.82779962, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.57543945, + "step": 676, + "time_per_iteration": 2.9655325412750244 + }, + { + "auxiliary_loss_clip": 0.01889619, + "auxiliary_loss_mlp": 0.01434585, + "balance_loss_clip": 1.54212189, + "balance_loss_mlp": 1.37977242, + "epoch": 0.04070344205621524, + "flos": 17720950953600.0, + "grad_norm": 2.105352965424125, + "language_loss": 0.88311815, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.9163602, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 3.47265625, + "router_z_loss_mlp": 0.54736328, + "step": 677, + "time_per_iteration": 4.350664377212524 + }, + { + "auxiliary_loss_clip": 0.01890635, + "auxiliary_loss_mlp": 0.01419903, + "balance_loss_clip": 1.55372143, + "balance_loss_mlp": 1.36110914, + "epoch": 0.04076356530888321, + "flos": 26188958252160.0, + "grad_norm": 1.9219578950959613, + "language_loss": 0.79372543, + "learning_rate": 3.998785125078559e-06, + "loss": 0.82683086, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 3.37109375, + "router_z_loss_mlp": 0.58837891, + "step": 678, + "time_per_iteration": 2.9264941215515137 + }, + { + "auxiliary_loss_clip": 0.01881921, + "auxiliary_loss_mlp": 0.01432368, + "balance_loss_clip": 1.54716015, + "balance_loss_mlp": 1.38184738, + "epoch": 0.04082368856155118, + "flos": 35787256617600.0, + "grad_norm": 1.6239066200833818, + "language_loss": 0.84643549, + "learning_rate": 3.998771514534505e-06, + "loss": 0.87957841, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 3.34765625, + "router_z_loss_mlp": 0.50439453, + "step": 679, + "time_per_iteration": 3.057399034500122 + }, + { + "auxiliary_loss_clip": 0.01881568, + "auxiliary_loss_mlp": 0.01423099, + "balance_loss_clip": 1.55263519, + "balance_loss_mlp": 1.36800086, + "epoch": 0.04088381181421915, + "flos": 28158126756480.0, + "grad_norm": 1.8513016466523973, + "language_loss": 0.79441047, + "learning_rate": 3.998757828196835e-06, + "loss": 0.82745719, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.55126953, + "step": 680, + "time_per_iteration": 5.890503883361816 + }, + { + "auxiliary_loss_clip": 0.01901783, + "auxiliary_loss_mlp": 0.01433088, + "balance_loss_clip": 1.55541158, + "balance_loss_mlp": 1.37403142, + "epoch": 0.04094393506688712, + "flos": 27608581687680.0, + "grad_norm": 1.6542279445311474, + "language_loss": 0.8598094, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.89315808, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.59057617, + "step": 681, + "time_per_iteration": 3.004376173019409 + }, + { + "auxiliary_loss_clip": 0.01911731, + "auxiliary_loss_mlp": 0.0141565, + "balance_loss_clip": 1.56634855, + "balance_loss_mlp": 1.35943127, + "epoch": 0.04100405831955509, + "flos": 23122328423040.0, + "grad_norm": 1.6818273753128772, + "language_loss": 0.75456005, + "learning_rate": 3.998730228142726e-06, + "loss": 0.78783393, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.56225586, + "step": 682, + "time_per_iteration": 2.9098241329193115 + }, + { + "auxiliary_loss_clip": 0.01882517, + "auxiliary_loss_mlp": 0.01412526, + "balance_loss_clip": 1.5453794, + "balance_loss_mlp": 1.35377991, + "epoch": 0.04106418157222306, + "flos": 20166268181760.0, + "grad_norm": 1.672625039716468, + "language_loss": 0.75341654, + "learning_rate": 3.998716314427333e-06, + "loss": 0.78636694, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.58764648, + "step": 683, + "time_per_iteration": 2.9536118507385254 + }, + { + "auxiliary_loss_clip": 0.01877363, + "auxiliary_loss_mlp": 0.01412375, + "balance_loss_clip": 1.54620409, + "balance_loss_mlp": 1.36068606, + "epoch": 0.041124304824891024, + "flos": 17429627813760.0, + "grad_norm": 2.44255990949796, + "language_loss": 0.84105563, + "learning_rate": 3.998702324920417e-06, + "loss": 0.87395304, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 3.3125, + "router_z_loss_mlp": 0.51733398, + "step": 684, + "time_per_iteration": 2.8678042888641357 + }, + { + "auxiliary_loss_clip": 0.0186854, + "auxiliary_loss_mlp": 0.01403294, + "balance_loss_clip": 1.53617358, + "balance_loss_mlp": 1.34902954, + "epoch": 0.041184428077558996, + "flos": 25792359166080.0, + "grad_norm": 1.7904478562112298, + "language_loss": 0.92644894, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.9591673, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 3.32226562, + "router_z_loss_mlp": 0.54223633, + "step": 685, + "time_per_iteration": 2.9945783615112305 + }, + { + "auxiliary_loss_clip": 0.01888534, + "auxiliary_loss_mlp": 0.01408772, + "balance_loss_clip": 1.54666662, + "balance_loss_mlp": 1.35319638, + "epoch": 0.04124455133022697, + "flos": 22974676081920.0, + "grad_norm": 2.2106828472277673, + "language_loss": 0.91037917, + "learning_rate": 3.998674118534141e-06, + "loss": 0.94335222, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 3.421875, + "router_z_loss_mlp": 0.5559082, + "step": 686, + "time_per_iteration": 2.9504990577697754 + }, + { + "auxiliary_loss_clip": 0.01885941, + "auxiliary_loss_mlp": 0.01419542, + "balance_loss_clip": 1.54158914, + "balance_loss_mlp": 1.36232173, + "epoch": 0.04130467458289493, + "flos": 21299228691840.0, + "grad_norm": 2.016049254943442, + "language_loss": 0.73946941, + "learning_rate": 3.998659901655851e-06, + "loss": 0.77252424, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.5715332, + "step": 687, + "time_per_iteration": 2.9170024394989014 + }, + { + "auxiliary_loss_clip": 0.0183868, + "auxiliary_loss_mlp": 0.01408747, + "balance_loss_clip": 1.52008343, + "balance_loss_mlp": 1.35875034, + "epoch": 0.041364797835562905, + "flos": 19983704837760.0, + "grad_norm": 1.420124221429297, + "language_loss": 0.88659251, + "learning_rate": 3.998645608988177e-06, + "loss": 0.91906679, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.50024414, + "step": 688, + "time_per_iteration": 2.93145751953125 + }, + { + "auxiliary_loss_clip": 0.01838327, + "auxiliary_loss_mlp": 0.01403956, + "balance_loss_clip": 1.52289879, + "balance_loss_mlp": 1.35341144, + "epoch": 0.04142492108823087, + "flos": 21915745344000.0, + "grad_norm": 1.9017765394804462, + "language_loss": 0.8579247, + "learning_rate": 3.998631240531661e-06, + "loss": 0.89034754, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.50537109, + "step": 689, + "time_per_iteration": 2.898087739944458 + }, + { + "auxiliary_loss_clip": 0.01842574, + "auxiliary_loss_mlp": 0.01424882, + "balance_loss_clip": 1.51690888, + "balance_loss_mlp": 1.36561096, + "epoch": 0.04148504434089884, + "flos": 27651953468160.0, + "grad_norm": 1.956580045528937, + "language_loss": 0.71512437, + "learning_rate": 3.998616796286848e-06, + "loss": 0.74779892, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.59326172, + "step": 690, + "time_per_iteration": 3.041569471359253 + }, + { + "auxiliary_loss_clip": 0.01846391, + "auxiliary_loss_mlp": 0.01408434, + "balance_loss_clip": 1.52527702, + "balance_loss_mlp": 1.35917687, + "epoch": 0.041545167593566815, + "flos": 20527187103360.0, + "grad_norm": 1.6961855779836625, + "language_loss": 0.77274734, + "learning_rate": 3.998602276254286e-06, + "loss": 0.80529559, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 3.20898438, + "router_z_loss_mlp": 0.49194336, + "step": 691, + "time_per_iteration": 2.936180830001831 + }, + { + "auxiliary_loss_clip": 0.01835314, + "auxiliary_loss_mlp": 0.01409073, + "balance_loss_clip": 1.51812124, + "balance_loss_mlp": 1.35523796, + "epoch": 0.04160529084623478, + "flos": 11874851913600.0, + "grad_norm": 2.1722637304999552, + "language_loss": 0.86083645, + "learning_rate": 3.998587680434526e-06, + "loss": 0.89328033, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.53857422, + "step": 692, + "time_per_iteration": 2.8621623516082764 + }, + { + "auxiliary_loss_clip": 0.0185828, + "auxiliary_loss_mlp": 0.01409178, + "balance_loss_clip": 1.5180285, + "balance_loss_mlp": 1.35050273, + "epoch": 0.04166541409890275, + "flos": 14836070062080.0, + "grad_norm": 2.266761029064862, + "language_loss": 0.93417192, + "learning_rate": 3.99857300882812e-06, + "loss": 0.96684647, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 3.40234375, + "router_z_loss_mlp": 0.58691406, + "step": 693, + "time_per_iteration": 2.884765863418579 + }, + { + "auxiliary_loss_clip": 0.01848824, + "auxiliary_loss_mlp": 0.01401082, + "balance_loss_clip": 1.51981378, + "balance_loss_mlp": 1.34488702, + "epoch": 0.04172553735157072, + "flos": 25818130719360.0, + "grad_norm": 1.9467063765605717, + "language_loss": 0.85516679, + "learning_rate": 3.998558261435626e-06, + "loss": 0.88766587, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.56176758, + "step": 694, + "time_per_iteration": 2.947791337966919 + }, + { + "auxiliary_loss_clip": 0.01872993, + "auxiliary_loss_mlp": 0.01411732, + "balance_loss_clip": 1.5277288, + "balance_loss_mlp": 1.3636663, + "epoch": 0.04178566060423869, + "flos": 24290471404800.0, + "grad_norm": 2.6033693232401256, + "language_loss": 0.8734439, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.90629113, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 3.45507812, + "router_z_loss_mlp": 0.48046875, + "step": 695, + "time_per_iteration": 2.991652250289917 + }, + { + "auxiliary_loss_clip": 0.01834696, + "auxiliary_loss_mlp": 0.01412095, + "balance_loss_clip": 1.51167929, + "balance_loss_mlp": 1.35902286, + "epoch": 0.04184578385690666, + "flos": 18230789070720.0, + "grad_norm": 2.246575441005443, + "language_loss": 0.8801986, + "learning_rate": 3.99852853929461e-06, + "loss": 0.91266656, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.53125, + "step": 696, + "time_per_iteration": 2.9283676147460938 + }, + { + "auxiliary_loss_clip": 0.01840252, + "auxiliary_loss_mlp": 0.01404549, + "balance_loss_clip": 1.51386762, + "balance_loss_mlp": 1.35007024, + "epoch": 0.041905907109574626, + "flos": 22784647345920.0, + "grad_norm": 1.9688901479121834, + "language_loss": 0.95167482, + "learning_rate": 3.998513564547216e-06, + "loss": 0.98412281, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 3.26367188, + "router_z_loss_mlp": 0.54418945, + "step": 697, + "time_per_iteration": 2.942450523376465 + }, + { + "auxiliary_loss_clip": 0.018095, + "auxiliary_loss_mlp": 0.01403152, + "balance_loss_clip": 1.49613905, + "balance_loss_mlp": 1.35050964, + "epoch": 0.0419660303622426, + "flos": 20166539650560.0, + "grad_norm": 2.6527356008532714, + "language_loss": 0.87111253, + "learning_rate": 3.998498514015987e-06, + "loss": 0.90323907, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.52685547, + "step": 698, + "time_per_iteration": 2.943557024002075 + }, + { + "auxiliary_loss_clip": 0.01834468, + "auxiliary_loss_mlp": 0.01415486, + "balance_loss_clip": 1.50868201, + "balance_loss_mlp": 1.36525154, + "epoch": 0.042026153614910564, + "flos": 23086874482560.0, + "grad_norm": 2.235809062833807, + "language_loss": 0.9316889, + "learning_rate": 3.998483387701495e-06, + "loss": 0.96418834, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 3.2578125, + "router_z_loss_mlp": 0.50268555, + "step": 699, + "time_per_iteration": 2.893601894378662 + }, + { + "auxiliary_loss_clip": 0.01656688, + "auxiliary_loss_mlp": 0.01428847, + "balance_loss_clip": 1.43760383, + "balance_loss_mlp": 1.40347934, + "epoch": 0.042086276867578536, + "flos": 64527911902080.0, + "grad_norm": 0.9084685749167275, + "language_loss": 0.67912495, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70998031, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25390625, + "step": 700, + "time_per_iteration": 3.380621910095215 + }, + { + "auxiliary_loss_clip": 0.01835094, + "auxiliary_loss_mlp": 0.01423899, + "balance_loss_clip": 1.5081811, + "balance_loss_mlp": 1.36326933, + "epoch": 0.04214640012024651, + "flos": 15495460801920.0, + "grad_norm": 2.200241090378552, + "language_loss": 0.91733503, + "learning_rate": 3.998452907725016e-06, + "loss": 0.94992495, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.60644531, + "step": 701, + "time_per_iteration": 2.894984006881714 + }, + { + "auxiliary_loss_clip": 0.01838811, + "auxiliary_loss_mlp": 0.01414127, + "balance_loss_clip": 1.50803781, + "balance_loss_mlp": 1.35859895, + "epoch": 0.04220652337291447, + "flos": 23887221333120.0, + "grad_norm": 1.7841697170120672, + "language_loss": 0.70937371, + "learning_rate": 3.998437554064184e-06, + "loss": 0.74190307, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.5546875, + "step": 702, + "time_per_iteration": 2.91745662689209 + }, + { + "auxiliary_loss_clip": 0.0160452, + "auxiliary_loss_mlp": 0.0137902, + "balance_loss_clip": 1.38627684, + "balance_loss_mlp": 1.35517788, + "epoch": 0.042266646625582445, + "flos": 63826235256960.0, + "grad_norm": 0.8520686425842733, + "language_loss": 0.6106168, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.64045215, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.23828125, + "step": 703, + "time_per_iteration": 3.423539638519287 + }, + { + "auxiliary_loss_clip": 0.0159064, + "auxiliary_loss_mlp": 0.01371895, + "balance_loss_clip": 1.37667143, + "balance_loss_mlp": 1.35177267, + "epoch": 0.04232676987825041, + "flos": 50049973566720.0, + "grad_norm": 1.0395438235276593, + "language_loss": 0.57906008, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.60868543, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.20117188, + "step": 704, + "time_per_iteration": 3.164651870727539 + }, + { + "auxiliary_loss_clip": 0.01844929, + "auxiliary_loss_mlp": 0.01418002, + "balance_loss_clip": 1.51236641, + "balance_loss_mlp": 1.36020899, + "epoch": 0.04238689313091838, + "flos": 21626005772160.0, + "grad_norm": 2.7090755696354125, + "language_loss": 0.91177702, + "learning_rate": 3.998391038398319e-06, + "loss": 0.94440639, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.57788086, + "step": 705, + "time_per_iteration": 2.936997890472412 + }, + { + "auxiliary_loss_clip": 0.01815578, + "auxiliary_loss_mlp": 0.01435747, + "balance_loss_clip": 1.50898385, + "balance_loss_mlp": 1.38312769, + "epoch": 0.042447016383586354, + "flos": 19144374952320.0, + "grad_norm": 1.8025493171591411, + "language_loss": 0.74006224, + "learning_rate": 3.998375381617201e-06, + "loss": 0.7725755, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.52612305, + "step": 706, + "time_per_iteration": 2.9508841037750244 + }, + { + "auxiliary_loss_clip": 0.01840996, + "auxiliary_loss_mlp": 0.01432665, + "balance_loss_clip": 1.51314437, + "balance_loss_mlp": 1.37391901, + "epoch": 0.04250713963625432, + "flos": 24436766401920.0, + "grad_norm": 2.514182690714767, + "language_loss": 0.95214689, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.98488343, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 3.27734375, + "router_z_loss_mlp": 0.58789062, + "step": 707, + "time_per_iteration": 2.974381446838379 + }, + { + "auxiliary_loss_clip": 0.01873567, + "auxiliary_loss_mlp": 0.01428884, + "balance_loss_clip": 1.53425312, + "balance_loss_mlp": 1.36417675, + "epoch": 0.04256726288892229, + "flos": 30378187532160.0, + "grad_norm": 1.7254142115679436, + "language_loss": 0.83380532, + "learning_rate": 3.998343840719776e-06, + "loss": 0.86682975, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 3.38867188, + "router_z_loss_mlp": 0.64697266, + "step": 708, + "time_per_iteration": 3.045398473739624 + }, + { + "auxiliary_loss_clip": 0.0187123, + "auxiliary_loss_mlp": 0.01414633, + "balance_loss_clip": 1.52891517, + "balance_loss_mlp": 1.35359836, + "epoch": 0.04262738614159026, + "flos": 16371330503040.0, + "grad_norm": 2.165310153970776, + "language_loss": 0.85613942, + "learning_rate": 3.998327956604666e-06, + "loss": 0.88899803, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 3.42382812, + "router_z_loss_mlp": 0.61083984, + "step": 709, + "time_per_iteration": 2.9153432846069336 + }, + { + "auxiliary_loss_clip": 0.01899001, + "auxiliary_loss_mlp": 0.01412544, + "balance_loss_clip": 1.54926682, + "balance_loss_mlp": 1.34798002, + "epoch": 0.04268750939425823, + "flos": 20422046891520.0, + "grad_norm": 2.6497421113705313, + "language_loss": 0.89039063, + "learning_rate": 3.99831199671276e-06, + "loss": 0.92350614, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 3.49804688, + "router_z_loss_mlp": 0.64501953, + "step": 710, + "time_per_iteration": 2.986358404159546 + }, + { + "auxiliary_loss_clip": 0.01890333, + "auxiliary_loss_mlp": 0.0141115, + "balance_loss_clip": 1.55238152, + "balance_loss_mlp": 1.35431099, + "epoch": 0.0427476326469262, + "flos": 20312608423680.0, + "grad_norm": 2.0018863062527354, + "language_loss": 0.86911535, + "learning_rate": 3.998295961044662e-06, + "loss": 0.90213013, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.56762695, + "step": 711, + "time_per_iteration": 2.94836163520813 + }, + { + "auxiliary_loss_clip": 0.0187944, + "auxiliary_loss_mlp": 0.01406386, + "balance_loss_clip": 1.5419724, + "balance_loss_mlp": 1.34973764, + "epoch": 0.042807755899594166, + "flos": 21660238103040.0, + "grad_norm": 1.686725212099122, + "language_loss": 0.87494421, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.90780246, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 3.375, + "router_z_loss_mlp": 0.56640625, + "step": 712, + "time_per_iteration": 4.401049375534058 + }, + { + "auxiliary_loss_clip": 0.01913099, + "auxiliary_loss_mlp": 0.01429716, + "balance_loss_clip": 1.56280684, + "balance_loss_mlp": 1.36598706, + "epoch": 0.04286787915226214, + "flos": 21445206975360.0, + "grad_norm": 2.2050732453180215, + "language_loss": 0.94239867, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9758268, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 3.5, + "router_z_loss_mlp": 0.63745117, + "step": 713, + "time_per_iteration": 2.935232400894165 + }, + { + "auxiliary_loss_clip": 0.0161998, + "auxiliary_loss_mlp": 0.0138637, + "balance_loss_clip": 1.40543556, + "balance_loss_mlp": 1.35890424, + "epoch": 0.04292800240493011, + "flos": 66432552042240.0, + "grad_norm": 0.8892425080044829, + "language_loss": 0.63811636, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.66817987, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.27539062, + "step": 714, + "time_per_iteration": 3.4342894554138184 + }, + { + "auxiliary_loss_clip": 0.018662, + "auxiliary_loss_mlp": 0.01416303, + "balance_loss_clip": 1.53611696, + "balance_loss_mlp": 1.35793781, + "epoch": 0.042988125657598075, + "flos": 31662917660160.0, + "grad_norm": 1.7261391371845218, + "language_loss": 0.769256, + "learning_rate": 3.998231060622563e-06, + "loss": 0.80208111, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 3.30078125, + "router_z_loss_mlp": 0.58447266, + "step": 715, + "time_per_iteration": 5.839420795440674 + }, + { + "auxiliary_loss_clip": 0.01892364, + "auxiliary_loss_mlp": 0.01423129, + "balance_loss_clip": 1.54726517, + "balance_loss_mlp": 1.36953259, + "epoch": 0.04304824891026605, + "flos": 33259675063680.0, + "grad_norm": 2.0276379052797586, + "language_loss": 0.75453043, + "learning_rate": 3.998214646082688e-06, + "loss": 0.78768539, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 3.44726562, + "router_z_loss_mlp": 0.53613281, + "step": 716, + "time_per_iteration": 3.0408191680908203 + }, + { + "auxiliary_loss_clip": 0.01597324, + "auxiliary_loss_mlp": 0.01351057, + "balance_loss_clip": 1.38407516, + "balance_loss_mlp": 1.33131552, + "epoch": 0.04310837216293401, + "flos": 64099252235520.0, + "grad_norm": 0.9088058074208603, + "language_loss": 0.65822923, + "learning_rate": 3.998198155770314e-06, + "loss": 0.68771303, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.19726562, + "step": 717, + "time_per_iteration": 3.354990243911743 + }, + { + "auxiliary_loss_clip": 0.01578036, + "auxiliary_loss_mlp": 0.01349515, + "balance_loss_clip": 1.36827695, + "balance_loss_mlp": 1.32996488, + "epoch": 0.043168495415601985, + "flos": 61372050048000.0, + "grad_norm": 0.9809814160912551, + "language_loss": 0.58976793, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61904347, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.1953125, + "step": 718, + "time_per_iteration": 3.2009143829345703 + }, + { + "auxiliary_loss_clip": 0.01867079, + "auxiliary_loss_mlp": 0.01419621, + "balance_loss_clip": 1.53106618, + "balance_loss_mlp": 1.36421227, + "epoch": 0.04322861866826996, + "flos": 20714003458560.0, + "grad_norm": 2.076529409708467, + "language_loss": 0.94171143, + "learning_rate": 3.99816494783057e-06, + "loss": 0.97457844, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.55395508, + "step": 719, + "time_per_iteration": 2.9428839683532715 + }, + { + "auxiliary_loss_clip": 0.01882313, + "auxiliary_loss_mlp": 0.01427644, + "balance_loss_clip": 1.53601754, + "balance_loss_mlp": 1.3676343, + "epoch": 0.04328874192093792, + "flos": 30385698168960.0, + "grad_norm": 1.6273678316622155, + "language_loss": 0.67888725, + "learning_rate": 3.99814823020446e-06, + "loss": 0.7119869, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 3.46289062, + "router_z_loss_mlp": 0.59985352, + "step": 720, + "time_per_iteration": 3.049025058746338 + }, + { + "auxiliary_loss_clip": 0.0185489, + "auxiliary_loss_mlp": 0.01424686, + "balance_loss_clip": 1.5253253, + "balance_loss_mlp": 1.3678236, + "epoch": 0.043348865173605894, + "flos": 21954864113280.0, + "grad_norm": 1.7857669997040797, + "language_loss": 0.799528, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.83232379, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.56860352, + "step": 721, + "time_per_iteration": 2.9399807453155518 + }, + { + "auxiliary_loss_clip": 0.01885821, + "auxiliary_loss_mlp": 0.01438031, + "balance_loss_clip": 1.54395938, + "balance_loss_mlp": 1.38162088, + "epoch": 0.04340898842627386, + "flos": 15271606938240.0, + "grad_norm": 2.4318385868713466, + "language_loss": 0.91326874, + "learning_rate": 3.998114567642933e-06, + "loss": 0.94650722, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.56420898, + "step": 722, + "time_per_iteration": 3.0237135887145996 + }, + { + "auxiliary_loss_clip": 0.0188476, + "auxiliary_loss_mlp": 0.01418902, + "balance_loss_clip": 1.54072654, + "balance_loss_mlp": 1.36160994, + "epoch": 0.04346911167894183, + "flos": 27977192225280.0, + "grad_norm": 1.6611790582638595, + "language_loss": 0.87100774, + "learning_rate": 3.998097622708792e-06, + "loss": 0.90404439, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 3.43945312, + "router_z_loss_mlp": 0.57299805, + "step": 723, + "time_per_iteration": 2.9661364555358887 + }, + { + "auxiliary_loss_clip": 0.01876483, + "auxiliary_loss_mlp": 0.01421548, + "balance_loss_clip": 1.54038143, + "balance_loss_mlp": 1.36175275, + "epoch": 0.0435292349316098, + "flos": 29254230737280.0, + "grad_norm": 1.9287581479853886, + "language_loss": 0.84676349, + "learning_rate": 3.99808060200659e-06, + "loss": 0.87974381, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.59814453, + "step": 724, + "time_per_iteration": 2.921574831008911 + }, + { + "auxiliary_loss_clip": 0.01857113, + "auxiliary_loss_mlp": 0.01426174, + "balance_loss_clip": 1.52804804, + "balance_loss_mlp": 1.36487687, + "epoch": 0.04358935818427777, + "flos": 20568070419840.0, + "grad_norm": 2.1401418010799445, + "language_loss": 0.82473868, + "learning_rate": 3.998063505536971e-06, + "loss": 0.8575716, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.61328125, + "step": 725, + "time_per_iteration": 2.8997013568878174 + }, + { + "auxiliary_loss_clip": 0.0188996, + "auxiliary_loss_mlp": 0.01409792, + "balance_loss_clip": 1.5404017, + "balance_loss_mlp": 1.35414529, + "epoch": 0.04364948143694574, + "flos": 14472979390080.0, + "grad_norm": 1.9426785383783498, + "language_loss": 0.89713764, + "learning_rate": 3.998046333300584e-06, + "loss": 0.93013513, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 3.49414062, + "router_z_loss_mlp": 0.55615234, + "step": 726, + "time_per_iteration": 2.8861801624298096 + }, + { + "auxiliary_loss_clip": 0.01545121, + "auxiliary_loss_mlp": 0.01399475, + "balance_loss_clip": 1.3435781, + "balance_loss_mlp": 1.38040137, + "epoch": 0.043709604689613706, + "flos": 50092015265280.0, + "grad_norm": 0.9066111944712127, + "language_loss": 0.5606811, + "learning_rate": 3.998029085298079e-06, + "loss": 0.59012711, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.19042969, + "step": 727, + "time_per_iteration": 3.4231810569763184 + }, + { + "auxiliary_loss_clip": 0.01878452, + "auxiliary_loss_mlp": 0.01415019, + "balance_loss_clip": 1.54053032, + "balance_loss_mlp": 1.36266208, + "epoch": 0.04376972794228168, + "flos": 13999771578240.0, + "grad_norm": 2.0011747958995967, + "language_loss": 0.85094321, + "learning_rate": 3.998011761530112e-06, + "loss": 0.88387787, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 3.37695312, + "router_z_loss_mlp": 0.52368164, + "step": 728, + "time_per_iteration": 2.8805689811706543 + }, + { + "auxiliary_loss_clip": 0.01881251, + "auxiliary_loss_mlp": 0.01409838, + "balance_loss_clip": 1.5514549, + "balance_loss_mlp": 1.35950804, + "epoch": 0.04382985119494965, + "flos": 22017944643840.0, + "grad_norm": 1.9612928068288507, + "language_loss": 0.77830446, + "learning_rate": 3.997994361997338e-06, + "loss": 0.81121528, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.50268555, + "step": 729, + "time_per_iteration": 2.9253108501434326 + }, + { + "auxiliary_loss_clip": 0.0188017, + "auxiliary_loss_mlp": 0.01437363, + "balance_loss_clip": 1.54280567, + "balance_loss_mlp": 1.37465882, + "epoch": 0.043889974447617615, + "flos": 24217075059840.0, + "grad_norm": 4.75827974862781, + "language_loss": 0.97816634, + "learning_rate": 3.997976886700417e-06, + "loss": 1.01134169, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 3.36914062, + "router_z_loss_mlp": 0.62670898, + "step": 730, + "time_per_iteration": 2.910001754760742 + }, + { + "auxiliary_loss_clip": 0.0190936, + "auxiliary_loss_mlp": 0.0144303, + "balance_loss_clip": 1.56059241, + "balance_loss_mlp": 1.38750231, + "epoch": 0.04395009770028559, + "flos": 17283378061440.0, + "grad_norm": 1.9971074187491038, + "language_loss": 0.90748799, + "learning_rate": 3.997959335640013e-06, + "loss": 0.94101191, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 3.484375, + "router_z_loss_mlp": 0.55493164, + "step": 731, + "time_per_iteration": 2.9167802333831787 + }, + { + "auxiliary_loss_clip": 0.01892854, + "auxiliary_loss_mlp": 0.01423794, + "balance_loss_clip": 1.55966783, + "balance_loss_mlp": 1.3691721, + "epoch": 0.04401022095295355, + "flos": 12317265999360.0, + "grad_norm": 2.686306618743098, + "language_loss": 0.93006575, + "learning_rate": 3.997941708816791e-06, + "loss": 0.96323228, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.54663086, + "step": 732, + "time_per_iteration": 2.8737337589263916 + }, + { + "auxiliary_loss_clip": 0.01894209, + "auxiliary_loss_mlp": 0.01442912, + "balance_loss_clip": 1.55442023, + "balance_loss_mlp": 1.38936293, + "epoch": 0.044070344205621524, + "flos": 20969465454720.0, + "grad_norm": 2.1442995468662414, + "language_loss": 0.88115162, + "learning_rate": 3.997924006231419e-06, + "loss": 0.91452283, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 3.3984375, + "router_z_loss_mlp": 0.53588867, + "step": 733, + "time_per_iteration": 2.9321916103363037 + }, + { + "auxiliary_loss_clip": 0.01892767, + "auxiliary_loss_mlp": 0.01448693, + "balance_loss_clip": 1.56099737, + "balance_loss_mlp": 1.39774346, + "epoch": 0.044130467458289496, + "flos": 13853340846720.0, + "grad_norm": 2.8942354956444145, + "language_loss": 0.94055319, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.97396773, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 3.31054688, + "router_z_loss_mlp": 0.50976562, + "step": 734, + "time_per_iteration": 2.8936517238616943 + }, + { + "auxiliary_loss_clip": 0.01874641, + "auxiliary_loss_mlp": 0.01439175, + "balance_loss_clip": 1.55348217, + "balance_loss_mlp": 1.38870168, + "epoch": 0.04419059071095746, + "flos": 28666064592000.0, + "grad_norm": 2.150201630083173, + "language_loss": 0.81715298, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.85029113, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.50463867, + "step": 735, + "time_per_iteration": 2.996674060821533 + }, + { + "auxiliary_loss_clip": 0.01876597, + "auxiliary_loss_mlp": 0.01426291, + "balance_loss_clip": 1.54754567, + "balance_loss_mlp": 1.37627029, + "epoch": 0.04425071396362543, + "flos": 28194847551360.0, + "grad_norm": 2.3276130780406974, + "language_loss": 0.91140306, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.9444319, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.50048828, + "step": 736, + "time_per_iteration": 2.995058059692383 + }, + { + "auxiliary_loss_clip": 0.01869817, + "auxiliary_loss_mlp": 0.01423546, + "balance_loss_clip": 1.55083799, + "balance_loss_mlp": 1.3730973, + "epoch": 0.0443108372162934, + "flos": 23668661111040.0, + "grad_norm": 1.7285589272321336, + "language_loss": 0.87185109, + "learning_rate": 3.997852438281901e-06, + "loss": 0.90478474, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 3.18164062, + "router_z_loss_mlp": 0.50415039, + "step": 737, + "time_per_iteration": 2.9087588787078857 + }, + { + "auxiliary_loss_clip": 0.01872622, + "auxiliary_loss_mlp": 0.01409727, + "balance_loss_clip": 1.55355358, + "balance_loss_mlp": 1.35925376, + "epoch": 0.04437096046896137, + "flos": 33991692986880.0, + "grad_norm": 1.7775664963375757, + "language_loss": 0.87357605, + "learning_rate": 3.997834356895906e-06, + "loss": 0.90639949, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 3.19140625, + "router_z_loss_mlp": 0.50512695, + "step": 738, + "time_per_iteration": 3.0028109550476074 + }, + { + "auxiliary_loss_clip": 0.01588643, + "auxiliary_loss_mlp": 0.01377567, + "balance_loss_clip": 1.40529346, + "balance_loss_mlp": 1.35973382, + "epoch": 0.04443108372162934, + "flos": 67426619719680.0, + "grad_norm": 0.8774947206118064, + "language_loss": 0.59251225, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.62217426, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.17871094, + "step": 739, + "time_per_iteration": 3.331681489944458 + }, + { + "auxiliary_loss_clip": 0.01850388, + "auxiliary_loss_mlp": 0.01406158, + "balance_loss_clip": 1.53516042, + "balance_loss_mlp": 1.35053515, + "epoch": 0.04449120697429731, + "flos": 29764747526400.0, + "grad_norm": 2.146540833101568, + "language_loss": 0.94922328, + "learning_rate": 3.997797966850369e-06, + "loss": 0.98178881, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.55664062, + "step": 740, + "time_per_iteration": 2.9626245498657227 + }, + { + "auxiliary_loss_clip": 0.01834548, + "auxiliary_loss_mlp": 0.01405322, + "balance_loss_clip": 1.52528477, + "balance_loss_mlp": 1.36042786, + "epoch": 0.04455133022696528, + "flos": 36515835936000.0, + "grad_norm": 1.8109677680183802, + "language_loss": 0.74031162, + "learning_rate": 3.997779658192205e-06, + "loss": 0.77271032, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.44897461, + "step": 741, + "time_per_iteration": 3.0658676624298096 + }, + { + "auxiliary_loss_clip": 0.01816081, + "auxiliary_loss_mlp": 0.01412371, + "balance_loss_clip": 1.51785052, + "balance_loss_mlp": 1.36196911, + "epoch": 0.044611453479633245, + "flos": 28815390990720.0, + "grad_norm": 1.6545422574039155, + "language_loss": 0.89935893, + "learning_rate": 3.997761273778037e-06, + "loss": 0.93164349, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.50390625, + "step": 742, + "time_per_iteration": 2.944192886352539 + }, + { + "auxiliary_loss_clip": 0.01829265, + "auxiliary_loss_mlp": 0.01406171, + "balance_loss_clip": 1.52322972, + "balance_loss_mlp": 1.35584092, + "epoch": 0.04467157673230122, + "flos": 20020516122240.0, + "grad_norm": 1.6689098756707486, + "language_loss": 0.86307043, + "learning_rate": 3.997742813608561e-06, + "loss": 0.89542484, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.50341797, + "step": 743, + "time_per_iteration": 2.9224765300750732 + }, + { + "auxiliary_loss_clip": 0.01844098, + "auxiliary_loss_mlp": 0.01419169, + "balance_loss_clip": 1.5290345, + "balance_loss_mlp": 1.36676478, + "epoch": 0.04473169998496919, + "flos": 18014174375040.0, + "grad_norm": 2.1167540475477704, + "language_loss": 0.82972831, + "learning_rate": 3.997724277684479e-06, + "loss": 0.86236095, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.52392578, + "step": 744, + "time_per_iteration": 2.8655498027801514 + }, + { + "auxiliary_loss_clip": 0.01828467, + "auxiliary_loss_mlp": 0.01428365, + "balance_loss_clip": 1.51996446, + "balance_loss_mlp": 1.37312317, + "epoch": 0.044791823237637154, + "flos": 20641557254400.0, + "grad_norm": 1.8840597342492515, + "language_loss": 0.87216598, + "learning_rate": 3.99770566600649e-06, + "loss": 0.90473431, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.55297852, + "step": 745, + "time_per_iteration": 2.8988864421844482 + }, + { + "auxiliary_loss_clip": 0.01821615, + "auxiliary_loss_mlp": 0.01400654, + "balance_loss_clip": 1.51276243, + "balance_loss_mlp": 1.35223126, + "epoch": 0.04485194649030513, + "flos": 31188759707520.0, + "grad_norm": 1.5940569941983864, + "language_loss": 0.70484185, + "learning_rate": 3.997686978575302e-06, + "loss": 0.73706448, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.48413086, + "step": 746, + "time_per_iteration": 2.9693877696990967 + }, + { + "auxiliary_loss_clip": 0.01829242, + "auxiliary_loss_mlp": 0.01408519, + "balance_loss_clip": 1.51754999, + "balance_loss_mlp": 1.35756886, + "epoch": 0.04491206974297309, + "flos": 26155133124480.0, + "grad_norm": 4.50444213445697, + "language_loss": 0.72110617, + "learning_rate": 3.997668215391625e-06, + "loss": 0.75348377, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.50952148, + "step": 747, + "time_per_iteration": 4.337121486663818 + }, + { + "auxiliary_loss_clip": 0.01831279, + "auxiliary_loss_mlp": 0.01406919, + "balance_loss_clip": 1.51720095, + "balance_loss_mlp": 1.35830593, + "epoch": 0.044972192995641064, + "flos": 20677373153280.0, + "grad_norm": 1.7543771950286817, + "language_loss": 0.69462407, + "learning_rate": 3.997649376456168e-06, + "loss": 0.72700608, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 3.14453125, + "router_z_loss_mlp": 0.48583984, + "step": 748, + "time_per_iteration": 3.0355687141418457 + }, + { + "auxiliary_loss_clip": 0.01821218, + "auxiliary_loss_mlp": 0.01410234, + "balance_loss_clip": 1.50347352, + "balance_loss_mlp": 1.36045265, + "epoch": 0.045032316248309036, + "flos": 16115325569280.0, + "grad_norm": 2.098925474326493, + "language_loss": 0.79656464, + "learning_rate": 3.997630461769647e-06, + "loss": 0.82887918, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.49780273, + "step": 749, + "time_per_iteration": 4.484265565872192 + }, + { + "auxiliary_loss_clip": 0.01818191, + "auxiliary_loss_mlp": 0.01399623, + "balance_loss_clip": 1.50725865, + "balance_loss_mlp": 1.35062838, + "epoch": 0.045092439500977, + "flos": 17867969867520.0, + "grad_norm": 2.2087650294882195, + "language_loss": 0.91904932, + "learning_rate": 3.997611471332778e-06, + "loss": 0.95122743, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.49023438, + "step": 750, + "time_per_iteration": 4.368940114974976 + }, + { + "auxiliary_loss_clip": 0.01828831, + "auxiliary_loss_mlp": 0.014155, + "balance_loss_clip": 1.50582111, + "balance_loss_mlp": 1.36173701, + "epoch": 0.04515256275364497, + "flos": 24473668176000.0, + "grad_norm": 1.8919848037991864, + "language_loss": 0.76874387, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.80118716, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 3.22460938, + "router_z_loss_mlp": 0.53735352, + "step": 751, + "time_per_iteration": 2.9266202449798584 + }, + { + "auxiliary_loss_clip": 0.01827861, + "auxiliary_loss_mlp": 0.01398185, + "balance_loss_clip": 1.50911379, + "balance_loss_mlp": 1.34806991, + "epoch": 0.04521268600631294, + "flos": 20924419616640.0, + "grad_norm": 2.044618179238129, + "language_loss": 0.72757006, + "learning_rate": 3.997573263210883e-06, + "loss": 0.75983059, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.50073242, + "step": 752, + "time_per_iteration": 2.948711633682251 + }, + { + "auxiliary_loss_clip": 0.01837854, + "auxiliary_loss_mlp": 0.01407321, + "balance_loss_clip": 1.51439106, + "balance_loss_mlp": 1.35505939, + "epoch": 0.04527280925898091, + "flos": 13379409118080.0, + "grad_norm": 2.94500083802943, + "language_loss": 0.95578623, + "learning_rate": 3.997554045527305e-06, + "loss": 0.98823798, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.52270508, + "step": 753, + "time_per_iteration": 2.946030378341675 + }, + { + "auxiliary_loss_clip": 0.01839672, + "auxiliary_loss_mlp": 0.01416577, + "balance_loss_clip": 1.51202047, + "balance_loss_mlp": 1.36693859, + "epoch": 0.04533293251164888, + "flos": 23264008450560.0, + "grad_norm": 1.864918050199659, + "language_loss": 0.93731219, + "learning_rate": 3.997534752096277e-06, + "loss": 0.96987468, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 3.2734375, + "router_z_loss_mlp": 0.49658203, + "step": 754, + "time_per_iteration": 2.9738311767578125 + }, + { + "auxiliary_loss_clip": 0.01799635, + "auxiliary_loss_mlp": 0.0141314, + "balance_loss_clip": 1.48793459, + "balance_loss_mlp": 1.36476433, + "epoch": 0.04539305576431685, + "flos": 12429600134400.0, + "grad_norm": 2.1802391860680417, + "language_loss": 0.82788706, + "learning_rate": 3.997515382918531e-06, + "loss": 0.8600148, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.48339844, + "step": 755, + "time_per_iteration": 2.904738426208496 + }, + { + "auxiliary_loss_clip": 0.01861373, + "auxiliary_loss_mlp": 0.01415794, + "balance_loss_clip": 1.5292325, + "balance_loss_mlp": 1.36059988, + "epoch": 0.04545317901698482, + "flos": 16079645404800.0, + "grad_norm": 2.2249185344744586, + "language_loss": 0.81624317, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.84901482, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.55175781, + "step": 756, + "time_per_iteration": 2.874044418334961 + }, + { + "auxiliary_loss_clip": 0.01527385, + "auxiliary_loss_mlp": 0.01363339, + "balance_loss_clip": 1.34259439, + "balance_loss_mlp": 1.34331167, + "epoch": 0.045513302269652785, + "flos": 66430244557440.0, + "grad_norm": 0.8143196125274184, + "language_loss": 0.62874645, + "learning_rate": 3.997476417325827e-06, + "loss": 0.65765369, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.20019531, + "step": 757, + "time_per_iteration": 3.380831003189087 + }, + { + "auxiliary_loss_clip": 0.0182952, + "auxiliary_loss_mlp": 0.01402036, + "balance_loss_clip": 1.5164001, + "balance_loss_mlp": 1.35635519, + "epoch": 0.04557342552232076, + "flos": 21481294343040.0, + "grad_norm": 1.4304769051346566, + "language_loss": 0.85912132, + "learning_rate": 3.997456820912346e-06, + "loss": 0.89143693, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.45703125, + "step": 758, + "time_per_iteration": 2.9174766540527344 + }, + { + "auxiliary_loss_clip": 0.0182339, + "auxiliary_loss_mlp": 0.01403253, + "balance_loss_clip": 1.51454926, + "balance_loss_mlp": 1.35471058, + "epoch": 0.04563354877498873, + "flos": 23743233820800.0, + "grad_norm": 1.572253467288795, + "language_loss": 0.90430075, + "learning_rate": 3.997437148755101e-06, + "loss": 0.93656719, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.4855957, + "step": 759, + "time_per_iteration": 2.9826741218566895 + }, + { + "auxiliary_loss_clip": 0.01851553, + "auxiliary_loss_mlp": 0.01411191, + "balance_loss_clip": 1.52170324, + "balance_loss_mlp": 1.36283958, + "epoch": 0.045693672027656694, + "flos": 25745865494400.0, + "grad_norm": 1.9694097359266136, + "language_loss": 0.76808208, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.80070955, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 3.30273438, + "router_z_loss_mlp": 0.48339844, + "step": 760, + "time_per_iteration": 2.9721078872680664 + }, + { + "auxiliary_loss_clip": 0.0182119, + "auxiliary_loss_mlp": 0.01414614, + "balance_loss_clip": 1.50802028, + "balance_loss_mlp": 1.36538029, + "epoch": 0.045753795280324666, + "flos": 19728469065600.0, + "grad_norm": 2.02199454136294, + "language_loss": 0.85850781, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.8908658, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.49267578, + "step": 761, + "time_per_iteration": 2.88466215133667 + }, + { + "auxiliary_loss_clip": 0.01823041, + "auxiliary_loss_mlp": 0.01403745, + "balance_loss_clip": 1.51203454, + "balance_loss_mlp": 1.35312915, + "epoch": 0.04581391853299264, + "flos": 23265546773760.0, + "grad_norm": 1.8711302250722732, + "language_loss": 0.81321716, + "learning_rate": 3.997377677828266e-06, + "loss": 0.84548503, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.50634766, + "step": 762, + "time_per_iteration": 3.0334715843200684 + }, + { + "auxiliary_loss_clip": 0.01533627, + "auxiliary_loss_mlp": 0.01350375, + "balance_loss_clip": 1.35450578, + "balance_loss_mlp": 1.33311367, + "epoch": 0.0458740417856606, + "flos": 64261500664320.0, + "grad_norm": 1.0230698571538615, + "language_loss": 0.58958972, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.61842972, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.17285156, + "step": 763, + "time_per_iteration": 3.4223034381866455 + }, + { + "auxiliary_loss_clip": 0.0182625, + "auxiliary_loss_mlp": 0.01391582, + "balance_loss_clip": 1.50772262, + "balance_loss_mlp": 1.34225309, + "epoch": 0.045934165038328575, + "flos": 20778034129920.0, + "grad_norm": 2.1360309516049054, + "language_loss": 0.90989161, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.94206989, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.49365234, + "step": 764, + "time_per_iteration": 2.913908004760742 + }, + { + "auxiliary_loss_clip": 0.01852021, + "auxiliary_loss_mlp": 0.01406893, + "balance_loss_clip": 1.52348483, + "balance_loss_mlp": 1.35525167, + "epoch": 0.04599428829099654, + "flos": 30274947601920.0, + "grad_norm": 2.4258240181928388, + "language_loss": 0.90691137, + "learning_rate": 3.997317525234592e-06, + "loss": 0.93950057, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.51635742, + "step": 765, + "time_per_iteration": 3.030827760696411 + }, + { + "auxiliary_loss_clip": 0.01860152, + "auxiliary_loss_mlp": 0.01406598, + "balance_loss_clip": 1.52474284, + "balance_loss_mlp": 1.35595822, + "epoch": 0.04605441154366451, + "flos": 23049158302080.0, + "grad_norm": 2.4334746267586627, + "language_loss": 0.9224034, + "learning_rate": 3.997297322892056e-06, + "loss": 0.95507097, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 3.3515625, + "router_z_loss_mlp": 0.50634766, + "step": 766, + "time_per_iteration": 2.925028085708618 + }, + { + "auxiliary_loss_clip": 0.01845157, + "auxiliary_loss_mlp": 0.01391694, + "balance_loss_clip": 1.51981235, + "balance_loss_mlp": 1.34703779, + "epoch": 0.046114534796332485, + "flos": 22027400807040.0, + "grad_norm": 2.1255610463993126, + "language_loss": 0.86258286, + "learning_rate": 3.997277044811806e-06, + "loss": 0.89495134, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.44677734, + "step": 767, + "time_per_iteration": 2.928607225418091 + }, + { + "auxiliary_loss_clip": 0.01830262, + "auxiliary_loss_mlp": 0.01390609, + "balance_loss_clip": 1.51123428, + "balance_loss_mlp": 1.34316409, + "epoch": 0.04617465804900045, + "flos": 29874547952640.0, + "grad_norm": 2.6164681187059013, + "language_loss": 0.90550232, + "learning_rate": 3.99725669099461e-06, + "loss": 0.937711, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.47485352, + "step": 768, + "time_per_iteration": 3.0374767780303955 + }, + { + "auxiliary_loss_clip": 0.01825738, + "auxiliary_loss_mlp": 0.01393484, + "balance_loss_clip": 1.5027678, + "balance_loss_mlp": 1.34560978, + "epoch": 0.04623478130166842, + "flos": 25641132485760.0, + "grad_norm": 1.855598811037543, + "language_loss": 0.77536201, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.80755424, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 3.23046875, + "router_z_loss_mlp": 0.47875977, + "step": 769, + "time_per_iteration": 2.930849313735962 + }, + { + "auxiliary_loss_clip": 0.01840537, + "auxiliary_loss_mlp": 0.01396492, + "balance_loss_clip": 1.52008522, + "balance_loss_mlp": 1.34954739, + "epoch": 0.04629490455433639, + "flos": 20458812931200.0, + "grad_norm": 1.7108334592982801, + "language_loss": 0.88691765, + "learning_rate": 3.997215756152471e-06, + "loss": 0.91928792, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.46923828, + "step": 770, + "time_per_iteration": 2.9216597080230713 + }, + { + "auxiliary_loss_clip": 0.01859199, + "auxiliary_loss_mlp": 0.01394573, + "balance_loss_clip": 1.52304351, + "balance_loss_mlp": 1.34536326, + "epoch": 0.04635502780700436, + "flos": 23159094462720.0, + "grad_norm": 1.9048594332260138, + "language_loss": 0.89956772, + "learning_rate": 3.99719517512908e-06, + "loss": 0.93210554, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 3.36328125, + "router_z_loss_mlp": 0.49194336, + "step": 771, + "time_per_iteration": 2.9021871089935303 + }, + { + "auxiliary_loss_clip": 0.018788, + "auxiliary_loss_mlp": 0.01402846, + "balance_loss_clip": 1.53030348, + "balance_loss_mlp": 1.34755707, + "epoch": 0.04641515105967233, + "flos": 23301724631040.0, + "grad_norm": 2.458990416584798, + "language_loss": 0.86799014, + "learning_rate": 3.997174518371848e-06, + "loss": 0.90080661, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.5534668, + "step": 772, + "time_per_iteration": 2.9592127799987793 + }, + { + "auxiliary_loss_clip": 0.0184297, + "auxiliary_loss_mlp": 0.0140165, + "balance_loss_clip": 1.51696324, + "balance_loss_mlp": 1.35384727, + "epoch": 0.046475274312340296, + "flos": 25125095831040.0, + "grad_norm": 2.0053639612289373, + "language_loss": 0.76235378, + "learning_rate": 3.997153785881557e-06, + "loss": 0.79480004, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.47802734, + "step": 773, + "time_per_iteration": 2.937987804412842 + }, + { + "auxiliary_loss_clip": 0.01826529, + "auxiliary_loss_mlp": 0.01398626, + "balance_loss_clip": 1.51024282, + "balance_loss_mlp": 1.35087037, + "epoch": 0.04653539756500827, + "flos": 25275824818560.0, + "grad_norm": 1.892682913212188, + "language_loss": 0.80694079, + "learning_rate": 3.997132977658996e-06, + "loss": 0.83919227, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.47753906, + "step": 774, + "time_per_iteration": 2.9249420166015625 + }, + { + "auxiliary_loss_clip": 0.01847365, + "auxiliary_loss_mlp": 0.01397992, + "balance_loss_clip": 1.5220468, + "balance_loss_mlp": 1.3462069, + "epoch": 0.046595520817676234, + "flos": 35416700553600.0, + "grad_norm": 1.9830329975893493, + "language_loss": 0.75557733, + "learning_rate": 3.997112093704952e-06, + "loss": 0.78803086, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.51806641, + "step": 775, + "time_per_iteration": 3.0458877086639404 + }, + { + "auxiliary_loss_clip": 0.01845925, + "auxiliary_loss_mlp": 0.01396654, + "balance_loss_clip": 1.51606452, + "balance_loss_mlp": 1.34501219, + "epoch": 0.046655644070344206, + "flos": 18121260113280.0, + "grad_norm": 1.577608297709537, + "language_loss": 0.79408789, + "learning_rate": 3.997091134020217e-06, + "loss": 0.82651371, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 3.29296875, + "router_z_loss_mlp": 0.5168457, + "step": 776, + "time_per_iteration": 2.8755991458892822 + }, + { + "auxiliary_loss_clip": 0.01814775, + "auxiliary_loss_mlp": 0.01393668, + "balance_loss_clip": 1.49961662, + "balance_loss_mlp": 1.34336162, + "epoch": 0.04671576732301218, + "flos": 29217193228800.0, + "grad_norm": 2.0017917932076426, + "language_loss": 0.74597222, + "learning_rate": 3.997070098605585e-06, + "loss": 0.77805662, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 3.15234375, + "router_z_loss_mlp": 0.50244141, + "step": 777, + "time_per_iteration": 2.9886343479156494 + }, + { + "auxiliary_loss_clip": 0.01824814, + "auxiliary_loss_mlp": 0.01390522, + "balance_loss_clip": 1.50977933, + "balance_loss_mlp": 1.33969092, + "epoch": 0.04677589057568014, + "flos": 30489526281600.0, + "grad_norm": 1.8400168204724396, + "language_loss": 0.78576303, + "learning_rate": 3.997048987461856e-06, + "loss": 0.81791633, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.50927734, + "step": 778, + "time_per_iteration": 2.9512522220611572 + }, + { + "auxiliary_loss_clip": 0.01831366, + "auxiliary_loss_mlp": 0.01393039, + "balance_loss_clip": 1.51449943, + "balance_loss_mlp": 1.34392512, + "epoch": 0.046836013828348115, + "flos": 20567210768640.0, + "grad_norm": 1.7805126195342975, + "language_loss": 0.81811976, + "learning_rate": 3.997027800589829e-06, + "loss": 0.85036385, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.49169922, + "step": 779, + "time_per_iteration": 2.9345955848693848 + }, + { + "auxiliary_loss_clip": 0.01817615, + "auxiliary_loss_mlp": 0.01395925, + "balance_loss_clip": 1.50507855, + "balance_loss_mlp": 1.34731126, + "epoch": 0.04689613708101608, + "flos": 25458342917760.0, + "grad_norm": 1.6001421332730201, + "language_loss": 0.78782463, + "learning_rate": 3.997006537990308e-06, + "loss": 0.81996012, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 3.12304688, + "router_z_loss_mlp": 0.4855957, + "step": 780, + "time_per_iteration": 2.9785804748535156 + }, + { + "auxiliary_loss_clip": 0.01809633, + "auxiliary_loss_mlp": 0.01391016, + "balance_loss_clip": 1.50308776, + "balance_loss_mlp": 1.34574008, + "epoch": 0.04695626033368405, + "flos": 23011351632000.0, + "grad_norm": 1.8454841549771226, + "language_loss": 0.78052449, + "learning_rate": 3.996985199664099e-06, + "loss": 0.81253099, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.45288086, + "step": 781, + "time_per_iteration": 4.298590898513794 + }, + { + "auxiliary_loss_clip": 0.01864387, + "auxiliary_loss_mlp": 0.01403717, + "balance_loss_clip": 1.52601779, + "balance_loss_mlp": 1.35219526, + "epoch": 0.047016383586352024, + "flos": 29145516186240.0, + "grad_norm": 3.0953733130967858, + "language_loss": 0.78691792, + "learning_rate": 3.99696378561201e-06, + "loss": 0.81959897, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.515625, + "step": 782, + "time_per_iteration": 2.9480690956115723 + }, + { + "auxiliary_loss_clip": 0.0184835, + "auxiliary_loss_mlp": 0.0140193, + "balance_loss_clip": 1.5195384, + "balance_loss_mlp": 1.35467517, + "epoch": 0.04707650683901999, + "flos": 14984672544000.0, + "grad_norm": 2.630421796350862, + "language_loss": 0.83736241, + "learning_rate": 3.996942295834855e-06, + "loss": 0.86986518, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 3.28710938, + "router_z_loss_mlp": 0.47216797, + "step": 783, + "time_per_iteration": 2.8645083904266357 + }, + { + "auxiliary_loss_clip": 0.01821279, + "auxiliary_loss_mlp": 0.01397082, + "balance_loss_clip": 1.51320028, + "balance_loss_mlp": 1.35278392, + "epoch": 0.04713663009168796, + "flos": 21660645306240.0, + "grad_norm": 1.8812322655608364, + "language_loss": 0.83666897, + "learning_rate": 3.996920730333448e-06, + "loss": 0.86885256, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.44311523, + "step": 784, + "time_per_iteration": 4.293826580047607 + }, + { + "auxiliary_loss_clip": 0.01840727, + "auxiliary_loss_mlp": 0.01407437, + "balance_loss_clip": 1.51957297, + "balance_loss_mlp": 1.3555814, + "epoch": 0.04719675334435593, + "flos": 21335542283520.0, + "grad_norm": 2.038855176247955, + "language_loss": 0.82196027, + "learning_rate": 3.996899089108607e-06, + "loss": 0.85444188, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.51855469, + "step": 785, + "time_per_iteration": 5.750572919845581 + }, + { + "auxiliary_loss_clip": 0.01862031, + "auxiliary_loss_mlp": 0.0140669, + "balance_loss_clip": 1.53198457, + "balance_loss_mlp": 1.3582201, + "epoch": 0.0472568765970239, + "flos": 17940416071680.0, + "grad_norm": 2.371061068008672, + "language_loss": 0.92075324, + "learning_rate": 3.996877372161152e-06, + "loss": 0.95344049, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 3.29882812, + "router_z_loss_mlp": 0.48461914, + "step": 786, + "time_per_iteration": 2.8435044288635254 + }, + { + "auxiliary_loss_clip": 0.01864519, + "auxiliary_loss_mlp": 0.0140156, + "balance_loss_clip": 1.52150631, + "balance_loss_mlp": 1.35070515, + "epoch": 0.04731699984969187, + "flos": 18086394355200.0, + "grad_norm": 2.3130492437649974, + "language_loss": 0.8175329, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.85019368, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 3.4296875, + "router_z_loss_mlp": 0.50854492, + "step": 787, + "time_per_iteration": 2.906297445297241 + }, + { + "auxiliary_loss_clip": 0.01858608, + "auxiliary_loss_mlp": 0.01400122, + "balance_loss_clip": 1.53087068, + "balance_loss_mlp": 1.34805119, + "epoch": 0.047377123102359836, + "flos": 23195408054400.0, + "grad_norm": 2.1134192986518854, + "language_loss": 0.83571285, + "learning_rate": 3.996833711101698e-06, + "loss": 0.8683002, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.52099609, + "step": 788, + "time_per_iteration": 2.8946852684020996 + }, + { + "auxiliary_loss_clip": 0.01841545, + "auxiliary_loss_mlp": 0.01396683, + "balance_loss_clip": 1.52317727, + "balance_loss_mlp": 1.34568512, + "epoch": 0.04743724635502781, + "flos": 22758151875840.0, + "grad_norm": 1.7962856076281197, + "language_loss": 0.86332119, + "learning_rate": 3.996811766991355e-06, + "loss": 0.89570343, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.51025391, + "step": 789, + "time_per_iteration": 2.9510281085968018 + }, + { + "auxiliary_loss_clip": 0.01851716, + "auxiliary_loss_mlp": 0.01405319, + "balance_loss_clip": 1.526088, + "balance_loss_mlp": 1.35043561, + "epoch": 0.04749736960769577, + "flos": 17247697896960.0, + "grad_norm": 2.234319510346056, + "language_loss": 0.84486669, + "learning_rate": 3.996789747161709e-06, + "loss": 0.87743706, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.54931641, + "step": 790, + "time_per_iteration": 3.005979299545288 + }, + { + "auxiliary_loss_clip": 0.01838206, + "auxiliary_loss_mlp": 0.01404923, + "balance_loss_clip": 1.51464772, + "balance_loss_mlp": 1.35344887, + "epoch": 0.047557492860363745, + "flos": 40494060875520.0, + "grad_norm": 1.8067560405250707, + "language_loss": 0.90425384, + "learning_rate": 3.996767651613597e-06, + "loss": 0.9366852, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 3.234375, + "router_z_loss_mlp": 0.515625, + "step": 791, + "time_per_iteration": 3.03739595413208 + }, + { + "auxiliary_loss_clip": 0.01845247, + "auxiliary_loss_mlp": 0.0139394, + "balance_loss_clip": 1.5269959, + "balance_loss_mlp": 1.3427515, + "epoch": 0.04761761611303172, + "flos": 18707344997760.0, + "grad_norm": 1.9780024267043066, + "language_loss": 0.91547269, + "learning_rate": 3.996745480347854e-06, + "loss": 0.94786447, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.51123047, + "step": 792, + "time_per_iteration": 2.91386079788208 + }, + { + "auxiliary_loss_clip": 0.01851402, + "auxiliary_loss_mlp": 0.01397307, + "balance_loss_clip": 1.52458668, + "balance_loss_mlp": 1.3437109, + "epoch": 0.04767773936569968, + "flos": 20931477805440.0, + "grad_norm": 1.746686072804086, + "language_loss": 0.75069177, + "learning_rate": 3.996723233365324e-06, + "loss": 0.78317893, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.53588867, + "step": 793, + "time_per_iteration": 2.9040868282318115 + }, + { + "auxiliary_loss_clip": 0.01870766, + "auxiliary_loss_mlp": 0.01389, + "balance_loss_clip": 1.53725767, + "balance_loss_mlp": 1.33907485, + "epoch": 0.047737862618367655, + "flos": 23743233820800.0, + "grad_norm": 1.8813575652980163, + "language_loss": 0.89648056, + "learning_rate": 3.996700910666847e-06, + "loss": 0.92907822, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.4987793, + "step": 794, + "time_per_iteration": 2.917881488800049 + }, + { + "auxiliary_loss_clip": 0.01864417, + "auxiliary_loss_mlp": 0.01406163, + "balance_loss_clip": 1.53375578, + "balance_loss_mlp": 1.35018277, + "epoch": 0.04779798587103562, + "flos": 23706196312320.0, + "grad_norm": 2.4200086282525017, + "language_loss": 0.73541319, + "learning_rate": 3.996678512253272e-06, + "loss": 0.76811898, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 3.3046875, + "router_z_loss_mlp": 0.55932617, + "step": 795, + "time_per_iteration": 2.9173803329467773 + }, + { + "auxiliary_loss_clip": 0.01827111, + "auxiliary_loss_mlp": 0.0140686, + "balance_loss_clip": 1.51393044, + "balance_loss_mlp": 1.33936346, + "epoch": 0.04785810912370359, + "flos": 23193598262400.0, + "grad_norm": 2.2317957737512293, + "language_loss": 0.82829022, + "learning_rate": 3.996656038125449e-06, + "loss": 0.86062998, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.67504883, + "step": 796, + "time_per_iteration": 2.9835941791534424 + }, + { + "auxiliary_loss_clip": 0.01853467, + "auxiliary_loss_mlp": 0.01400692, + "balance_loss_clip": 1.52777743, + "balance_loss_mlp": 1.34390104, + "epoch": 0.047918232376371564, + "flos": 18049628315520.0, + "grad_norm": 2.072032321952671, + "language_loss": 0.84482729, + "learning_rate": 3.996633488284228e-06, + "loss": 0.87736887, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 3.25585938, + "router_z_loss_mlp": 0.56787109, + "step": 797, + "time_per_iteration": 2.8903167247772217 + }, + { + "auxiliary_loss_clip": 0.01616355, + "auxiliary_loss_mlp": 0.01391907, + "balance_loss_clip": 1.41686726, + "balance_loss_mlp": 1.36730194, + "epoch": 0.04797835562903953, + "flos": 62472271305600.0, + "grad_norm": 0.9351957346055234, + "language_loss": 0.64653492, + "learning_rate": 3.996610862730465e-06, + "loss": 0.6766175, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.24609375, + "step": 798, + "time_per_iteration": 3.2863686084747314 + }, + { + "auxiliary_loss_clip": 0.01867027, + "auxiliary_loss_mlp": 0.01392628, + "balance_loss_clip": 1.52881086, + "balance_loss_mlp": 1.33745825, + "epoch": 0.0480384788817075, + "flos": 21517336465920.0, + "grad_norm": 1.9815362972049368, + "language_loss": 0.93685377, + "learning_rate": 3.996588161465018e-06, + "loss": 0.96945035, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 3.38085938, + "router_z_loss_mlp": 0.55224609, + "step": 799, + "time_per_iteration": 2.9071173667907715 + }, + { + "auxiliary_loss_clip": 0.01850862, + "auxiliary_loss_mlp": 0.01389532, + "balance_loss_clip": 1.52861166, + "balance_loss_mlp": 1.34079969, + "epoch": 0.048098602134375466, + "flos": 21736937318400.0, + "grad_norm": 2.1364826074748073, + "language_loss": 0.88807458, + "learning_rate": 3.996565384488748e-06, + "loss": 0.92047846, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.48779297, + "step": 800, + "time_per_iteration": 2.979412078857422 + }, + { + "auxiliary_loss_clip": 0.01866784, + "auxiliary_loss_mlp": 0.01395592, + "balance_loss_clip": 1.53351831, + "balance_loss_mlp": 1.34561944, + "epoch": 0.04815872538704344, + "flos": 22941710605440.0, + "grad_norm": 2.061690072831917, + "language_loss": 0.86406684, + "learning_rate": 3.996542531802518e-06, + "loss": 0.89669061, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 3.3359375, + "router_z_loss_mlp": 0.50048828, + "step": 801, + "time_per_iteration": 2.905458927154541 + }, + { + "auxiliary_loss_clip": 0.01863367, + "auxiliary_loss_mlp": 0.01404691, + "balance_loss_clip": 1.52951956, + "balance_loss_mlp": 1.35092807, + "epoch": 0.04821884863971141, + "flos": 43189048765440.0, + "grad_norm": 1.7019540161384774, + "language_loss": 0.82406479, + "learning_rate": 3.996519603407196e-06, + "loss": 0.85674536, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 3.33984375, + "router_z_loss_mlp": 0.53759766, + "step": 802, + "time_per_iteration": 3.1033570766448975 + }, + { + "auxiliary_loss_clip": 0.01823356, + "auxiliary_loss_mlp": 0.01400601, + "balance_loss_clip": 1.5065794, + "balance_loss_mlp": 1.35375166, + "epoch": 0.048278971892379376, + "flos": 18628021584000.0, + "grad_norm": 1.739054873352336, + "language_loss": 0.8805483, + "learning_rate": 3.996496599303649e-06, + "loss": 0.91278785, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.46875, + "step": 803, + "time_per_iteration": 2.910318613052368 + }, + { + "auxiliary_loss_clip": 0.01859458, + "auxiliary_loss_mlp": 0.01391074, + "balance_loss_clip": 1.53174305, + "balance_loss_mlp": 1.34138751, + "epoch": 0.04833909514504735, + "flos": 20239709771520.0, + "grad_norm": 2.2358467299767324, + "language_loss": 0.88474125, + "learning_rate": 3.996473519492753e-06, + "loss": 0.91724658, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.49682617, + "step": 804, + "time_per_iteration": 2.914029598236084 + }, + { + "auxiliary_loss_clip": 0.01839535, + "auxiliary_loss_mlp": 0.01416621, + "balance_loss_clip": 1.51489258, + "balance_loss_mlp": 1.36471725, + "epoch": 0.04839921839771532, + "flos": 24656050540800.0, + "grad_norm": 1.9013931102613904, + "language_loss": 0.88015497, + "learning_rate": 3.99645036397538e-06, + "loss": 0.91271645, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.51904297, + "step": 805, + "time_per_iteration": 3.0111243724823 + }, + { + "auxiliary_loss_clip": 0.01839249, + "auxiliary_loss_mlp": 0.0140756, + "balance_loss_clip": 1.52127957, + "balance_loss_mlp": 1.35575116, + "epoch": 0.048459341650383285, + "flos": 24838342416000.0, + "grad_norm": 2.1353189903897465, + "language_loss": 0.70154035, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.73400843, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.51855469, + "step": 806, + "time_per_iteration": 2.932370901107788 + }, + { + "auxiliary_loss_clip": 0.01817468, + "auxiliary_loss_mlp": 0.01407163, + "balance_loss_clip": 1.50604773, + "balance_loss_mlp": 1.35337567, + "epoch": 0.04851946490305126, + "flos": 22173198111360.0, + "grad_norm": 2.0724064528612103, + "language_loss": 0.78897703, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.82122338, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.53759766, + "step": 807, + "time_per_iteration": 2.962855339050293 + }, + { + "auxiliary_loss_clip": 0.0182054, + "auxiliary_loss_mlp": 0.01398479, + "balance_loss_clip": 1.50889182, + "balance_loss_mlp": 1.35072351, + "epoch": 0.04857958815571922, + "flos": 19801503452160.0, + "grad_norm": 2.1480360511518506, + "language_loss": 0.89356852, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.92575872, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.4777832, + "step": 808, + "time_per_iteration": 2.9136788845062256 + }, + { + "auxiliary_loss_clip": 0.01850997, + "auxiliary_loss_mlp": 0.01401411, + "balance_loss_clip": 1.52032948, + "balance_loss_mlp": 1.34979367, + "epoch": 0.048639711408387194, + "flos": 18707299752960.0, + "grad_norm": 1.6030490585742618, + "language_loss": 0.91894817, + "learning_rate": 3.996356984858732e-06, + "loss": 0.95147228, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 3.30859375, + "router_z_loss_mlp": 0.51635742, + "step": 809, + "time_per_iteration": 2.9515011310577393 + }, + { + "auxiliary_loss_clip": 0.01837949, + "auxiliary_loss_mlp": 0.01405112, + "balance_loss_clip": 1.5183723, + "balance_loss_mlp": 1.35707045, + "epoch": 0.048699834661055166, + "flos": 24874294049280.0, + "grad_norm": 1.785782932243432, + "language_loss": 0.87767208, + "learning_rate": 3.996333450822208e-06, + "loss": 0.91010273, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.48022461, + "step": 810, + "time_per_iteration": 3.045872688293457 + }, + { + "auxiliary_loss_clip": 0.01847411, + "auxiliary_loss_mlp": 0.01397066, + "balance_loss_clip": 1.52350378, + "balance_loss_mlp": 1.34683156, + "epoch": 0.04875995791372313, + "flos": 20713686744960.0, + "grad_norm": 1.8408782498077456, + "language_loss": 0.82856613, + "learning_rate": 3.99630984108452e-06, + "loss": 0.86101091, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.50219727, + "step": 811, + "time_per_iteration": 2.903075933456421 + }, + { + "auxiliary_loss_clip": 0.01818789, + "auxiliary_loss_mlp": 0.01390545, + "balance_loss_clip": 1.50683105, + "balance_loss_mlp": 1.34019077, + "epoch": 0.048820081166391104, + "flos": 18597544571520.0, + "grad_norm": 1.6497366446825628, + "language_loss": 0.76484424, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.79693758, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.50292969, + "step": 812, + "time_per_iteration": 3.0392627716064453 + }, + { + "auxiliary_loss_clip": 0.01798738, + "auxiliary_loss_mlp": 0.01396716, + "balance_loss_clip": 1.5012238, + "balance_loss_mlp": 1.3534193, + "epoch": 0.04888020441905907, + "flos": 22716635132160.0, + "grad_norm": 1.7809746580975487, + "language_loss": 0.92108989, + "learning_rate": 3.996262394509233e-06, + "loss": 0.95304447, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.43310547, + "step": 813, + "time_per_iteration": 2.8799514770507812 + }, + { + "auxiliary_loss_clip": 0.01808244, + "auxiliary_loss_mlp": 0.01386942, + "balance_loss_clip": 1.49865031, + "balance_loss_mlp": 1.34359753, + "epoch": 0.04894032767172704, + "flos": 22794736936320.0, + "grad_norm": 2.336550220074976, + "language_loss": 0.77508688, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.80703872, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.43334961, + "step": 814, + "time_per_iteration": 2.940197706222534 + }, + { + "auxiliary_loss_clip": 0.01799965, + "auxiliary_loss_mlp": 0.01394469, + "balance_loss_clip": 1.49066353, + "balance_loss_mlp": 1.34566522, + "epoch": 0.04900045092439501, + "flos": 25526038417920.0, + "grad_norm": 3.910530535185917, + "language_loss": 0.86133736, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.8932817, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.48852539, + "step": 815, + "time_per_iteration": 2.9385859966278076 + }, + { + "auxiliary_loss_clip": 0.01822635, + "auxiliary_loss_mlp": 0.01388171, + "balance_loss_clip": 1.50163722, + "balance_loss_mlp": 1.33993936, + "epoch": 0.04906057417706298, + "flos": 25969221665280.0, + "grad_norm": 2.0774461332568364, + "language_loss": 0.94099939, + "learning_rate": 3.996190656910043e-06, + "loss": 0.97310746, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 3.20703125, + "router_z_loss_mlp": 0.48217773, + "step": 816, + "time_per_iteration": 4.376782417297363 + }, + { + "auxiliary_loss_clip": 0.01814921, + "auxiliary_loss_mlp": 0.01388295, + "balance_loss_clip": 1.49504447, + "balance_loss_mlp": 1.34132648, + "epoch": 0.04912069742973095, + "flos": 18633858163200.0, + "grad_norm": 2.1324283968048285, + "language_loss": 0.82597482, + "learning_rate": 3.996166592984268e-06, + "loss": 0.85800701, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 3.203125, + "router_z_loss_mlp": 0.46948242, + "step": 817, + "time_per_iteration": 2.895831823348999 + }, + { + "auxiliary_loss_clip": 0.01800808, + "auxiliary_loss_mlp": 0.01402223, + "balance_loss_clip": 1.49211407, + "balance_loss_mlp": 1.352036, + "epoch": 0.049180820682398915, + "flos": 23711263729920.0, + "grad_norm": 1.5462755635436127, + "language_loss": 0.86202067, + "learning_rate": 3.996142453363656e-06, + "loss": 0.89405096, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.50219727, + "step": 818, + "time_per_iteration": 2.9735167026519775 + }, + { + "auxiliary_loss_clip": 0.01826964, + "auxiliary_loss_mlp": 0.01393234, + "balance_loss_clip": 1.49724865, + "balance_loss_mlp": 1.34652781, + "epoch": 0.04924094393506689, + "flos": 22430922347520.0, + "grad_norm": 2.7282464017841166, + "language_loss": 0.79514813, + "learning_rate": 3.996118238049124e-06, + "loss": 0.82735014, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 3.296875, + "router_z_loss_mlp": 0.46679688, + "step": 819, + "time_per_iteration": 4.409702301025391 + }, + { + "auxiliary_loss_clip": 0.0180604, + "auxiliary_loss_mlp": 0.01396224, + "balance_loss_clip": 1.48923528, + "balance_loss_mlp": 1.3488977, + "epoch": 0.04930106718773486, + "flos": 15745674401280.0, + "grad_norm": 2.1236838463034413, + "language_loss": 0.86490518, + "learning_rate": 3.996093947041586e-06, + "loss": 0.89692783, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 3.1640625, + "router_z_loss_mlp": 0.47363281, + "step": 820, + "time_per_iteration": 4.201559543609619 + }, + { + "auxiliary_loss_clip": 0.01781165, + "auxiliary_loss_mlp": 0.01380241, + "balance_loss_clip": 1.46974015, + "balance_loss_mlp": 1.33441651, + "epoch": 0.049361190440402825, + "flos": 26261947393920.0, + "grad_norm": 1.6835459718250223, + "language_loss": 0.92528224, + "learning_rate": 3.996069580341966e-06, + "loss": 0.95689625, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.45825195, + "step": 821, + "time_per_iteration": 2.9575586318969727 + }, + { + "auxiliary_loss_clip": 0.01794888, + "auxiliary_loss_mlp": 0.01399648, + "balance_loss_clip": 1.48152208, + "balance_loss_mlp": 1.34855485, + "epoch": 0.0494213136930708, + "flos": 21262462652160.0, + "grad_norm": 1.7941132314032007, + "language_loss": 0.91358036, + "learning_rate": 3.996045137951188e-06, + "loss": 0.94552571, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.51074219, + "step": 822, + "time_per_iteration": 2.920083999633789 + }, + { + "auxiliary_loss_clip": 0.01802128, + "auxiliary_loss_mlp": 0.01390564, + "balance_loss_clip": 1.48946881, + "balance_loss_mlp": 1.34135473, + "epoch": 0.04948143694573876, + "flos": 27977282714880.0, + "grad_norm": 1.7603327171710064, + "language_loss": 0.69669139, + "learning_rate": 3.996020619870178e-06, + "loss": 0.72861838, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.49291992, + "step": 823, + "time_per_iteration": 2.91367506980896 + }, + { + "auxiliary_loss_clip": 0.01516922, + "auxiliary_loss_mlp": 0.01401135, + "balance_loss_clip": 1.33240712, + "balance_loss_mlp": 1.38740194, + "epoch": 0.049541560198406734, + "flos": 66206553690240.0, + "grad_norm": 1.3610024248720263, + "language_loss": 0.62392616, + "learning_rate": 3.995996026099866e-06, + "loss": 0.65310669, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.13769531, + "step": 824, + "time_per_iteration": 3.3961522579193115 + }, + { + "auxiliary_loss_clip": 0.01825708, + "auxiliary_loss_mlp": 0.01394966, + "balance_loss_clip": 1.49865556, + "balance_loss_mlp": 1.3435396, + "epoch": 0.049601683451074706, + "flos": 22902818060160.0, + "grad_norm": 1.9881053459045397, + "language_loss": 0.92805862, + "learning_rate": 3.995971356641185e-06, + "loss": 0.96026534, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 3.27539062, + "router_z_loss_mlp": 0.51489258, + "step": 825, + "time_per_iteration": 2.879167318344116 + }, + { + "auxiliary_loss_clip": 0.01786378, + "auxiliary_loss_mlp": 0.01380799, + "balance_loss_clip": 1.4775238, + "balance_loss_mlp": 1.33292496, + "epoch": 0.04966180670374267, + "flos": 21443487672960.0, + "grad_norm": 4.2119845320806, + "language_loss": 0.69332087, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.72499263, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.47875977, + "step": 826, + "time_per_iteration": 2.9242491722106934 + }, + { + "auxiliary_loss_clip": 0.01788047, + "auxiliary_loss_mlp": 0.01381852, + "balance_loss_clip": 1.47656679, + "balance_loss_mlp": 1.33695745, + "epoch": 0.04972192995641064, + "flos": 23116446599040.0, + "grad_norm": 1.871044601870244, + "language_loss": 0.80878162, + "learning_rate": 3.995921790662459e-06, + "loss": 0.84048057, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 3.11132812, + "router_z_loss_mlp": 0.44873047, + "step": 827, + "time_per_iteration": 2.9052062034606934 + }, + { + "auxiliary_loss_clip": 0.01799373, + "auxiliary_loss_mlp": 0.01387627, + "balance_loss_clip": 1.48233485, + "balance_loss_mlp": 1.33734488, + "epoch": 0.04978205320907861, + "flos": 40420890754560.0, + "grad_norm": 1.7401849402315255, + "language_loss": 0.80930394, + "learning_rate": 3.995896894144294e-06, + "loss": 0.84117401, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.50268555, + "step": 828, + "time_per_iteration": 3.0711886882781982 + }, + { + "auxiliary_loss_clip": 0.01774237, + "auxiliary_loss_mlp": 0.01384949, + "balance_loss_clip": 1.4699049, + "balance_loss_mlp": 1.33800399, + "epoch": 0.04984217646174658, + "flos": 25239465982080.0, + "grad_norm": 1.6343608822225304, + "language_loss": 0.86034715, + "learning_rate": 3.995871921941519e-06, + "loss": 0.8919391, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.46899414, + "step": 829, + "time_per_iteration": 2.9460737705230713 + }, + { + "auxiliary_loss_clip": 0.01802786, + "auxiliary_loss_mlp": 0.01398795, + "balance_loss_clip": 1.48619795, + "balance_loss_mlp": 1.34488869, + "epoch": 0.04990229971441455, + "flos": 15967763717760.0, + "grad_norm": 1.851268271977176, + "language_loss": 0.77943528, + "learning_rate": 3.99584687405508e-06, + "loss": 0.81145108, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 3.16210938, + "router_z_loss_mlp": 0.53881836, + "step": 830, + "time_per_iteration": 2.8687191009521484 + }, + { + "auxiliary_loss_clip": 0.01787295, + "auxiliary_loss_mlp": 0.01393857, + "balance_loss_clip": 1.47876096, + "balance_loss_mlp": 1.34262109, + "epoch": 0.04996242296708252, + "flos": 18414121576320.0, + "grad_norm": 1.8460346272955606, + "language_loss": 0.80827886, + "learning_rate": 3.995821750485929e-06, + "loss": 0.84009039, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.51220703, + "step": 831, + "time_per_iteration": 2.877595901489258 + }, + { + "auxiliary_loss_clip": 0.01805221, + "auxiliary_loss_mlp": 0.01409239, + "balance_loss_clip": 1.48660588, + "balance_loss_mlp": 1.35959983, + "epoch": 0.05002254621975049, + "flos": 17866974481920.0, + "grad_norm": 2.8871010573190796, + "language_loss": 0.94391936, + "learning_rate": 3.995796551235016e-06, + "loss": 0.97606391, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.49682617, + "step": 832, + "time_per_iteration": 2.890181303024292 + }, + { + "auxiliary_loss_clip": 0.01788666, + "auxiliary_loss_mlp": 0.01395125, + "balance_loss_clip": 1.48139071, + "balance_loss_mlp": 1.3493011, + "epoch": 0.050082669472418455, + "flos": 45676425674880.0, + "grad_norm": 2.051987689918829, + "language_loss": 0.84519434, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.87703222, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.45874023, + "step": 833, + "time_per_iteration": 3.150804281234741 + }, + { + "auxiliary_loss_clip": 0.01816112, + "auxiliary_loss_mlp": 0.01402476, + "balance_loss_clip": 1.50085545, + "balance_loss_mlp": 1.34718704, + "epoch": 0.05014279272508643, + "flos": 37976297443200.0, + "grad_norm": 2.065844511031575, + "language_loss": 0.84388304, + "learning_rate": 3.995745925691733e-06, + "loss": 0.87606889, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 3.14648438, + "router_z_loss_mlp": 0.55273438, + "step": 834, + "time_per_iteration": 3.060631275177002 + }, + { + "auxiliary_loss_clip": 0.01836929, + "auxiliary_loss_mlp": 0.01389722, + "balance_loss_clip": 1.51282406, + "balance_loss_mlp": 1.34432733, + "epoch": 0.0502029159777544, + "flos": 21006005270400.0, + "grad_norm": 15.201290321965066, + "language_loss": 0.94532466, + "learning_rate": 3.995720499401282e-06, + "loss": 0.97759116, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 3.23828125, + "router_z_loss_mlp": 0.45361328, + "step": 835, + "time_per_iteration": 2.848585605621338 + }, + { + "auxiliary_loss_clip": 0.0183201, + "auxiliary_loss_mlp": 0.01413752, + "balance_loss_clip": 1.51348972, + "balance_loss_mlp": 1.36449456, + "epoch": 0.050263039230422364, + "flos": 15896539123200.0, + "grad_norm": 2.4706956653806955, + "language_loss": 0.80907446, + "learning_rate": 3.995694997432911e-06, + "loss": 0.84153211, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.4921875, + "step": 836, + "time_per_iteration": 2.8848702907562256 + }, + { + "auxiliary_loss_clip": 0.018032, + "auxiliary_loss_mlp": 0.01392972, + "balance_loss_clip": 1.50184894, + "balance_loss_mlp": 1.34605145, + "epoch": 0.050323162483090336, + "flos": 23743324310400.0, + "grad_norm": 2.1252050587452906, + "language_loss": 0.85682911, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.88879079, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.46948242, + "step": 837, + "time_per_iteration": 2.866690158843994 + }, + { + "auxiliary_loss_clip": 0.01819871, + "auxiliary_loss_mlp": 0.0138325, + "balance_loss_clip": 1.50013995, + "balance_loss_mlp": 1.33888054, + "epoch": 0.0503832857357583, + "flos": 20275797139200.0, + "grad_norm": 2.4298288551517895, + "language_loss": 0.75213742, + "learning_rate": 3.995643766466275e-06, + "loss": 0.78416866, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.44433594, + "step": 838, + "time_per_iteration": 2.90195369720459 + }, + { + "auxiliary_loss_clip": 0.01816102, + "auxiliary_loss_mlp": 0.01389304, + "balance_loss_clip": 1.49762249, + "balance_loss_mlp": 1.33852053, + "epoch": 0.05044340898842627, + "flos": 17793894850560.0, + "grad_norm": 2.267826514107252, + "language_loss": 0.85492152, + "learning_rate": 3.995618037469953e-06, + "loss": 0.88697553, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.5078125, + "step": 839, + "time_per_iteration": 2.8225862979888916 + }, + { + "auxiliary_loss_clip": 0.01823601, + "auxiliary_loss_mlp": 0.01387291, + "balance_loss_clip": 1.51064348, + "balance_loss_mlp": 1.34013176, + "epoch": 0.050503532241094246, + "flos": 22976893077120.0, + "grad_norm": 1.8300055540046678, + "language_loss": 0.87262237, + "learning_rate": 3.995592232799595e-06, + "loss": 0.90473127, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.47192383, + "step": 840, + "time_per_iteration": 2.916903018951416 + }, + { + "auxiliary_loss_clip": 0.0182705, + "auxiliary_loss_mlp": 0.01383786, + "balance_loss_clip": 1.51147962, + "balance_loss_mlp": 1.33567333, + "epoch": 0.05056365549376221, + "flos": 22785461752320.0, + "grad_norm": 2.010340981670334, + "language_loss": 0.96352673, + "learning_rate": 3.99556635245618e-06, + "loss": 0.99563515, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.48120117, + "step": 841, + "time_per_iteration": 3.005314826965332 + }, + { + "auxiliary_loss_clip": 0.01807994, + "auxiliary_loss_mlp": 0.01391783, + "balance_loss_clip": 1.497877, + "balance_loss_mlp": 1.33863902, + "epoch": 0.05062377874643018, + "flos": 30928501762560.0, + "grad_norm": 1.9655716273118453, + "language_loss": 0.79264289, + "learning_rate": 3.995540396440688e-06, + "loss": 0.82464063, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 3.09960938, + "router_z_loss_mlp": 0.53100586, + "step": 842, + "time_per_iteration": 3.043440341949463 + }, + { + "auxiliary_loss_clip": 0.01855147, + "auxiliary_loss_mlp": 0.01380734, + "balance_loss_clip": 1.526613, + "balance_loss_mlp": 1.33295536, + "epoch": 0.05068390199909815, + "flos": 19656882512640.0, + "grad_norm": 3.20927757314938, + "language_loss": 0.79720527, + "learning_rate": 3.995514364754105e-06, + "loss": 0.82956409, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 3.28125, + "router_z_loss_mlp": 0.47827148, + "step": 843, + "time_per_iteration": 3.000242233276367 + }, + { + "auxiliary_loss_clip": 0.01838416, + "auxiliary_loss_mlp": 0.01382012, + "balance_loss_clip": 1.51292562, + "balance_loss_mlp": 1.33113313, + "epoch": 0.05074402525176612, + "flos": 37976342688000.0, + "grad_norm": 2.0326589152989456, + "language_loss": 0.85111976, + "learning_rate": 3.995488257397417e-06, + "loss": 0.88332403, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.50854492, + "step": 844, + "time_per_iteration": 3.0275068283081055 + }, + { + "auxiliary_loss_clip": 0.01826131, + "auxiliary_loss_mlp": 0.0137729, + "balance_loss_clip": 1.50440192, + "balance_loss_mlp": 1.33349252, + "epoch": 0.05080414850443409, + "flos": 22064800273920.0, + "grad_norm": 2.6589327775192992, + "language_loss": 0.77616584, + "learning_rate": 3.995462074371614e-06, + "loss": 0.80820012, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 3.22070312, + "router_z_loss_mlp": 0.43798828, + "step": 845, + "time_per_iteration": 2.9278721809387207 + }, + { + "auxiliary_loss_clip": 0.01816248, + "auxiliary_loss_mlp": 0.01372795, + "balance_loss_clip": 1.50177646, + "balance_loss_mlp": 1.3196516, + "epoch": 0.05086427175710206, + "flos": 20234325640320.0, + "grad_norm": 1.8893195906928064, + "language_loss": 0.89712608, + "learning_rate": 3.99543581567769e-06, + "loss": 0.92901647, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.53100586, + "step": 846, + "time_per_iteration": 2.9648349285125732 + }, + { + "auxiliary_loss_clip": 0.01813592, + "auxiliary_loss_mlp": 0.01369228, + "balance_loss_clip": 1.50278819, + "balance_loss_mlp": 1.32481062, + "epoch": 0.05092439500977003, + "flos": 15167733580800.0, + "grad_norm": 2.524893424300868, + "language_loss": 0.8969394, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.92876762, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.4440918, + "step": 847, + "time_per_iteration": 2.8614354133605957 + }, + { + "auxiliary_loss_clip": 0.01806018, + "auxiliary_loss_mlp": 0.01367872, + "balance_loss_clip": 1.49729681, + "balance_loss_mlp": 1.32123733, + "epoch": 0.050984518262437994, + "flos": 22065478945920.0, + "grad_norm": 4.636223784752397, + "language_loss": 0.84552395, + "learning_rate": 3.995383071289462e-06, + "loss": 0.87726289, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.46655273, + "step": 848, + "time_per_iteration": 3.0094499588012695 + }, + { + "auxiliary_loss_clip": 0.01799842, + "auxiliary_loss_mlp": 0.01377914, + "balance_loss_clip": 1.49196172, + "balance_loss_mlp": 1.33433104, + "epoch": 0.05104464151510597, + "flos": 30237095687040.0, + "grad_norm": 1.7011880709186042, + "language_loss": 0.89660692, + "learning_rate": 3.995356585597158e-06, + "loss": 0.92838448, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 3.078125, + "router_z_loss_mlp": 0.43579102, + "step": 849, + "time_per_iteration": 2.95316219329834 + }, + { + "auxiliary_loss_clip": 0.01813582, + "auxiliary_loss_mlp": 0.013616, + "balance_loss_clip": 1.50708103, + "balance_loss_mlp": 1.31689668, + "epoch": 0.05110476476777394, + "flos": 18342082575360.0, + "grad_norm": 1.790855200726768, + "language_loss": 0.86346138, + "learning_rate": 3.995330024240732e-06, + "loss": 0.89521313, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.44702148, + "step": 850, + "time_per_iteration": 2.878868818283081 + }, + { + "auxiliary_loss_clip": 0.01817026, + "auxiliary_loss_mlp": 0.01378849, + "balance_loss_clip": 1.50307941, + "balance_loss_mlp": 1.32842302, + "epoch": 0.051164888020441904, + "flos": 38013018238080.0, + "grad_norm": 2.1813917151089823, + "language_loss": 0.67864013, + "learning_rate": 3.995303387221192e-06, + "loss": 0.71059883, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.50439453, + "step": 851, + "time_per_iteration": 4.476234436035156 + }, + { + "auxiliary_loss_clip": 0.01822883, + "auxiliary_loss_mlp": 0.01377187, + "balance_loss_clip": 1.50723672, + "balance_loss_mlp": 1.33050513, + "epoch": 0.051225011273109876, + "flos": 23048796343680.0, + "grad_norm": 2.095090364139327, + "language_loss": 0.85511827, + "learning_rate": 3.995276674539547e-06, + "loss": 0.88711894, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.46679688, + "step": 852, + "time_per_iteration": 2.8926923274993896 + }, + { + "auxiliary_loss_clip": 0.01823062, + "auxiliary_loss_mlp": 0.01375601, + "balance_loss_clip": 1.50540531, + "balance_loss_mlp": 1.33275747, + "epoch": 0.05128513452577785, + "flos": 18268957699200.0, + "grad_norm": 3.4281229407797915, + "language_loss": 0.81977731, + "learning_rate": 3.995249886196811e-06, + "loss": 0.8517639, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 3.171875, + "router_z_loss_mlp": 0.42797852, + "step": 853, + "time_per_iteration": 2.8876209259033203 + }, + { + "auxiliary_loss_clip": 0.01804689, + "auxiliary_loss_mlp": 0.01377132, + "balance_loss_clip": 1.49787354, + "balance_loss_mlp": 1.33085454, + "epoch": 0.05134525777844581, + "flos": 27210444278400.0, + "grad_norm": 1.7967464202972343, + "language_loss": 0.79279524, + "learning_rate": 3.995223022193999e-06, + "loss": 0.82461345, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.46289062, + "step": 854, + "time_per_iteration": 7.1201441287994385 + }, + { + "auxiliary_loss_clip": 0.01829336, + "auxiliary_loss_mlp": 0.01362233, + "balance_loss_clip": 1.5116694, + "balance_loss_mlp": 1.3195796, + "epoch": 0.051405381031113785, + "flos": 28372931660160.0, + "grad_norm": 2.066049808287637, + "language_loss": 0.8404395, + "learning_rate": 3.99519608253213e-06, + "loss": 0.87235522, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 3.17773438, + "router_z_loss_mlp": 0.42675781, + "step": 855, + "time_per_iteration": 2.9628236293792725 + }, + { + "auxiliary_loss_clip": 0.01576137, + "auxiliary_loss_mlp": 0.01350152, + "balance_loss_clip": 1.39099693, + "balance_loss_mlp": 1.32983899, + "epoch": 0.05146550428378175, + "flos": 65649018274560.0, + "grad_norm": 1.0166848510085653, + "language_loss": 0.65837896, + "learning_rate": 3.995169067212227e-06, + "loss": 0.68764186, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.203125, + "step": 856, + "time_per_iteration": 3.348721504211426 + }, + { + "auxiliary_loss_clip": 0.01808813, + "auxiliary_loss_mlp": 0.01356211, + "balance_loss_clip": 1.50570428, + "balance_loss_mlp": 1.31434441, + "epoch": 0.05152562753644972, + "flos": 22065116987520.0, + "grad_norm": 1.6972895422584147, + "language_loss": 0.78500831, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.81665862, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.41821289, + "step": 857, + "time_per_iteration": 2.9286608695983887 + }, + { + "auxiliary_loss_clip": 0.01817284, + "auxiliary_loss_mlp": 0.01366413, + "balance_loss_clip": 1.50524783, + "balance_loss_mlp": 1.32356966, + "epoch": 0.051585750789117694, + "flos": 18517271016960.0, + "grad_norm": 2.1992065336226343, + "language_loss": 0.8981958, + "learning_rate": 3.995114809602412e-06, + "loss": 0.93003279, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.4284668, + "step": 858, + "time_per_iteration": 2.9300036430358887 + }, + { + "auxiliary_loss_clip": 0.01830388, + "auxiliary_loss_mlp": 0.01362196, + "balance_loss_clip": 1.51691341, + "balance_loss_mlp": 1.31720686, + "epoch": 0.05164587404178566, + "flos": 23740157174400.0, + "grad_norm": 2.003928726043768, + "language_loss": 0.78726923, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.81919509, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.44995117, + "step": 859, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.01850452, + "auxiliary_loss_mlp": 0.0136897, + "balance_loss_clip": 1.52527332, + "balance_loss_mlp": 1.32390881, + "epoch": 0.05170599729445363, + "flos": 16261439587200.0, + "grad_norm": 2.1278202593354476, + "language_loss": 0.92207611, + "learning_rate": 3.995060249372788e-06, + "loss": 0.9542703, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.45019531, + "step": 860, + "time_per_iteration": 2.8851518630981445 + }, + { + "auxiliary_loss_clip": 0.01828204, + "auxiliary_loss_mlp": 0.01358449, + "balance_loss_clip": 1.51953816, + "balance_loss_mlp": 1.31708312, + "epoch": 0.0517661205471216, + "flos": 23995981128960.0, + "grad_norm": 1.7694978813659668, + "language_loss": 0.83461797, + "learning_rate": 3.99503285577813e-06, + "loss": 0.86648452, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.41381836, + "step": 861, + "time_per_iteration": 2.9634711742401123 + }, + { + "auxiliary_loss_clip": 0.01852806, + "auxiliary_loss_mlp": 0.01369825, + "balance_loss_clip": 1.53507268, + "balance_loss_mlp": 1.32917476, + "epoch": 0.05182624379978957, + "flos": 29289503698560.0, + "grad_norm": 1.5563595125341918, + "language_loss": 0.80394101, + "learning_rate": 3.995005386531627e-06, + "loss": 0.83616734, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.40649414, + "step": 862, + "time_per_iteration": 2.9379584789276123 + }, + { + "auxiliary_loss_clip": 0.01830652, + "auxiliary_loss_mlp": 0.01357801, + "balance_loss_clip": 1.52242565, + "balance_loss_mlp": 1.31397963, + "epoch": 0.05188636705245754, + "flos": 24181575874560.0, + "grad_norm": 2.2707327723332122, + "language_loss": 0.91866994, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.95055443, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.4387207, + "step": 863, + "time_per_iteration": 2.9594500064849854 + }, + { + "auxiliary_loss_clip": 0.018547, + "auxiliary_loss_mlp": 0.01378785, + "balance_loss_clip": 1.5396688, + "balance_loss_mlp": 1.33369994, + "epoch": 0.051946490305125506, + "flos": 26771966490240.0, + "grad_norm": 1.9135553070425162, + "language_loss": 0.77285695, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.80519176, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 3.1484375, + "router_z_loss_mlp": 0.45068359, + "step": 864, + "time_per_iteration": 2.916667938232422 + }, + { + "auxiliary_loss_clip": 0.01851128, + "auxiliary_loss_mlp": 0.01362304, + "balance_loss_clip": 1.53160191, + "balance_loss_mlp": 1.32170153, + "epoch": 0.05200661355779348, + "flos": 21511318907520.0, + "grad_norm": 2.3824763250921954, + "language_loss": 0.814924, + "learning_rate": 3.994922524891474e-06, + "loss": 0.84705842, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 3.19726562, + "router_z_loss_mlp": 0.40600586, + "step": 865, + "time_per_iteration": 2.9067957401275635 + }, + { + "auxiliary_loss_clip": 0.0182903, + "auxiliary_loss_mlp": 0.01361395, + "balance_loss_clip": 1.52244174, + "balance_loss_mlp": 1.31909978, + "epoch": 0.05206673681046144, + "flos": 18123748577280.0, + "grad_norm": 2.1988326702584735, + "language_loss": 0.88584638, + "learning_rate": 3.994894753048032e-06, + "loss": 0.9177506, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 3.06640625, + "router_z_loss_mlp": 0.42333984, + "step": 866, + "time_per_iteration": 2.9035234451293945 + }, + { + "auxiliary_loss_clip": 0.01844, + "auxiliary_loss_mlp": 0.01377901, + "balance_loss_clip": 1.5372839, + "balance_loss_mlp": 1.33779883, + "epoch": 0.052126860063129415, + "flos": 17531600889600.0, + "grad_norm": 2.1165075224252465, + "language_loss": 0.90509367, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.93731272, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.40087891, + "step": 867, + "time_per_iteration": 2.9128005504608154 + }, + { + "auxiliary_loss_clip": 0.01818964, + "auxiliary_loss_mlp": 0.01368096, + "balance_loss_clip": 1.52205837, + "balance_loss_mlp": 1.32732606, + "epoch": 0.05218698331579739, + "flos": 32610735872640.0, + "grad_norm": 1.4607265956456723, + "language_loss": 0.64220965, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.67408025, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.4074707, + "step": 868, + "time_per_iteration": 3.0640904903411865 + }, + { + "auxiliary_loss_clip": 0.01866036, + "auxiliary_loss_mlp": 0.01358639, + "balance_loss_clip": 1.54187369, + "balance_loss_mlp": 1.31546199, + "epoch": 0.05224710656846535, + "flos": 22137427457280.0, + "grad_norm": 1.7261248696424438, + "language_loss": 0.8508594, + "learning_rate": 3.994810983642281e-06, + "loss": 0.88310611, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 3.23632812, + "router_z_loss_mlp": 0.43164062, + "step": 869, + "time_per_iteration": 2.917043924331665 + }, + { + "auxiliary_loss_clip": 0.01863737, + "auxiliary_loss_mlp": 0.01381443, + "balance_loss_clip": 1.53866887, + "balance_loss_mlp": 1.33683527, + "epoch": 0.052307229821133325, + "flos": 11152380643200.0, + "grad_norm": 2.0377219942512115, + "language_loss": 0.8958711, + "learning_rate": 3.994782909218751e-06, + "loss": 0.92832291, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 3.25, + "router_z_loss_mlp": 0.44580078, + "step": 870, + "time_per_iteration": 2.97171688079834 + }, + { + "auxiliary_loss_clip": 0.01850786, + "auxiliary_loss_mlp": 0.0136775, + "balance_loss_clip": 1.53517175, + "balance_loss_mlp": 1.3231895, + "epoch": 0.05236735307380129, + "flos": 19135506971520.0, + "grad_norm": 2.1034959426379363, + "language_loss": 0.82252818, + "learning_rate": 3.994754759152854e-06, + "loss": 0.85471356, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 3.15429688, + "router_z_loss_mlp": 0.44580078, + "step": 871, + "time_per_iteration": 2.9046144485473633 + }, + { + "auxiliary_loss_clip": 0.01819981, + "auxiliary_loss_mlp": 0.01360846, + "balance_loss_clip": 1.52098656, + "balance_loss_mlp": 1.31764436, + "epoch": 0.05242747632646926, + "flos": 20970732309120.0, + "grad_norm": 1.8439899003995517, + "language_loss": 0.82167101, + "learning_rate": 3.994726533445656e-06, + "loss": 0.85347927, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 2.99023438, + "router_z_loss_mlp": 0.43212891, + "step": 872, + "time_per_iteration": 2.9418015480041504 + }, + { + "auxiliary_loss_clip": 0.01568037, + "auxiliary_loss_mlp": 0.01384349, + "balance_loss_clip": 1.38824129, + "balance_loss_mlp": 1.36928082, + "epoch": 0.052487599579137234, + "flos": 65047640647680.0, + "grad_norm": 0.9114661984303519, + "language_loss": 0.61750519, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.64702904, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.15039062, + "step": 873, + "time_per_iteration": 3.269615650177002 + }, + { + "auxiliary_loss_clip": 0.01828929, + "auxiliary_loss_mlp": 0.01364362, + "balance_loss_clip": 1.51872659, + "balance_loss_mlp": 1.31875205, + "epoch": 0.0525477228318052, + "flos": 23297743088640.0, + "grad_norm": 2.232765692489517, + "language_loss": 0.91665459, + "learning_rate": 3.994669855111643e-06, + "loss": 0.94858748, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.45629883, + "step": 874, + "time_per_iteration": 2.8996899127960205 + }, + { + "auxiliary_loss_clip": 0.01826651, + "auxiliary_loss_mlp": 0.01357293, + "balance_loss_clip": 1.51545358, + "balance_loss_mlp": 1.31092012, + "epoch": 0.05260784608447317, + "flos": 32242034845440.0, + "grad_norm": 1.8286756194771692, + "language_loss": 0.7659229, + "learning_rate": 3.994641402486977e-06, + "loss": 0.79776227, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.46386719, + "step": 875, + "time_per_iteration": 3.078219413757324 + }, + { + "auxiliary_loss_clip": 0.01800907, + "auxiliary_loss_mlp": 0.01358723, + "balance_loss_clip": 1.49698472, + "balance_loss_mlp": 1.31547427, + "epoch": 0.052667969337141136, + "flos": 24474165868800.0, + "grad_norm": 1.8111117713375375, + "language_loss": 0.93941855, + "learning_rate": 3.99461287422531e-06, + "loss": 0.97101486, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.43261719, + "step": 876, + "time_per_iteration": 2.949913263320923 + }, + { + "auxiliary_loss_clip": 0.01541522, + "auxiliary_loss_mlp": 0.01363613, + "balance_loss_clip": 1.36036658, + "balance_loss_mlp": 1.34463537, + "epoch": 0.05272809258980911, + "flos": 57815897016960.0, + "grad_norm": 0.8244072927814181, + "language_loss": 0.62998748, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65903878, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.18945312, + "step": 877, + "time_per_iteration": 3.343492269515991 + }, + { + "auxiliary_loss_clip": 0.01800666, + "auxiliary_loss_mlp": 0.01359869, + "balance_loss_clip": 1.49258852, + "balance_loss_mlp": 1.30992055, + "epoch": 0.05278821584247708, + "flos": 17429763548160.0, + "grad_norm": 4.189960888742742, + "language_loss": 0.87971097, + "learning_rate": 3.994555590795299e-06, + "loss": 0.91131639, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.49951172, + "step": 878, + "time_per_iteration": 2.9486749172210693 + }, + { + "auxiliary_loss_clip": 0.01837989, + "auxiliary_loss_mlp": 0.01363508, + "balance_loss_clip": 1.51474774, + "balance_loss_mlp": 1.31432247, + "epoch": 0.052848339095145046, + "flos": 26147531998080.0, + "grad_norm": 1.9200719947467402, + "language_loss": 0.85137105, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.88338602, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 3.23242188, + "router_z_loss_mlp": 0.4921875, + "step": 879, + "time_per_iteration": 2.924668550491333 + }, + { + "auxiliary_loss_clip": 0.01821014, + "auxiliary_loss_mlp": 0.01371327, + "balance_loss_clip": 1.51379347, + "balance_loss_mlp": 1.31467879, + "epoch": 0.05290846234781302, + "flos": 16480768970880.0, + "grad_norm": 1.7029594594745732, + "language_loss": 0.86198425, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.89390779, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.56640625, + "step": 880, + "time_per_iteration": 2.940678834915161 + }, + { + "auxiliary_loss_clip": 0.01849287, + "auxiliary_loss_mlp": 0.01353515, + "balance_loss_clip": 1.53299463, + "balance_loss_mlp": 1.30764341, + "epoch": 0.05296858560048098, + "flos": 19874718817920.0, + "grad_norm": 6.342344532104024, + "language_loss": 0.90035778, + "learning_rate": 3.994469098399906e-06, + "loss": 0.9323858, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.45898438, + "step": 881, + "time_per_iteration": 2.980752468109131 + }, + { + "auxiliary_loss_clip": 0.01827373, + "auxiliary_loss_mlp": 0.01356874, + "balance_loss_clip": 1.51075494, + "balance_loss_mlp": 1.29912949, + "epoch": 0.053028708853148955, + "flos": 24399095466240.0, + "grad_norm": 2.619292917965124, + "language_loss": 0.89445192, + "learning_rate": 3.994440116339046e-06, + "loss": 0.92629439, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.5769043, + "step": 882, + "time_per_iteration": 2.9575963020324707 + }, + { + "auxiliary_loss_clip": 0.01830939, + "auxiliary_loss_mlp": 0.01369397, + "balance_loss_clip": 1.51536751, + "balance_loss_mlp": 1.32364464, + "epoch": 0.05308883210581693, + "flos": 36406985650560.0, + "grad_norm": 2.292568020602893, + "language_loss": 0.72086704, + "learning_rate": 3.994411058648816e-06, + "loss": 0.75287038, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.45776367, + "step": 883, + "time_per_iteration": 3.0957469940185547 + }, + { + "auxiliary_loss_clip": 0.01820832, + "auxiliary_loss_mlp": 0.01350373, + "balance_loss_clip": 1.50926507, + "balance_loss_mlp": 1.29734898, + "epoch": 0.05314895535848489, + "flos": 22864920900480.0, + "grad_norm": 1.8801506760104214, + "language_loss": 0.78126299, + "learning_rate": 3.994381925330319e-06, + "loss": 0.81297505, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 3.109375, + "router_z_loss_mlp": 0.53076172, + "step": 884, + "time_per_iteration": 2.891754627227783 + }, + { + "auxiliary_loss_clip": 0.01820893, + "auxiliary_loss_mlp": 0.01340818, + "balance_loss_clip": 1.51645029, + "balance_loss_mlp": 1.28965378, + "epoch": 0.053209078611152864, + "flos": 12868168412160.0, + "grad_norm": 2.0867899253512787, + "language_loss": 0.88063407, + "learning_rate": 3.994352716384659e-06, + "loss": 0.91225117, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.51196289, + "step": 885, + "time_per_iteration": 4.38989782333374 + }, + { + "auxiliary_loss_clip": 0.01840395, + "auxiliary_loss_mlp": 0.01341788, + "balance_loss_clip": 1.5228883, + "balance_loss_mlp": 1.28626013, + "epoch": 0.05326920186382083, + "flos": 12171604429440.0, + "grad_norm": 2.3494068003137394, + "language_loss": 0.89352345, + "learning_rate": 3.994323431812945e-06, + "loss": 0.92534524, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 3.17578125, + "router_z_loss_mlp": 0.5559082, + "step": 886, + "time_per_iteration": 2.9353585243225098 + }, + { + "auxiliary_loss_clip": 0.01808147, + "auxiliary_loss_mlp": 0.01352665, + "balance_loss_clip": 1.50146484, + "balance_loss_mlp": 1.29897308, + "epoch": 0.0533293251164888, + "flos": 22713060792960.0, + "grad_norm": 1.9677783655486352, + "language_loss": 0.91324037, + "learning_rate": 3.994294071616286e-06, + "loss": 0.94484842, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.53710938, + "step": 887, + "time_per_iteration": 2.8994715213775635 + }, + { + "auxiliary_loss_clip": 0.01825567, + "auxiliary_loss_mlp": 0.01342842, + "balance_loss_clip": 1.51131713, + "balance_loss_mlp": 1.28819621, + "epoch": 0.053389448369156774, + "flos": 26951679411840.0, + "grad_norm": 2.42149636260106, + "language_loss": 0.77242804, + "learning_rate": 3.994264635795796e-06, + "loss": 0.80411208, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.54711914, + "step": 888, + "time_per_iteration": 2.9645450115203857 + }, + { + "auxiliary_loss_clip": 0.01825061, + "auxiliary_loss_mlp": 0.01358587, + "balance_loss_clip": 1.51172471, + "balance_loss_mlp": 1.30398881, + "epoch": 0.05344957162182474, + "flos": 25567193203200.0, + "grad_norm": 2.0852687813171733, + "language_loss": 0.90151858, + "learning_rate": 3.994235124352592e-06, + "loss": 0.93335509, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.5456543, + "step": 889, + "time_per_iteration": 6.02805233001709 + }, + { + "auxiliary_loss_clip": 0.01819501, + "auxiliary_loss_mlp": 0.01338042, + "balance_loss_clip": 1.51269126, + "balance_loss_mlp": 1.28806901, + "epoch": 0.05350969487449271, + "flos": 19729192982400.0, + "grad_norm": 1.8594039118978922, + "language_loss": 0.90204954, + "learning_rate": 3.994205537287791e-06, + "loss": 0.93362498, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 3.06445312, + "router_z_loss_mlp": 0.49926758, + "step": 890, + "time_per_iteration": 4.285792112350464 + }, + { + "auxiliary_loss_clip": 0.01819079, + "auxiliary_loss_mlp": 0.01336548, + "balance_loss_clip": 1.50529587, + "balance_loss_mlp": 1.28884077, + "epoch": 0.053569818127160676, + "flos": 27027745200000.0, + "grad_norm": 2.4204191542134263, + "language_loss": 0.95190752, + "learning_rate": 3.994175874602517e-06, + "loss": 0.98346376, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 3.13476562, + "router_z_loss_mlp": 0.47680664, + "step": 891, + "time_per_iteration": 2.991344451904297 + }, + { + "auxiliary_loss_clip": 0.01817344, + "auxiliary_loss_mlp": 0.01334956, + "balance_loss_clip": 1.50421977, + "balance_loss_mlp": 1.28410125, + "epoch": 0.05362994137982865, + "flos": 13196167102080.0, + "grad_norm": 1.7341180593992296, + "language_loss": 0.73255563, + "learning_rate": 3.994146136297893e-06, + "loss": 0.76407862, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.5090332, + "step": 892, + "time_per_iteration": 2.8669216632843018 + }, + { + "auxiliary_loss_clip": 0.01827967, + "auxiliary_loss_mlp": 0.01344879, + "balance_loss_clip": 1.51594377, + "balance_loss_mlp": 1.29760051, + "epoch": 0.05369006463249662, + "flos": 28669774665600.0, + "grad_norm": 1.9776620580344464, + "language_loss": 0.84789604, + "learning_rate": 3.994116322375049e-06, + "loss": 0.87962449, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.47241211, + "step": 893, + "time_per_iteration": 2.973574161529541 + }, + { + "auxiliary_loss_clip": 0.01823167, + "auxiliary_loss_mlp": 0.01336882, + "balance_loss_clip": 1.50769567, + "balance_loss_mlp": 1.28631306, + "epoch": 0.053750187885164585, + "flos": 28924919948160.0, + "grad_norm": 1.774188471381303, + "language_loss": 0.83699286, + "learning_rate": 3.994086432835114e-06, + "loss": 0.86859334, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.50610352, + "step": 894, + "time_per_iteration": 2.9355788230895996 + }, + { + "auxiliary_loss_clip": 0.01815447, + "auxiliary_loss_mlp": 0.01327712, + "balance_loss_clip": 1.50680447, + "balance_loss_mlp": 1.28224516, + "epoch": 0.05381031113783256, + "flos": 15167235888000.0, + "grad_norm": 2.6488484702625748, + "language_loss": 0.77675927, + "learning_rate": 3.994056467679221e-06, + "loss": 0.80819082, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.4543457, + "step": 895, + "time_per_iteration": 2.8974344730377197 + }, + { + "auxiliary_loss_clip": 0.01836184, + "auxiliary_loss_mlp": 0.01345751, + "balance_loss_clip": 1.51685596, + "balance_loss_mlp": 1.29539704, + "epoch": 0.05387043439050053, + "flos": 21845244666240.0, + "grad_norm": 2.047597700048923, + "language_loss": 0.88850421, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.92032349, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 3.1953125, + "router_z_loss_mlp": 0.50317383, + "step": 896, + "time_per_iteration": 2.938079833984375 + }, + { + "auxiliary_loss_clip": 0.0183299, + "auxiliary_loss_mlp": 0.01339245, + "balance_loss_clip": 1.5149734, + "balance_loss_mlp": 1.28572059, + "epoch": 0.053930557643168495, + "flos": 17318967736320.0, + "grad_norm": 2.1000402481542433, + "language_loss": 0.89968133, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.93140376, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.53540039, + "step": 897, + "time_per_iteration": 2.8849101066589355 + }, + { + "auxiliary_loss_clip": 0.01792134, + "auxiliary_loss_mlp": 0.01328286, + "balance_loss_clip": 1.49125373, + "balance_loss_mlp": 1.27190042, + "epoch": 0.05399068089583647, + "flos": 17357588812800.0, + "grad_norm": 1.7214176860540233, + "language_loss": 0.92299122, + "learning_rate": 3.993966118527175e-06, + "loss": 0.95419538, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.56420898, + "step": 898, + "time_per_iteration": 2.948834180831909 + }, + { + "auxiliary_loss_clip": 0.01811634, + "auxiliary_loss_mlp": 0.01334077, + "balance_loss_clip": 1.49247384, + "balance_loss_mlp": 1.28267431, + "epoch": 0.05405080414850443, + "flos": 17494156177920.0, + "grad_norm": 3.0181159383087737, + "language_loss": 0.94630206, + "learning_rate": 3.993935850918845e-06, + "loss": 0.97775924, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 3.18945312, + "router_z_loss_mlp": 0.51416016, + "step": 899, + "time_per_iteration": 2.8465311527252197 + }, + { + "auxiliary_loss_clip": 0.01809096, + "auxiliary_loss_mlp": 0.0132014, + "balance_loss_clip": 1.50926292, + "balance_loss_mlp": 1.26990473, + "epoch": 0.054110927401172404, + "flos": 24506995610880.0, + "grad_norm": 2.783918634899374, + "language_loss": 0.76743126, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.79872358, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 2.99609375, + "router_z_loss_mlp": 0.50317383, + "step": 900, + "time_per_iteration": 2.973247766494751 + }, + { + "auxiliary_loss_clip": 0.01798848, + "auxiliary_loss_mlp": 0.01335497, + "balance_loss_clip": 1.49030232, + "balance_loss_mlp": 1.27434266, + "epoch": 0.054171050653840376, + "flos": 22940398506240.0, + "grad_norm": 2.513470820318757, + "language_loss": 0.77238572, + "learning_rate": 3.993875088872592e-06, + "loss": 0.80372918, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.61206055, + "step": 901, + "time_per_iteration": 2.9066410064697266 + }, + { + "auxiliary_loss_clip": 0.01771224, + "auxiliary_loss_mlp": 0.01312642, + "balance_loss_clip": 1.48025572, + "balance_loss_mlp": 1.26436234, + "epoch": 0.05423117390650834, + "flos": 12941021819520.0, + "grad_norm": 4.020323794041706, + "language_loss": 0.86799669, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.8988353, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.48217773, + "step": 902, + "time_per_iteration": 2.875523805618286 + }, + { + "auxiliary_loss_clip": 0.01794277, + "auxiliary_loss_mlp": 0.01320592, + "balance_loss_clip": 1.48742819, + "balance_loss_mlp": 1.26394343, + "epoch": 0.05429129715917631, + "flos": 19910806185600.0, + "grad_norm": 1.8379763427161284, + "language_loss": 0.88245296, + "learning_rate": 3.993814024394569e-06, + "loss": 0.91360164, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.56689453, + "step": 903, + "time_per_iteration": 2.99001407623291 + }, + { + "auxiliary_loss_clip": 0.01795298, + "auxiliary_loss_mlp": 0.01315418, + "balance_loss_clip": 1.49021804, + "balance_loss_mlp": 1.26439619, + "epoch": 0.05435142041184428, + "flos": 16917075008640.0, + "grad_norm": 3.635522511019434, + "language_loss": 0.77855074, + "learning_rate": 3.993783378746537e-06, + "loss": 0.80965793, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 3.04492188, + "router_z_loss_mlp": 0.51000977, + "step": 904, + "time_per_iteration": 2.872283458709717 + }, + { + "auxiliary_loss_clip": 0.01807286, + "auxiliary_loss_mlp": 0.01310812, + "balance_loss_clip": 1.49694157, + "balance_loss_mlp": 1.26112604, + "epoch": 0.05441154366451225, + "flos": 23958083969280.0, + "grad_norm": 2.2450681190330424, + "language_loss": 0.88082826, + "learning_rate": 3.993752657494039e-06, + "loss": 0.91200918, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.49731445, + "step": 905, + "time_per_iteration": 2.928906202316284 + }, + { + "auxiliary_loss_clip": 0.01799434, + "auxiliary_loss_mlp": 0.01317229, + "balance_loss_clip": 1.50462413, + "balance_loss_mlp": 1.26267934, + "epoch": 0.05447166691718022, + "flos": 19985152671360.0, + "grad_norm": 2.8940891997299967, + "language_loss": 0.75633049, + "learning_rate": 3.993721860638241e-06, + "loss": 0.7874971, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 2.94726562, + "router_z_loss_mlp": 0.54541016, + "step": 906, + "time_per_iteration": 2.892223358154297 + }, + { + "auxiliary_loss_clip": 0.01809935, + "auxiliary_loss_mlp": 0.0132403, + "balance_loss_clip": 1.50437939, + "balance_loss_mlp": 1.2714107, + "epoch": 0.05453179016984819, + "flos": 24947328435840.0, + "grad_norm": 1.839839549330035, + "language_loss": 0.89762485, + "learning_rate": 3.993690988180309e-06, + "loss": 0.9289645, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.52612305, + "step": 907, + "time_per_iteration": 3.0679714679718018 + }, + { + "auxiliary_loss_clip": 0.01805964, + "auxiliary_loss_mlp": 0.01324462, + "balance_loss_clip": 1.49746776, + "balance_loss_mlp": 1.26342702, + "epoch": 0.05459191342251616, + "flos": 18124517738880.0, + "grad_norm": 3.808097026398685, + "language_loss": 0.88467824, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.91598248, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.6105957, + "step": 908, + "time_per_iteration": 2.9402353763580322 + }, + { + "auxiliary_loss_clip": 0.01819141, + "auxiliary_loss_mlp": 0.01304859, + "balance_loss_clip": 1.51185405, + "balance_loss_mlp": 1.25481462, + "epoch": 0.054652036675184125, + "flos": 19217635562880.0, + "grad_norm": 2.071742558990563, + "language_loss": 0.91760516, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.94884509, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.49975586, + "step": 909, + "time_per_iteration": 2.8678462505340576 + }, + { + "auxiliary_loss_clip": 0.0181916, + "auxiliary_loss_mlp": 0.01314232, + "balance_loss_clip": 1.50630784, + "balance_loss_mlp": 1.26278138, + "epoch": 0.0547121599278521, + "flos": 16334292994560.0, + "grad_norm": 1.9725686797636022, + "language_loss": 0.7395097, + "learning_rate": 3.99359791720544e-06, + "loss": 0.77084363, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 3.12890625, + "router_z_loss_mlp": 0.51464844, + "step": 910, + "time_per_iteration": 2.9442813396453857 + }, + { + "auxiliary_loss_clip": 0.01798481, + "auxiliary_loss_mlp": 0.01306952, + "balance_loss_clip": 1.49972177, + "balance_loss_mlp": 1.25969768, + "epoch": 0.05477228318052007, + "flos": 20348560056960.0, + "grad_norm": 4.296121085858535, + "language_loss": 0.85511494, + "learning_rate": 3.993566742350714e-06, + "loss": 0.88616925, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 2.98632812, + "router_z_loss_mlp": 0.47241211, + "step": 911, + "time_per_iteration": 2.8902196884155273 + }, + { + "auxiliary_loss_clip": 0.01834938, + "auxiliary_loss_mlp": 0.01311461, + "balance_loss_clip": 1.5218792, + "balance_loss_mlp": 1.25815034, + "epoch": 0.054832406433188034, + "flos": 21981042869760.0, + "grad_norm": 2.4367855726741645, + "language_loss": 0.78793681, + "learning_rate": 3.993535491899736e-06, + "loss": 0.81940079, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.53369141, + "step": 912, + "time_per_iteration": 2.887500286102295 + }, + { + "auxiliary_loss_clip": 0.01809126, + "auxiliary_loss_mlp": 0.01294013, + "balance_loss_clip": 1.50858974, + "balance_loss_mlp": 1.24673426, + "epoch": 0.054892529685856006, + "flos": 16407553605120.0, + "grad_norm": 2.162502011067309, + "language_loss": 0.85739934, + "learning_rate": 3.993504165853694e-06, + "loss": 0.88843071, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.47290039, + "step": 913, + "time_per_iteration": 2.8801772594451904 + }, + { + "auxiliary_loss_clip": 0.01811921, + "auxiliary_loss_mlp": 0.01277972, + "balance_loss_clip": 1.51135182, + "balance_loss_mlp": 1.22747517, + "epoch": 0.05495265293852397, + "flos": 23922222825600.0, + "grad_norm": 3.5432700374361965, + "language_loss": 0.85183299, + "learning_rate": 3.993472764213772e-06, + "loss": 0.88273191, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.50537109, + "step": 914, + "time_per_iteration": 2.9222562313079834 + }, + { + "auxiliary_loss_clip": 0.01822716, + "auxiliary_loss_mlp": 0.01280819, + "balance_loss_clip": 1.51854992, + "balance_loss_mlp": 1.23513794, + "epoch": 0.055012776191191944, + "flos": 23597527006080.0, + "grad_norm": 2.223313911790687, + "language_loss": 0.93764597, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.96868134, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.45678711, + "step": 915, + "time_per_iteration": 2.9515678882598877 + }, + { + "auxiliary_loss_clip": 0.01812385, + "auxiliary_loss_mlp": 0.01283608, + "balance_loss_clip": 1.51045215, + "balance_loss_mlp": 1.23172808, + "epoch": 0.055072899443859916, + "flos": 17537030265600.0, + "grad_norm": 1.6952305176205038, + "language_loss": 0.91252571, + "learning_rate": 3.993409734157064e-06, + "loss": 0.94348562, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 3.01953125, + "router_z_loss_mlp": 0.51879883, + "step": 916, + "time_per_iteration": 2.8573005199432373 + }, + { + "auxiliary_loss_clip": 0.01831952, + "auxiliary_loss_mlp": 0.01275272, + "balance_loss_clip": 1.51393878, + "balance_loss_mlp": 1.23054528, + "epoch": 0.05513302269652788, + "flos": 21696596939520.0, + "grad_norm": 2.0728064789163625, + "language_loss": 0.82017642, + "learning_rate": 3.993378105742666e-06, + "loss": 0.85124874, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 3.1796875, + "router_z_loss_mlp": 0.44702148, + "step": 917, + "time_per_iteration": 3.0046277046203613 + }, + { + "auxiliary_loss_clip": 0.01842565, + "auxiliary_loss_mlp": 0.01277238, + "balance_loss_clip": 1.52977335, + "balance_loss_mlp": 1.22614539, + "epoch": 0.05519314594919585, + "flos": 21622521922560.0, + "grad_norm": 1.8758102119320275, + "language_loss": 0.82289231, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.85409033, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.51049805, + "step": 918, + "time_per_iteration": 2.9367332458496094 + }, + { + "auxiliary_loss_clip": 0.01841115, + "auxiliary_loss_mlp": 0.01281238, + "balance_loss_clip": 1.52192795, + "balance_loss_mlp": 1.22304058, + "epoch": 0.05525326920186382, + "flos": 21808523871360.0, + "grad_norm": 2.1477621549127073, + "language_loss": 0.90166974, + "learning_rate": 3.99331462214778e-06, + "loss": 0.93289328, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.58154297, + "step": 919, + "time_per_iteration": 2.947847604751587 + }, + { + "auxiliary_loss_clip": 0.0182475, + "auxiliary_loss_mlp": 0.01260962, + "balance_loss_clip": 1.51625562, + "balance_loss_mlp": 1.20703149, + "epoch": 0.05531339245453179, + "flos": 28451485912320.0, + "grad_norm": 2.16259163437597, + "language_loss": 0.9004454, + "learning_rate": 3.993282766969699e-06, + "loss": 0.93130249, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.5390625, + "step": 920, + "time_per_iteration": 4.363829135894775 + }, + { + "auxiliary_loss_clip": 0.01828106, + "auxiliary_loss_mlp": 0.01252203, + "balance_loss_clip": 1.5244658, + "balance_loss_mlp": 1.19309914, + "epoch": 0.05537351570719976, + "flos": 37388402766720.0, + "grad_norm": 2.916465323927769, + "language_loss": 0.6804232, + "learning_rate": 3.993250836206136e-06, + "loss": 0.71122628, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.59106445, + "step": 921, + "time_per_iteration": 3.050734281539917 + }, + { + "auxiliary_loss_clip": 0.01863353, + "auxiliary_loss_mlp": 0.01234188, + "balance_loss_clip": 1.53925312, + "balance_loss_mlp": 1.17453575, + "epoch": 0.05543363895986773, + "flos": 20094183936000.0, + "grad_norm": 1.8621926257525712, + "language_loss": 0.74321294, + "learning_rate": 3.993218829858301e-06, + "loss": 0.77418834, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 3.24023438, + "router_z_loss_mlp": 0.59667969, + "step": 922, + "time_per_iteration": 2.9207465648651123 + }, + { + "auxiliary_loss_clip": 0.01836176, + "auxiliary_loss_mlp": 0.01218895, + "balance_loss_clip": 1.52290404, + "balance_loss_mlp": 1.15917158, + "epoch": 0.0554937622125357, + "flos": 24543399692160.0, + "grad_norm": 2.4075152153371104, + "language_loss": 0.84960294, + "learning_rate": 3.993186747927408e-06, + "loss": 0.8801536, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.59692383, + "step": 923, + "time_per_iteration": 2.9464728832244873 + }, + { + "auxiliary_loss_clip": 0.01813052, + "auxiliary_loss_mlp": 0.01242433, + "balance_loss_clip": 1.50640559, + "balance_loss_mlp": 1.17851257, + "epoch": 0.055553885465203665, + "flos": 14328358450560.0, + "grad_norm": 1.9966025039225748, + "language_loss": 0.81055301, + "learning_rate": 3.993154590414675e-06, + "loss": 0.84110785, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.63891602, + "step": 924, + "time_per_iteration": 4.420054197311401 + }, + { + "auxiliary_loss_clip": 0.01802992, + "auxiliary_loss_mlp": 0.01225223, + "balance_loss_clip": 1.50149763, + "balance_loss_mlp": 1.17277074, + "epoch": 0.05561400871787164, + "flos": 27392600419200.0, + "grad_norm": 2.19191681749827, + "language_loss": 1.03813112, + "learning_rate": 3.993122357321319e-06, + "loss": 1.06841326, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.52441406, + "step": 925, + "time_per_iteration": 2.952658176422119 + }, + { + "auxiliary_loss_clip": 0.01818934, + "auxiliary_loss_mlp": 0.01225922, + "balance_loss_clip": 1.50946283, + "balance_loss_mlp": 1.16865385, + "epoch": 0.05567413197053961, + "flos": 23231314442880.0, + "grad_norm": 2.0936737817801654, + "language_loss": 0.82510746, + "learning_rate": 3.993090048648564e-06, + "loss": 0.85555601, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 3.09765625, + "router_z_loss_mlp": 0.57324219, + "step": 926, + "time_per_iteration": 2.9741051197052 + }, + { + "auxiliary_loss_clip": 0.01859545, + "auxiliary_loss_mlp": 0.01214459, + "balance_loss_clip": 1.52988219, + "balance_loss_mlp": 1.16358006, + "epoch": 0.055734255223207574, + "flos": 25275870063360.0, + "grad_norm": 2.8047397473570923, + "language_loss": 0.76422632, + "learning_rate": 3.993057664397634e-06, + "loss": 0.79496634, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 3.29492188, + "router_z_loss_mlp": 0.50927734, + "step": 927, + "time_per_iteration": 2.9632952213287354 + }, + { + "auxiliary_loss_clip": 0.01524745, + "auxiliary_loss_mlp": 0.01281707, + "balance_loss_clip": 1.34803295, + "balance_loss_mlp": 1.26034474, + "epoch": 0.055794378475875546, + "flos": 66536606378880.0, + "grad_norm": 0.7989656387942049, + "language_loss": 0.60015935, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.6282239, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21386719, + "step": 928, + "time_per_iteration": 3.373538017272949 + }, + { + "auxiliary_loss_clip": 0.01794104, + "auxiliary_loss_mlp": 0.01201874, + "balance_loss_clip": 1.49049687, + "balance_loss_mlp": 1.13764453, + "epoch": 0.05585450172854351, + "flos": 25348497246720.0, + "grad_norm": 1.993697928502927, + "language_loss": 0.97807825, + "learning_rate": 3.992992669166168e-06, + "loss": 1.00803804, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 3.03320312, + "router_z_loss_mlp": 0.64233398, + "step": 929, + "time_per_iteration": 2.953200101852417 + }, + { + "auxiliary_loss_clip": 0.01800258, + "auxiliary_loss_mlp": 0.01194494, + "balance_loss_clip": 1.49608827, + "balance_loss_mlp": 1.13331532, + "epoch": 0.05591462498121148, + "flos": 33924811893120.0, + "grad_norm": 2.205080640536082, + "language_loss": 0.73961151, + "learning_rate": 3.992960058188094e-06, + "loss": 0.76955903, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.61157227, + "step": 930, + "time_per_iteration": 2.9848570823669434 + }, + { + "auxiliary_loss_clip": 0.01805885, + "auxiliary_loss_mlp": 0.01183901, + "balance_loss_clip": 1.49413872, + "balance_loss_mlp": 1.12925589, + "epoch": 0.055974748233879455, + "flos": 17939782644480.0, + "grad_norm": 2.7199080134763496, + "language_loss": 0.88239849, + "learning_rate": 3.992927371636776e-06, + "loss": 0.9122963, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.54663086, + "step": 931, + "time_per_iteration": 2.961127281188965 + }, + { + "auxiliary_loss_clip": 0.01819974, + "auxiliary_loss_mlp": 0.0118285, + "balance_loss_clip": 1.50521255, + "balance_loss_mlp": 1.13387847, + "epoch": 0.05603487148654742, + "flos": 24031932762240.0, + "grad_norm": 1.777550438733614, + "language_loss": 0.85831845, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.88834667, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.48950195, + "step": 932, + "time_per_iteration": 2.921264410018921 + }, + { + "auxiliary_loss_clip": 0.01819874, + "auxiliary_loss_mlp": 0.01199686, + "balance_loss_clip": 1.50642169, + "balance_loss_mlp": 1.14506459, + "epoch": 0.05609499473921539, + "flos": 17315755355520.0, + "grad_norm": 3.7180759795969633, + "language_loss": 0.77384377, + "learning_rate": 3.992861771819365e-06, + "loss": 0.80403942, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.54614258, + "step": 933, + "time_per_iteration": 2.9201443195343018 + }, + { + "auxiliary_loss_clip": 0.01808115, + "auxiliary_loss_mlp": 0.01173153, + "balance_loss_clip": 1.49650502, + "balance_loss_mlp": 1.11752987, + "epoch": 0.05615511799188336, + "flos": 21004195478400.0, + "grad_norm": 2.431947597054783, + "language_loss": 0.8825593, + "learning_rate": 3.99282885855576e-06, + "loss": 0.91237193, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 3.11523438, + "router_z_loss_mlp": 0.5559082, + "step": 934, + "time_per_iteration": 2.9288856983184814 + }, + { + "auxiliary_loss_clip": 0.01781886, + "auxiliary_loss_mlp": 0.01183805, + "balance_loss_clip": 1.49008346, + "balance_loss_mlp": 1.1302563, + "epoch": 0.05621524124455133, + "flos": 17282111207040.0, + "grad_norm": 2.244230933642363, + "language_loss": 0.81859976, + "learning_rate": 3.992795869723885e-06, + "loss": 0.84825671, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.53491211, + "step": 935, + "time_per_iteration": 2.852914333343506 + }, + { + "auxiliary_loss_clip": 0.01506819, + "auxiliary_loss_mlp": 0.01246009, + "balance_loss_clip": 1.328192, + "balance_loss_mlp": 1.22703099, + "epoch": 0.0562753644972193, + "flos": 58747653325440.0, + "grad_norm": 0.8373000863460053, + "language_loss": 0.69211292, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71964121, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.18945312, + "step": 936, + "time_per_iteration": 3.2782094478607178 + }, + { + "auxiliary_loss_clip": 0.01834713, + "auxiliary_loss_mlp": 0.011736, + "balance_loss_clip": 1.51889431, + "balance_loss_mlp": 1.11544991, + "epoch": 0.05633548774988727, + "flos": 17465941405440.0, + "grad_norm": 1.8612314170687285, + "language_loss": 0.79343462, + "learning_rate": 3.992729665360331e-06, + "loss": 0.8235178, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 3.16015625, + "router_z_loss_mlp": 0.58178711, + "step": 937, + "time_per_iteration": 3.016991376876831 + }, + { + "auxiliary_loss_clip": 0.01498435, + "auxiliary_loss_mlp": 0.01255596, + "balance_loss_clip": 1.3230865, + "balance_loss_mlp": 1.21249008, + "epoch": 0.05639561100255524, + "flos": 70687531296000.0, + "grad_norm": 0.8923820394992804, + "language_loss": 0.64593017, + "learning_rate": 3.992696449831162e-06, + "loss": 0.6734705, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.43164062, + "step": 938, + "time_per_iteration": 3.3356151580810547 + }, + { + "auxiliary_loss_clip": 0.01852264, + "auxiliary_loss_mlp": 0.01148252, + "balance_loss_clip": 1.52615798, + "balance_loss_mlp": 1.08802772, + "epoch": 0.056455734255223204, + "flos": 20495850439680.0, + "grad_norm": 10.89292738387266, + "language_loss": 0.82072914, + "learning_rate": 3.992663158738745e-06, + "loss": 0.85073423, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 3.26171875, + "router_z_loss_mlp": 0.6015625, + "step": 939, + "time_per_iteration": 2.966855764389038 + }, + { + "auxiliary_loss_clip": 0.01802884, + "auxiliary_loss_mlp": 0.01156681, + "balance_loss_clip": 1.49764073, + "balance_loss_mlp": 1.09378588, + "epoch": 0.056515857507891176, + "flos": 22063488174720.0, + "grad_norm": 2.5977853870767533, + "language_loss": 0.75260365, + "learning_rate": 3.992629792084341e-06, + "loss": 0.78219926, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 3.05273438, + "router_z_loss_mlp": 0.62890625, + "step": 940, + "time_per_iteration": 2.9463765621185303 + }, + { + "auxiliary_loss_clip": 0.01821132, + "auxiliary_loss_mlp": 0.01139553, + "balance_loss_clip": 1.51737189, + "balance_loss_mlp": 1.07696843, + "epoch": 0.05657598076055915, + "flos": 24036140528640.0, + "grad_norm": 1.9101663160291347, + "language_loss": 0.73412901, + "learning_rate": 3.992596349869216e-06, + "loss": 0.76373589, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 3.0390625, + "router_z_loss_mlp": 0.62573242, + "step": 941, + "time_per_iteration": 2.972933292388916 + }, + { + "auxiliary_loss_clip": 0.01807581, + "auxiliary_loss_mlp": 0.01128786, + "balance_loss_clip": 1.49848926, + "balance_loss_mlp": 1.0726862, + "epoch": 0.05663610401322711, + "flos": 20489063719680.0, + "grad_norm": 2.154991494335058, + "language_loss": 0.82089889, + "learning_rate": 3.992562832094637e-06, + "loss": 0.85026258, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.56103516, + "step": 942, + "time_per_iteration": 2.9231905937194824 + }, + { + "auxiliary_loss_clip": 0.01805441, + "auxiliary_loss_mlp": 0.01154505, + "balance_loss_clip": 1.49837422, + "balance_loss_mlp": 1.09847653, + "epoch": 0.056696227265895086, + "flos": 21078858677760.0, + "grad_norm": 1.969478680408, + "language_loss": 0.9046042, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.93420362, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.56054688, + "step": 943, + "time_per_iteration": 2.952671766281128 + }, + { + "auxiliary_loss_clip": 0.01811321, + "auxiliary_loss_mlp": 0.01126002, + "balance_loss_clip": 1.50448644, + "balance_loss_mlp": 1.07602906, + "epoch": 0.05675635051856306, + "flos": 17830163197440.0, + "grad_norm": 2.3150109969683355, + "language_loss": 0.7875241, + "learning_rate": 3.992495569872206e-06, + "loss": 0.81689727, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 3.06835938, + "router_z_loss_mlp": 0.5, + "step": 944, + "time_per_iteration": 2.9937245845794678 + }, + { + "auxiliary_loss_clip": 0.0182348, + "auxiliary_loss_mlp": 0.01127015, + "balance_loss_clip": 1.5117172, + "balance_loss_mlp": 1.07637489, + "epoch": 0.05681647377123102, + "flos": 23125993251840.0, + "grad_norm": 2.023598600908038, + "language_loss": 0.80577797, + "learning_rate": 3.992461825426906e-06, + "loss": 0.83528292, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.5065918, + "step": 945, + "time_per_iteration": 2.9441916942596436 + }, + { + "auxiliary_loss_clip": 0.01828885, + "auxiliary_loss_mlp": 0.01128285, + "balance_loss_clip": 1.515136, + "balance_loss_mlp": 1.07077813, + "epoch": 0.056876597023898995, + "flos": 16079057222400.0, + "grad_norm": 2.5776856782879594, + "language_loss": 0.8405292, + "learning_rate": 3.992428005427252e-06, + "loss": 0.87010086, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.57519531, + "step": 946, + "time_per_iteration": 2.89601993560791 + }, + { + "auxiliary_loss_clip": 0.01835966, + "auxiliary_loss_mlp": 0.01142674, + "balance_loss_clip": 1.5084995, + "balance_loss_mlp": 1.08299851, + "epoch": 0.05693672027656696, + "flos": 16844085866880.0, + "grad_norm": 1.8355066976118155, + "language_loss": 0.81324726, + "learning_rate": 3.992394109874529e-06, + "loss": 0.84303367, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.59619141, + "step": 947, + "time_per_iteration": 2.948582887649536 + }, + { + "auxiliary_loss_clip": 0.01853746, + "auxiliary_loss_mlp": 0.01139829, + "balance_loss_clip": 1.5245657, + "balance_loss_mlp": 1.09030986, + "epoch": 0.05699684352923493, + "flos": 21396858266880.0, + "grad_norm": 2.8695168211434687, + "language_loss": 0.89341986, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.9233557, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 3.2890625, + "router_z_loss_mlp": 0.49536133, + "step": 948, + "time_per_iteration": 2.9198410511016846 + }, + { + "auxiliary_loss_clip": 0.01813829, + "auxiliary_loss_mlp": 0.01126792, + "balance_loss_clip": 1.50337386, + "balance_loss_mlp": 1.07333875, + "epoch": 0.057056966781902904, + "flos": 15568857146880.0, + "grad_norm": 1.843729330524657, + "language_loss": 0.89480019, + "learning_rate": 3.992326092115019e-06, + "loss": 0.92420632, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.53417969, + "step": 949, + "time_per_iteration": 3.0143661499023438 + }, + { + "auxiliary_loss_clip": 0.01815096, + "auxiliary_loss_mlp": 0.01131878, + "balance_loss_clip": 1.50624919, + "balance_loss_mlp": 1.08064151, + "epoch": 0.05711709003457087, + "flos": 19947346001280.0, + "grad_norm": 2.4636524784259946, + "language_loss": 0.8111912, + "learning_rate": 3.992291969910811e-06, + "loss": 0.84066093, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 3.08789062, + "router_z_loss_mlp": 0.51245117, + "step": 950, + "time_per_iteration": 3.043057441711426 + }, + { + "auxiliary_loss_clip": 0.01836223, + "auxiliary_loss_mlp": 0.01134918, + "balance_loss_clip": 1.51527572, + "balance_loss_mlp": 1.07328701, + "epoch": 0.05717721328723884, + "flos": 30343955201280.0, + "grad_norm": 2.1464229552790712, + "language_loss": 0.84199166, + "learning_rate": 3.992257772158691e-06, + "loss": 0.87170303, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 3.2109375, + "router_z_loss_mlp": 0.61621094, + "step": 951, + "time_per_iteration": 3.026181221008301 + }, + { + "auxiliary_loss_clip": 0.01826237, + "auxiliary_loss_mlp": 0.01119243, + "balance_loss_clip": 1.50702643, + "balance_loss_mlp": 1.06209397, + "epoch": 0.05723733653990681, + "flos": 23663503203840.0, + "grad_norm": 2.347996341450788, + "language_loss": 0.89865255, + "learning_rate": 3.992223498859958e-06, + "loss": 0.92810738, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 3.1875, + "router_z_loss_mlp": 0.5715332, + "step": 952, + "time_per_iteration": 2.9891815185546875 + }, + { + "auxiliary_loss_clip": 0.01847534, + "auxiliary_loss_mlp": 0.01166208, + "balance_loss_clip": 1.51565874, + "balance_loss_mlp": 1.10040426, + "epoch": 0.05729745979257478, + "flos": 22066248107520.0, + "grad_norm": 1.8217459827910385, + "language_loss": 0.80831194, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.8384493, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 3.31445312, + "router_z_loss_mlp": 0.65820312, + "step": 953, + "time_per_iteration": 3.052690267562866 + }, + { + "auxiliary_loss_clip": 0.01810574, + "auxiliary_loss_mlp": 0.01133222, + "balance_loss_clip": 1.4950248, + "balance_loss_mlp": 1.07366526, + "epoch": 0.05735758304524275, + "flos": 19612424856960.0, + "grad_norm": 1.9342926455715435, + "language_loss": 0.89199835, + "learning_rate": 3.992154725627848e-06, + "loss": 0.92143631, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 3.15625, + "router_z_loss_mlp": 0.59594727, + "step": 954, + "time_per_iteration": 3.028079032897949 + }, + { + "auxiliary_loss_clip": 0.01821366, + "auxiliary_loss_mlp": 0.01145391, + "balance_loss_clip": 1.5009191, + "balance_loss_mlp": 1.09186625, + "epoch": 0.057417706297910716, + "flos": 19108378074240.0, + "grad_norm": 2.1771568903413265, + "language_loss": 0.90394211, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.93360972, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.53515625, + "step": 955, + "time_per_iteration": 4.370601177215576 + }, + { + "auxiliary_loss_clip": 0.01803572, + "auxiliary_loss_mlp": 0.01151463, + "balance_loss_clip": 1.49157071, + "balance_loss_mlp": 1.09786606, + "epoch": 0.05747782955057869, + "flos": 16663241825280.0, + "grad_norm": 2.408368784324984, + "language_loss": 0.91776884, + "learning_rate": 3.992085650224914e-06, + "loss": 0.94731927, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.53613281, + "step": 956, + "time_per_iteration": 2.9173810482025146 + }, + { + "auxiliary_loss_clip": 0.01799494, + "auxiliary_loss_mlp": 0.01156998, + "balance_loss_clip": 1.49723983, + "balance_loss_mlp": 1.10168493, + "epoch": 0.05753795280324665, + "flos": 14510288367360.0, + "grad_norm": 1.5854873244984242, + "language_loss": 0.77253294, + "learning_rate": 3.99205099921266e-06, + "loss": 0.80209786, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.55297852, + "step": 957, + "time_per_iteration": 2.9963958263397217 + }, + { + "auxiliary_loss_clip": 0.01796244, + "auxiliary_loss_mlp": 0.01163023, + "balance_loss_clip": 1.48546946, + "balance_loss_mlp": 1.10990334, + "epoch": 0.057598076055914625, + "flos": 18085127500800.0, + "grad_norm": 1.9111101960021382, + "language_loss": 0.81484997, + "learning_rate": 3.992016272661633e-06, + "loss": 0.84444261, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.53149414, + "step": 958, + "time_per_iteration": 2.9076666831970215 + }, + { + "auxiliary_loss_clip": 0.01788126, + "auxiliary_loss_mlp": 0.01144111, + "balance_loss_clip": 1.48001909, + "balance_loss_mlp": 1.09614062, + "epoch": 0.0576581993085826, + "flos": 22134034097280.0, + "grad_norm": 3.4016494791228715, + "language_loss": 0.90427709, + "learning_rate": 3.99198147057315e-06, + "loss": 0.93359947, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 3.08007812, + "router_z_loss_mlp": 0.47973633, + "step": 959, + "time_per_iteration": 5.83426570892334 + }, + { + "auxiliary_loss_clip": 0.01789907, + "auxiliary_loss_mlp": 0.01147008, + "balance_loss_clip": 1.48212516, + "balance_loss_mlp": 1.10249555, + "epoch": 0.05771832256125056, + "flos": 33194558517120.0, + "grad_norm": 2.2904403450486686, + "language_loss": 0.80814618, + "learning_rate": 3.991946592948529e-06, + "loss": 0.83751529, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 3.07421875, + "router_z_loss_mlp": 0.4453125, + "step": 960, + "time_per_iteration": 3.1046619415283203 + }, + { + "auxiliary_loss_clip": 0.01814349, + "auxiliary_loss_mlp": 0.01170118, + "balance_loss_clip": 1.49410212, + "balance_loss_mlp": 1.12105167, + "epoch": 0.057778445813918534, + "flos": 24180716223360.0, + "grad_norm": 1.7920129052323541, + "language_loss": 0.9497633, + "learning_rate": 3.991911639789094e-06, + "loss": 0.97960794, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 3.20117188, + "router_z_loss_mlp": 0.49121094, + "step": 961, + "time_per_iteration": 3.0662448406219482 + }, + { + "auxiliary_loss_clip": 0.01813588, + "auxiliary_loss_mlp": 0.01157318, + "balance_loss_clip": 1.49518657, + "balance_loss_mlp": 1.10550976, + "epoch": 0.0578385690665865, + "flos": 29655037589760.0, + "grad_norm": 3.1496734546212877, + "language_loss": 0.71032959, + "learning_rate": 3.991876611096169e-06, + "loss": 0.74003863, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 3.18359375, + "router_z_loss_mlp": 0.51806641, + "step": 962, + "time_per_iteration": 3.0075128078460693 + }, + { + "auxiliary_loss_clip": 0.01801373, + "auxiliary_loss_mlp": 0.01135076, + "balance_loss_clip": 1.48495126, + "balance_loss_mlp": 1.0882988, + "epoch": 0.05789869231925447, + "flos": 20894892744960.0, + "grad_norm": 2.201444155386885, + "language_loss": 0.8997736, + "learning_rate": 3.991841506871084e-06, + "loss": 0.92913806, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 3.16601562, + "router_z_loss_mlp": 0.46801758, + "step": 963, + "time_per_iteration": 2.976529359817505 + }, + { + "auxiliary_loss_clip": 0.01817438, + "auxiliary_loss_mlp": 0.01141877, + "balance_loss_clip": 1.50018108, + "balance_loss_mlp": 1.09149933, + "epoch": 0.057958815571922444, + "flos": 26042210807040.0, + "grad_norm": 2.6425702285963646, + "language_loss": 0.88256055, + "learning_rate": 3.99180632711517e-06, + "loss": 0.91215372, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 3.17382812, + "router_z_loss_mlp": 0.50390625, + "step": 964, + "time_per_iteration": 2.9432666301727295 + }, + { + "auxiliary_loss_clip": 0.01801345, + "auxiliary_loss_mlp": 0.01142654, + "balance_loss_clip": 1.48766971, + "balance_loss_mlp": 1.09425521, + "epoch": 0.05801893882459041, + "flos": 18086846803200.0, + "grad_norm": 4.5298790846055965, + "language_loss": 0.79566801, + "learning_rate": 3.99177107182976e-06, + "loss": 0.82510799, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 3.13671875, + "router_z_loss_mlp": 0.484375, + "step": 965, + "time_per_iteration": 2.9682512283325195 + }, + { + "auxiliary_loss_clip": 0.01787567, + "auxiliary_loss_mlp": 0.01136967, + "balance_loss_clip": 1.48307681, + "balance_loss_mlp": 1.08441949, + "epoch": 0.05807906207725838, + "flos": 17757852727680.0, + "grad_norm": 1.9209339410999229, + "language_loss": 0.8451575, + "learning_rate": 3.99173574101619e-06, + "loss": 0.87440282, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.52539062, + "step": 966, + "time_per_iteration": 3.0391228199005127 + }, + { + "auxiliary_loss_clip": 0.0179777, + "auxiliary_loss_mlp": 0.01134891, + "balance_loss_clip": 1.49362993, + "balance_loss_mlp": 1.0877558, + "epoch": 0.058139185329926346, + "flos": 18049311601920.0, + "grad_norm": 2.1316567157031217, + "language_loss": 0.78331923, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.81264585, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 3.04101562, + "router_z_loss_mlp": 0.47143555, + "step": 967, + "time_per_iteration": 2.917583703994751 + }, + { + "auxiliary_loss_clip": 0.01491509, + "auxiliary_loss_mlp": 0.01192346, + "balance_loss_clip": 1.32212186, + "balance_loss_mlp": 1.1654526, + "epoch": 0.05819930858259432, + "flos": 62391319079040.0, + "grad_norm": 0.8075296042625363, + "language_loss": 0.57560199, + "learning_rate": 3.991664852809939e-06, + "loss": 0.6024406, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.26953125, + "step": 968, + "time_per_iteration": 3.3976640701293945 + }, + { + "auxiliary_loss_clip": 0.01771766, + "auxiliary_loss_mlp": 0.01147993, + "balance_loss_clip": 1.47534823, + "balance_loss_mlp": 1.09043932, + "epoch": 0.05825943183526229, + "flos": 19144691665920.0, + "grad_norm": 2.025823885216549, + "language_loss": 0.84289056, + "learning_rate": 3.991629295419945e-06, + "loss": 0.87208813, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.57617188, + "step": 969, + "time_per_iteration": 2.9408795833587646 + }, + { + "auxiliary_loss_clip": 0.01780061, + "auxiliary_loss_mlp": 0.01154017, + "balance_loss_clip": 1.4686842, + "balance_loss_mlp": 1.1040448, + "epoch": 0.058319555087930255, + "flos": 29034222681600.0, + "grad_norm": 2.091273090492494, + "language_loss": 0.78792429, + "learning_rate": 3.991593662507167e-06, + "loss": 0.81726509, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.5, + "step": 970, + "time_per_iteration": 2.9662909507751465 + }, + { + "auxiliary_loss_clip": 0.01778524, + "auxiliary_loss_mlp": 0.01143298, + "balance_loss_clip": 1.47375774, + "balance_loss_mlp": 1.09184694, + "epoch": 0.05837967834059823, + "flos": 18889274914560.0, + "grad_norm": 2.607590734016365, + "language_loss": 0.93898678, + "learning_rate": 3.991557954072958e-06, + "loss": 0.96820498, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 3.04882812, + "router_z_loss_mlp": 0.51513672, + "step": 971, + "time_per_iteration": 2.920809745788574 + }, + { + "auxiliary_loss_clip": 0.01756702, + "auxiliary_loss_mlp": 0.01121608, + "balance_loss_clip": 1.45477188, + "balance_loss_mlp": 1.07094443, + "epoch": 0.05843980159326619, + "flos": 25714166872320.0, + "grad_norm": 1.7375957140504457, + "language_loss": 0.87568176, + "learning_rate": 3.991522170118673e-06, + "loss": 0.9044649, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.50634766, + "step": 972, + "time_per_iteration": 2.908933401107788 + }, + { + "auxiliary_loss_clip": 0.01769997, + "auxiliary_loss_mlp": 0.01174221, + "balance_loss_clip": 1.46907532, + "balance_loss_mlp": 1.12155461, + "epoch": 0.058499924845934165, + "flos": 25561899561600.0, + "grad_norm": 2.0940785176596153, + "language_loss": 0.88775885, + "learning_rate": 3.991486310645667e-06, + "loss": 0.91720104, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 3.0078125, + "router_z_loss_mlp": 0.52685547, + "step": 973, + "time_per_iteration": 2.970554828643799 + }, + { + "auxiliary_loss_clip": 0.01761172, + "auxiliary_loss_mlp": 0.01160684, + "balance_loss_clip": 1.46474636, + "balance_loss_mlp": 1.10701609, + "epoch": 0.05856004809860214, + "flos": 16444998316800.0, + "grad_norm": 2.0606692034576484, + "language_loss": 0.76586604, + "learning_rate": 3.991450375655301e-06, + "loss": 0.79508466, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.53662109, + "step": 974, + "time_per_iteration": 2.9474990367889404 + }, + { + "auxiliary_loss_clip": 0.01739818, + "auxiliary_loss_mlp": 0.01169788, + "balance_loss_clip": 1.44869041, + "balance_loss_mlp": 1.11318755, + "epoch": 0.0586201713512701, + "flos": 39473660724480.0, + "grad_norm": 1.4271591496739375, + "language_loss": 0.77935803, + "learning_rate": 3.991414365148936e-06, + "loss": 0.8084541, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.56616211, + "step": 975, + "time_per_iteration": 3.0622963905334473 + }, + { + "auxiliary_loss_clip": 0.01762792, + "auxiliary_loss_mlp": 0.01149318, + "balance_loss_clip": 1.45939469, + "balance_loss_mlp": 1.08957052, + "epoch": 0.058680294603938074, + "flos": 23374939996800.0, + "grad_norm": 2.47163363219876, + "language_loss": 0.79049522, + "learning_rate": 3.99137827912794e-06, + "loss": 0.81961632, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.59765625, + "step": 976, + "time_per_iteration": 2.970222234725952 + }, + { + "auxiliary_loss_clip": 0.01746409, + "auxiliary_loss_mlp": 0.01148405, + "balance_loss_clip": 1.45020735, + "balance_loss_mlp": 1.09118426, + "epoch": 0.05874041785660604, + "flos": 32244297085440.0, + "grad_norm": 1.797441877631057, + "language_loss": 0.88619471, + "learning_rate": 3.991342117593679e-06, + "loss": 0.91514283, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.57202148, + "step": 977, + "time_per_iteration": 2.960991621017456 + }, + { + "auxiliary_loss_clip": 0.01758621, + "auxiliary_loss_mlp": 0.01144513, + "balance_loss_clip": 1.45872498, + "balance_loss_mlp": 1.08827043, + "epoch": 0.05880054110927401, + "flos": 22320307514880.0, + "grad_norm": 1.4908016473801688, + "language_loss": 0.81136203, + "learning_rate": 3.991305880547527e-06, + "loss": 0.84039336, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.56176758, + "step": 978, + "time_per_iteration": 2.9335594177246094 + }, + { + "auxiliary_loss_clip": 0.01777462, + "auxiliary_loss_mlp": 0.01131803, + "balance_loss_clip": 1.47058034, + "balance_loss_mlp": 1.07243681, + "epoch": 0.05886066436194198, + "flos": 27391740768000.0, + "grad_norm": 1.9355336357238397, + "language_loss": 0.82057917, + "learning_rate": 3.991269567990855e-06, + "loss": 0.84967184, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.59399414, + "step": 979, + "time_per_iteration": 2.945584297180176 + }, + { + "auxiliary_loss_clip": 0.01473931, + "auxiliary_loss_mlp": 0.0113949, + "balance_loss_clip": 1.30363488, + "balance_loss_mlp": 1.06968153, + "epoch": 0.05892078761460995, + "flos": 59610266300160.0, + "grad_norm": 0.9615661736269862, + "language_loss": 0.59125507, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61738932, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.69921875, + "step": 980, + "time_per_iteration": 3.2757680416107178 + }, + { + "auxiliary_loss_clip": 0.0174837, + "auxiliary_loss_mlp": 0.01112781, + "balance_loss_clip": 1.45818305, + "balance_loss_mlp": 1.05615628, + "epoch": 0.05898091086727792, + "flos": 15422516904960.0, + "grad_norm": 2.7763120702718154, + "language_loss": 0.88997209, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.91858363, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.56713867, + "step": 981, + "time_per_iteration": 2.8843767642974854 + }, + { + "auxiliary_loss_clip": 0.0178376, + "auxiliary_loss_mlp": 0.01113488, + "balance_loss_clip": 1.48050678, + "balance_loss_mlp": 1.05989099, + "epoch": 0.059041034119945886, + "flos": 23664724813440.0, + "grad_norm": 1.9114560187327032, + "language_loss": 0.81018984, + "learning_rate": 3.991160177271513e-06, + "loss": 0.83916235, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 3.02929688, + "router_z_loss_mlp": 0.53564453, + "step": 982, + "time_per_iteration": 2.9167864322662354 + }, + { + "auxiliary_loss_clip": 0.01798443, + "auxiliary_loss_mlp": 0.0111861, + "balance_loss_clip": 1.48551524, + "balance_loss_mlp": 1.06518006, + "epoch": 0.05910115737261386, + "flos": 24764855581440.0, + "grad_norm": 2.2682188939168757, + "language_loss": 0.87547767, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.90464818, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.53442383, + "step": 983, + "time_per_iteration": 2.9428207874298096 + }, + { + "auxiliary_loss_clip": 0.0175733, + "auxiliary_loss_mlp": 0.01118514, + "balance_loss_clip": 1.46027482, + "balance_loss_mlp": 1.06615734, + "epoch": 0.05916128062528183, + "flos": 11736158042880.0, + "grad_norm": 2.303648113199355, + "language_loss": 0.86889964, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.89765811, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 2.96875, + "router_z_loss_mlp": 0.52392578, + "step": 984, + "time_per_iteration": 2.8902368545532227 + }, + { + "auxiliary_loss_clip": 0.01741363, + "auxiliary_loss_mlp": 0.01096622, + "balance_loss_clip": 1.44815087, + "balance_loss_mlp": 1.05046427, + "epoch": 0.059221403877949795, + "flos": 21911673312000.0, + "grad_norm": 2.175059402342379, + "language_loss": 0.79157126, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.81995112, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 2.93164062, + "router_z_loss_mlp": 0.46166992, + "step": 985, + "time_per_iteration": 2.893165111541748 + }, + { + "auxiliary_loss_clip": 0.01771611, + "auxiliary_loss_mlp": 0.01130213, + "balance_loss_clip": 1.46997595, + "balance_loss_mlp": 1.08338761, + "epoch": 0.05928152713061777, + "flos": 20522391154560.0, + "grad_norm": 2.5719379370295283, + "language_loss": 0.92309588, + "learning_rate": 3.991013265915661e-06, + "loss": 0.95211416, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.46826172, + "step": 986, + "time_per_iteration": 2.8692374229431152 + }, + { + "auxiliary_loss_clip": 0.01772994, + "auxiliary_loss_mlp": 0.01120556, + "balance_loss_clip": 1.46565008, + "balance_loss_mlp": 1.07280087, + "epoch": 0.05934165038328574, + "flos": 24504914350080.0, + "grad_norm": 1.920348175342391, + "language_loss": 0.78143734, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.81037283, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 3.07226562, + "router_z_loss_mlp": 0.4777832, + "step": 987, + "time_per_iteration": 2.957395553588867 + }, + { + "auxiliary_loss_clip": 0.01784216, + "auxiliary_loss_mlp": 0.01133454, + "balance_loss_clip": 1.47021174, + "balance_loss_mlp": 1.08212233, + "epoch": 0.059401773635953704, + "flos": 38742411962880.0, + "grad_norm": 2.426251495929481, + "language_loss": 0.73397893, + "learning_rate": 3.990939357235621e-06, + "loss": 0.76315564, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 3.14257812, + "router_z_loss_mlp": 0.51318359, + "step": 988, + "time_per_iteration": 3.044674873352051 + }, + { + "auxiliary_loss_clip": 0.01464726, + "auxiliary_loss_mlp": 0.01114485, + "balance_loss_clip": 1.30212998, + "balance_loss_mlp": 1.08759105, + "epoch": 0.059461896888621676, + "flos": 58050998853120.0, + "grad_norm": 0.9488903398449772, + "language_loss": 0.71248472, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73827684, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.26953125, + "step": 989, + "time_per_iteration": 4.650926351547241 + }, + { + "auxiliary_loss_clip": 0.01780717, + "auxiliary_loss_mlp": 0.01124669, + "balance_loss_clip": 1.47023439, + "balance_loss_mlp": 1.07660389, + "epoch": 0.05952202014128964, + "flos": 22138015639680.0, + "grad_norm": 6.264074962202939, + "language_loss": 0.82071888, + "learning_rate": 3.990865146569105e-06, + "loss": 0.84977275, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 3.10546875, + "router_z_loss_mlp": 0.48071289, + "step": 990, + "time_per_iteration": 2.8928844928741455 + }, + { + "auxiliary_loss_clip": 0.01749238, + "auxiliary_loss_mlp": 0.01112948, + "balance_loss_clip": 1.450845, + "balance_loss_mlp": 1.06235552, + "epoch": 0.059582143393957614, + "flos": 20454650409600.0, + "grad_norm": 1.946403534793412, + "language_loss": 0.87365746, + "learning_rate": 3.990827927994434e-06, + "loss": 0.90227932, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 2.984375, + "router_z_loss_mlp": 0.50585938, + "step": 991, + "time_per_iteration": 2.923243761062622 + }, + { + "auxiliary_loss_clip": 0.01785655, + "auxiliary_loss_mlp": 0.01115336, + "balance_loss_clip": 1.47416401, + "balance_loss_mlp": 1.06584024, + "epoch": 0.059642266646625586, + "flos": 20604610235520.0, + "grad_norm": 4.194706136498649, + "language_loss": 0.79371905, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.82272893, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 3.1171875, + "router_z_loss_mlp": 0.49487305, + "step": 992, + "time_per_iteration": 2.866121292114258 + }, + { + "auxiliary_loss_clip": 0.01788712, + "auxiliary_loss_mlp": 0.01116158, + "balance_loss_clip": 1.48039627, + "balance_loss_mlp": 1.06866503, + "epoch": 0.05970238989929355, + "flos": 19361623075200.0, + "grad_norm": 2.6581920028364223, + "language_loss": 0.76972079, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.79876947, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 3.08398438, + "router_z_loss_mlp": 0.4753418, + "step": 993, + "time_per_iteration": 4.399053573608398 + }, + { + "auxiliary_loss_clip": 0.01768839, + "auxiliary_loss_mlp": 0.0113828, + "balance_loss_clip": 1.46545231, + "balance_loss_mlp": 1.08744895, + "epoch": 0.05976251315196152, + "flos": 30276395435520.0, + "grad_norm": 2.8629526542588923, + "language_loss": 0.81012928, + "learning_rate": 3.990715819321712e-06, + "loss": 0.83920044, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.50830078, + "step": 994, + "time_per_iteration": 5.7542078495025635 + }, + { + "auxiliary_loss_clip": 0.01764486, + "auxiliary_loss_mlp": 0.01125745, + "balance_loss_clip": 1.45731437, + "balance_loss_mlp": 1.07603514, + "epoch": 0.05982263640462949, + "flos": 23195498544000.0, + "grad_norm": 2.3575456302053572, + "language_loss": 0.8222034, + "learning_rate": 3.99067829878596e-06, + "loss": 0.85110569, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 3.0703125, + "router_z_loss_mlp": 0.49682617, + "step": 995, + "time_per_iteration": 2.9191224575042725 + }, + { + "auxiliary_loss_clip": 0.01778943, + "auxiliary_loss_mlp": 0.01115622, + "balance_loss_clip": 1.46941471, + "balance_loss_mlp": 1.06657887, + "epoch": 0.05988275965729746, + "flos": 27861283751040.0, + "grad_norm": 2.3387336930079283, + "language_loss": 0.88894904, + "learning_rate": 3.990640702763487e-06, + "loss": 0.91789472, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 3.09375, + "router_z_loss_mlp": 0.49047852, + "step": 996, + "time_per_iteration": 2.9270241260528564 + }, + { + "auxiliary_loss_clip": 0.01768029, + "auxiliary_loss_mlp": 0.01104629, + "balance_loss_clip": 1.46670771, + "balance_loss_mlp": 1.05088913, + "epoch": 0.05994288290996543, + "flos": 24690463850880.0, + "grad_norm": 3.1824842774948836, + "language_loss": 0.89937687, + "learning_rate": 3.990603031255718e-06, + "loss": 0.92810345, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.53735352, + "step": 997, + "time_per_iteration": 2.8931725025177 + }, + { + "auxiliary_loss_clip": 0.01450192, + "auxiliary_loss_mlp": 0.01078129, + "balance_loss_clip": 1.28053331, + "balance_loss_mlp": 1.03044569, + "epoch": 0.0600030061626334, + "flos": 69963114499200.0, + "grad_norm": 1.0614858789100483, + "language_loss": 0.75583935, + "learning_rate": 3.990565284264083e-06, + "loss": 0.78112257, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.4765625, + "step": 998, + "time_per_iteration": 3.4666221141815186 + }, + { + "auxiliary_loss_clip": 0.01743604, + "auxiliary_loss_mlp": 0.01098276, + "balance_loss_clip": 1.45488107, + "balance_loss_mlp": 1.04947162, + "epoch": 0.06006312941530137, + "flos": 26550917804160.0, + "grad_norm": 1.7707557400643377, + "language_loss": 0.77670515, + "learning_rate": 3.990527461790013e-06, + "loss": 0.80512393, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.48828125, + "step": 999, + "time_per_iteration": 2.9835479259490967 + }, + { + "auxiliary_loss_clip": 0.01779867, + "auxiliary_loss_mlp": 0.01111157, + "balance_loss_clip": 1.46700263, + "balance_loss_mlp": 1.05293489, + "epoch": 0.060123252667969335, + "flos": 27355291441920.0, + "grad_norm": 1.7339375132223012, + "language_loss": 0.83587921, + "learning_rate": 3.990489563834943e-06, + "loss": 0.86478943, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.58178711, + "step": 1000, + "time_per_iteration": 3.0643134117126465 + }, + { + "auxiliary_loss_clip": 0.01783456, + "auxiliary_loss_mlp": 0.01118305, + "balance_loss_clip": 1.4779247, + "balance_loss_mlp": 1.05302608, + "epoch": 0.06018337592063731, + "flos": 27028197648000.0, + "grad_norm": 2.0289755495665727, + "language_loss": 0.88062441, + "learning_rate": 3.990451590400309e-06, + "loss": 0.90964204, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.65283203, + "step": 1001, + "time_per_iteration": 3.078782558441162 + }, + { + "auxiliary_loss_clip": 0.01778232, + "auxiliary_loss_mlp": 0.01127018, + "balance_loss_clip": 1.4771452, + "balance_loss_mlp": 1.06932116, + "epoch": 0.06024349917330528, + "flos": 25603868753280.0, + "grad_norm": 3.424867566705001, + "language_loss": 0.75499094, + "learning_rate": 3.990413541487551e-06, + "loss": 0.78404349, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.57714844, + "step": 1002, + "time_per_iteration": 2.8902413845062256 + }, + { + "auxiliary_loss_clip": 0.01783125, + "auxiliary_loss_mlp": 0.01105495, + "balance_loss_clip": 1.47705889, + "balance_loss_mlp": 1.05311489, + "epoch": 0.060303622425973244, + "flos": 26142509825280.0, + "grad_norm": 2.5630817683705303, + "language_loss": 0.7803756, + "learning_rate": 3.990375417098112e-06, + "loss": 0.8092618, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 3.0625, + "router_z_loss_mlp": 0.52392578, + "step": 1003, + "time_per_iteration": 2.942902088165283 + }, + { + "auxiliary_loss_clip": 0.01777849, + "auxiliary_loss_mlp": 0.01111834, + "balance_loss_clip": 1.46847713, + "balance_loss_mlp": 1.05761766, + "epoch": 0.060363745678641216, + "flos": 20387181133440.0, + "grad_norm": 2.9149342127984106, + "language_loss": 0.71679455, + "learning_rate": 3.990337217233437e-06, + "loss": 0.74569136, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 3.09570312, + "router_z_loss_mlp": 0.54223633, + "step": 1004, + "time_per_iteration": 2.866318464279175 + }, + { + "auxiliary_loss_clip": 0.01800068, + "auxiliary_loss_mlp": 0.01120875, + "balance_loss_clip": 1.48791361, + "balance_loss_mlp": 1.06556225, + "epoch": 0.06042386893130918, + "flos": 17758214686080.0, + "grad_norm": 2.236491318712935, + "language_loss": 0.8563534, + "learning_rate": 3.990298941894976e-06, + "loss": 0.8855629, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 3.11914062, + "router_z_loss_mlp": 0.5534668, + "step": 1005, + "time_per_iteration": 2.8661811351776123 + }, + { + "auxiliary_loss_clip": 0.01471201, + "auxiliary_loss_mlp": 0.01248823, + "balance_loss_clip": 1.29227734, + "balance_loss_mlp": 1.16451824, + "epoch": 0.06048399218397715, + "flos": 68570484226560.0, + "grad_norm": 0.9328488173804859, + "language_loss": 0.59108686, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61828709, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.84375, + "step": 1006, + "time_per_iteration": 3.4758994579315186 + }, + { + "auxiliary_loss_clip": 0.01764285, + "auxiliary_loss_mlp": 0.01119911, + "balance_loss_clip": 1.45837474, + "balance_loss_mlp": 1.06555188, + "epoch": 0.060544115436645125, + "flos": 23268985378560.0, + "grad_norm": 1.9734780088783437, + "language_loss": 0.76431131, + "learning_rate": 3.990222164802503e-06, + "loss": 0.79315323, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.54418945, + "step": 1007, + "time_per_iteration": 2.9659454822540283 + }, + { + "auxiliary_loss_clip": 0.01772858, + "auxiliary_loss_mlp": 0.01122475, + "balance_loss_clip": 1.46641684, + "balance_loss_mlp": 1.07052374, + "epoch": 0.06060423868931309, + "flos": 23888985880320.0, + "grad_norm": 1.8553294946977932, + "language_loss": 0.82669604, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.85564941, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.51977539, + "step": 1008, + "time_per_iteration": 2.9513726234436035 + }, + { + "auxiliary_loss_clip": 0.01772593, + "auxiliary_loss_mlp": 0.01110892, + "balance_loss_clip": 1.4744153, + "balance_loss_mlp": 1.06182516, + "epoch": 0.06066436194198106, + "flos": 18735740749440.0, + "grad_norm": 2.081513422032007, + "language_loss": 0.80450046, + "learning_rate": 3.990145085832335e-06, + "loss": 0.83333534, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.49145508, + "step": 1009, + "time_per_iteration": 2.9590160846710205 + }, + { + "auxiliary_loss_clip": 0.01756764, + "auxiliary_loss_mlp": 0.01092227, + "balance_loss_clip": 1.46559119, + "balance_loss_mlp": 1.04251635, + "epoch": 0.06072448519464903, + "flos": 24650123472000.0, + "grad_norm": 1.7450216860543388, + "language_loss": 0.94533348, + "learning_rate": 3.990106433146769e-06, + "loss": 0.97382343, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.49682617, + "step": 1010, + "time_per_iteration": 2.922691822052002 + }, + { + "auxiliary_loss_clip": 0.01802396, + "auxiliary_loss_mlp": 0.01101886, + "balance_loss_clip": 1.48367071, + "balance_loss_mlp": 1.04719305, + "epoch": 0.060784608447317, + "flos": 17386346522880.0, + "grad_norm": 2.5556733651468937, + "language_loss": 0.74273312, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.77177596, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.54663086, + "step": 1011, + "time_per_iteration": 3.039433717727661 + }, + { + "auxiliary_loss_clip": 0.0177328, + "auxiliary_loss_mlp": 0.01116799, + "balance_loss_clip": 1.47163677, + "balance_loss_mlp": 1.06255841, + "epoch": 0.06084473169998497, + "flos": 23701717077120.0, + "grad_norm": 1.8633959632349348, + "language_loss": 0.88229191, + "learning_rate": 3.990028901381999e-06, + "loss": 0.91119266, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 3.01757812, + "router_z_loss_mlp": 0.54174805, + "step": 1012, + "time_per_iteration": 3.075021743774414 + }, + { + "auxiliary_loss_clip": 0.01764553, + "auxiliary_loss_mlp": 0.0111357, + "balance_loss_clip": 1.46178496, + "balance_loss_mlp": 1.05665922, + "epoch": 0.06090485495265294, + "flos": 23555829283200.0, + "grad_norm": 3.4309493970906235, + "language_loss": 0.79051965, + "learning_rate": 3.989990022305734e-06, + "loss": 0.81930089, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.56933594, + "step": 1013, + "time_per_iteration": 3.0801842212677 + }, + { + "auxiliary_loss_clip": 0.01785438, + "auxiliary_loss_mlp": 0.01117131, + "balance_loss_clip": 1.47955787, + "balance_loss_mlp": 1.06262898, + "epoch": 0.06096497820532091, + "flos": 20348922015360.0, + "grad_norm": 3.132082957752681, + "language_loss": 0.87760311, + "learning_rate": 3.98995106776885e-06, + "loss": 0.90662885, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.54541016, + "step": 1014, + "time_per_iteration": 2.954566717147827 + }, + { + "auxiliary_loss_clip": 0.01787772, + "auxiliary_loss_mlp": 0.01097586, + "balance_loss_clip": 1.47414505, + "balance_loss_mlp": 1.04453802, + "epoch": 0.061025101457988874, + "flos": 26949281437440.0, + "grad_norm": 2.8504941829538786, + "language_loss": 0.76601887, + "learning_rate": 3.98991203777282e-06, + "loss": 0.7948724, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 3.13085938, + "router_z_loss_mlp": 0.53051758, + "step": 1015, + "time_per_iteration": 2.9903922080993652 + }, + { + "auxiliary_loss_clip": 0.017505, + "auxiliary_loss_mlp": 0.01109162, + "balance_loss_clip": 1.45743346, + "balance_loss_mlp": 1.06145489, + "epoch": 0.061085224710656846, + "flos": 25386168182400.0, + "grad_norm": 1.6699326971956314, + "language_loss": 0.80578256, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8343792, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.47705078, + "step": 1016, + "time_per_iteration": 3.0523488521575928 + }, + { + "auxiliary_loss_clip": 0.0177542, + "auxiliary_loss_mlp": 0.01090713, + "balance_loss_clip": 1.4750495, + "balance_loss_mlp": 1.03783226, + "epoch": 0.06114534796332482, + "flos": 24834949056000.0, + "grad_norm": 1.580675696355111, + "language_loss": 0.77814472, + "learning_rate": 3.989833751409254e-06, + "loss": 0.80680609, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.52905273, + "step": 1017, + "time_per_iteration": 2.9541842937469482 + }, + { + "auxiliary_loss_clip": 0.01784733, + "auxiliary_loss_mlp": 0.01119936, + "balance_loss_clip": 1.47203934, + "balance_loss_mlp": 1.06767416, + "epoch": 0.061205471215992784, + "flos": 20641195296000.0, + "grad_norm": 2.3501197110012293, + "language_loss": 0.88139933, + "learning_rate": 3.989794495044685e-06, + "loss": 0.91044605, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 3.12695312, + "router_z_loss_mlp": 0.5222168, + "step": 1018, + "time_per_iteration": 2.9190120697021484 + }, + { + "auxiliary_loss_clip": 0.01762296, + "auxiliary_loss_mlp": 0.01110537, + "balance_loss_clip": 1.46461296, + "balance_loss_mlp": 1.0611372, + "epoch": 0.061265594468660756, + "flos": 16516992072960.0, + "grad_norm": 3.210226008132924, + "language_loss": 0.81643718, + "learning_rate": 3.989755163226909e-06, + "loss": 0.84516555, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.49389648, + "step": 1019, + "time_per_iteration": 2.9377410411834717 + }, + { + "auxiliary_loss_clip": 0.0175308, + "auxiliary_loss_mlp": 0.01108285, + "balance_loss_clip": 1.45773196, + "balance_loss_mlp": 1.05683446, + "epoch": 0.06132571772132872, + "flos": 26256382283520.0, + "grad_norm": 1.6369066684028653, + "language_loss": 0.85087967, + "learning_rate": 3.989715755957418e-06, + "loss": 0.87949336, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.5144043, + "step": 1020, + "time_per_iteration": 2.9024717807769775 + }, + { + "auxiliary_loss_clip": 0.01758379, + "auxiliary_loss_mlp": 0.01112267, + "balance_loss_clip": 1.4619689, + "balance_loss_mlp": 1.06162727, + "epoch": 0.06138584097399669, + "flos": 37428878880000.0, + "grad_norm": 2.172716191146108, + "language_loss": 0.80054629, + "learning_rate": 3.989676273237705e-06, + "loss": 0.82925272, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 2.96484375, + "router_z_loss_mlp": 0.50634766, + "step": 1021, + "time_per_iteration": 3.0067856311798096 + }, + { + "auxiliary_loss_clip": 0.01734307, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_clip": 1.44287395, + "balance_loss_mlp": 1.0499723, + "epoch": 0.061445964226664665, + "flos": 17429582568960.0, + "grad_norm": 1.9900527885747565, + "language_loss": 0.90149105, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.92984062, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.50732422, + "step": 1022, + "time_per_iteration": 2.8270459175109863 + }, + { + "auxiliary_loss_clip": 0.01727234, + "auxiliary_loss_mlp": 0.01100139, + "balance_loss_clip": 1.43986511, + "balance_loss_mlp": 1.04818761, + "epoch": 0.06150608747933263, + "flos": 22609866107520.0, + "grad_norm": 2.465964295804433, + "language_loss": 0.8456322, + "learning_rate": 3.989597081453611e-06, + "loss": 0.87390596, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.51977539, + "step": 1023, + "time_per_iteration": 2.908308506011963 + }, + { + "auxiliary_loss_clip": 0.01458392, + "auxiliary_loss_mlp": 0.01106745, + "balance_loss_clip": 1.28580499, + "balance_loss_mlp": 1.06325746, + "epoch": 0.0615662107320006, + "flos": 56767264110720.0, + "grad_norm": 0.9412958094999629, + "language_loss": 0.65272641, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67837775, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.43554688, + "step": 1024, + "time_per_iteration": 4.7813379764556885 + }, + { + "auxiliary_loss_clip": 0.01762097, + "auxiliary_loss_mlp": 0.01113161, + "balance_loss_clip": 1.45964742, + "balance_loss_mlp": 1.06311727, + "epoch": 0.06162633398466857, + "flos": 22574955104640.0, + "grad_norm": 2.338938560779928, + "language_loss": 0.89807844, + "learning_rate": 3.989517587886636e-06, + "loss": 0.92683101, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.5, + "step": 1025, + "time_per_iteration": 2.8680503368377686 + }, + { + "auxiliary_loss_clip": 0.0175568, + "auxiliary_loss_mlp": 0.01096299, + "balance_loss_clip": 1.4578923, + "balance_loss_mlp": 1.04809117, + "epoch": 0.06168645723733654, + "flos": 25604049732480.0, + "grad_norm": 2.5534052411207275, + "language_loss": 0.85558999, + "learning_rate": 3.989477727938335e-06, + "loss": 0.88410985, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 2.97851562, + "router_z_loss_mlp": 0.48144531, + "step": 1026, + "time_per_iteration": 3.0151915550231934 + }, + { + "auxiliary_loss_clip": 0.01766933, + "auxiliary_loss_mlp": 0.01093902, + "balance_loss_clip": 1.46521711, + "balance_loss_mlp": 1.04605103, + "epoch": 0.06174658049000451, + "flos": 16006520528640.0, + "grad_norm": 4.233212544919618, + "language_loss": 0.83724552, + "learning_rate": 3.989437792548839e-06, + "loss": 0.86585391, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 3.015625, + "router_z_loss_mlp": 0.47875977, + "step": 1027, + "time_per_iteration": 2.9180421829223633 + }, + { + "auxiliary_loss_clip": 0.01741351, + "auxiliary_loss_mlp": 0.0108745, + "balance_loss_clip": 1.44877517, + "balance_loss_mlp": 1.04069662, + "epoch": 0.06180670374267248, + "flos": 11291979409920.0, + "grad_norm": 4.3802808550790795, + "language_loss": 0.86246991, + "learning_rate": 3.989397781719663e-06, + "loss": 0.89075798, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 2.92578125, + "router_z_loss_mlp": 0.46728516, + "step": 1028, + "time_per_iteration": 2.837451219558716 + }, + { + "auxiliary_loss_clip": 0.01455406, + "auxiliary_loss_mlp": 0.01056385, + "balance_loss_clip": 1.28816664, + "balance_loss_mlp": 1.01575804, + "epoch": 0.06186682699534045, + "flos": 65157305339520.0, + "grad_norm": 1.05428391854155, + "language_loss": 0.60597384, + "learning_rate": 3.989357695452323e-06, + "loss": 0.63109183, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.40625, + "step": 1029, + "time_per_iteration": 7.551038503646851 + }, + { + "auxiliary_loss_clip": 0.0173921, + "auxiliary_loss_mlp": 0.01108819, + "balance_loss_clip": 1.4432013, + "balance_loss_mlp": 1.05977595, + "epoch": 0.061926950248008414, + "flos": 21115624717440.0, + "grad_norm": 1.9647172765041028, + "language_loss": 0.83996975, + "learning_rate": 3.98931753374834e-06, + "loss": 0.86844993, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.49023438, + "step": 1030, + "time_per_iteration": 2.8411941528320312 + }, + { + "auxiliary_loss_clip": 0.01750262, + "auxiliary_loss_mlp": 0.0110731, + "balance_loss_clip": 1.45673215, + "balance_loss_mlp": 1.05929208, + "epoch": 0.061987073500676386, + "flos": 17757309790080.0, + "grad_norm": 2.528812080894756, + "language_loss": 0.82780004, + "learning_rate": 3.989277296609237e-06, + "loss": 0.85637575, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.48071289, + "step": 1031, + "time_per_iteration": 2.912752628326416 + }, + { + "auxiliary_loss_clip": 0.01733133, + "auxiliary_loss_mlp": 0.01099299, + "balance_loss_clip": 1.44583797, + "balance_loss_mlp": 1.05793369, + "epoch": 0.06204719675334436, + "flos": 21846511520640.0, + "grad_norm": 1.5592754345030688, + "language_loss": 0.78595883, + "learning_rate": 3.98923698403654e-06, + "loss": 0.81428319, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.41357422, + "step": 1032, + "time_per_iteration": 2.8701016902923584 + }, + { + "auxiliary_loss_clip": 0.01761686, + "auxiliary_loss_mlp": 0.01121842, + "balance_loss_clip": 1.46452475, + "balance_loss_mlp": 1.07263207, + "epoch": 0.06210732000601232, + "flos": 19362935174400.0, + "grad_norm": 1.8803227839005745, + "language_loss": 0.91664344, + "learning_rate": 3.989196596031776e-06, + "loss": 0.94547874, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.49243164, + "step": 1033, + "time_per_iteration": 2.859041929244995 + }, + { + "auxiliary_loss_clip": 0.01735547, + "auxiliary_loss_mlp": 0.01116456, + "balance_loss_clip": 1.44652939, + "balance_loss_mlp": 1.07521009, + "epoch": 0.062167443258680295, + "flos": 24758928512640.0, + "grad_norm": 1.8834403006290097, + "language_loss": 0.86785245, + "learning_rate": 3.989156132596479e-06, + "loss": 0.89637244, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.41210938, + "step": 1034, + "time_per_iteration": 2.9061217308044434 + }, + { + "auxiliary_loss_clip": 0.01717429, + "auxiliary_loss_mlp": 0.01121075, + "balance_loss_clip": 1.44049525, + "balance_loss_mlp": 1.07911372, + "epoch": 0.06222756651134827, + "flos": 34471551784320.0, + "grad_norm": 2.4027110345740725, + "language_loss": 0.83155692, + "learning_rate": 3.989115593732182e-06, + "loss": 0.85994202, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.41967773, + "step": 1035, + "time_per_iteration": 2.9614744186401367 + }, + { + "auxiliary_loss_clip": 0.01733698, + "auxiliary_loss_mlp": 0.01102233, + "balance_loss_clip": 1.44758916, + "balance_loss_mlp": 1.06108189, + "epoch": 0.06228768976401623, + "flos": 25677536567040.0, + "grad_norm": 2.014183682891057, + "language_loss": 0.80815291, + "learning_rate": 3.989074979440421e-06, + "loss": 0.83651227, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.41137695, + "step": 1036, + "time_per_iteration": 2.9048259258270264 + }, + { + "auxiliary_loss_clip": 0.01727459, + "auxiliary_loss_mlp": 0.01105266, + "balance_loss_clip": 1.44422376, + "balance_loss_mlp": 1.06413913, + "epoch": 0.062347813016684205, + "flos": 25305034976640.0, + "grad_norm": 2.289986129438155, + "language_loss": 0.87771559, + "learning_rate": 3.989034289722739e-06, + "loss": 0.90604287, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.41113281, + "step": 1037, + "time_per_iteration": 2.9057538509368896 + }, + { + "auxiliary_loss_clip": 0.01740674, + "auxiliary_loss_mlp": 0.01104176, + "balance_loss_clip": 1.45432663, + "balance_loss_mlp": 1.06495619, + "epoch": 0.06240793626935217, + "flos": 26918442466560.0, + "grad_norm": 2.17218124273449, + "language_loss": 0.82448542, + "learning_rate": 3.988993524580676e-06, + "loss": 0.85293388, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.39208984, + "step": 1038, + "time_per_iteration": 2.9137020111083984 + }, + { + "auxiliary_loss_clip": 0.01735794, + "auxiliary_loss_mlp": 0.01125473, + "balance_loss_clip": 1.45549679, + "balance_loss_mlp": 1.08396459, + "epoch": 0.06246805952202014, + "flos": 21625598568960.0, + "grad_norm": 1.7732554016872213, + "language_loss": 0.87295985, + "learning_rate": 3.98895268401578e-06, + "loss": 0.90157253, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.4152832, + "step": 1039, + "time_per_iteration": 2.888058662414551 + }, + { + "auxiliary_loss_clip": 0.01757381, + "auxiliary_loss_mlp": 0.01105729, + "balance_loss_clip": 1.46479321, + "balance_loss_mlp": 1.06402981, + "epoch": 0.0625281827746881, + "flos": 19319744373120.0, + "grad_norm": 1.9248844862207504, + "language_loss": 0.82382154, + "learning_rate": 3.9889117680296e-06, + "loss": 0.85245264, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.41674805, + "step": 1040, + "time_per_iteration": 2.921339511871338 + }, + { + "auxiliary_loss_clip": 0.01768295, + "auxiliary_loss_mlp": 0.01120702, + "balance_loss_clip": 1.47491086, + "balance_loss_mlp": 1.07962286, + "epoch": 0.06258830602735609, + "flos": 27757274659200.0, + "grad_norm": 2.3323472699951933, + "language_loss": 0.71318835, + "learning_rate": 3.988870776623685e-06, + "loss": 0.7420783, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.41064453, + "step": 1041, + "time_per_iteration": 2.9496092796325684 + }, + { + "auxiliary_loss_clip": 0.01745431, + "auxiliary_loss_mlp": 0.01103614, + "balance_loss_clip": 1.45403087, + "balance_loss_mlp": 1.06122315, + "epoch": 0.06264842928002405, + "flos": 23233214724480.0, + "grad_norm": 3.9091823612043366, + "language_loss": 0.82761753, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.85610801, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.42407227, + "step": 1042, + "time_per_iteration": 2.8704757690429688 + }, + { + "auxiliary_loss_clip": 0.01760614, + "auxiliary_loss_mlp": 0.01108625, + "balance_loss_clip": 1.47254574, + "balance_loss_mlp": 1.06900001, + "epoch": 0.06270855253269202, + "flos": 38413734600960.0, + "grad_norm": 1.6159776810035342, + "language_loss": 0.79290557, + "learning_rate": 3.988788567558874e-06, + "loss": 0.82159793, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 2.8828125, + "router_z_loss_mlp": 0.39624023, + "step": 1043, + "time_per_iteration": 3.192606210708618 + }, + { + "auxiliary_loss_clip": 0.01737305, + "auxiliary_loss_mlp": 0.01084531, + "balance_loss_clip": 1.45415425, + "balance_loss_mlp": 1.04452479, + "epoch": 0.06276867578535998, + "flos": 22463209152000.0, + "grad_norm": 2.1038082376414335, + "language_loss": 0.94099969, + "learning_rate": 3.988747349903097e-06, + "loss": 0.96921802, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.40039062, + "step": 1044, + "time_per_iteration": 2.956242561340332 + }, + { + "auxiliary_loss_clip": 0.01762031, + "auxiliary_loss_mlp": 0.01107552, + "balance_loss_clip": 1.47193527, + "balance_loss_mlp": 1.06342125, + "epoch": 0.06282879903802796, + "flos": 22940896199040.0, + "grad_norm": 3.299193864973366, + "language_loss": 0.86993623, + "learning_rate": 3.988706056833821e-06, + "loss": 0.89863205, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.44140625, + "step": 1045, + "time_per_iteration": 3.019695281982422 + }, + { + "auxiliary_loss_clip": 0.01755911, + "auxiliary_loss_mlp": 0.01104295, + "balance_loss_clip": 1.47076941, + "balance_loss_mlp": 1.06450295, + "epoch": 0.06288892229069593, + "flos": 34831339585920.0, + "grad_norm": 1.8370873263421879, + "language_loss": 0.80741465, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.83601665, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.39819336, + "step": 1046, + "time_per_iteration": 3.028822422027588 + }, + { + "auxiliary_loss_clip": 0.01767714, + "auxiliary_loss_mlp": 0.01108284, + "balance_loss_clip": 1.47247636, + "balance_loss_mlp": 1.06980324, + "epoch": 0.06294904554336389, + "flos": 19436512498560.0, + "grad_norm": 1.9217036854499492, + "language_loss": 0.79487407, + "learning_rate": 3.988623244461039e-06, + "loss": 0.82363403, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.38500977, + "step": 1047, + "time_per_iteration": 2.847055673599243 + }, + { + "auxiliary_loss_clip": 0.01804642, + "auxiliary_loss_mlp": 0.01102612, + "balance_loss_clip": 1.49930358, + "balance_loss_mlp": 1.05929172, + "epoch": 0.06300916879603187, + "flos": 40676759953920.0, + "grad_norm": 2.2964380453478643, + "language_loss": 0.79250073, + "learning_rate": 3.988581725160672e-06, + "loss": 0.82157326, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 3.05664062, + "router_z_loss_mlp": 0.43286133, + "step": 1048, + "time_per_iteration": 3.108105421066284 + }, + { + "auxiliary_loss_clip": 0.01776605, + "auxiliary_loss_mlp": 0.01093656, + "balance_loss_clip": 1.48267186, + "balance_loss_mlp": 1.04973984, + "epoch": 0.06306929204869983, + "flos": 23814322680960.0, + "grad_norm": 2.3495273603732727, + "language_loss": 0.79698145, + "learning_rate": 3.988540130453087e-06, + "loss": 0.82568407, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.43896484, + "step": 1049, + "time_per_iteration": 2.916186809539795 + }, + { + "auxiliary_loss_clip": 0.01775529, + "auxiliary_loss_mlp": 0.01084095, + "balance_loss_clip": 1.48018289, + "balance_loss_mlp": 1.0391531, + "epoch": 0.0631294153013678, + "flos": 18924909834240.0, + "grad_norm": 1.9284429166849173, + "language_loss": 0.84785175, + "learning_rate": 3.988498460339862e-06, + "loss": 0.87644798, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 2.95507812, + "router_z_loss_mlp": 0.44921875, + "step": 1050, + "time_per_iteration": 2.903005599975586 + }, + { + "auxiliary_loss_clip": 0.01764088, + "auxiliary_loss_mlp": 0.01089241, + "balance_loss_clip": 1.48199725, + "balance_loss_mlp": 1.04933023, + "epoch": 0.06318953855403578, + "flos": 24290923852800.0, + "grad_norm": 2.9277066596991337, + "language_loss": 0.79522878, + "learning_rate": 3.988456714822575e-06, + "loss": 0.82376212, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.39892578, + "step": 1051, + "time_per_iteration": 2.9847066402435303 + }, + { + "auxiliary_loss_clip": 0.01787204, + "auxiliary_loss_mlp": 0.01098907, + "balance_loss_clip": 1.49109316, + "balance_loss_mlp": 1.05754137, + "epoch": 0.06324966180670374, + "flos": 22539229695360.0, + "grad_norm": 2.9056891946360137, + "language_loss": 0.82186389, + "learning_rate": 3.98841489390281e-06, + "loss": 0.85072505, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.41333008, + "step": 1052, + "time_per_iteration": 2.92612361907959 + }, + { + "auxiliary_loss_clip": 0.01771935, + "auxiliary_loss_mlp": 0.01103932, + "balance_loss_clip": 1.4738822, + "balance_loss_mlp": 1.06247079, + "epoch": 0.06330978505937171, + "flos": 15786421983360.0, + "grad_norm": 2.0808861890616317, + "language_loss": 0.80976057, + "learning_rate": 3.988372997582155e-06, + "loss": 0.83851916, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.41479492, + "step": 1053, + "time_per_iteration": 2.9314537048339844 + }, + { + "auxiliary_loss_clip": 0.01755651, + "auxiliary_loss_mlp": 0.01089254, + "balance_loss_clip": 1.46913409, + "balance_loss_mlp": 1.04579055, + "epoch": 0.06336990831203967, + "flos": 21481203853440.0, + "grad_norm": 1.7863040620360378, + "language_loss": 0.86004102, + "learning_rate": 3.988331025862195e-06, + "loss": 0.88849002, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.43457031, + "step": 1054, + "time_per_iteration": 2.9013099670410156 + }, + { + "auxiliary_loss_clip": 0.01757754, + "auxiliary_loss_mlp": 0.01107937, + "balance_loss_clip": 1.46588516, + "balance_loss_mlp": 1.06709647, + "epoch": 0.06343003156470765, + "flos": 18488106103680.0, + "grad_norm": 1.8192064518483613, + "language_loss": 0.86988091, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.89853787, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.40844727, + "step": 1055, + "time_per_iteration": 2.931546211242676 + }, + { + "auxiliary_loss_clip": 0.0178881, + "auxiliary_loss_mlp": 0.01104719, + "balance_loss_clip": 1.4807725, + "balance_loss_mlp": 1.0607065, + "epoch": 0.06349015481737562, + "flos": 25165617189120.0, + "grad_norm": 2.6190387645977924, + "language_loss": 0.83905435, + "learning_rate": 3.988246856230734e-06, + "loss": 0.86798966, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.44018555, + "step": 1056, + "time_per_iteration": 2.962200164794922 + }, + { + "auxiliary_loss_clip": 0.01787566, + "auxiliary_loss_mlp": 0.01111884, + "balance_loss_clip": 1.47861218, + "balance_loss_mlp": 1.06029046, + "epoch": 0.06355027807004358, + "flos": 26883486218880.0, + "grad_norm": 2.153540532384241, + "language_loss": 0.82990301, + "learning_rate": 3.988204658322426e-06, + "loss": 0.85889751, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 3.08984375, + "router_z_loss_mlp": 0.51611328, + "step": 1057, + "time_per_iteration": 2.947171926498413 + }, + { + "auxiliary_loss_clip": 0.01722677, + "auxiliary_loss_mlp": 0.01089327, + "balance_loss_clip": 1.44525552, + "balance_loss_mlp": 1.04500484, + "epoch": 0.06361040132271156, + "flos": 21406404919680.0, + "grad_norm": 1.839197615920782, + "language_loss": 0.84954029, + "learning_rate": 3.988162385021196e-06, + "loss": 0.87766027, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.44311523, + "step": 1058, + "time_per_iteration": 2.9516613483428955 + }, + { + "auxiliary_loss_clip": 0.01749457, + "auxiliary_loss_mlp": 0.01117299, + "balance_loss_clip": 1.45961094, + "balance_loss_mlp": 1.07214236, + "epoch": 0.06367052457537953, + "flos": 25743377030400.0, + "grad_norm": 1.855521302230082, + "language_loss": 0.88974363, + "learning_rate": 3.988120036328651e-06, + "loss": 0.9184112, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.45141602, + "step": 1059, + "time_per_iteration": 4.368882656097412 + }, + { + "auxiliary_loss_clip": 0.01752919, + "auxiliary_loss_mlp": 0.01106652, + "balance_loss_clip": 1.46077883, + "balance_loss_mlp": 1.06070864, + "epoch": 0.0637306478280475, + "flos": 17638279424640.0, + "grad_norm": 1.887208650155512, + "language_loss": 0.9380244, + "learning_rate": 3.988077612246394e-06, + "loss": 0.96662015, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.45922852, + "step": 1060, + "time_per_iteration": 2.869223117828369 + }, + { + "auxiliary_loss_clip": 0.01723478, + "auxiliary_loss_mlp": 0.01097943, + "balance_loss_clip": 1.44218194, + "balance_loss_mlp": 1.05357313, + "epoch": 0.06379077108071547, + "flos": 13670551278720.0, + "grad_norm": 2.4857569845775047, + "language_loss": 0.89936805, + "learning_rate": 3.988035112776035e-06, + "loss": 0.92758238, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.44384766, + "step": 1061, + "time_per_iteration": 2.9586708545684814 + }, + { + "auxiliary_loss_clip": 0.01744387, + "auxiliary_loss_mlp": 0.01129532, + "balance_loss_clip": 1.44540977, + "balance_loss_mlp": 1.08041799, + "epoch": 0.06385089433338344, + "flos": 28491419088000.0, + "grad_norm": 2.485194966339801, + "language_loss": 0.79436672, + "learning_rate": 3.987992537919185e-06, + "loss": 0.82310593, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 2.98828125, + "router_z_loss_mlp": 0.49145508, + "step": 1062, + "time_per_iteration": 2.979839563369751 + }, + { + "auxiliary_loss_clip": 0.01743715, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_clip": 1.45320678, + "balance_loss_mlp": 1.06230509, + "epoch": 0.0639110175860514, + "flos": 24320360234880.0, + "grad_norm": 2.225872801233063, + "language_loss": 0.88141608, + "learning_rate": 3.987949887677459e-06, + "loss": 0.90990037, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 2.90429688, + "router_z_loss_mlp": 0.42431641, + "step": 1063, + "time_per_iteration": 2.9812557697296143 + }, + { + "auxiliary_loss_clip": 0.01753563, + "auxiliary_loss_mlp": 0.01108147, + "balance_loss_clip": 1.45970643, + "balance_loss_mlp": 1.06196523, + "epoch": 0.06397114083871938, + "flos": 22100887641600.0, + "grad_norm": 1.904324705273974, + "language_loss": 0.8215313, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.85014844, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.46191406, + "step": 1064, + "time_per_iteration": 5.847304582595825 + }, + { + "auxiliary_loss_clip": 0.01728762, + "auxiliary_loss_mlp": 0.01119061, + "balance_loss_clip": 1.44704461, + "balance_loss_mlp": 1.07380879, + "epoch": 0.06403126409138735, + "flos": 19582174068480.0, + "grad_norm": 2.4519472243780625, + "language_loss": 0.86275077, + "learning_rate": 3.987864361045851e-06, + "loss": 0.89122903, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.45263672, + "step": 1065, + "time_per_iteration": 2.972856283187866 + }, + { + "auxiliary_loss_clip": 0.01721832, + "auxiliary_loss_mlp": 0.01119014, + "balance_loss_clip": 1.43451214, + "balance_loss_mlp": 1.0745728, + "epoch": 0.06409138734405531, + "flos": 40822240544640.0, + "grad_norm": 1.4168105153476913, + "language_loss": 0.69800854, + "learning_rate": 3.987821484659211e-06, + "loss": 0.72641706, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.44482422, + "step": 1066, + "time_per_iteration": 3.0889222621917725 + }, + { + "auxiliary_loss_clip": 0.0172932, + "auxiliary_loss_mlp": 0.01110269, + "balance_loss_clip": 1.44399118, + "balance_loss_mlp": 1.06866479, + "epoch": 0.06415151059672328, + "flos": 20449402012800.0, + "grad_norm": 1.9220329987313058, + "language_loss": 0.92754364, + "learning_rate": 3.987778532894181e-06, + "loss": 0.95593953, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.41601562, + "step": 1067, + "time_per_iteration": 2.939772605895996 + }, + { + "auxiliary_loss_clip": 0.01732527, + "auxiliary_loss_mlp": 0.01115119, + "balance_loss_clip": 1.44641566, + "balance_loss_mlp": 1.06960523, + "epoch": 0.06421163384939126, + "flos": 18079743369600.0, + "grad_norm": 2.1195306808710255, + "language_loss": 0.85973728, + "learning_rate": 3.987735505752391e-06, + "loss": 0.88821375, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.45507812, + "step": 1068, + "time_per_iteration": 2.9522650241851807 + }, + { + "auxiliary_loss_clip": 0.01719565, + "auxiliary_loss_mlp": 0.01122733, + "balance_loss_clip": 1.44106495, + "balance_loss_mlp": 1.07297516, + "epoch": 0.06427175710205922, + "flos": 25130434717440.0, + "grad_norm": 2.135140385739313, + "language_loss": 0.91790491, + "learning_rate": 3.987692403235471e-06, + "loss": 0.94632792, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.49780273, + "step": 1069, + "time_per_iteration": 2.981790542602539 + }, + { + "auxiliary_loss_clip": 0.01732515, + "auxiliary_loss_mlp": 0.01124218, + "balance_loss_clip": 1.44390666, + "balance_loss_mlp": 1.07665372, + "epoch": 0.06433188035472719, + "flos": 17388518273280.0, + "grad_norm": 2.454605877266586, + "language_loss": 0.98568225, + "learning_rate": 3.987649225345056e-06, + "loss": 1.01424956, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.47607422, + "step": 1070, + "time_per_iteration": 2.917107582092285 + }, + { + "auxiliary_loss_clip": 0.01733819, + "auxiliary_loss_mlp": 0.01104872, + "balance_loss_clip": 1.44733763, + "balance_loss_mlp": 1.05847549, + "epoch": 0.06439200360739517, + "flos": 23555738793600.0, + "grad_norm": 1.6570412362268654, + "language_loss": 0.89191127, + "learning_rate": 3.987605972082782e-06, + "loss": 0.92029816, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.46386719, + "step": 1071, + "time_per_iteration": 2.9443490505218506 + }, + { + "auxiliary_loss_clip": 0.01704184, + "auxiliary_loss_mlp": 0.01109491, + "balance_loss_clip": 1.42826271, + "balance_loss_mlp": 1.06199837, + "epoch": 0.06445212686006313, + "flos": 21989458402560.0, + "grad_norm": 2.796421678623422, + "language_loss": 0.78996003, + "learning_rate": 3.987562643450292e-06, + "loss": 0.81809676, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.4753418, + "step": 1072, + "time_per_iteration": 2.9573192596435547 + }, + { + "auxiliary_loss_clip": 0.01725489, + "auxiliary_loss_mlp": 0.01118141, + "balance_loss_clip": 1.43980217, + "balance_loss_mlp": 1.07036233, + "epoch": 0.0645122501127311, + "flos": 25932274646400.0, + "grad_norm": 2.5198820194986387, + "language_loss": 0.83350307, + "learning_rate": 3.987519239449226e-06, + "loss": 0.86193937, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.47753906, + "step": 1073, + "time_per_iteration": 2.967038631439209 + }, + { + "auxiliary_loss_clip": 0.01717309, + "auxiliary_loss_mlp": 0.01104622, + "balance_loss_clip": 1.44258928, + "balance_loss_mlp": 1.0587976, + "epoch": 0.06457237336539907, + "flos": 25636065068160.0, + "grad_norm": 2.2892290814939225, + "language_loss": 0.81935132, + "learning_rate": 3.987475760081233e-06, + "loss": 0.84757066, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.45874023, + "step": 1074, + "time_per_iteration": 3.0130085945129395 + }, + { + "auxiliary_loss_clip": 0.0172568, + "auxiliary_loss_mlp": 0.01092348, + "balance_loss_clip": 1.4453876, + "balance_loss_mlp": 1.04909921, + "epoch": 0.06463249661806704, + "flos": 19473459517440.0, + "grad_norm": 4.539720651722752, + "language_loss": 0.82304054, + "learning_rate": 3.987432205347958e-06, + "loss": 0.85122085, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.43237305, + "step": 1075, + "time_per_iteration": 3.0138700008392334 + }, + { + "auxiliary_loss_clip": 0.01710781, + "auxiliary_loss_mlp": 0.01099048, + "balance_loss_clip": 1.43353677, + "balance_loss_mlp": 1.05541778, + "epoch": 0.064692619870735, + "flos": 24508579178880.0, + "grad_norm": 2.281583428462896, + "language_loss": 0.90676308, + "learning_rate": 3.987388575251055e-06, + "loss": 0.93486142, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.43652344, + "step": 1076, + "time_per_iteration": 2.9074480533599854 + }, + { + "auxiliary_loss_clip": 0.01724738, + "auxiliary_loss_mlp": 0.01106654, + "balance_loss_clip": 1.44384992, + "balance_loss_mlp": 1.05951834, + "epoch": 0.06475274312340297, + "flos": 17027327882880.0, + "grad_norm": 1.8316548306322429, + "language_loss": 0.82992005, + "learning_rate": 3.98734486979218e-06, + "loss": 0.85823405, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.47143555, + "step": 1077, + "time_per_iteration": 2.9090092182159424 + }, + { + "auxiliary_loss_clip": 0.01742388, + "auxiliary_loss_mlp": 0.01105175, + "balance_loss_clip": 1.44995594, + "balance_loss_mlp": 1.05629945, + "epoch": 0.06481286637607095, + "flos": 24583287623040.0, + "grad_norm": 2.002357603417955, + "language_loss": 0.94002444, + "learning_rate": 3.987301088972986e-06, + "loss": 0.96850008, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.48901367, + "step": 1078, + "time_per_iteration": 2.9229512214660645 + }, + { + "auxiliary_loss_clip": 0.01760043, + "auxiliary_loss_mlp": 0.0110283, + "balance_loss_clip": 1.46047914, + "balance_loss_mlp": 1.05311978, + "epoch": 0.06487298962873891, + "flos": 21115488983040.0, + "grad_norm": 1.8746992090384071, + "language_loss": 0.80908412, + "learning_rate": 3.987257232795137e-06, + "loss": 0.83771282, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 2.9921875, + "router_z_loss_mlp": 0.49707031, + "step": 1079, + "time_per_iteration": 2.9435999393463135 + }, + { + "auxiliary_loss_clip": 0.01732123, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_clip": 1.44813764, + "balance_loss_mlp": 1.061113, + "epoch": 0.06493311288140688, + "flos": 24618922542720.0, + "grad_norm": 1.8382228937895808, + "language_loss": 0.72456443, + "learning_rate": 3.987213301260294e-06, + "loss": 0.75296497, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.46826172, + "step": 1080, + "time_per_iteration": 2.9569170475006104 + }, + { + "auxiliary_loss_clip": 0.01734279, + "auxiliary_loss_mlp": 0.0108878, + "balance_loss_clip": 1.44639111, + "balance_loss_mlp": 1.03797281, + "epoch": 0.06499323613407486, + "flos": 25348949694720.0, + "grad_norm": 2.603785400984728, + "language_loss": 0.74544013, + "learning_rate": 3.987169294370123e-06, + "loss": 0.77367067, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.50830078, + "step": 1081, + "time_per_iteration": 2.929741382598877 + }, + { + "auxiliary_loss_clip": 0.01716126, + "auxiliary_loss_mlp": 0.01087034, + "balance_loss_clip": 1.4401443, + "balance_loss_mlp": 1.03636992, + "epoch": 0.06505335938674282, + "flos": 20385326096640.0, + "grad_norm": 2.662456643947311, + "language_loss": 0.86840391, + "learning_rate": 3.987125212126294e-06, + "loss": 0.8964355, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.50708008, + "step": 1082, + "time_per_iteration": 2.90867280960083 + }, + { + "auxiliary_loss_clip": 0.01758874, + "auxiliary_loss_mlp": 0.0109391, + "balance_loss_clip": 1.45710778, + "balance_loss_mlp": 1.04562998, + "epoch": 0.06511348263941079, + "flos": 25348859205120.0, + "grad_norm": 2.4265526615676407, + "language_loss": 0.84446776, + "learning_rate": 3.987081054530478e-06, + "loss": 0.87299562, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.4831543, + "step": 1083, + "time_per_iteration": 2.888659954071045 + }, + { + "auxiliary_loss_clip": 0.01732819, + "auxiliary_loss_mlp": 0.01103274, + "balance_loss_clip": 1.44616389, + "balance_loss_mlp": 1.05218053, + "epoch": 0.06517360589207877, + "flos": 20340732706560.0, + "grad_norm": 2.1264269036891643, + "language_loss": 0.81526709, + "learning_rate": 3.987036821584348e-06, + "loss": 0.84362805, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.51098633, + "step": 1084, + "time_per_iteration": 2.9511709213256836 + }, + { + "auxiliary_loss_clip": 0.01725693, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.44075513, + "balance_loss_mlp": 1.04465258, + "epoch": 0.06523372914474673, + "flos": 31692987469440.0, + "grad_norm": 3.1313932209047604, + "language_loss": 0.68742931, + "learning_rate": 3.986992513289584e-06, + "loss": 0.71566951, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.53637695, + "step": 1085, + "time_per_iteration": 3.1240668296813965 + }, + { + "auxiliary_loss_clip": 0.0173554, + "auxiliary_loss_mlp": 0.01095776, + "balance_loss_clip": 1.45352054, + "balance_loss_mlp": 1.04339552, + "epoch": 0.0652938523974147, + "flos": 20788168965120.0, + "grad_norm": 1.9172528250795644, + "language_loss": 0.78989136, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.81820452, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.52368164, + "step": 1086, + "time_per_iteration": 3.111402750015259 + }, + { + "auxiliary_loss_clip": 0.01732224, + "auxiliary_loss_mlp": 0.01107969, + "balance_loss_clip": 1.45045519, + "balance_loss_mlp": 1.056041, + "epoch": 0.06535397565008266, + "flos": 16699736396160.0, + "grad_norm": 2.327778367103435, + "language_loss": 0.87041205, + "learning_rate": 3.986903670660872e-06, + "loss": 0.89881396, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.51953125, + "step": 1087, + "time_per_iteration": 2.9237046241760254 + }, + { + "auxiliary_loss_clip": 0.0173413, + "auxiliary_loss_mlp": 0.01106527, + "balance_loss_clip": 1.45015228, + "balance_loss_mlp": 1.05824733, + "epoch": 0.06541409890275064, + "flos": 26879006983680.0, + "grad_norm": 2.124317695553559, + "language_loss": 0.80336535, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.83177185, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.4831543, + "step": 1088, + "time_per_iteration": 2.9563703536987305 + }, + { + "auxiliary_loss_clip": 0.01731971, + "auxiliary_loss_mlp": 0.01097546, + "balance_loss_clip": 1.44776869, + "balance_loss_mlp": 1.05119753, + "epoch": 0.06547422215541861, + "flos": 20531259135360.0, + "grad_norm": 1.9834864647016117, + "language_loss": 0.73003018, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.75832546, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.46386719, + "step": 1089, + "time_per_iteration": 3.001741409301758 + }, + { + "auxiliary_loss_clip": 0.01715105, + "auxiliary_loss_mlp": 0.01078199, + "balance_loss_clip": 1.43955708, + "balance_loss_mlp": 1.03490257, + "epoch": 0.06553434540808657, + "flos": 22026405421440.0, + "grad_norm": 1.6022040135244706, + "language_loss": 0.87374407, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.90167713, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.43286133, + "step": 1090, + "time_per_iteration": 3.0122454166412354 + }, + { + "auxiliary_loss_clip": 0.01737361, + "auxiliary_loss_mlp": 0.01099679, + "balance_loss_clip": 1.45306754, + "balance_loss_mlp": 1.05185258, + "epoch": 0.06559446866075455, + "flos": 24619510725120.0, + "grad_norm": 2.796697461268191, + "language_loss": 0.74027598, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.76864642, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.47827148, + "step": 1091, + "time_per_iteration": 2.9875214099884033 + }, + { + "auxiliary_loss_clip": 0.01741994, + "auxiliary_loss_mlp": 0.01094948, + "balance_loss_clip": 1.46008587, + "balance_loss_mlp": 1.04533267, + "epoch": 0.06565459191342252, + "flos": 24284001398400.0, + "grad_norm": 2.0419714628705514, + "language_loss": 0.84173548, + "learning_rate": 3.986680245605936e-06, + "loss": 0.87010491, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.49584961, + "step": 1092, + "time_per_iteration": 2.9963228702545166 + }, + { + "auxiliary_loss_clip": 0.01744972, + "auxiliary_loss_mlp": 0.01091014, + "balance_loss_clip": 1.45675588, + "balance_loss_mlp": 1.04001641, + "epoch": 0.06571471516609048, + "flos": 24797006651520.0, + "grad_norm": 1.7372875513209753, + "language_loss": 0.72635341, + "learning_rate": 3.986635334582814e-06, + "loss": 0.75471324, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.51000977, + "step": 1093, + "time_per_iteration": 3.0057733058929443 + }, + { + "auxiliary_loss_clip": 0.01737242, + "auxiliary_loss_mlp": 0.01096707, + "balance_loss_clip": 1.45472765, + "balance_loss_mlp": 1.04907107, + "epoch": 0.06577483841875846, + "flos": 26225362333440.0, + "grad_norm": 1.7927805142556603, + "language_loss": 0.89127207, + "learning_rate": 3.986590348226282e-06, + "loss": 0.91961157, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.47631836, + "step": 1094, + "time_per_iteration": 4.393615484237671 + }, + { + "auxiliary_loss_clip": 0.01754974, + "auxiliary_loss_mlp": 0.01107473, + "balance_loss_clip": 1.47047555, + "balance_loss_mlp": 1.05683303, + "epoch": 0.06583496167142643, + "flos": 25091225458560.0, + "grad_norm": 1.4772628579604468, + "language_loss": 0.830634, + "learning_rate": 3.986545286538044e-06, + "loss": 0.85925847, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.50585938, + "step": 1095, + "time_per_iteration": 3.063180685043335 + }, + { + "auxiliary_loss_clip": 0.01758167, + "auxiliary_loss_mlp": 0.01088261, + "balance_loss_clip": 1.4688561, + "balance_loss_mlp": 1.04153073, + "epoch": 0.06589508492409439, + "flos": 25640815772160.0, + "grad_norm": 2.3154209125445924, + "language_loss": 0.73786545, + "learning_rate": 3.986500149519811e-06, + "loss": 0.76632971, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.46655273, + "step": 1096, + "time_per_iteration": 3.0325822830200195 + }, + { + "auxiliary_loss_clip": 0.01744085, + "auxiliary_loss_mlp": 0.0109002, + "balance_loss_clip": 1.46101153, + "balance_loss_mlp": 1.04591215, + "epoch": 0.06595520817676236, + "flos": 23631261644160.0, + "grad_norm": 1.6178821112809478, + "language_loss": 0.79414994, + "learning_rate": 3.986454937173292e-06, + "loss": 0.82249093, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.44091797, + "step": 1097, + "time_per_iteration": 3.0436770915985107 + }, + { + "auxiliary_loss_clip": 0.01777051, + "auxiliary_loss_mlp": 0.01099625, + "balance_loss_clip": 1.47507608, + "balance_loss_mlp": 1.05444515, + "epoch": 0.06601533142943034, + "flos": 33814604263680.0, + "grad_norm": 2.00212914439608, + "language_loss": 0.79848421, + "learning_rate": 3.986409649500203e-06, + "loss": 0.82725102, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.45214844, + "step": 1098, + "time_per_iteration": 3.115184783935547 + }, + { + "auxiliary_loss_clip": 0.01769933, + "auxiliary_loss_mlp": 0.01085563, + "balance_loss_clip": 1.48165345, + "balance_loss_mlp": 1.04000151, + "epoch": 0.0660754546820983, + "flos": 20266929158400.0, + "grad_norm": 2.036835422515395, + "language_loss": 0.83505535, + "learning_rate": 3.986364286502261e-06, + "loss": 0.86361033, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.45605469, + "step": 1099, + "time_per_iteration": 7.400240421295166 + }, + { + "auxiliary_loss_clip": 0.01750478, + "auxiliary_loss_mlp": 0.01075247, + "balance_loss_clip": 1.46780109, + "balance_loss_mlp": 1.03013837, + "epoch": 0.06613557793476627, + "flos": 19363523356800.0, + "grad_norm": 1.9774998621546798, + "language_loss": 0.84955657, + "learning_rate": 3.986318848181186e-06, + "loss": 0.87781382, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.45117188, + "step": 1100, + "time_per_iteration": 2.974531650543213 + }, + { + "auxiliary_loss_clip": 0.01761734, + "auxiliary_loss_mlp": 0.01091089, + "balance_loss_clip": 1.47157216, + "balance_loss_mlp": 1.04423952, + "epoch": 0.06619570118743424, + "flos": 13780668418560.0, + "grad_norm": 3.1202256330676112, + "language_loss": 0.7497772, + "learning_rate": 3.986273334538702e-06, + "loss": 0.77830541, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 2.8984375, + "router_z_loss_mlp": 0.46850586, + "step": 1101, + "time_per_iteration": 2.9259326457977295 + }, + { + "auxiliary_loss_clip": 0.01767658, + "auxiliary_loss_mlp": 0.01080252, + "balance_loss_clip": 1.47729135, + "balance_loss_mlp": 1.03676498, + "epoch": 0.06625582444010221, + "flos": 17866748257920.0, + "grad_norm": 2.58149305793957, + "language_loss": 0.885665, + "learning_rate": 3.986227745576533e-06, + "loss": 0.9141441, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.43481445, + "step": 1102, + "time_per_iteration": 2.9965481758117676 + }, + { + "auxiliary_loss_clip": 0.01777894, + "auxiliary_loss_mlp": 0.01083197, + "balance_loss_clip": 1.48404551, + "balance_loss_mlp": 1.03513217, + "epoch": 0.06631594769277017, + "flos": 11846275182720.0, + "grad_norm": 1.9666606129193156, + "language_loss": 0.84470856, + "learning_rate": 3.98618208129641e-06, + "loss": 0.87331951, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 2.93945312, + "router_z_loss_mlp": 0.48046875, + "step": 1103, + "time_per_iteration": 2.9953227043151855 + }, + { + "auxiliary_loss_clip": 0.01770883, + "auxiliary_loss_mlp": 0.01114489, + "balance_loss_clip": 1.48208094, + "balance_loss_mlp": 1.0660423, + "epoch": 0.06637607094543815, + "flos": 19803177509760.0, + "grad_norm": 1.841229150888138, + "language_loss": 0.83678144, + "learning_rate": 3.986136341700063e-06, + "loss": 0.86563516, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.48388672, + "step": 1104, + "time_per_iteration": 2.9772369861602783 + }, + { + "auxiliary_loss_clip": 0.01769873, + "auxiliary_loss_mlp": 0.01090531, + "balance_loss_clip": 1.48641241, + "balance_loss_mlp": 1.0470438, + "epoch": 0.06643619419810612, + "flos": 25497099728640.0, + "grad_norm": 1.4957116099067367, + "language_loss": 0.80923736, + "learning_rate": 3.986090526789227e-06, + "loss": 0.83784139, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.43481445, + "step": 1105, + "time_per_iteration": 3.0643210411071777 + }, + { + "auxiliary_loss_clip": 0.01767182, + "auxiliary_loss_mlp": 0.01097072, + "balance_loss_clip": 1.48648906, + "balance_loss_mlp": 1.05320334, + "epoch": 0.06649631745077408, + "flos": 16955379371520.0, + "grad_norm": 1.7595968954551977, + "language_loss": 0.97386545, + "learning_rate": 3.986044636565639e-06, + "loss": 1.00250793, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.43920898, + "step": 1106, + "time_per_iteration": 2.998225688934326 + }, + { + "auxiliary_loss_clip": 0.01790624, + "auxiliary_loss_mlp": 0.01079645, + "balance_loss_clip": 1.49478602, + "balance_loss_mlp": 1.03529942, + "epoch": 0.06655644070344206, + "flos": 17867653153920.0, + "grad_norm": 1.6565295343057573, + "language_loss": 0.84513593, + "learning_rate": 3.985998671031039e-06, + "loss": 0.87383866, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 2.95898438, + "router_z_loss_mlp": 0.44335938, + "step": 1107, + "time_per_iteration": 3.1502256393432617 + }, + { + "auxiliary_loss_clip": 0.014676, + "auxiliary_loss_mlp": 0.01172473, + "balance_loss_clip": 1.28858137, + "balance_loss_mlp": 1.13337255, + "epoch": 0.06661656395611003, + "flos": 61448342060160.0, + "grad_norm": 0.830976931440527, + "language_loss": 0.56755757, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.59395826, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.390625, + "step": 1108, + "time_per_iteration": 3.35772967338562 + }, + { + "auxiliary_loss_clip": 0.01784933, + "auxiliary_loss_mlp": 0.01082812, + "balance_loss_clip": 1.4914031, + "balance_loss_mlp": 1.03894269, + "epoch": 0.066676687208778, + "flos": 20671310350080.0, + "grad_norm": 2.5541455989432813, + "language_loss": 0.74656141, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.77523875, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.43823242, + "step": 1109, + "time_per_iteration": 2.9016807079315186 + }, + { + "auxiliary_loss_clip": 0.01758358, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_clip": 1.4752115, + "balance_loss_mlp": 1.03888702, + "epoch": 0.06673681046144596, + "flos": 20933106618240.0, + "grad_norm": 1.620623019912696, + "language_loss": 0.79759663, + "learning_rate": 3.985860322578614e-06, + "loss": 0.82597369, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.40478516, + "step": 1110, + "time_per_iteration": 2.9524264335632324 + }, + { + "auxiliary_loss_clip": 0.01773748, + "auxiliary_loss_mlp": 0.0110024, + "balance_loss_clip": 1.48403859, + "balance_loss_mlp": 1.06128192, + "epoch": 0.06679693371411394, + "flos": 31078144874880.0, + "grad_norm": 3.659221291677558, + "language_loss": 0.72957015, + "learning_rate": 3.985814055817427e-06, + "loss": 0.75831002, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.3894043, + "step": 1111, + "time_per_iteration": 2.977801561355591 + }, + { + "auxiliary_loss_clip": 0.01780874, + "auxiliary_loss_mlp": 0.01107994, + "balance_loss_clip": 1.48701584, + "balance_loss_mlp": 1.0677731, + "epoch": 0.0668570569667819, + "flos": 21736213401600.0, + "grad_norm": 1.9440383483003574, + "language_loss": 0.80538237, + "learning_rate": 3.985767713753971e-06, + "loss": 0.83427101, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.40234375, + "step": 1112, + "time_per_iteration": 2.942483425140381 + }, + { + "auxiliary_loss_clip": 0.01774849, + "auxiliary_loss_mlp": 0.01120721, + "balance_loss_clip": 1.48869443, + "balance_loss_mlp": 1.07763863, + "epoch": 0.06691718021944987, + "flos": 22757473203840.0, + "grad_norm": 1.9991070425184172, + "language_loss": 0.81648648, + "learning_rate": 3.985721296390005e-06, + "loss": 0.84544218, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.4309082, + "step": 1113, + "time_per_iteration": 2.9562251567840576 + }, + { + "auxiliary_loss_clip": 0.01744177, + "auxiliary_loss_mlp": 0.01109111, + "balance_loss_clip": 1.4688673, + "balance_loss_mlp": 1.06943822, + "epoch": 0.06697730347211785, + "flos": 16554482029440.0, + "grad_norm": 2.7514611871893497, + "language_loss": 0.8461982, + "learning_rate": 3.985674803727289e-06, + "loss": 0.87473106, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.3972168, + "step": 1114, + "time_per_iteration": 2.9056577682495117 + }, + { + "auxiliary_loss_clip": 0.01455675, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.28287292, + "balance_loss_mlp": 1.00996161, + "epoch": 0.06703742672478581, + "flos": 59812149173760.0, + "grad_norm": 0.8451899874782671, + "language_loss": 0.58309799, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60799658, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.2421875, + "step": 1115, + "time_per_iteration": 3.3437414169311523 + }, + { + "auxiliary_loss_clip": 0.01778429, + "auxiliary_loss_mlp": 0.0112172, + "balance_loss_clip": 1.48894501, + "balance_loss_mlp": 1.07644403, + "epoch": 0.06709754997745378, + "flos": 16808903395200.0, + "grad_norm": 2.702988190676563, + "language_loss": 0.93082017, + "learning_rate": 3.985581592512658e-06, + "loss": 0.9598217, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 2.89257812, + "router_z_loss_mlp": 0.45239258, + "step": 1116, + "time_per_iteration": 2.8920130729675293 + }, + { + "auxiliary_loss_clip": 0.01780936, + "auxiliary_loss_mlp": 0.01126618, + "balance_loss_clip": 1.48837924, + "balance_loss_mlp": 1.08589661, + "epoch": 0.06715767323012176, + "flos": 22133174446080.0, + "grad_norm": 1.9560883740736086, + "language_loss": 0.88852501, + "learning_rate": 3.985534873964279e-06, + "loss": 0.91760051, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.40722656, + "step": 1117, + "time_per_iteration": 3.0018818378448486 + }, + { + "auxiliary_loss_clip": 0.0144369, + "auxiliary_loss_mlp": 0.01073777, + "balance_loss_clip": 1.27206695, + "balance_loss_mlp": 1.05556166, + "epoch": 0.06721779648278972, + "flos": 66643583644800.0, + "grad_norm": 0.8650562609671947, + "language_loss": 0.59873402, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6239087, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.18261719, + "step": 1118, + "time_per_iteration": 3.24596905708313 + }, + { + "auxiliary_loss_clip": 0.01794294, + "auxiliary_loss_mlp": 0.01092811, + "balance_loss_clip": 1.49612546, + "balance_loss_mlp": 1.05139744, + "epoch": 0.06727791973545769, + "flos": 22392617984640.0, + "grad_norm": 2.344515846792726, + "language_loss": 0.87493664, + "learning_rate": 3.985441210994251e-06, + "loss": 0.9038077, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 2.98046875, + "router_z_loss_mlp": 0.41381836, + "step": 1119, + "time_per_iteration": 2.9194021224975586 + }, + { + "auxiliary_loss_clip": 0.01763388, + "auxiliary_loss_mlp": 0.01111295, + "balance_loss_clip": 1.47731256, + "balance_loss_mlp": 1.07102633, + "epoch": 0.06733804298812565, + "flos": 24290969097600.0, + "grad_norm": 1.829720566023223, + "language_loss": 0.86594296, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.8946898, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.40258789, + "step": 1120, + "time_per_iteration": 2.983499765396118 + }, + { + "auxiliary_loss_clip": 0.0179329, + "auxiliary_loss_mlp": 0.01122749, + "balance_loss_clip": 1.49916816, + "balance_loss_mlp": 1.08002412, + "epoch": 0.06739816624079363, + "flos": 15925070609280.0, + "grad_norm": 1.8461268729512665, + "language_loss": 0.79897928, + "learning_rate": 3.985347246871708e-06, + "loss": 0.82813966, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.42749023, + "step": 1121, + "time_per_iteration": 2.881929874420166 + }, + { + "auxiliary_loss_clip": 0.01439313, + "auxiliary_loss_mlp": 0.01022523, + "balance_loss_clip": 1.26538038, + "balance_loss_mlp": 1.00240028, + "epoch": 0.0674582894934616, + "flos": 71434977696000.0, + "grad_norm": 0.7621670857372473, + "language_loss": 0.58532143, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60993981, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20117188, + "step": 1122, + "time_per_iteration": 3.4584033489227295 + }, + { + "auxiliary_loss_clip": 0.01787508, + "auxiliary_loss_mlp": 0.01110695, + "balance_loss_clip": 1.49315333, + "balance_loss_mlp": 1.06861424, + "epoch": 0.06751841274612956, + "flos": 25275870063360.0, + "grad_norm": 3.5325857879232325, + "language_loss": 0.73612785, + "learning_rate": 3.985252981610901e-06, + "loss": 0.7651099, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.4206543, + "step": 1123, + "time_per_iteration": 3.012976884841919 + }, + { + "auxiliary_loss_clip": 0.01803792, + "auxiliary_loss_mlp": 0.0110326, + "balance_loss_clip": 1.50131488, + "balance_loss_mlp": 1.06017756, + "epoch": 0.06757853599879754, + "flos": 23812558133760.0, + "grad_norm": 1.8359494955447002, + "language_loss": 0.80354464, + "learning_rate": 3.985205736058114e-06, + "loss": 0.83261526, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 3.02539062, + "router_z_loss_mlp": 0.4309082, + "step": 1124, + "time_per_iteration": 2.9381868839263916 + }, + { + "auxiliary_loss_clip": 0.01781343, + "auxiliary_loss_mlp": 0.01081859, + "balance_loss_clip": 1.49227417, + "balance_loss_mlp": 1.04273438, + "epoch": 0.0676386592514655, + "flos": 21043857185280.0, + "grad_norm": 1.8885530983194523, + "language_loss": 0.73892158, + "learning_rate": 3.985158415226128e-06, + "loss": 0.76755363, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.39111328, + "step": 1125, + "time_per_iteration": 2.954005241394043 + }, + { + "auxiliary_loss_clip": 0.01784842, + "auxiliary_loss_mlp": 0.01097686, + "balance_loss_clip": 1.49179482, + "balance_loss_mlp": 1.05787003, + "epoch": 0.06769878250413347, + "flos": 25567193203200.0, + "grad_norm": 2.6544142005686964, + "language_loss": 0.83032131, + "learning_rate": 3.985111019116736e-06, + "loss": 0.85914648, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.3984375, + "step": 1126, + "time_per_iteration": 2.961317777633667 + }, + { + "auxiliary_loss_clip": 0.01434716, + "auxiliary_loss_mlp": 0.01014259, + "balance_loss_clip": 1.26352429, + "balance_loss_mlp": 0.99880904, + "epoch": 0.06775890575680145, + "flos": 70687938499200.0, + "grad_norm": 0.7902248914487475, + "language_loss": 0.59876448, + "learning_rate": 3.985063547731735e-06, + "loss": 0.62325418, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.15429688, + "step": 1127, + "time_per_iteration": 3.440735101699829 + }, + { + "auxiliary_loss_clip": 0.01785199, + "auxiliary_loss_mlp": 0.01109218, + "balance_loss_clip": 1.49244273, + "balance_loss_mlp": 1.06832957, + "epoch": 0.06781902900946941, + "flos": 24244113467520.0, + "grad_norm": 1.9840662838060583, + "language_loss": 0.82907218, + "learning_rate": 3.985016001072925e-06, + "loss": 0.85801637, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.40893555, + "step": 1128, + "time_per_iteration": 2.9745333194732666 + }, + { + "auxiliary_loss_clip": 0.01815432, + "auxiliary_loss_mlp": 0.01118336, + "balance_loss_clip": 1.50900817, + "balance_loss_mlp": 1.07458663, + "epoch": 0.06787915226213738, + "flos": 22427302763520.0, + "grad_norm": 2.646893922195106, + "language_loss": 0.78473097, + "learning_rate": 3.984968379142109e-06, + "loss": 0.81406868, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 3.06054688, + "router_z_loss_mlp": 0.4375, + "step": 1129, + "time_per_iteration": 4.303941011428833 + }, + { + "auxiliary_loss_clip": 0.017947, + "auxiliary_loss_mlp": 0.01100976, + "balance_loss_clip": 1.49370492, + "balance_loss_mlp": 1.0572021, + "epoch": 0.06793927551480534, + "flos": 37721152160640.0, + "grad_norm": 1.951544639686094, + "language_loss": 0.75001538, + "learning_rate": 3.984920681941094e-06, + "loss": 0.77897215, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 3.00976562, + "router_z_loss_mlp": 0.4375, + "step": 1130, + "time_per_iteration": 3.039942741394043 + }, + { + "auxiliary_loss_clip": 0.01775956, + "auxiliary_loss_mlp": 0.01107316, + "balance_loss_clip": 1.48581553, + "balance_loss_mlp": 1.06843042, + "epoch": 0.06799939876747332, + "flos": 20641150051200.0, + "grad_norm": 2.2016438470572655, + "language_loss": 0.81806189, + "learning_rate": 3.984872909471688e-06, + "loss": 0.84689462, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.38867188, + "step": 1131, + "time_per_iteration": 2.9461746215820312 + }, + { + "auxiliary_loss_clip": 0.01763196, + "auxiliary_loss_mlp": 0.01101781, + "balance_loss_clip": 1.47825074, + "balance_loss_mlp": 1.06015277, + "epoch": 0.06805952202014129, + "flos": 14872519388160.0, + "grad_norm": 1.8597213994609005, + "language_loss": 0.82135308, + "learning_rate": 3.984825061735701e-06, + "loss": 0.85000288, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.41601562, + "step": 1132, + "time_per_iteration": 2.884608030319214 + }, + { + "auxiliary_loss_clip": 0.01781808, + "auxiliary_loss_mlp": 0.01113013, + "balance_loss_clip": 1.49054408, + "balance_loss_mlp": 1.07186246, + "epoch": 0.06811964527280925, + "flos": 48926930947200.0, + "grad_norm": 1.4297570721558497, + "language_loss": 0.65490377, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.68385196, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 2.91015625, + "router_z_loss_mlp": 0.41162109, + "step": 1133, + "time_per_iteration": 3.280437469482422 + }, + { + "auxiliary_loss_clip": 0.01802387, + "auxiliary_loss_mlp": 0.01105853, + "balance_loss_clip": 1.49957681, + "balance_loss_mlp": 1.06289005, + "epoch": 0.06817976852547723, + "flos": 15385615130880.0, + "grad_norm": 1.9842473187868983, + "language_loss": 0.77088434, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.79996669, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 3.02734375, + "router_z_loss_mlp": 0.4296875, + "step": 1134, + "time_per_iteration": 5.790952920913696 + }, + { + "auxiliary_loss_clip": 0.01780266, + "auxiliary_loss_mlp": 0.01101158, + "balance_loss_clip": 1.49274802, + "balance_loss_mlp": 1.06119883, + "epoch": 0.0682398917781452, + "flos": 20164956082560.0, + "grad_norm": 1.75619328365223, + "language_loss": 0.88908952, + "learning_rate": 3.984681066946423e-06, + "loss": 0.91790372, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.39941406, + "step": 1135, + "time_per_iteration": 4.318697214126587 + }, + { + "auxiliary_loss_clip": 0.01792087, + "auxiliary_loss_mlp": 0.01106024, + "balance_loss_clip": 1.49672318, + "balance_loss_mlp": 1.06391895, + "epoch": 0.06830001503081316, + "flos": 23451051029760.0, + "grad_norm": 2.1834946854949626, + "language_loss": 0.79806721, + "learning_rate": 3.984632918162291e-06, + "loss": 0.8270483, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.42114258, + "step": 1136, + "time_per_iteration": 2.961277723312378 + }, + { + "auxiliary_loss_clip": 0.01801486, + "auxiliary_loss_mlp": 0.01103813, + "balance_loss_clip": 1.50816965, + "balance_loss_mlp": 1.06099343, + "epoch": 0.06836013828348114, + "flos": 34363063457280.0, + "grad_norm": 2.028757697892878, + "language_loss": 0.86820656, + "learning_rate": 3.984584694120679e-06, + "loss": 0.89725959, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.4284668, + "step": 1137, + "time_per_iteration": 2.9968631267547607 + }, + { + "auxiliary_loss_clip": 0.01768299, + "auxiliary_loss_mlp": 0.01111839, + "balance_loss_clip": 1.48083735, + "balance_loss_mlp": 1.07414508, + "epoch": 0.06842026153614911, + "flos": 23159230197120.0, + "grad_norm": 2.021102761838922, + "language_loss": 0.80818772, + "learning_rate": 3.984536394823418e-06, + "loss": 0.83698905, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.37719727, + "step": 1138, + "time_per_iteration": 2.9238510131835938 + }, + { + "auxiliary_loss_clip": 0.01779437, + "auxiliary_loss_mlp": 0.01099406, + "balance_loss_clip": 1.48964417, + "balance_loss_mlp": 1.05927992, + "epoch": 0.06848038478881707, + "flos": 24619917928320.0, + "grad_norm": 2.2607119602967467, + "language_loss": 0.86203003, + "learning_rate": 3.984488020272336e-06, + "loss": 0.89081848, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 2.89648438, + "router_z_loss_mlp": 0.40112305, + "step": 1139, + "time_per_iteration": 2.9141201972961426 + }, + { + "auxiliary_loss_clip": 0.0178041, + "auxiliary_loss_mlp": 0.01085807, + "balance_loss_clip": 1.49157107, + "balance_loss_mlp": 1.04587162, + "epoch": 0.06854050804148504, + "flos": 40895998848000.0, + "grad_norm": 1.5130640781885838, + "language_loss": 0.76314354, + "learning_rate": 3.984439570469271e-06, + "loss": 0.79180562, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.39941406, + "step": 1140, + "time_per_iteration": 3.161508798599243 + }, + { + "auxiliary_loss_clip": 0.01785595, + "auxiliary_loss_mlp": 0.0110339, + "balance_loss_clip": 1.49298286, + "balance_loss_mlp": 1.0636456, + "epoch": 0.06860063129415302, + "flos": 31698914538240.0, + "grad_norm": 2.04622856788199, + "language_loss": 0.7029835, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.73187333, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 2.921875, + "router_z_loss_mlp": 0.39746094, + "step": 1141, + "time_per_iteration": 3.042388677597046 + }, + { + "auxiliary_loss_clip": 0.01782916, + "auxiliary_loss_mlp": 0.01092359, + "balance_loss_clip": 1.48284721, + "balance_loss_mlp": 1.05235219, + "epoch": 0.06866075454682098, + "flos": 26553270533760.0, + "grad_norm": 2.139667974776622, + "language_loss": 0.80406606, + "learning_rate": 3.984342445114538e-06, + "loss": 0.83281875, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 2.99804688, + "router_z_loss_mlp": 0.40039062, + "step": 1142, + "time_per_iteration": 3.0027992725372314 + }, + { + "auxiliary_loss_clip": 0.0175986, + "auxiliary_loss_mlp": 0.01078084, + "balance_loss_clip": 1.48119748, + "balance_loss_mlp": 1.03998494, + "epoch": 0.06872087779948895, + "flos": 29802011258880.0, + "grad_norm": 1.7872533265285635, + "language_loss": 0.70513737, + "learning_rate": 3.984293769566553e-06, + "loss": 0.73351681, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.38110352, + "step": 1143, + "time_per_iteration": 3.014019012451172 + }, + { + "auxiliary_loss_clip": 0.01745685, + "auxiliary_loss_mlp": 0.01088501, + "balance_loss_clip": 1.46915126, + "balance_loss_mlp": 1.05030656, + "epoch": 0.06878100105215693, + "flos": 26951905635840.0, + "grad_norm": 1.806922279363485, + "language_loss": 0.75769001, + "learning_rate": 3.98424501877395e-06, + "loss": 0.78603196, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.38208008, + "step": 1144, + "time_per_iteration": 3.0195586681365967 + }, + { + "auxiliary_loss_clip": 0.01783507, + "auxiliary_loss_mlp": 0.01091063, + "balance_loss_clip": 1.48624361, + "balance_loss_mlp": 1.04755163, + "epoch": 0.06884112430482489, + "flos": 10677905976960.0, + "grad_norm": 2.246671589225427, + "language_loss": 0.93436694, + "learning_rate": 3.984196192738577e-06, + "loss": 0.96311259, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 2.97070312, + "router_z_loss_mlp": 0.43554688, + "step": 1145, + "time_per_iteration": 2.889540910720825 + }, + { + "auxiliary_loss_clip": 0.0179275, + "auxiliary_loss_mlp": 0.0110185, + "balance_loss_clip": 1.492553, + "balance_loss_mlp": 1.05857706, + "epoch": 0.06890124755749286, + "flos": 20203305690240.0, + "grad_norm": 2.2112595961326162, + "language_loss": 0.84479892, + "learning_rate": 3.984147291462285e-06, + "loss": 0.87374485, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 3.00390625, + "router_z_loss_mlp": 0.43310547, + "step": 1146, + "time_per_iteration": 2.977402448654175 + }, + { + "auxiliary_loss_clip": 0.01739956, + "auxiliary_loss_mlp": 0.01078082, + "balance_loss_clip": 1.46196139, + "balance_loss_mlp": 1.03833771, + "epoch": 0.06896137081016084, + "flos": 20458993910400.0, + "grad_norm": 1.7327090292129335, + "language_loss": 0.86641192, + "learning_rate": 3.98409831494693e-06, + "loss": 0.89459223, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.3972168, + "step": 1147, + "time_per_iteration": 2.9276974201202393 + }, + { + "auxiliary_loss_clip": 0.01754834, + "auxiliary_loss_mlp": 0.01083027, + "balance_loss_clip": 1.46990585, + "balance_loss_mlp": 1.04406953, + "epoch": 0.0690214940628288, + "flos": 18377717495040.0, + "grad_norm": 1.765621534859995, + "language_loss": 0.87507212, + "learning_rate": 3.984049263194367e-06, + "loss": 0.90345073, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.38989258, + "step": 1148, + "time_per_iteration": 2.961883783340454 + }, + { + "auxiliary_loss_clip": 0.01736728, + "auxiliary_loss_mlp": 0.01088318, + "balance_loss_clip": 1.45327306, + "balance_loss_mlp": 1.04828787, + "epoch": 0.06908161731549677, + "flos": 20567663216640.0, + "grad_norm": 2.382562797615503, + "language_loss": 0.71986365, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.74811405, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.40039062, + "step": 1149, + "time_per_iteration": 2.9428791999816895 + }, + { + "auxiliary_loss_clip": 0.01760479, + "auxiliary_loss_mlp": 0.01074686, + "balance_loss_clip": 1.47101974, + "balance_loss_mlp": 1.03639627, + "epoch": 0.06914174056816474, + "flos": 27575209008000.0, + "grad_norm": 2.149800710497741, + "language_loss": 0.86074388, + "learning_rate": 3.983950933985064e-06, + "loss": 0.88909554, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.3828125, + "step": 1150, + "time_per_iteration": 3.00156569480896 + }, + { + "auxiliary_loss_clip": 0.0176389, + "auxiliary_loss_mlp": 0.0109314, + "balance_loss_clip": 1.47581637, + "balance_loss_mlp": 1.05334806, + "epoch": 0.06920186382083271, + "flos": 15312264030720.0, + "grad_norm": 3.013811228292107, + "language_loss": 0.84592628, + "learning_rate": 3.983901656532052e-06, + "loss": 0.87449664, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.39794922, + "step": 1151, + "time_per_iteration": 2.9745144844055176 + }, + { + "auxiliary_loss_clip": 0.0176165, + "auxiliary_loss_mlp": 0.01096524, + "balance_loss_clip": 1.46992767, + "balance_loss_mlp": 1.05918789, + "epoch": 0.06926198707350067, + "flos": 25201930780800.0, + "grad_norm": 2.072542761457855, + "language_loss": 0.87080699, + "learning_rate": 3.983852303849291e-06, + "loss": 0.89938873, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.37353516, + "step": 1152, + "time_per_iteration": 2.951582908630371 + }, + { + "auxiliary_loss_clip": 0.01736643, + "auxiliary_loss_mlp": 0.01083812, + "balance_loss_clip": 1.45184994, + "balance_loss_mlp": 1.04480708, + "epoch": 0.06932211032616864, + "flos": 13262640992640.0, + "grad_norm": 2.923065751743557, + "language_loss": 0.92647183, + "learning_rate": 3.983802875938651e-06, + "loss": 0.95467639, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.39013672, + "step": 1153, + "time_per_iteration": 2.955064296722412 + }, + { + "auxiliary_loss_clip": 0.0173797, + "auxiliary_loss_mlp": 0.01084936, + "balance_loss_clip": 1.45431972, + "balance_loss_mlp": 1.04366565, + "epoch": 0.06938223357883662, + "flos": 24838251926400.0, + "grad_norm": 2.1225094967668166, + "language_loss": 0.8310492, + "learning_rate": 3.983753372802008e-06, + "loss": 0.85927826, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.4128418, + "step": 1154, + "time_per_iteration": 2.9111173152923584 + }, + { + "auxiliary_loss_clip": 0.01742665, + "auxiliary_loss_mlp": 0.01081409, + "balance_loss_clip": 1.4587245, + "balance_loss_mlp": 1.04121125, + "epoch": 0.06944235683150458, + "flos": 27278456492160.0, + "grad_norm": 2.4603130437876852, + "language_loss": 0.76968384, + "learning_rate": 3.983703794441237e-06, + "loss": 0.79792452, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.40234375, + "step": 1155, + "time_per_iteration": 2.9915316104888916 + }, + { + "auxiliary_loss_clip": 0.01747401, + "auxiliary_loss_mlp": 0.01083891, + "balance_loss_clip": 1.45977402, + "balance_loss_mlp": 1.04452848, + "epoch": 0.06950248008417255, + "flos": 25818040229760.0, + "grad_norm": 4.871177763113254, + "language_loss": 0.72125125, + "learning_rate": 3.98365414085822e-06, + "loss": 0.74956429, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.39355469, + "step": 1156, + "time_per_iteration": 2.9384782314300537 + }, + { + "auxiliary_loss_clip": 0.01721828, + "auxiliary_loss_mlp": 0.01075663, + "balance_loss_clip": 1.43956709, + "balance_loss_mlp": 1.03656256, + "epoch": 0.06956260333684053, + "flos": 22281279235200.0, + "grad_norm": 2.219946572602928, + "language_loss": 0.76466644, + "learning_rate": 3.98360441205484e-06, + "loss": 0.7926414, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.39086914, + "step": 1157, + "time_per_iteration": 2.8861210346221924 + }, + { + "auxiliary_loss_clip": 0.01745972, + "auxiliary_loss_mlp": 0.01080283, + "balance_loss_clip": 1.4597404, + "balance_loss_mlp": 1.03877389, + "epoch": 0.0696227265895085, + "flos": 29693884890240.0, + "grad_norm": 1.7211163533486489, + "language_loss": 0.73171544, + "learning_rate": 3.983554608032982e-06, + "loss": 0.759978, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.41479492, + "step": 1158, + "time_per_iteration": 3.0217227935791016 + }, + { + "auxiliary_loss_clip": 0.01741903, + "auxiliary_loss_mlp": 0.01079304, + "balance_loss_clip": 1.45792198, + "balance_loss_mlp": 1.03929758, + "epoch": 0.06968284984217646, + "flos": 25535358846720.0, + "grad_norm": 1.7231024398076913, + "language_loss": 0.8121056, + "learning_rate": 3.983504728794533e-06, + "loss": 0.84031773, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.40014648, + "step": 1159, + "time_per_iteration": 2.968329429626465 + }, + { + "auxiliary_loss_clip": 0.01742283, + "auxiliary_loss_mlp": 0.01084784, + "balance_loss_clip": 1.4558394, + "balance_loss_mlp": 1.04220212, + "epoch": 0.06974297309484444, + "flos": 20706357087360.0, + "grad_norm": 2.6560324991410025, + "language_loss": 0.84323162, + "learning_rate": 3.983454774341387e-06, + "loss": 0.87150228, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.42602539, + "step": 1160, + "time_per_iteration": 2.905940294265747 + }, + { + "auxiliary_loss_clip": 0.01728161, + "auxiliary_loss_mlp": 0.01073135, + "balance_loss_clip": 1.44484377, + "balance_loss_mlp": 1.03219819, + "epoch": 0.0698030963475124, + "flos": 26516368759680.0, + "grad_norm": 1.8101908981803763, + "language_loss": 0.7730915, + "learning_rate": 3.983404744675437e-06, + "loss": 0.80110443, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.40869141, + "step": 1161, + "time_per_iteration": 2.952885150909424 + }, + { + "auxiliary_loss_clip": 0.01728234, + "auxiliary_loss_mlp": 0.01068383, + "balance_loss_clip": 1.44422698, + "balance_loss_mlp": 1.02949739, + "epoch": 0.06986321960018037, + "flos": 23051058583680.0, + "grad_norm": 1.633409104216109, + "language_loss": 0.84162581, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.86959195, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.38867188, + "step": 1162, + "time_per_iteration": 2.879686117172241 + }, + { + "auxiliary_loss_clip": 0.01723115, + "auxiliary_loss_mlp": 0.01067573, + "balance_loss_clip": 1.44501972, + "balance_loss_mlp": 1.02966487, + "epoch": 0.06992334285284833, + "flos": 28596514055040.0, + "grad_norm": 11.040833235859107, + "language_loss": 0.80968076, + "learning_rate": 3.983304459712716e-06, + "loss": 0.83758765, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.37915039, + "step": 1163, + "time_per_iteration": 4.371046543121338 + }, + { + "auxiliary_loss_clip": 0.01738412, + "auxiliary_loss_mlp": 0.01077845, + "balance_loss_clip": 1.44815302, + "balance_loss_mlp": 1.03800559, + "epoch": 0.06998346610551631, + "flos": 20605198417920.0, + "grad_norm": 2.7626409823828477, + "language_loss": 0.80418587, + "learning_rate": 3.983254204419749e-06, + "loss": 0.83234847, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.3984375, + "step": 1164, + "time_per_iteration": 2.923943042755127 + }, + { + "auxiliary_loss_clip": 0.01712709, + "auxiliary_loss_mlp": 0.01075891, + "balance_loss_clip": 1.42895484, + "balance_loss_mlp": 1.03826833, + "epoch": 0.07004358935818428, + "flos": 22539093960960.0, + "grad_norm": 1.661469348126522, + "language_loss": 0.74778467, + "learning_rate": 3.983203873921583e-06, + "loss": 0.77567065, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.3762207, + "step": 1165, + "time_per_iteration": 2.893615484237671 + }, + { + "auxiliary_loss_clip": 0.01712109, + "auxiliary_loss_mlp": 0.01074353, + "balance_loss_clip": 1.43215501, + "balance_loss_mlp": 1.03775609, + "epoch": 0.07010371261085224, + "flos": 28961866967040.0, + "grad_norm": 1.7925751006522423, + "language_loss": 0.82505178, + "learning_rate": 3.983153468220128e-06, + "loss": 0.85291642, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.3659668, + "step": 1166, + "time_per_iteration": 2.982210636138916 + }, + { + "auxiliary_loss_clip": 0.01725198, + "auxiliary_loss_mlp": 0.01078402, + "balance_loss_clip": 1.43985724, + "balance_loss_mlp": 1.04101789, + "epoch": 0.07016383586352022, + "flos": 23669566007040.0, + "grad_norm": 1.9654444086813077, + "language_loss": 0.8662982, + "learning_rate": 3.983102987317295e-06, + "loss": 0.89433414, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.3737793, + "step": 1167, + "time_per_iteration": 2.8925750255584717 + }, + { + "auxiliary_loss_clip": 0.01727189, + "auxiliary_loss_mlp": 0.01078335, + "balance_loss_clip": 1.44217777, + "balance_loss_mlp": 1.03935409, + "epoch": 0.07022395911618819, + "flos": 19801639186560.0, + "grad_norm": 2.5703404707862894, + "language_loss": 0.92231417, + "learning_rate": 3.983052431214997e-06, + "loss": 0.95036948, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.38989258, + "step": 1168, + "time_per_iteration": 2.9164843559265137 + }, + { + "auxiliary_loss_clip": 0.01723879, + "auxiliary_loss_mlp": 0.01075371, + "balance_loss_clip": 1.4369669, + "balance_loss_mlp": 1.03698575, + "epoch": 0.07028408236885615, + "flos": 21699085403520.0, + "grad_norm": 2.843384274012963, + "language_loss": 0.90027738, + "learning_rate": 3.983001799915153e-06, + "loss": 0.92826986, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.38378906, + "step": 1169, + "time_per_iteration": 4.388768911361694 + }, + { + "auxiliary_loss_clip": 0.01737596, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_clip": 1.44595742, + "balance_loss_mlp": 1.04129398, + "epoch": 0.07034420562152413, + "flos": 25641403954560.0, + "grad_norm": 2.1051026856087085, + "language_loss": 0.86896288, + "learning_rate": 3.982951093419681e-06, + "loss": 0.89714825, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.39648438, + "step": 1170, + "time_per_iteration": 4.4324798583984375 + }, + { + "auxiliary_loss_clip": 0.01712682, + "auxiliary_loss_mlp": 0.01091295, + "balance_loss_clip": 1.43891263, + "balance_loss_mlp": 1.05395889, + "epoch": 0.0704043288741921, + "flos": 20819369894400.0, + "grad_norm": 1.8235108999273948, + "language_loss": 0.7695992, + "learning_rate": 3.982900311730506e-06, + "loss": 0.79763901, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.37329102, + "step": 1171, + "time_per_iteration": 2.895761489868164 + }, + { + "auxiliary_loss_clip": 0.01705467, + "auxiliary_loss_mlp": 0.01089557, + "balance_loss_clip": 1.42738473, + "balance_loss_mlp": 1.05431914, + "epoch": 0.07046445212686006, + "flos": 25604140222080.0, + "grad_norm": 1.8031258721489132, + "language_loss": 0.90964723, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.93759745, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.35205078, + "step": 1172, + "time_per_iteration": 3.139467239379883 + }, + { + "auxiliary_loss_clip": 0.01718161, + "auxiliary_loss_mlp": 0.01092915, + "balance_loss_clip": 1.42629385, + "balance_loss_mlp": 1.05386233, + "epoch": 0.07052457537952803, + "flos": 25568007609600.0, + "grad_norm": 1.5733268441001675, + "language_loss": 0.83832705, + "learning_rate": 3.982798522778748e-06, + "loss": 0.86643785, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.390625, + "step": 1173, + "time_per_iteration": 2.964430332183838 + }, + { + "auxiliary_loss_clip": 0.01700662, + "auxiliary_loss_mlp": 0.01081855, + "balance_loss_clip": 1.42357922, + "balance_loss_mlp": 1.04575872, + "epoch": 0.070584698632196, + "flos": 17977725048960.0, + "grad_norm": 1.9552783716035997, + "language_loss": 0.84077764, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.86860275, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.36083984, + "step": 1174, + "time_per_iteration": 2.8872430324554443 + }, + { + "auxiliary_loss_clip": 0.01700675, + "auxiliary_loss_mlp": 0.01077218, + "balance_loss_clip": 1.4258492, + "balance_loss_mlp": 1.04128838, + "epoch": 0.07064482188486397, + "flos": 25380919785600.0, + "grad_norm": 3.674582629124737, + "language_loss": 0.87052703, + "learning_rate": 3.982696433075317e-06, + "loss": 0.89830595, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.359375, + "step": 1175, + "time_per_iteration": 2.9896137714385986 + }, + { + "auxiliary_loss_clip": 0.01714685, + "auxiliary_loss_mlp": 0.01087933, + "balance_loss_clip": 1.43113565, + "balance_loss_mlp": 1.05190849, + "epoch": 0.07070494513753194, + "flos": 24910607640960.0, + "grad_norm": 1.723610517678584, + "language_loss": 0.8547827, + "learning_rate": 3.982645275446563e-06, + "loss": 0.88280892, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.36035156, + "step": 1176, + "time_per_iteration": 2.9168763160705566 + }, + { + "auxiliary_loss_clip": 0.01700325, + "auxiliary_loss_mlp": 0.01077941, + "balance_loss_clip": 1.4231739, + "balance_loss_mlp": 1.0428462, + "epoch": 0.07076506839019991, + "flos": 22346622005760.0, + "grad_norm": 1.9652279590081545, + "language_loss": 0.75990933, + "learning_rate": 3.982594042635701e-06, + "loss": 0.78769201, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.35107422, + "step": 1177, + "time_per_iteration": 2.923025608062744 + }, + { + "auxiliary_loss_clip": 0.01714983, + "auxiliary_loss_mlp": 0.01089819, + "balance_loss_clip": 1.4295876, + "balance_loss_mlp": 1.05315065, + "epoch": 0.07082519164286788, + "flos": 18669945530880.0, + "grad_norm": 1.8068675617996233, + "language_loss": 0.87232077, + "learning_rate": 3.982542734644673e-06, + "loss": 0.90036881, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.36694336, + "step": 1178, + "time_per_iteration": 2.8652961254119873 + }, + { + "auxiliary_loss_clip": 0.01440522, + "auxiliary_loss_mlp": 0.0104938, + "balance_loss_clip": 1.27498198, + "balance_loss_mlp": 1.03211808, + "epoch": 0.07088531489553584, + "flos": 63686048307840.0, + "grad_norm": 0.847612952468103, + "language_loss": 0.63460118, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65950024, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.17285156, + "step": 1179, + "time_per_iteration": 3.4792938232421875 + }, + { + "auxiliary_loss_clip": 0.01721916, + "auxiliary_loss_mlp": 0.01091705, + "balance_loss_clip": 1.43438852, + "balance_loss_mlp": 1.0559423, + "epoch": 0.07094543814820382, + "flos": 21580778954880.0, + "grad_norm": 2.4464394103108047, + "language_loss": 0.86921805, + "learning_rate": 3.98243989312991e-06, + "loss": 0.89735425, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.35742188, + "step": 1180, + "time_per_iteration": 2.8962419033050537 + }, + { + "auxiliary_loss_clip": 0.01699546, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_clip": 1.41976452, + "balance_loss_mlp": 1.03707147, + "epoch": 0.07100556140087179, + "flos": 22099847011200.0, + "grad_norm": 3.8808583244963386, + "language_loss": 0.90603805, + "learning_rate": 3.982388359610074e-06, + "loss": 0.93379205, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 2.796875, + "router_z_loss_mlp": 0.38769531, + "step": 1181, + "time_per_iteration": 2.88250470161438 + }, + { + "auxiliary_loss_clip": 0.01688241, + "auxiliary_loss_mlp": 0.01061109, + "balance_loss_clip": 1.41726637, + "balance_loss_mlp": 1.0276829, + "epoch": 0.07106568465353975, + "flos": 47938998579840.0, + "grad_norm": 1.885771899897693, + "language_loss": 0.85516238, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.88265586, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 2.70898438, + "router_z_loss_mlp": 0.33422852, + "step": 1182, + "time_per_iteration": 3.1554551124572754 + }, + { + "auxiliary_loss_clip": 0.01709463, + "auxiliary_loss_mlp": 0.01070045, + "balance_loss_clip": 1.42889595, + "balance_loss_mlp": 1.03340054, + "epoch": 0.07112580790620772, + "flos": 23451277253760.0, + "grad_norm": 2.165353215465474, + "language_loss": 0.81728697, + "learning_rate": 3.982285067055262e-06, + "loss": 0.84508198, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.36645508, + "step": 1183, + "time_per_iteration": 2.9735240936279297 + }, + { + "auxiliary_loss_clip": 0.01723271, + "auxiliary_loss_mlp": 0.01068223, + "balance_loss_clip": 1.43319452, + "balance_loss_mlp": 1.03281784, + "epoch": 0.0711859311588757, + "flos": 31881523127040.0, + "grad_norm": 2.063488622546912, + "language_loss": 0.81384939, + "learning_rate": 3.982233308024204e-06, + "loss": 0.84176433, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 2.90234375, + "router_z_loss_mlp": 0.35400391, + "step": 1184, + "time_per_iteration": 2.956021547317505 + }, + { + "auxiliary_loss_clip": 0.01694805, + "auxiliary_loss_mlp": 0.01071256, + "balance_loss_clip": 1.41919231, + "balance_loss_mlp": 1.03463459, + "epoch": 0.07124605441154366, + "flos": 19619935493760.0, + "grad_norm": 1.7983909342733337, + "language_loss": 0.78965789, + "learning_rate": 3.98218147382666e-06, + "loss": 0.8173185, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.36621094, + "step": 1185, + "time_per_iteration": 2.9469523429870605 + }, + { + "auxiliary_loss_clip": 0.01713783, + "auxiliary_loss_mlp": 0.01082376, + "balance_loss_clip": 1.43160462, + "balance_loss_mlp": 1.04782891, + "epoch": 0.07130617766421163, + "flos": 14692715976960.0, + "grad_norm": 2.4884418341115904, + "language_loss": 0.68930507, + "learning_rate": 3.982129564464596e-06, + "loss": 0.71726662, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.34521484, + "step": 1186, + "time_per_iteration": 2.8512160778045654 + }, + { + "auxiliary_loss_clip": 0.01691548, + "auxiliary_loss_mlp": 0.01069641, + "balance_loss_clip": 1.41857743, + "balance_loss_mlp": 1.02934873, + "epoch": 0.07136630091687961, + "flos": 26079112581120.0, + "grad_norm": 1.8787388922612374, + "language_loss": 0.71592003, + "learning_rate": 3.98207757993998e-06, + "loss": 0.74353194, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.40332031, + "step": 1187, + "time_per_iteration": 3.004605770111084 + }, + { + "auxiliary_loss_clip": 0.01719162, + "auxiliary_loss_mlp": 0.01071555, + "balance_loss_clip": 1.44113469, + "balance_loss_mlp": 1.03860593, + "epoch": 0.07142642416954757, + "flos": 15677616942720.0, + "grad_norm": 2.409841600979247, + "language_loss": 0.80428207, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.83218926, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.32910156, + "step": 1188, + "time_per_iteration": 2.9323060512542725 + }, + { + "auxiliary_loss_clip": 0.01712766, + "auxiliary_loss_mlp": 0.01071645, + "balance_loss_clip": 1.43560302, + "balance_loss_mlp": 1.03690767, + "epoch": 0.07148654742221554, + "flos": 19764737412480.0, + "grad_norm": 1.9420417860906642, + "language_loss": 0.86858535, + "learning_rate": 3.981973385410981e-06, + "loss": 0.89642942, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.34741211, + "step": 1189, + "time_per_iteration": 2.942032814025879 + }, + { + "auxiliary_loss_clip": 0.0171633, + "auxiliary_loss_mlp": 0.01069603, + "balance_loss_clip": 1.43951368, + "balance_loss_mlp": 1.03350639, + "epoch": 0.07154667067488352, + "flos": 23480894615040.0, + "grad_norm": 2.8304067860425843, + "language_loss": 0.78945482, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.81731415, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 2.76953125, + "router_z_loss_mlp": 0.36108398, + "step": 1190, + "time_per_iteration": 2.926969289779663 + }, + { + "auxiliary_loss_clip": 0.0173779, + "auxiliary_loss_mlp": 0.01074461, + "balance_loss_clip": 1.44896078, + "balance_loss_mlp": 1.03507435, + "epoch": 0.07160679392755148, + "flos": 18342127820160.0, + "grad_norm": 2.0474937790828425, + "language_loss": 0.77704114, + "learning_rate": 3.981868890255468e-06, + "loss": 0.80516362, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.39404297, + "step": 1191, + "time_per_iteration": 2.903620719909668 + }, + { + "auxiliary_loss_clip": 0.0172124, + "auxiliary_loss_mlp": 0.01080661, + "balance_loss_clip": 1.43466735, + "balance_loss_mlp": 1.04582763, + "epoch": 0.07166691718021945, + "flos": 17755499998080.0, + "grad_norm": 3.31795051111562, + "language_loss": 0.76172864, + "learning_rate": 3.981816529947719e-06, + "loss": 0.78974771, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.34814453, + "step": 1192, + "time_per_iteration": 2.9529917240142822 + }, + { + "auxiliary_loss_clip": 0.01727446, + "auxiliary_loss_mlp": 0.01078597, + "balance_loss_clip": 1.44257212, + "balance_loss_mlp": 1.04450274, + "epoch": 0.07172704043288743, + "flos": 22461173136000.0, + "grad_norm": 1.9096970835370717, + "language_loss": 0.79305339, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.82111388, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 2.84960938, + "router_z_loss_mlp": 0.34082031, + "step": 1193, + "time_per_iteration": 2.911536455154419 + }, + { + "auxiliary_loss_clip": 0.01737895, + "auxiliary_loss_mlp": 0.01079276, + "balance_loss_clip": 1.45515561, + "balance_loss_mlp": 1.04227328, + "epoch": 0.07178716368555539, + "flos": 23232717031680.0, + "grad_norm": 1.837871044667439, + "language_loss": 0.87734944, + "learning_rate": 3.981711583882166e-06, + "loss": 0.90552115, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.37011719, + "step": 1194, + "time_per_iteration": 2.9038984775543213 + }, + { + "auxiliary_loss_clip": 0.01741711, + "auxiliary_loss_mlp": 0.01082663, + "balance_loss_clip": 1.45766115, + "balance_loss_mlp": 1.04728198, + "epoch": 0.07184728693822336, + "flos": 25160504526720.0, + "grad_norm": 1.8814635220373335, + "language_loss": 0.82097638, + "learning_rate": 3.981658998128341e-06, + "loss": 0.84922016, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.35375977, + "step": 1195, + "time_per_iteration": 2.9334497451782227 + }, + { + "auxiliary_loss_clip": 0.01745805, + "auxiliary_loss_mlp": 0.01077178, + "balance_loss_clip": 1.46009898, + "balance_loss_mlp": 1.04487276, + "epoch": 0.07190741019089132, + "flos": 22721566815360.0, + "grad_norm": 1.867701780210213, + "language_loss": 0.80789912, + "learning_rate": 3.981606337229808e-06, + "loss": 0.83612895, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.32299805, + "step": 1196, + "time_per_iteration": 2.9574496746063232 + }, + { + "auxiliary_loss_clip": 0.01728089, + "auxiliary_loss_mlp": 0.01082063, + "balance_loss_clip": 1.44284189, + "balance_loss_mlp": 1.04713452, + "epoch": 0.0719675334435593, + "flos": 29361361720320.0, + "grad_norm": 2.4457021167157698, + "language_loss": 0.73589826, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.7639997, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.34936523, + "step": 1197, + "time_per_iteration": 3.030885934829712 + }, + { + "auxiliary_loss_clip": 0.01739424, + "auxiliary_loss_mlp": 0.01078717, + "balance_loss_clip": 1.45779407, + "balance_loss_mlp": 1.04459929, + "epoch": 0.07202765669622727, + "flos": 17648821463040.0, + "grad_norm": 2.001110647556366, + "language_loss": 0.87090337, + "learning_rate": 3.98150079000661e-06, + "loss": 0.89908481, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.34106445, + "step": 1198, + "time_per_iteration": 4.424349784851074 + }, + { + "auxiliary_loss_clip": 0.01751791, + "auxiliary_loss_mlp": 0.01092928, + "balance_loss_clip": 1.46431303, + "balance_loss_mlp": 1.05928731, + "epoch": 0.07208777994889523, + "flos": 21443985365760.0, + "grad_norm": 2.0261192201506373, + "language_loss": 0.84525955, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8737067, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 2.87109375, + "router_z_loss_mlp": 0.33642578, + "step": 1199, + "time_per_iteration": 2.9546959400177 + }, + { + "auxiliary_loss_clip": 0.0174724, + "auxiliary_loss_mlp": 0.01086107, + "balance_loss_clip": 1.46095288, + "balance_loss_mlp": 1.05449295, + "epoch": 0.07214790320156321, + "flos": 26951588922240.0, + "grad_norm": 2.162753263759689, + "language_loss": 0.77799565, + "learning_rate": 3.981394942228581e-06, + "loss": 0.80632913, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.31591797, + "step": 1200, + "time_per_iteration": 3.147609233856201 + }, + { + "auxiliary_loss_clip": 0.01758858, + "auxiliary_loss_mlp": 0.01094145, + "balance_loss_clip": 1.47000027, + "balance_loss_mlp": 1.05835867, + "epoch": 0.07220802645423118, + "flos": 23890886161920.0, + "grad_norm": 2.3226840472419927, + "language_loss": 0.84240931, + "learning_rate": 3.98134190563652e-06, + "loss": 0.87093937, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 2.88671875, + "router_z_loss_mlp": 0.3581543, + "step": 1201, + "time_per_iteration": 2.89512038230896 + }, + { + "auxiliary_loss_clip": 0.01743841, + "auxiliary_loss_mlp": 0.01088443, + "balance_loss_clip": 1.45149279, + "balance_loss_mlp": 1.04929507, + "epoch": 0.07226814970689914, + "flos": 19252682300160.0, + "grad_norm": 1.917896726347435, + "language_loss": 0.70590383, + "learning_rate": 3.981288793911775e-06, + "loss": 0.7342267, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 2.92773438, + "router_z_loss_mlp": 0.39135742, + "step": 1202, + "time_per_iteration": 2.9453768730163574 + }, + { + "auxiliary_loss_clip": 0.01739272, + "auxiliary_loss_mlp": 0.01071937, + "balance_loss_clip": 1.45604241, + "balance_loss_mlp": 1.0371753, + "epoch": 0.07232827295956712, + "flos": 19181412460800.0, + "grad_norm": 2.0618424164882208, + "language_loss": 0.89387202, + "learning_rate": 3.98123560705636e-06, + "loss": 0.92198414, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.34790039, + "step": 1203, + "time_per_iteration": 4.4452478885650635 + }, + { + "auxiliary_loss_clip": 0.01767333, + "auxiliary_loss_mlp": 0.01087884, + "balance_loss_clip": 1.47615254, + "balance_loss_mlp": 1.05254996, + "epoch": 0.07238839621223508, + "flos": 17648776218240.0, + "grad_norm": 1.686556145630741, + "language_loss": 0.80710328, + "learning_rate": 3.981182345072293e-06, + "loss": 0.83565545, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 2.91210938, + "router_z_loss_mlp": 0.35375977, + "step": 1204, + "time_per_iteration": 4.305245637893677 + }, + { + "auxiliary_loss_clip": 0.01737967, + "auxiliary_loss_mlp": 0.01097466, + "balance_loss_clip": 1.45582771, + "balance_loss_mlp": 1.0603447, + "epoch": 0.07244851946490305, + "flos": 28303878816000.0, + "grad_norm": 1.605170742336957, + "language_loss": 0.83086634, + "learning_rate": 3.981129007961593e-06, + "loss": 0.85922068, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.37109375, + "step": 1205, + "time_per_iteration": 4.344234943389893 + }, + { + "auxiliary_loss_clip": 0.0175286, + "auxiliary_loss_mlp": 0.01081941, + "balance_loss_clip": 1.46489418, + "balance_loss_mlp": 1.04536748, + "epoch": 0.07250864271757101, + "flos": 22575000349440.0, + "grad_norm": 1.8510455252659823, + "language_loss": 0.78052974, + "learning_rate": 3.981075595726283e-06, + "loss": 0.80887783, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.36572266, + "step": 1206, + "time_per_iteration": 2.927222728729248 + }, + { + "auxiliary_loss_clip": 0.0173858, + "auxiliary_loss_mlp": 0.01071468, + "balance_loss_clip": 1.45540977, + "balance_loss_mlp": 1.03475118, + "epoch": 0.072568765970239, + "flos": 21772436503680.0, + "grad_norm": 2.0375811785618083, + "language_loss": 0.79009116, + "learning_rate": 3.981022108368387e-06, + "loss": 0.81819159, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.3671875, + "step": 1207, + "time_per_iteration": 2.98776912689209 + }, + { + "auxiliary_loss_clip": 0.01725553, + "auxiliary_loss_mlp": 0.01074397, + "balance_loss_clip": 1.44766903, + "balance_loss_mlp": 1.03863418, + "epoch": 0.07262888922290696, + "flos": 25530517653120.0, + "grad_norm": 1.6976473278794113, + "language_loss": 0.81308889, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.84108835, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.35766602, + "step": 1208, + "time_per_iteration": 2.9281349182128906 + }, + { + "auxiliary_loss_clip": 0.01737577, + "auxiliary_loss_mlp": 0.01074648, + "balance_loss_clip": 1.45709348, + "balance_loss_mlp": 1.04129362, + "epoch": 0.07268901247557492, + "flos": 21255132994560.0, + "grad_norm": 1.810228870763851, + "language_loss": 0.79778206, + "learning_rate": 3.980914908292955e-06, + "loss": 0.82590431, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.33374023, + "step": 1209, + "time_per_iteration": 2.9338574409484863 + }, + { + "auxiliary_loss_clip": 0.01752962, + "auxiliary_loss_mlp": 0.01065357, + "balance_loss_clip": 1.46704757, + "balance_loss_mlp": 1.0298562, + "epoch": 0.0727491357282429, + "flos": 25489091399040.0, + "grad_norm": 2.3990415635268616, + "language_loss": 0.83185416, + "learning_rate": 3.980861195579486e-06, + "loss": 0.86003739, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.35498047, + "step": 1210, + "time_per_iteration": 3.1589083671569824 + }, + { + "auxiliary_loss_clip": 0.01739507, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_clip": 1.45913279, + "balance_loss_mlp": 1.0426662, + "epoch": 0.07280925898091087, + "flos": 24472853769600.0, + "grad_norm": 1.6588875731584751, + "language_loss": 0.86237931, + "learning_rate": 3.98080740775156e-06, + "loss": 0.89059877, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.3972168, + "step": 1211, + "time_per_iteration": 2.910682201385498 + }, + { + "auxiliary_loss_clip": 0.01737098, + "auxiliary_loss_mlp": 0.01066662, + "balance_loss_clip": 1.45601916, + "balance_loss_mlp": 1.02939737, + "epoch": 0.07286938223357883, + "flos": 18295226945280.0, + "grad_norm": 2.1028432800332655, + "language_loss": 0.93129295, + "learning_rate": 3.98075354481122e-06, + "loss": 0.95933056, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.37280273, + "step": 1212, + "time_per_iteration": 2.9579455852508545 + }, + { + "auxiliary_loss_clip": 0.0173523, + "auxiliary_loss_mlp": 0.01061275, + "balance_loss_clip": 1.45369434, + "balance_loss_mlp": 1.02501178, + "epoch": 0.07292950548624681, + "flos": 21224610737280.0, + "grad_norm": 1.7221741431888318, + "language_loss": 0.74217641, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.77014148, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.36206055, + "step": 1213, + "time_per_iteration": 2.8672401905059814 + }, + { + "auxiliary_loss_clip": 0.0174543, + "auxiliary_loss_mlp": 0.01070239, + "balance_loss_clip": 1.4595561, + "balance_loss_mlp": 1.03171074, + "epoch": 0.07298962873891478, + "flos": 24652068998400.0, + "grad_norm": 1.6503147979080937, + "language_loss": 0.86349249, + "learning_rate": 3.980645593601465e-06, + "loss": 0.89164913, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 2.85742188, + "router_z_loss_mlp": 0.38525391, + "step": 1214, + "time_per_iteration": 2.940476417541504 + }, + { + "auxiliary_loss_clip": 0.0174297, + "auxiliary_loss_mlp": 0.0107418, + "balance_loss_clip": 1.46085334, + "balance_loss_mlp": 1.0365572, + "epoch": 0.07304975199158274, + "flos": 27064149281280.0, + "grad_norm": 2.9535399495248567, + "language_loss": 0.86468691, + "learning_rate": 3.980591505336144e-06, + "loss": 0.89285845, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.3762207, + "step": 1215, + "time_per_iteration": 2.998314619064331 + }, + { + "auxiliary_loss_clip": 0.01734469, + "auxiliary_loss_mlp": 0.0106546, + "balance_loss_clip": 1.44905901, + "balance_loss_mlp": 1.02716994, + "epoch": 0.07310987524425071, + "flos": 33563214299520.0, + "grad_norm": 1.6807280439957868, + "language_loss": 0.83393174, + "learning_rate": 3.980537341966595e-06, + "loss": 0.86193103, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.38330078, + "step": 1216, + "time_per_iteration": 2.9751698970794678 + }, + { + "auxiliary_loss_clip": 0.01765809, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.47734332, + "balance_loss_mlp": 1.02829695, + "epoch": 0.07316999849691869, + "flos": 28122627571200.0, + "grad_norm": 1.9834005684413027, + "language_loss": 0.78346801, + "learning_rate": 3.980483103494872e-06, + "loss": 0.81178463, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.37573242, + "step": 1217, + "time_per_iteration": 2.966101884841919 + }, + { + "auxiliary_loss_clip": 0.01742526, + "auxiliary_loss_mlp": 0.01074209, + "balance_loss_clip": 1.46050537, + "balance_loss_mlp": 1.03706324, + "epoch": 0.07323012174958665, + "flos": 14400804654720.0, + "grad_norm": 1.8600213996681079, + "language_loss": 0.87973988, + "learning_rate": 3.98042878992303e-06, + "loss": 0.90790719, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.37133789, + "step": 1218, + "time_per_iteration": 2.865882396697998 + }, + { + "auxiliary_loss_clip": 0.01733777, + "auxiliary_loss_mlp": 0.01067971, + "balance_loss_clip": 1.45329738, + "balance_loss_mlp": 1.02960896, + "epoch": 0.07329024500225462, + "flos": 21626412975360.0, + "grad_norm": 1.9651097117463758, + "language_loss": 0.88450789, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.9125253, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.3840332, + "step": 1219, + "time_per_iteration": 2.907116651535034 + }, + { + "auxiliary_loss_clip": 0.01736842, + "auxiliary_loss_mlp": 0.0107386, + "balance_loss_clip": 1.4583354, + "balance_loss_mlp": 1.03564191, + "epoch": 0.0733503682549226, + "flos": 13231802021760.0, + "grad_norm": 1.9035129675019191, + "language_loss": 0.87007231, + "learning_rate": 3.980319937487235e-06, + "loss": 0.89817929, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.38232422, + "step": 1220, + "time_per_iteration": 2.8809688091278076 + }, + { + "auxiliary_loss_clip": 0.01744602, + "auxiliary_loss_mlp": 0.01074062, + "balance_loss_clip": 1.46072721, + "balance_loss_mlp": 1.0339129, + "epoch": 0.07341049150759056, + "flos": 20896838271360.0, + "grad_norm": 2.0663208744488446, + "language_loss": 0.8005898, + "learning_rate": 3.98026539862741e-06, + "loss": 0.82877648, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.40112305, + "step": 1221, + "time_per_iteration": 2.8785417079925537 + }, + { + "auxiliary_loss_clip": 0.01738903, + "auxiliary_loss_mlp": 0.0106669, + "balance_loss_clip": 1.45837414, + "balance_loss_mlp": 1.02961588, + "epoch": 0.07347061476025853, + "flos": 15421928722560.0, + "grad_norm": 1.8746087726504528, + "language_loss": 0.93722951, + "learning_rate": 3.980210784675722e-06, + "loss": 0.96528542, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.37109375, + "step": 1222, + "time_per_iteration": 2.96284556388855 + }, + { + "auxiliary_loss_clip": 0.01739344, + "auxiliary_loss_mlp": 0.01069028, + "balance_loss_clip": 1.45512319, + "balance_loss_mlp": 1.02928352, + "epoch": 0.0735307380129265, + "flos": 11116202785920.0, + "grad_norm": 2.9634330438431626, + "language_loss": 0.9308179, + "learning_rate": 3.980156095634242e-06, + "loss": 0.95890158, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.3972168, + "step": 1223, + "time_per_iteration": 2.846430540084839 + }, + { + "auxiliary_loss_clip": 0.01750781, + "auxiliary_loss_mlp": 0.01075169, + "balance_loss_clip": 1.46650648, + "balance_loss_mlp": 1.03673625, + "epoch": 0.07359086126559447, + "flos": 23742871862400.0, + "grad_norm": 2.413643272568145, + "language_loss": 0.83740896, + "learning_rate": 3.980101331505045e-06, + "loss": 0.86566842, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.3840332, + "step": 1224, + "time_per_iteration": 3.008087396621704 + }, + { + "auxiliary_loss_clip": 0.01734798, + "auxiliary_loss_mlp": 0.01075699, + "balance_loss_clip": 1.45144153, + "balance_loss_mlp": 1.03512013, + "epoch": 0.07365098451826244, + "flos": 21002340441600.0, + "grad_norm": 2.083858182567427, + "language_loss": 0.85000324, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.87810826, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.40600586, + "step": 1225, + "time_per_iteration": 3.069096326828003 + }, + { + "auxiliary_loss_clip": 0.01737809, + "auxiliary_loss_mlp": 0.01069145, + "balance_loss_clip": 1.45951688, + "balance_loss_mlp": 1.03021157, + "epoch": 0.0737111077709304, + "flos": 19941961870080.0, + "grad_norm": 2.0975455494419983, + "language_loss": 0.92384607, + "learning_rate": 3.979991577991808e-06, + "loss": 0.95191562, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.38916016, + "step": 1226, + "time_per_iteration": 2.9532361030578613 + }, + { + "auxiliary_loss_clip": 0.01782014, + "auxiliary_loss_mlp": 0.01073972, + "balance_loss_clip": 1.48533571, + "balance_loss_mlp": 1.03451407, + "epoch": 0.07377123102359838, + "flos": 16590252683520.0, + "grad_norm": 2.5088905806470616, + "language_loss": 0.79429889, + "learning_rate": 3.97993658861193e-06, + "loss": 0.82285869, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 2.96289062, + "router_z_loss_mlp": 0.39453125, + "step": 1227, + "time_per_iteration": 3.056983470916748 + }, + { + "auxiliary_loss_clip": 0.01740669, + "auxiliary_loss_mlp": 0.01070421, + "balance_loss_clip": 1.46197307, + "balance_loss_mlp": 1.03000879, + "epoch": 0.07383135427626634, + "flos": 28339966183680.0, + "grad_norm": 2.169365393380425, + "language_loss": 0.86905336, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.89716423, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.40332031, + "step": 1228, + "time_per_iteration": 2.9955191612243652 + }, + { + "auxiliary_loss_clip": 0.01748338, + "auxiliary_loss_mlp": 0.01073462, + "balance_loss_clip": 1.46644998, + "balance_loss_mlp": 1.03042769, + "epoch": 0.07389147752893431, + "flos": 20056648734720.0, + "grad_norm": 4.268344981787953, + "language_loss": 0.81027192, + "learning_rate": 3.97982638461608e-06, + "loss": 0.83848989, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.4296875, + "step": 1229, + "time_per_iteration": 2.953364133834839 + }, + { + "auxiliary_loss_clip": 0.01758347, + "auxiliary_loss_mlp": 0.01074346, + "balance_loss_clip": 1.47193432, + "balance_loss_mlp": 1.03350449, + "epoch": 0.07395160078160229, + "flos": 18123296129280.0, + "grad_norm": 1.9638302560810479, + "language_loss": 0.80922222, + "learning_rate": 3.979771170004287e-06, + "loss": 0.83754921, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.40844727, + "step": 1230, + "time_per_iteration": 2.9035606384277344 + }, + { + "auxiliary_loss_clip": 0.01734878, + "auxiliary_loss_mlp": 0.01070906, + "balance_loss_clip": 1.457165, + "balance_loss_mlp": 1.03197217, + "epoch": 0.07401172403427025, + "flos": 23597481761280.0, + "grad_norm": 2.239313452335804, + "language_loss": 0.83430797, + "learning_rate": 3.979715880319372e-06, + "loss": 0.86236578, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.3894043, + "step": 1231, + "time_per_iteration": 2.9736645221710205 + }, + { + "auxiliary_loss_clip": 0.01749904, + "auxiliary_loss_mlp": 0.01075794, + "balance_loss_clip": 1.46252811, + "balance_loss_mlp": 1.0319488, + "epoch": 0.07407184728693822, + "flos": 26371204882560.0, + "grad_norm": 2.121473087369066, + "language_loss": 0.97927123, + "learning_rate": 3.979660515563434e-06, + "loss": 1.00752819, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.43823242, + "step": 1232, + "time_per_iteration": 3.0363388061523438 + }, + { + "auxiliary_loss_clip": 0.01740952, + "auxiliary_loss_mlp": 0.01071942, + "balance_loss_clip": 1.46368349, + "balance_loss_mlp": 1.03391445, + "epoch": 0.0741319705396062, + "flos": 22210642823040.0, + "grad_norm": 2.9429534614298607, + "language_loss": 0.83075738, + "learning_rate": 3.979605075738569e-06, + "loss": 0.85888624, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.37988281, + "step": 1233, + "time_per_iteration": 4.419411897659302 + }, + { + "auxiliary_loss_clip": 0.01753128, + "auxiliary_loss_mlp": 0.01077026, + "balance_loss_clip": 1.46535671, + "balance_loss_mlp": 1.03539777, + "epoch": 0.07419209379227416, + "flos": 39214488654720.0, + "grad_norm": 2.1884629755441427, + "language_loss": 0.72009736, + "learning_rate": 3.979549560846883e-06, + "loss": 0.7483989, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.41625977, + "step": 1234, + "time_per_iteration": 3.118805170059204 + }, + { + "auxiliary_loss_clip": 0.01753646, + "auxiliary_loss_mlp": 0.01077283, + "balance_loss_clip": 1.47166014, + "balance_loss_mlp": 1.03815866, + "epoch": 0.07425221704494213, + "flos": 22791343576320.0, + "grad_norm": 2.196036768049439, + "language_loss": 0.78298187, + "learning_rate": 3.979493970890478e-06, + "loss": 0.81129116, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.39135742, + "step": 1235, + "time_per_iteration": 2.968195915222168 + }, + { + "auxiliary_loss_clip": 0.01744384, + "auxiliary_loss_mlp": 0.01062963, + "balance_loss_clip": 1.4673667, + "balance_loss_mlp": 1.02541161, + "epoch": 0.0743123402976101, + "flos": 22283089027200.0, + "grad_norm": 2.043937499821166, + "language_loss": 0.84789968, + "learning_rate": 3.979438305871464e-06, + "loss": 0.87597317, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 2.76757812, + "router_z_loss_mlp": 0.37548828, + "step": 1236, + "time_per_iteration": 2.902275562286377 + }, + { + "auxiliary_loss_clip": 0.01759652, + "auxiliary_loss_mlp": 0.01068365, + "balance_loss_clip": 1.47450292, + "balance_loss_mlp": 1.02890694, + "epoch": 0.07437246355027807, + "flos": 29327038899840.0, + "grad_norm": 2.538933912720982, + "language_loss": 0.77946121, + "learning_rate": 3.979382565791951e-06, + "loss": 0.8077414, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 2.85351562, + "router_z_loss_mlp": 0.39453125, + "step": 1237, + "time_per_iteration": 2.94968318939209 + }, + { + "auxiliary_loss_clip": 0.01752334, + "auxiliary_loss_mlp": 0.0106996, + "balance_loss_clip": 1.47003424, + "balance_loss_mlp": 1.032552, + "epoch": 0.07443258680294604, + "flos": 31958131852800.0, + "grad_norm": 1.7857787820615114, + "language_loss": 0.78460157, + "learning_rate": 3.979326750654053e-06, + "loss": 0.81282449, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.37402344, + "step": 1238, + "time_per_iteration": 4.414613962173462 + }, + { + "auxiliary_loss_clip": 0.01770746, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.48345912, + "balance_loss_mlp": 1.02784216, + "epoch": 0.074492710055614, + "flos": 22685750916480.0, + "grad_norm": 2.0565679634515157, + "language_loss": 0.88199615, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.9103955, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 2.87304688, + "router_z_loss_mlp": 0.41308594, + "step": 1239, + "time_per_iteration": 5.773285627365112 + }, + { + "auxiliary_loss_clip": 0.0176868, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_clip": 1.48096228, + "balance_loss_mlp": 1.0236814, + "epoch": 0.07455283330828198, + "flos": 21293980295040.0, + "grad_norm": 4.184757193333043, + "language_loss": 0.90643001, + "learning_rate": 3.979214895211569e-06, + "loss": 0.9347527, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.39868164, + "step": 1240, + "time_per_iteration": 2.9377450942993164 + }, + { + "auxiliary_loss_clip": 0.01760012, + "auxiliary_loss_mlp": 0.01071419, + "balance_loss_clip": 1.47619224, + "balance_loss_mlp": 1.03012514, + "epoch": 0.07461295656094995, + "flos": 24398869242240.0, + "grad_norm": 2.3221089535548267, + "language_loss": 0.90357721, + "learning_rate": 3.979158854911225e-06, + "loss": 0.93189156, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.4128418, + "step": 1241, + "time_per_iteration": 2.914361000061035 + }, + { + "auxiliary_loss_clip": 0.01471498, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.29809165, + "balance_loss_mlp": 1.00883317, + "epoch": 0.07467307981361791, + "flos": 62138074078080.0, + "grad_norm": 0.9091378248313787, + "language_loss": 0.6314882, + "learning_rate": 3.979102739560979e-06, + "loss": 0.6565299, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.23828125, + "step": 1242, + "time_per_iteration": 3.4046709537506104 + }, + { + "auxiliary_loss_clip": 0.01804198, + "auxiliary_loss_mlp": 0.01076023, + "balance_loss_clip": 1.5040946, + "balance_loss_mlp": 1.03546774, + "epoch": 0.07473320306628589, + "flos": 24873841601280.0, + "grad_norm": 2.2085478184233978, + "language_loss": 0.64503086, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.67383301, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.4050293, + "step": 1243, + "time_per_iteration": 2.9248616695404053 + }, + { + "auxiliary_loss_clip": 0.0176037, + "auxiliary_loss_mlp": 0.01082033, + "balance_loss_clip": 1.47819281, + "balance_loss_mlp": 1.0439105, + "epoch": 0.07479332631895386, + "flos": 24907576239360.0, + "grad_norm": 1.848427621321491, + "language_loss": 0.78092623, + "learning_rate": 3.978990283719296e-06, + "loss": 0.80935025, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.38110352, + "step": 1244, + "time_per_iteration": 2.9813613891601562 + }, + { + "auxiliary_loss_clip": 0.01761704, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_clip": 1.47839069, + "balance_loss_mlp": 1.03083086, + "epoch": 0.07485344957162182, + "flos": 17822516826240.0, + "grad_norm": 2.768903890669495, + "language_loss": 0.71654117, + "learning_rate": 3.978933943232123e-06, + "loss": 0.74483871, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.37207031, + "step": 1245, + "time_per_iteration": 2.8912391662597656 + }, + { + "auxiliary_loss_clip": 0.01757208, + "auxiliary_loss_mlp": 0.01073239, + "balance_loss_clip": 1.47419667, + "balance_loss_mlp": 1.036165, + "epoch": 0.0749135728242898, + "flos": 25021222473600.0, + "grad_norm": 2.106114171054669, + "language_loss": 0.89950359, + "learning_rate": 3.978877527703576e-06, + "loss": 0.92780805, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.37109375, + "step": 1246, + "time_per_iteration": 2.88694429397583 + }, + { + "auxiliary_loss_clip": 0.01799153, + "auxiliary_loss_mlp": 0.01087021, + "balance_loss_clip": 1.49760079, + "balance_loss_mlp": 1.04429698, + "epoch": 0.07497369607695777, + "flos": 17831113338240.0, + "grad_norm": 2.5269229169624863, + "language_loss": 0.91223234, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.94109404, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 3.01367188, + "router_z_loss_mlp": 0.42700195, + "step": 1247, + "time_per_iteration": 2.894378185272217 + }, + { + "auxiliary_loss_clip": 0.01755564, + "auxiliary_loss_mlp": 0.01080991, + "balance_loss_clip": 1.47535264, + "balance_loss_mlp": 1.04398882, + "epoch": 0.07503381932962573, + "flos": 15129157749120.0, + "grad_norm": 2.722098938491771, + "language_loss": 0.66454566, + "learning_rate": 3.978764471530921e-06, + "loss": 0.69291121, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.36987305, + "step": 1248, + "time_per_iteration": 2.887066125869751 + }, + { + "auxiliary_loss_clip": 0.01729293, + "auxiliary_loss_mlp": 0.01079229, + "balance_loss_clip": 1.45595884, + "balance_loss_mlp": 1.04189301, + "epoch": 0.0750939425822937, + "flos": 12822715370880.0, + "grad_norm": 2.2795683105930493, + "language_loss": 0.75825602, + "learning_rate": 3.978707830891102e-06, + "loss": 0.78634125, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.37353516, + "step": 1249, + "time_per_iteration": 2.9188787937164307 + }, + { + "auxiliary_loss_clip": 0.01769071, + "auxiliary_loss_mlp": 0.01087395, + "balance_loss_clip": 1.47671127, + "balance_loss_mlp": 1.05127466, + "epoch": 0.07515406583496168, + "flos": 24217482263040.0, + "grad_norm": 2.630988987744165, + "language_loss": 0.84280241, + "learning_rate": 3.978651115218482e-06, + "loss": 0.87136704, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.36132812, + "step": 1250, + "time_per_iteration": 2.8844637870788574 + }, + { + "auxiliary_loss_clip": 0.01753261, + "auxiliary_loss_mlp": 0.01078826, + "balance_loss_clip": 1.47415257, + "balance_loss_mlp": 1.04094076, + "epoch": 0.07521418908762964, + "flos": 26699520286080.0, + "grad_norm": 2.229562137137637, + "language_loss": 0.69622493, + "learning_rate": 3.978594324515215e-06, + "loss": 0.72454578, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.37866211, + "step": 1251, + "time_per_iteration": 3.007110834121704 + }, + { + "auxiliary_loss_clip": 0.01461099, + "auxiliary_loss_mlp": 0.01108392, + "balance_loss_clip": 1.29167378, + "balance_loss_mlp": 1.05956399, + "epoch": 0.0752743123402976, + "flos": 59126353453440.0, + "grad_norm": 0.9354610690679419, + "language_loss": 0.70639718, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.73209214, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.48828125, + "step": 1252, + "time_per_iteration": 3.478961944580078 + }, + { + "auxiliary_loss_clip": 0.01749889, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_clip": 1.46743941, + "balance_loss_mlp": 1.0469805, + "epoch": 0.07533443559296558, + "flos": 23487455111040.0, + "grad_norm": 2.2850336638375666, + "language_loss": 0.8139683, + "learning_rate": 3.97848051802535e-06, + "loss": 0.84231532, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.37841797, + "step": 1253, + "time_per_iteration": 2.8962042331695557 + }, + { + "auxiliary_loss_clip": 0.01742657, + "auxiliary_loss_mlp": 0.01069823, + "balance_loss_clip": 1.4563272, + "balance_loss_mlp": 1.03594351, + "epoch": 0.07539455884563355, + "flos": 20886929660160.0, + "grad_norm": 2.3159823642241304, + "language_loss": 0.96008432, + "learning_rate": 3.978423502243069e-06, + "loss": 0.98820925, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.33911133, + "step": 1254, + "time_per_iteration": 2.885986566543579 + }, + { + "auxiliary_loss_clip": 0.01736942, + "auxiliary_loss_mlp": 0.01074947, + "balance_loss_clip": 1.4607358, + "balance_loss_mlp": 1.03713417, + "epoch": 0.07545468209830151, + "flos": 27684421251840.0, + "grad_norm": 1.9277262708035223, + "language_loss": 0.8913976, + "learning_rate": 3.97836641143877e-06, + "loss": 0.91951644, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.37792969, + "step": 1255, + "time_per_iteration": 2.951539993286133 + }, + { + "auxiliary_loss_clip": 0.01739936, + "auxiliary_loss_mlp": 0.0107543, + "balance_loss_clip": 1.4618721, + "balance_loss_mlp": 1.03821325, + "epoch": 0.0755148053509695, + "flos": 14145071189760.0, + "grad_norm": 1.7257814108298783, + "language_loss": 0.80488253, + "learning_rate": 3.978309245614618e-06, + "loss": 0.83303618, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.37182617, + "step": 1256, + "time_per_iteration": 2.865190267562866 + }, + { + "auxiliary_loss_clip": 0.01459123, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_clip": 1.2852397, + "balance_loss_mlp": 1.00299823, + "epoch": 0.07557492860363746, + "flos": 58260872073600.0, + "grad_norm": 0.7844914671057381, + "language_loss": 0.58185482, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60690892, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.43359375, + "step": 1257, + "time_per_iteration": 3.495898485183716 + }, + { + "auxiliary_loss_clip": 0.01757087, + "auxiliary_loss_mlp": 0.01084338, + "balance_loss_clip": 1.47047102, + "balance_loss_mlp": 1.04228067, + "epoch": 0.07563505185630542, + "flos": 24655055155200.0, + "grad_norm": 2.47743322287645, + "language_loss": 0.91487241, + "learning_rate": 3.978194688915432e-06, + "loss": 0.94328666, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.4206543, + "step": 1258, + "time_per_iteration": 2.9163155555725098 + }, + { + "auxiliary_loss_clip": 0.01718934, + "auxiliary_loss_mlp": 0.01070764, + "balance_loss_clip": 1.45128655, + "balance_loss_mlp": 1.03433347, + "epoch": 0.07569517510897339, + "flos": 15531321945600.0, + "grad_norm": 1.9483982590610465, + "language_loss": 0.82698214, + "learning_rate": 3.978137298044741e-06, + "loss": 0.85487908, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.36425781, + "step": 1259, + "time_per_iteration": 2.9140992164611816 + }, + { + "auxiliary_loss_clip": 0.01746552, + "auxiliary_loss_mlp": 0.01069834, + "balance_loss_clip": 1.46700418, + "balance_loss_mlp": 1.03516769, + "epoch": 0.07575529836164137, + "flos": 22938498224640.0, + "grad_norm": 1.7480143636119096, + "language_loss": 0.77179372, + "learning_rate": 3.978079832162885e-06, + "loss": 0.79995763, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.34667969, + "step": 1260, + "time_per_iteration": 2.9436309337615967 + }, + { + "auxiliary_loss_clip": 0.01740319, + "auxiliary_loss_mlp": 0.0107198, + "balance_loss_clip": 1.46633804, + "balance_loss_mlp": 1.03478646, + "epoch": 0.07581542161430933, + "flos": 19509908843520.0, + "grad_norm": 1.7031594956235663, + "language_loss": 0.86320269, + "learning_rate": 3.978022291272044e-06, + "loss": 0.89132571, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.37182617, + "step": 1261, + "time_per_iteration": 2.888699769973755 + }, + { + "auxiliary_loss_clip": 0.01733815, + "auxiliary_loss_mlp": 0.01070146, + "balance_loss_clip": 1.45722914, + "balance_loss_mlp": 1.03543234, + "epoch": 0.0758755448669773, + "flos": 24984411189120.0, + "grad_norm": 2.0589400484844913, + "language_loss": 0.84320378, + "learning_rate": 3.977964675374399e-06, + "loss": 0.87124348, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.34716797, + "step": 1262, + "time_per_iteration": 2.9670019149780273 + }, + { + "auxiliary_loss_clip": 0.01732578, + "auxiliary_loss_mlp": 0.01080833, + "balance_loss_clip": 1.45250809, + "balance_loss_mlp": 1.04268634, + "epoch": 0.07593566811964528, + "flos": 22758378099840.0, + "grad_norm": 2.6645614476911392, + "language_loss": 0.84352511, + "learning_rate": 3.977906984472136e-06, + "loss": 0.87165928, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.3815918, + "step": 1263, + "time_per_iteration": 2.9595067501068115 + }, + { + "auxiliary_loss_clip": 0.01722591, + "auxiliary_loss_mlp": 0.01065261, + "balance_loss_clip": 1.44611859, + "balance_loss_mlp": 1.03073788, + "epoch": 0.07599579137231324, + "flos": 23122780871040.0, + "grad_norm": 2.238079142121044, + "language_loss": 0.77217853, + "learning_rate": 3.977849218567442e-06, + "loss": 0.80005705, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.34521484, + "step": 1264, + "time_per_iteration": 2.996696710586548 + }, + { + "auxiliary_loss_clip": 0.01730685, + "auxiliary_loss_mlp": 0.01065692, + "balance_loss_clip": 1.45036972, + "balance_loss_mlp": 1.02818918, + "epoch": 0.07605591462498121, + "flos": 14510288367360.0, + "grad_norm": 2.343877725059831, + "language_loss": 0.8303113, + "learning_rate": 3.977791377662507e-06, + "loss": 0.85827506, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.375, + "step": 1265, + "time_per_iteration": 2.968015670776367 + }, + { + "auxiliary_loss_clip": 0.01741248, + "auxiliary_loss_mlp": 0.01071394, + "balance_loss_clip": 1.45523226, + "balance_loss_mlp": 1.03415275, + "epoch": 0.07611603787764919, + "flos": 23524809333120.0, + "grad_norm": 1.8358103372153176, + "language_loss": 0.66649503, + "learning_rate": 3.977733461759524e-06, + "loss": 0.69462144, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 2.86328125, + "router_z_loss_mlp": 0.37207031, + "step": 1266, + "time_per_iteration": 3.065746307373047 + }, + { + "auxiliary_loss_clip": 0.0173546, + "auxiliary_loss_mlp": 0.01068197, + "balance_loss_clip": 1.45270658, + "balance_loss_mlp": 1.03396034, + "epoch": 0.07617616113031715, + "flos": 21517200731520.0, + "grad_norm": 2.486078464385837, + "language_loss": 0.82681119, + "learning_rate": 3.977675470860691e-06, + "loss": 0.85484773, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.34228516, + "step": 1267, + "time_per_iteration": 3.2914657592773438 + }, + { + "auxiliary_loss_clip": 0.01718345, + "auxiliary_loss_mlp": 0.01060197, + "balance_loss_clip": 1.44293714, + "balance_loss_mlp": 1.02443433, + "epoch": 0.07623628438298512, + "flos": 14580924779520.0, + "grad_norm": 2.053217755541146, + "language_loss": 0.7529521, + "learning_rate": 3.977617404968205e-06, + "loss": 0.78073752, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.35766602, + "step": 1268, + "time_per_iteration": 4.371596097946167 + }, + { + "auxiliary_loss_clip": 0.01716627, + "auxiliary_loss_mlp": 0.01059661, + "balance_loss_clip": 1.44232655, + "balance_loss_mlp": 1.02416086, + "epoch": 0.07629640763565308, + "flos": 14728531875840.0, + "grad_norm": 1.9508578651369524, + "language_loss": 0.84397286, + "learning_rate": 3.977559264084269e-06, + "loss": 0.87173569, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.35498047, + "step": 1269, + "time_per_iteration": 2.8916733264923096 + }, + { + "auxiliary_loss_clip": 0.01723385, + "auxiliary_loss_mlp": 0.01065655, + "balance_loss_clip": 1.4477725, + "balance_loss_mlp": 1.02815223, + "epoch": 0.07635653088832106, + "flos": 14911140464640.0, + "grad_norm": 2.1987140659948587, + "language_loss": 0.91458678, + "learning_rate": 3.977501048211088e-06, + "loss": 0.94247723, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.375, + "step": 1270, + "time_per_iteration": 3.068037509918213 + }, + { + "auxiliary_loss_clip": 0.01742019, + "auxiliary_loss_mlp": 0.01070811, + "balance_loss_clip": 1.46395993, + "balance_loss_mlp": 1.03037524, + "epoch": 0.07641665414098903, + "flos": 26662889980800.0, + "grad_norm": 4.5670719508048805, + "language_loss": 0.73055732, + "learning_rate": 3.977442757350869e-06, + "loss": 0.75868565, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.40454102, + "step": 1271, + "time_per_iteration": 3.007747173309326 + }, + { + "auxiliary_loss_clip": 0.01715276, + "auxiliary_loss_mlp": 0.01068738, + "balance_loss_clip": 1.45187306, + "balance_loss_mlp": 1.03297532, + "epoch": 0.07647677739365699, + "flos": 25203921552000.0, + "grad_norm": 1.5208492813461223, + "language_loss": 0.84197932, + "learning_rate": 3.977384391505823e-06, + "loss": 0.86981952, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.35717773, + "step": 1272, + "time_per_iteration": 2.9786059856414795 + }, + { + "auxiliary_loss_clip": 0.01721079, + "auxiliary_loss_mlp": 0.01068862, + "balance_loss_clip": 1.44679165, + "balance_loss_mlp": 1.03386235, + "epoch": 0.07653690064632497, + "flos": 20567798951040.0, + "grad_norm": 1.5991624553566999, + "language_loss": 0.82105732, + "learning_rate": 3.977325950678162e-06, + "loss": 0.8489567, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.35009766, + "step": 1273, + "time_per_iteration": 4.382061243057251 + }, + { + "auxiliary_loss_clip": 0.0174976, + "auxiliary_loss_mlp": 0.01075948, + "balance_loss_clip": 1.46820331, + "balance_loss_mlp": 1.04009008, + "epoch": 0.07659702389899294, + "flos": 22278474057600.0, + "grad_norm": 1.6334320278034284, + "language_loss": 0.8217749, + "learning_rate": 3.977267434870103e-06, + "loss": 0.85003197, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.35864258, + "step": 1274, + "time_per_iteration": 5.8802330493927 + }, + { + "auxiliary_loss_clip": 0.01727434, + "auxiliary_loss_mlp": 0.01082611, + "balance_loss_clip": 1.45567441, + "balance_loss_mlp": 1.0445354, + "epoch": 0.0766571471516609, + "flos": 32649221214720.0, + "grad_norm": 1.6915330171567986, + "language_loss": 0.73476589, + "learning_rate": 3.977208844083865e-06, + "loss": 0.76286638, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.38061523, + "step": 1275, + "time_per_iteration": 3.0718259811401367 + }, + { + "auxiliary_loss_clip": 0.01724252, + "auxiliary_loss_mlp": 0.01073045, + "balance_loss_clip": 1.44810486, + "balance_loss_mlp": 1.03759253, + "epoch": 0.07671727040432888, + "flos": 15275588480640.0, + "grad_norm": 1.9022017864873937, + "language_loss": 0.81663787, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.84461081, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.35473633, + "step": 1276, + "time_per_iteration": 2.8892822265625 + }, + { + "auxiliary_loss_clip": 0.0174845, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_clip": 1.46690893, + "balance_loss_mlp": 1.03938472, + "epoch": 0.07677739365699685, + "flos": 28195842936960.0, + "grad_norm": 2.2304626521020494, + "language_loss": 0.61748576, + "learning_rate": 3.97709143758574e-06, + "loss": 0.64573443, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.37011719, + "step": 1277, + "time_per_iteration": 3.034935712814331 + }, + { + "auxiliary_loss_clip": 0.01750936, + "auxiliary_loss_mlp": 0.01069643, + "balance_loss_clip": 1.4696182, + "balance_loss_mlp": 1.03166318, + "epoch": 0.07683751690966481, + "flos": 18305226046080.0, + "grad_norm": 2.576421489293144, + "language_loss": 0.77631617, + "learning_rate": 3.977032621878305e-06, + "loss": 0.80452204, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.37963867, + "step": 1278, + "time_per_iteration": 2.8928890228271484 + }, + { + "auxiliary_loss_clip": 0.01728835, + "auxiliary_loss_mlp": 0.01073396, + "balance_loss_clip": 1.45895433, + "balance_loss_mlp": 1.02938437, + "epoch": 0.07689764016233278, + "flos": 21991132460160.0, + "grad_norm": 2.0281555783937866, + "language_loss": 0.89704406, + "learning_rate": 3.976973731201596e-06, + "loss": 0.92506635, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.43994141, + "step": 1279, + "time_per_iteration": 2.9805259704589844 + }, + { + "auxiliary_loss_clip": 0.01725253, + "auxiliary_loss_mlp": 0.0106869, + "balance_loss_clip": 1.45405555, + "balance_loss_mlp": 1.033499, + "epoch": 0.07695776341500075, + "flos": 22245961029120.0, + "grad_norm": 2.470507607785178, + "language_loss": 0.84064806, + "learning_rate": 3.976914765557845e-06, + "loss": 0.86858743, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.35180664, + "step": 1280, + "time_per_iteration": 2.976846694946289 + }, + { + "auxiliary_loss_clip": 0.01726636, + "auxiliary_loss_mlp": 0.0107868, + "balance_loss_clip": 1.45712566, + "balance_loss_mlp": 1.04196382, + "epoch": 0.07701788666766872, + "flos": 16152363077760.0, + "grad_norm": 2.0441251798628914, + "language_loss": 0.77052355, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.79857671, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.36694336, + "step": 1281, + "time_per_iteration": 2.889867067337036 + }, + { + "auxiliary_loss_clip": 0.01746481, + "auxiliary_loss_mlp": 0.01088412, + "balance_loss_clip": 1.4643898, + "balance_loss_mlp": 1.05031228, + "epoch": 0.07707800992033668, + "flos": 19472283152640.0, + "grad_norm": 1.8313120072104785, + "language_loss": 0.7698704, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.79821932, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 2.82617188, + "router_z_loss_mlp": 0.38134766, + "step": 1282, + "time_per_iteration": 2.919570207595825 + }, + { + "auxiliary_loss_clip": 0.01738537, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_clip": 1.4654814, + "balance_loss_mlp": 1.04734433, + "epoch": 0.07713813317300466, + "flos": 18999799257600.0, + "grad_norm": 2.1621741440844873, + "language_loss": 0.84758776, + "learning_rate": 3.976737418846713e-06, + "loss": 0.87583089, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.38452148, + "step": 1283, + "time_per_iteration": 2.8886008262634277 + }, + { + "auxiliary_loss_clip": 0.01754871, + "auxiliary_loss_mlp": 0.01075871, + "balance_loss_clip": 1.47956944, + "balance_loss_mlp": 1.03710413, + "epoch": 0.07719825642567263, + "flos": 18123069905280.0, + "grad_norm": 2.215603331025433, + "language_loss": 0.77059877, + "learning_rate": 3.976678153357181e-06, + "loss": 0.79890621, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.38793945, + "step": 1284, + "time_per_iteration": 2.924194574356079 + }, + { + "auxiliary_loss_clip": 0.01756304, + "auxiliary_loss_mlp": 0.01072524, + "balance_loss_clip": 1.48091066, + "balance_loss_mlp": 1.03425789, + "epoch": 0.0772583796783406, + "flos": 42209758154880.0, + "grad_norm": 2.898738287219596, + "language_loss": 0.77238631, + "learning_rate": 3.976618812911817e-06, + "loss": 0.80067456, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.3828125, + "step": 1285, + "time_per_iteration": 3.0994951725006104 + }, + { + "auxiliary_loss_clip": 0.01747339, + "auxiliary_loss_mlp": 0.01068938, + "balance_loss_clip": 1.47416735, + "balance_loss_mlp": 1.03029013, + "epoch": 0.07731850293100857, + "flos": 24764357888640.0, + "grad_norm": 1.951009346643708, + "language_loss": 0.85257119, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.88073397, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.38671875, + "step": 1286, + "time_per_iteration": 2.925128698348999 + }, + { + "auxiliary_loss_clip": 0.0178301, + "auxiliary_loss_mlp": 0.01066222, + "balance_loss_clip": 1.49531364, + "balance_loss_mlp": 1.02797949, + "epoch": 0.07737862618367654, + "flos": 17574655956480.0, + "grad_norm": 2.3008927117564912, + "language_loss": 0.79816276, + "learning_rate": 3.97649990716259e-06, + "loss": 0.82665509, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.38232422, + "step": 1287, + "time_per_iteration": 2.9392285346984863 + }, + { + "auxiliary_loss_clip": 0.01755652, + "auxiliary_loss_mlp": 0.0106919, + "balance_loss_clip": 1.48160517, + "balance_loss_mlp": 1.03070974, + "epoch": 0.0774387494363445, + "flos": 25637467656960.0, + "grad_norm": 1.5895200849943512, + "language_loss": 0.85740888, + "learning_rate": 3.976440341863237e-06, + "loss": 0.88565731, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.38476562, + "step": 1288, + "time_per_iteration": 2.9329679012298584 + }, + { + "auxiliary_loss_clip": 0.01807792, + "auxiliary_loss_mlp": 0.01070105, + "balance_loss_clip": 1.51997399, + "balance_loss_mlp": 1.02995539, + "epoch": 0.07749887268901248, + "flos": 12247489238400.0, + "grad_norm": 2.1979636540319674, + "language_loss": 0.88373482, + "learning_rate": 3.976380701617068e-06, + "loss": 0.91251385, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.40185547, + "step": 1289, + "time_per_iteration": 2.9131546020507812 + }, + { + "auxiliary_loss_clip": 0.01794165, + "auxiliary_loss_mlp": 0.01062036, + "balance_loss_clip": 1.51546896, + "balance_loss_mlp": 1.02336454, + "epoch": 0.07755899594168045, + "flos": 25092401823360.0, + "grad_norm": 1.608742435670239, + "language_loss": 0.86622036, + "learning_rate": 3.976320986426344e-06, + "loss": 0.8947823, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.38671875, + "step": 1290, + "time_per_iteration": 3.034775972366333 + }, + { + "auxiliary_loss_clip": 0.01793012, + "auxiliary_loss_mlp": 0.01079741, + "balance_loss_clip": 1.51886368, + "balance_loss_mlp": 1.04004455, + "epoch": 0.07761911919434841, + "flos": 14254509657600.0, + "grad_norm": 2.004659572921241, + "language_loss": 0.93054682, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.95927441, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.39672852, + "step": 1291, + "time_per_iteration": 2.8811709880828857 + }, + { + "auxiliary_loss_clip": 0.01611301, + "auxiliary_loss_mlp": 0.01080905, + "balance_loss_clip": 1.43155122, + "balance_loss_mlp": 1.05744457, + "epoch": 0.07767924244701638, + "flos": 67272017662080.0, + "grad_norm": 0.9301485934613928, + "language_loss": 0.65147805, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67840004, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.234375, + "step": 1292, + "time_per_iteration": 3.4918689727783203 + }, + { + "auxiliary_loss_clip": 0.01804499, + "auxiliary_loss_mlp": 0.01059589, + "balance_loss_clip": 1.52583015, + "balance_loss_mlp": 1.0242312, + "epoch": 0.07773936569968436, + "flos": 28562915151360.0, + "grad_norm": 1.8499176256455119, + "language_loss": 0.89070344, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.91934431, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.35351562, + "step": 1293, + "time_per_iteration": 2.977987289428711 + }, + { + "auxiliary_loss_clip": 0.01828741, + "auxiliary_loss_mlp": 0.01067464, + "balance_loss_clip": 1.54834819, + "balance_loss_mlp": 1.02862573, + "epoch": 0.07779948895235232, + "flos": 27501676928640.0, + "grad_norm": 1.9703196581961127, + "language_loss": 0.85988283, + "learning_rate": 3.976081376263239e-06, + "loss": 0.88884485, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.38867188, + "step": 1294, + "time_per_iteration": 3.0797393321990967 + }, + { + "auxiliary_loss_clip": 0.01870879, + "auxiliary_loss_mlp": 0.01065827, + "balance_loss_clip": 1.57699037, + "balance_loss_mlp": 1.02608252, + "epoch": 0.07785961220502029, + "flos": 18232417883520.0, + "grad_norm": 2.1861493025714136, + "language_loss": 0.82153273, + "learning_rate": 3.976021286383768e-06, + "loss": 0.85089982, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 2.94140625, + "router_z_loss_mlp": 0.3972168, + "step": 1295, + "time_per_iteration": 2.927900552749634 + }, + { + "auxiliary_loss_clip": 0.01852767, + "auxiliary_loss_mlp": 0.0107366, + "balance_loss_clip": 1.56830549, + "balance_loss_mlp": 1.0308876, + "epoch": 0.07791973545768827, + "flos": 24618967787520.0, + "grad_norm": 1.988307150976017, + "language_loss": 0.89092273, + "learning_rate": 3.975961121573371e-06, + "loss": 0.920187, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.42773438, + "step": 1296, + "time_per_iteration": 2.9144833087921143 + }, + { + "auxiliary_loss_clip": 0.01862245, + "auxiliary_loss_mlp": 0.01073393, + "balance_loss_clip": 1.57390916, + "balance_loss_mlp": 1.03493595, + "epoch": 0.07797985871035623, + "flos": 14289963598080.0, + "grad_norm": 2.438153300308794, + "language_loss": 0.98303407, + "learning_rate": 3.9759008818343305e-06, + "loss": 1.01239049, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.38452148, + "step": 1297, + "time_per_iteration": 2.884141445159912 + }, + { + "auxiliary_loss_clip": 0.01848178, + "auxiliary_loss_mlp": 0.01076101, + "balance_loss_clip": 1.55788994, + "balance_loss_mlp": 1.04153049, + "epoch": 0.0780399819630242, + "flos": 26620558830720.0, + "grad_norm": 2.122082091738837, + "language_loss": 0.77846909, + "learning_rate": 3.97584056716893e-06, + "loss": 0.80771184, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.34570312, + "step": 1298, + "time_per_iteration": 2.965386390686035 + }, + { + "auxiliary_loss_clip": 0.01836773, + "auxiliary_loss_mlp": 0.01084868, + "balance_loss_clip": 1.55378401, + "balance_loss_mlp": 1.04886699, + "epoch": 0.07810010521569218, + "flos": 21844339770240.0, + "grad_norm": 1.5520113538525946, + "language_loss": 0.82647514, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.85569155, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 2.83007812, + "router_z_loss_mlp": 0.35986328, + "step": 1299, + "time_per_iteration": 3.009385824203491 + }, + { + "auxiliary_loss_clip": 0.01838558, + "auxiliary_loss_mlp": 0.01086957, + "balance_loss_clip": 1.56167519, + "balance_loss_mlp": 1.04537666, + "epoch": 0.07816022846836014, + "flos": 25091768396160.0, + "grad_norm": 1.7672952857701527, + "language_loss": 0.88196242, + "learning_rate": 3.975719713068202e-06, + "loss": 0.91121763, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.41577148, + "step": 1300, + "time_per_iteration": 3.0674591064453125 + }, + { + "auxiliary_loss_clip": 0.01844526, + "auxiliary_loss_mlp": 0.01088812, + "balance_loss_clip": 1.55501914, + "balance_loss_mlp": 1.05056965, + "epoch": 0.0782203517210281, + "flos": 40932538663680.0, + "grad_norm": 1.7634050669601604, + "language_loss": 0.73660433, + "learning_rate": 3.975659173637458e-06, + "loss": 0.76593769, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.38232422, + "step": 1301, + "time_per_iteration": 3.1398513317108154 + }, + { + "auxiliary_loss_clip": 0.01852726, + "auxiliary_loss_mlp": 0.01120414, + "balance_loss_clip": 1.56486654, + "balance_loss_mlp": 1.08221889, + "epoch": 0.07828047497369607, + "flos": 41186869539840.0, + "grad_norm": 1.5190403635096623, + "language_loss": 0.72084296, + "learning_rate": 3.97559855928952e-06, + "loss": 0.75057429, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 2.87890625, + "router_z_loss_mlp": 0.38232422, + "step": 1302, + "time_per_iteration": 3.193338632583618 + }, + { + "auxiliary_loss_clip": 0.01838205, + "auxiliary_loss_mlp": 0.01100887, + "balance_loss_clip": 1.55639303, + "balance_loss_mlp": 1.06374145, + "epoch": 0.07834059822636405, + "flos": 23517162961920.0, + "grad_norm": 2.1747333138552456, + "language_loss": 0.83594853, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.8653394, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.37158203, + "step": 1303, + "time_per_iteration": 4.4202141761779785 + }, + { + "auxiliary_loss_clip": 0.01840331, + "auxiliary_loss_mlp": 0.01100586, + "balance_loss_clip": 1.55767512, + "balance_loss_mlp": 1.06365538, + "epoch": 0.07840072147903202, + "flos": 20203622403840.0, + "grad_norm": 1.7329407836014536, + "language_loss": 0.76395625, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.79336542, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.36962891, + "step": 1304, + "time_per_iteration": 3.007347345352173 + }, + { + "auxiliary_loss_clip": 0.01830605, + "auxiliary_loss_mlp": 0.01101989, + "balance_loss_clip": 1.5467788, + "balance_loss_mlp": 1.06706071, + "epoch": 0.07846084473169998, + "flos": 21370679510400.0, + "grad_norm": 1.729353113173296, + "language_loss": 0.77417636, + "learning_rate": 3.975416266765542e-06, + "loss": 0.80350232, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.34936523, + "step": 1305, + "time_per_iteration": 2.9907658100128174 + }, + { + "auxiliary_loss_clip": 0.01822183, + "auxiliary_loss_mlp": 0.01115397, + "balance_loss_clip": 1.53483343, + "balance_loss_mlp": 1.07739258, + "epoch": 0.07852096798436796, + "flos": 25421938836480.0, + "grad_norm": 2.0256731478959606, + "language_loss": 0.86735284, + "learning_rate": 3.975355352771841e-06, + "loss": 0.89672863, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 2.87695312, + "router_z_loss_mlp": 0.38012695, + "step": 1306, + "time_per_iteration": 3.035352945327759 + }, + { + "auxiliary_loss_clip": 0.01814415, + "auxiliary_loss_mlp": 0.0110271, + "balance_loss_clip": 1.53002036, + "balance_loss_mlp": 1.06506407, + "epoch": 0.07858109123703592, + "flos": 24582156503040.0, + "grad_norm": 3.2890982257210255, + "language_loss": 0.91390014, + "learning_rate": 3.975294363872468e-06, + "loss": 0.94307142, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.3762207, + "step": 1307, + "time_per_iteration": 2.9242122173309326 + }, + { + "auxiliary_loss_clip": 0.01828437, + "auxiliary_loss_mlp": 0.01091755, + "balance_loss_clip": 1.54675901, + "balance_loss_mlp": 1.05506253, + "epoch": 0.07864121448970389, + "flos": 20707216738560.0, + "grad_norm": 6.61180532298452, + "language_loss": 0.83968723, + "learning_rate": 3.975233300069735e-06, + "loss": 0.86888915, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.3671875, + "step": 1308, + "time_per_iteration": 4.563499927520752 + }, + { + "auxiliary_loss_clip": 0.01808705, + "auxiliary_loss_mlp": 0.01093618, + "balance_loss_clip": 1.53403997, + "balance_loss_mlp": 1.05885673, + "epoch": 0.07870133774237187, + "flos": 22976893077120.0, + "grad_norm": 1.4215884598710988, + "language_loss": 0.78695154, + "learning_rate": 3.975172161365958e-06, + "loss": 0.81597477, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.34765625, + "step": 1309, + "time_per_iteration": 5.931636095046997 + }, + { + "auxiliary_loss_clip": 0.01828399, + "auxiliary_loss_mlp": 0.01096926, + "balance_loss_clip": 1.54174781, + "balance_loss_mlp": 1.06049585, + "epoch": 0.07876146099503983, + "flos": 18851920692480.0, + "grad_norm": 1.9012223399748578, + "language_loss": 0.81805426, + "learning_rate": 3.975110947763453e-06, + "loss": 0.8473075, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 2.86523438, + "router_z_loss_mlp": 0.36425781, + "step": 1310, + "time_per_iteration": 3.044184684753418 + }, + { + "auxiliary_loss_clip": 0.01795535, + "auxiliary_loss_mlp": 0.01078981, + "balance_loss_clip": 1.52653968, + "balance_loss_mlp": 1.04264569, + "epoch": 0.0788215842477078, + "flos": 23816403941760.0, + "grad_norm": 1.7436961302633238, + "language_loss": 0.7428003, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.77154547, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.36328125, + "step": 1311, + "time_per_iteration": 2.978878974914551 + }, + { + "auxiliary_loss_clip": 0.01815468, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_clip": 1.54011953, + "balance_loss_mlp": 1.04320908, + "epoch": 0.07888170750037576, + "flos": 21589873159680.0, + "grad_norm": 1.7447601827914365, + "language_loss": 0.8759079, + "learning_rate": 3.974988295871553e-06, + "loss": 0.90484798, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.35327148, + "step": 1312, + "time_per_iteration": 2.9277594089508057 + }, + { + "auxiliary_loss_clip": 0.01803052, + "auxiliary_loss_mlp": 0.01072181, + "balance_loss_clip": 1.53015387, + "balance_loss_mlp": 1.03515506, + "epoch": 0.07894183075304374, + "flos": 19874311614720.0, + "grad_norm": 1.6019000632817448, + "language_loss": 0.83263129, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.86138368, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.37036133, + "step": 1313, + "time_per_iteration": 2.9565110206604004 + }, + { + "auxiliary_loss_clip": 0.018455, + "auxiliary_loss_mlp": 0.0107586, + "balance_loss_clip": 1.55665481, + "balance_loss_mlp": 1.03511381, + "epoch": 0.07900195400571171, + "flos": 16152001119360.0, + "grad_norm": 2.5188212163567454, + "language_loss": 0.75324893, + "learning_rate": 3.97486534441264e-06, + "loss": 0.78246248, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 2.88867188, + "router_z_loss_mlp": 0.4074707, + "step": 1314, + "time_per_iteration": 2.9270973205566406 + }, + { + "auxiliary_loss_clip": 0.01820626, + "auxiliary_loss_mlp": 0.01066646, + "balance_loss_clip": 1.54203343, + "balance_loss_mlp": 1.03069305, + "epoch": 0.07906207725837967, + "flos": 23740383398400.0, + "grad_norm": 1.4776185339329515, + "language_loss": 0.80779386, + "learning_rate": 3.974803756351379e-06, + "loss": 0.83666658, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.35986328, + "step": 1315, + "time_per_iteration": 3.0160586833953857 + }, + { + "auxiliary_loss_clip": 0.01834576, + "auxiliary_loss_mlp": 0.01068368, + "balance_loss_clip": 1.55009842, + "balance_loss_mlp": 1.0306741, + "epoch": 0.07912220051104765, + "flos": 24326468282880.0, + "grad_norm": 1.7272118719863356, + "language_loss": 0.74823552, + "learning_rate": 3.974742093405362e-06, + "loss": 0.77726495, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.37670898, + "step": 1316, + "time_per_iteration": 2.9685840606689453 + }, + { + "auxiliary_loss_clip": 0.01850993, + "auxiliary_loss_mlp": 0.01071581, + "balance_loss_clip": 1.55968845, + "balance_loss_mlp": 1.03472137, + "epoch": 0.07918232376371562, + "flos": 18889320159360.0, + "grad_norm": 3.380070988256466, + "language_loss": 0.66482151, + "learning_rate": 3.974680355576927e-06, + "loss": 0.69404721, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 2.9140625, + "router_z_loss_mlp": 0.36889648, + "step": 1317, + "time_per_iteration": 2.975184440612793 + }, + { + "auxiliary_loss_clip": 0.01868064, + "auxiliary_loss_mlp": 0.01080751, + "balance_loss_clip": 1.57240129, + "balance_loss_mlp": 1.04262805, + "epoch": 0.07924244701638358, + "flos": 27386492371200.0, + "grad_norm": 2.274152817975685, + "language_loss": 0.76550329, + "learning_rate": 3.974618542868415e-06, + "loss": 0.79499149, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 2.95703125, + "router_z_loss_mlp": 0.38110352, + "step": 1318, + "time_per_iteration": 3.042052984237671 + }, + { + "auxiliary_loss_clip": 0.01816779, + "auxiliary_loss_mlp": 0.01064739, + "balance_loss_clip": 1.5398705, + "balance_loss_mlp": 1.02866662, + "epoch": 0.07930257026905156, + "flos": 25131565837440.0, + "grad_norm": 2.088255838787028, + "language_loss": 0.91273797, + "learning_rate": 3.97455665528217e-06, + "loss": 0.94155312, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.36108398, + "step": 1319, + "time_per_iteration": 3.022975206375122 + }, + { + "auxiliary_loss_clip": 0.01832011, + "auxiliary_loss_mlp": 0.01072378, + "balance_loss_clip": 1.55288577, + "balance_loss_mlp": 1.03492236, + "epoch": 0.07936269352171953, + "flos": 21844294525440.0, + "grad_norm": 2.0802619639306927, + "language_loss": 0.81525147, + "learning_rate": 3.974494692820539e-06, + "loss": 0.84429538, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.37451172, + "step": 1320, + "time_per_iteration": 3.0054545402526855 + }, + { + "auxiliary_loss_clip": 0.0184322, + "auxiliary_loss_mlp": 0.01073694, + "balance_loss_clip": 1.55642295, + "balance_loss_mlp": 1.03447437, + "epoch": 0.07942281677438749, + "flos": 16947009083520.0, + "grad_norm": 2.160568726827266, + "language_loss": 0.70156789, + "learning_rate": 3.974432655485872e-06, + "loss": 0.73073703, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 2.86914062, + "router_z_loss_mlp": 0.39208984, + "step": 1321, + "time_per_iteration": 2.9728448390960693 + }, + { + "auxiliary_loss_clip": 0.01807986, + "auxiliary_loss_mlp": 0.01072265, + "balance_loss_clip": 1.53165674, + "balance_loss_mlp": 1.03416538, + "epoch": 0.07948294002705546, + "flos": 18995591491200.0, + "grad_norm": 6.318990862507566, + "language_loss": 0.85557503, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.88437754, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 2.76367188, + "router_z_loss_mlp": 0.38110352, + "step": 1322, + "time_per_iteration": 3.0972354412078857 + }, + { + "auxiliary_loss_clip": 0.01813729, + "auxiliary_loss_mlp": 0.01065655, + "balance_loss_clip": 1.53501034, + "balance_loss_mlp": 1.02867591, + "epoch": 0.07954306327972344, + "flos": 21663586218240.0, + "grad_norm": 1.996142720968327, + "language_loss": 0.91959667, + "learning_rate": 3.974308356206838e-06, + "loss": 0.94839048, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.36987305, + "step": 1323, + "time_per_iteration": 3.0631399154663086 + }, + { + "auxiliary_loss_clip": 0.01809874, + "auxiliary_loss_mlp": 0.01069558, + "balance_loss_clip": 1.53496861, + "balance_loss_mlp": 1.03195977, + "epoch": 0.0796031865323914, + "flos": 23230273812480.0, + "grad_norm": 1.952184094605342, + "language_loss": 0.83554459, + "learning_rate": 3.974246094267187e-06, + "loss": 0.86433887, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.37597656, + "step": 1324, + "time_per_iteration": 2.945810079574585 + }, + { + "auxiliary_loss_clip": 0.01809817, + "auxiliary_loss_mlp": 0.01067467, + "balance_loss_clip": 1.53293335, + "balance_loss_mlp": 1.02762771, + "epoch": 0.07966330978505937, + "flos": 23304891767040.0, + "grad_norm": 3.7263146673144742, + "language_loss": 0.81387931, + "learning_rate": 3.974183757463925e-06, + "loss": 0.84265214, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.3984375, + "step": 1325, + "time_per_iteration": 2.9765818119049072 + }, + { + "auxiliary_loss_clip": 0.01809257, + "auxiliary_loss_mlp": 0.01073367, + "balance_loss_clip": 1.53311133, + "balance_loss_mlp": 1.0322876, + "epoch": 0.07972343303772735, + "flos": 18371518957440.0, + "grad_norm": 2.6816626711017433, + "language_loss": 0.89573443, + "learning_rate": 3.974121345799418e-06, + "loss": 0.92456067, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 2.76171875, + "router_z_loss_mlp": 0.41113281, + "step": 1326, + "time_per_iteration": 2.8975560665130615 + }, + { + "auxiliary_loss_clip": 0.01813299, + "auxiliary_loss_mlp": 0.01074996, + "balance_loss_clip": 1.53555131, + "balance_loss_mlp": 1.03494191, + "epoch": 0.07978355629039531, + "flos": 21772572238080.0, + "grad_norm": 1.704023197078873, + "language_loss": 0.83882892, + "learning_rate": 3.974058859276032e-06, + "loss": 0.86771184, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 2.77734375, + "router_z_loss_mlp": 0.40063477, + "step": 1327, + "time_per_iteration": 2.9447286128997803 + }, + { + "auxiliary_loss_clip": 0.01826141, + "auxiliary_loss_mlp": 0.01062367, + "balance_loss_clip": 1.53515124, + "balance_loss_mlp": 1.02367198, + "epoch": 0.07984367954306328, + "flos": 18560416573440.0, + "grad_norm": 2.348848492272165, + "language_loss": 0.81226152, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.84114659, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.38671875, + "step": 1328, + "time_per_iteration": 2.9278526306152344 + }, + { + "auxiliary_loss_clip": 0.01829402, + "auxiliary_loss_mlp": 0.0106716, + "balance_loss_clip": 1.54503036, + "balance_loss_mlp": 1.02791643, + "epoch": 0.07990380279573125, + "flos": 16911328919040.0, + "grad_norm": 2.4852079952311295, + "language_loss": 0.75839508, + "learning_rate": 3.973933661662101e-06, + "loss": 0.78736067, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.39233398, + "step": 1329, + "time_per_iteration": 2.9189910888671875 + }, + { + "auxiliary_loss_clip": 0.01809689, + "auxiliary_loss_mlp": 0.01060681, + "balance_loss_clip": 1.53234291, + "balance_loss_mlp": 1.02348745, + "epoch": 0.07996392604839922, + "flos": 24109220160000.0, + "grad_norm": 1.5476744365566868, + "language_loss": 0.82471859, + "learning_rate": 3.973870950576305e-06, + "loss": 0.85342228, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.37182617, + "step": 1330, + "time_per_iteration": 3.0812740325927734 + }, + { + "auxiliary_loss_clip": 0.01817732, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.53360605, + "balance_loss_mlp": 1.02680373, + "epoch": 0.08002404930106718, + "flos": 14285755831680.0, + "grad_norm": 1.744532148731593, + "language_loss": 0.89795971, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.92677051, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.36572266, + "step": 1331, + "time_per_iteration": 2.8350930213928223 + }, + { + "auxiliary_loss_clip": 0.01866023, + "auxiliary_loss_mlp": 0.01066328, + "balance_loss_clip": 1.57110143, + "balance_loss_mlp": 1.02779925, + "epoch": 0.08008417255373516, + "flos": 40420981244160.0, + "grad_norm": 1.8018324982293628, + "language_loss": 0.74494582, + "learning_rate": 3.973745303858942e-06, + "loss": 0.77426928, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 2.94921875, + "router_z_loss_mlp": 0.38549805, + "step": 1332, + "time_per_iteration": 3.0982115268707275 + }, + { + "auxiliary_loss_clip": 0.01831914, + "auxiliary_loss_mlp": 0.01067039, + "balance_loss_clip": 1.55305088, + "balance_loss_mlp": 1.0297029, + "epoch": 0.08014429580640313, + "flos": 18488151348480.0, + "grad_norm": 1.676917693243041, + "language_loss": 0.83455777, + "learning_rate": 3.973682368232138e-06, + "loss": 0.86354727, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 2.78515625, + "router_z_loss_mlp": 0.37353516, + "step": 1333, + "time_per_iteration": 2.974811315536499 + }, + { + "auxiliary_loss_clip": 0.01832487, + "auxiliary_loss_mlp": 0.01069712, + "balance_loss_clip": 1.54960668, + "balance_loss_mlp": 1.0328052, + "epoch": 0.0802044190590711, + "flos": 22063352440320.0, + "grad_norm": 2.05187930577218, + "language_loss": 0.76293468, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.79195672, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.9213056564331055 + }, + { + "auxiliary_loss_clip": 0.01838097, + "auxiliary_loss_mlp": 0.01064677, + "balance_loss_clip": 1.55713403, + "balance_loss_mlp": 1.02707887, + "epoch": 0.08026454231173906, + "flos": 24582970909440.0, + "grad_norm": 1.8338618379950018, + "language_loss": 0.81460565, + "learning_rate": 3.973556272454221e-06, + "loss": 0.84363341, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.37573242, + "step": 1335, + "time_per_iteration": 2.9960389137268066 + }, + { + "auxiliary_loss_clip": 0.01596772, + "auxiliary_loss_mlp": 0.01080933, + "balance_loss_clip": 1.41260099, + "balance_loss_mlp": 1.0532763, + "epoch": 0.08032466556440704, + "flos": 52606475827200.0, + "grad_norm": 0.7541536853144227, + "language_loss": 0.56093943, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58771646, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.27734375, + "step": 1336, + "time_per_iteration": 3.4885671138763428 + }, + { + "auxiliary_loss_clip": 0.01833512, + "auxiliary_loss_mlp": 0.0106685, + "balance_loss_clip": 1.55233312, + "balance_loss_mlp": 1.03058612, + "epoch": 0.080384788817075, + "flos": 23853305715840.0, + "grad_norm": 1.915288157270104, + "language_loss": 0.6937685, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.72277212, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.36254883, + "step": 1337, + "time_per_iteration": 3.038100004196167 + }, + { + "auxiliary_loss_clip": 0.01815339, + "auxiliary_loss_mlp": 0.01072931, + "balance_loss_clip": 1.53570533, + "balance_loss_mlp": 1.03361607, + "epoch": 0.08044491206974297, + "flos": 25311459738240.0, + "grad_norm": 2.0108154169936183, + "language_loss": 0.88440186, + "learning_rate": 3.973366567512453e-06, + "loss": 0.91328454, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.39282227, + "step": 1338, + "time_per_iteration": 4.519045829772949 + }, + { + "auxiliary_loss_clip": 0.01834899, + "auxiliary_loss_mlp": 0.01072904, + "balance_loss_clip": 1.54483795, + "balance_loss_mlp": 1.03687918, + "epoch": 0.08050503532241095, + "flos": 22385152592640.0, + "grad_norm": 2.0399270521239314, + "language_loss": 0.88304138, + "learning_rate": 3.973303182868147e-06, + "loss": 0.91211939, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 2.90039062, + "router_z_loss_mlp": 0.3605957, + "step": 1339, + "time_per_iteration": 2.953521251678467 + }, + { + "auxiliary_loss_clip": 0.01807168, + "auxiliary_loss_mlp": 0.01063051, + "balance_loss_clip": 1.5333786, + "balance_loss_mlp": 1.02893305, + "epoch": 0.08056515857507891, + "flos": 18378622391040.0, + "grad_norm": 1.9466064579878857, + "language_loss": 0.90142989, + "learning_rate": 3.973239723395988e-06, + "loss": 0.93013203, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.34155273, + "step": 1340, + "time_per_iteration": 2.8782846927642822 + }, + { + "auxiliary_loss_clip": 0.01550971, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.37379944, + "balance_loss_mlp": 1.00473714, + "epoch": 0.08062528182774688, + "flos": 51374211684480.0, + "grad_norm": 0.8885486945481568, + "language_loss": 0.64754367, + "learning_rate": 3.97317618909838e-06, + "loss": 0.67336023, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.25976562, + "step": 1341, + "time_per_iteration": 3.3494129180908203 + }, + { + "auxiliary_loss_clip": 0.01848257, + "auxiliary_loss_mlp": 0.01079674, + "balance_loss_clip": 1.55610335, + "balance_loss_mlp": 1.04138398, + "epoch": 0.08068540508041486, + "flos": 17607802412160.0, + "grad_norm": 1.8772685402554194, + "language_loss": 0.90250564, + "learning_rate": 3.973112579977733e-06, + "loss": 0.93178499, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 2.92382812, + "router_z_loss_mlp": 0.38305664, + "step": 1342, + "time_per_iteration": 4.302040100097656 + }, + { + "auxiliary_loss_clip": 0.01827484, + "auxiliary_loss_mlp": 0.01073211, + "balance_loss_clip": 1.54308319, + "balance_loss_mlp": 1.03210771, + "epoch": 0.08074552833308282, + "flos": 10568286529920.0, + "grad_norm": 2.3416060945046615, + "language_loss": 0.77433646, + "learning_rate": 3.973048896036459e-06, + "loss": 0.80334336, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.41064453, + "step": 1343, + "time_per_iteration": 2.866652250289917 + }, + { + "auxiliary_loss_clip": 0.01515219, + "auxiliary_loss_mlp": 0.01064404, + "balance_loss_clip": 1.33694005, + "balance_loss_mlp": 1.03751075, + "epoch": 0.08080565158575079, + "flos": 60870310750080.0, + "grad_norm": 0.8203614794927442, + "language_loss": 0.57650089, + "learning_rate": 3.972985137276974e-06, + "loss": 0.60229707, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.26953125, + "step": 1344, + "time_per_iteration": 6.151159048080444 + }, + { + "auxiliary_loss_clip": 0.01828071, + "auxiliary_loss_mlp": 0.01065221, + "balance_loss_clip": 1.53947306, + "balance_loss_mlp": 1.02900493, + "epoch": 0.08086577483841875, + "flos": 18341403903360.0, + "grad_norm": 2.6421300002750074, + "language_loss": 0.87862444, + "learning_rate": 3.972921303701695e-06, + "loss": 0.90755737, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 0.36206055, + "step": 1345, + "time_per_iteration": 2.922560930252075 + }, + { + "auxiliary_loss_clip": 0.01797414, + "auxiliary_loss_mlp": 0.01068122, + "balance_loss_clip": 1.51893413, + "balance_loss_mlp": 1.03269327, + "epoch": 0.08092589809108673, + "flos": 21553604812800.0, + "grad_norm": 1.6174773876039161, + "language_loss": 0.89026421, + "learning_rate": 3.972857395313042e-06, + "loss": 0.91891956, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.35424805, + "step": 1346, + "time_per_iteration": 2.9248733520507812 + }, + { + "auxiliary_loss_clip": 0.01786787, + "auxiliary_loss_mlp": 0.01079401, + "balance_loss_clip": 1.5100323, + "balance_loss_mlp": 1.04013336, + "epoch": 0.0809860213437547, + "flos": 22138332353280.0, + "grad_norm": 1.5649249669384577, + "language_loss": 0.94058985, + "learning_rate": 3.972793412113439e-06, + "loss": 0.96925175, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.39282227, + "step": 1347, + "time_per_iteration": 2.885265588760376 + }, + { + "auxiliary_loss_clip": 0.01781129, + "auxiliary_loss_mlp": 0.01077046, + "balance_loss_clip": 1.50633836, + "balance_loss_mlp": 1.03939986, + "epoch": 0.08104614459642266, + "flos": 21735398995200.0, + "grad_norm": 1.724542344059585, + "language_loss": 0.90338033, + "learning_rate": 3.972729354105312e-06, + "loss": 0.93196201, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.37646484, + "step": 1348, + "time_per_iteration": 3.044618606567383 + }, + { + "auxiliary_loss_clip": 0.0176168, + "auxiliary_loss_mlp": 0.01075224, + "balance_loss_clip": 1.48762763, + "balance_loss_mlp": 1.0381496, + "epoch": 0.08110626784909064, + "flos": 23962563204480.0, + "grad_norm": 1.5682890232437847, + "language_loss": 0.77514565, + "learning_rate": 3.97266522129109e-06, + "loss": 0.80351472, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.37109375, + "step": 1349, + "time_per_iteration": 3.0364632606506348 + }, + { + "auxiliary_loss_clip": 0.01791197, + "auxiliary_loss_mlp": 0.01079724, + "balance_loss_clip": 1.50715661, + "balance_loss_mlp": 1.04279256, + "epoch": 0.0811663911017586, + "flos": 19034936484480.0, + "grad_norm": 1.8342501759779175, + "language_loss": 0.90341944, + "learning_rate": 3.972601013673205e-06, + "loss": 0.93212867, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 0.36938477, + "step": 1350, + "time_per_iteration": 2.8622543811798096 + }, + { + "auxiliary_loss_clip": 0.01762367, + "auxiliary_loss_mlp": 0.01072814, + "balance_loss_clip": 1.48724651, + "balance_loss_mlp": 1.03571606, + "epoch": 0.08122651435442657, + "flos": 15349753987200.0, + "grad_norm": 1.9673824673969504, + "language_loss": 0.83422673, + "learning_rate": 3.972536731254092e-06, + "loss": 0.86257851, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.37133789, + "step": 1351, + "time_per_iteration": 2.8935141563415527 + }, + { + "auxiliary_loss_clip": 0.01763052, + "auxiliary_loss_mlp": 0.01075159, + "balance_loss_clip": 1.48243308, + "balance_loss_mlp": 1.03565347, + "epoch": 0.08128663760709455, + "flos": 23231902625280.0, + "grad_norm": 1.7312084793301712, + "language_loss": 0.76381403, + "learning_rate": 3.972472374036189e-06, + "loss": 0.79219615, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 2.8046875, + "router_z_loss_mlp": 0.39526367, + "step": 1352, + "time_per_iteration": 2.8623712062835693 + }, + { + "auxiliary_loss_clip": 0.01781567, + "auxiliary_loss_mlp": 0.01081858, + "balance_loss_clip": 1.49975169, + "balance_loss_mlp": 1.04130328, + "epoch": 0.08134676085976252, + "flos": 22975761957120.0, + "grad_norm": 1.8115788041420133, + "language_loss": 0.84688938, + "learning_rate": 3.972407942021935e-06, + "loss": 0.87552357, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.40551758, + "step": 1353, + "time_per_iteration": 2.95131516456604 + }, + { + "auxiliary_loss_clip": 0.01450852, + "auxiliary_loss_mlp": 0.01142826, + "balance_loss_clip": 1.27231765, + "balance_loss_mlp": 1.03219974, + "epoch": 0.08140688411243048, + "flos": 64347683304960.0, + "grad_norm": 0.8724739369772011, + "language_loss": 0.59869313, + "learning_rate": 3.972343435213775e-06, + "loss": 0.62462991, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 1.109375, + "step": 1354, + "time_per_iteration": 3.3701465129852295 + }, + { + "auxiliary_loss_clip": 0.01750776, + "auxiliary_loss_mlp": 0.01068408, + "balance_loss_clip": 1.47799754, + "balance_loss_mlp": 1.03009415, + "epoch": 0.08146700736509845, + "flos": 22502011207680.0, + "grad_norm": 1.6148054200530948, + "language_loss": 0.84205008, + "learning_rate": 3.972278853614154e-06, + "loss": 0.87024194, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 2.72460938, + "router_z_loss_mlp": 0.3828125, + "step": 1355, + "time_per_iteration": 2.9188740253448486 + }, + { + "auxiliary_loss_clip": 0.01747928, + "auxiliary_loss_mlp": 0.01066937, + "balance_loss_clip": 1.4762485, + "balance_loss_mlp": 1.02888584, + "epoch": 0.08152713061776642, + "flos": 20457229363200.0, + "grad_norm": 1.9619286660003301, + "language_loss": 0.73109454, + "learning_rate": 3.972214197225521e-06, + "loss": 0.75924325, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.38037109, + "step": 1356, + "time_per_iteration": 2.938732624053955 + }, + { + "auxiliary_loss_clip": 0.01764057, + "auxiliary_loss_mlp": 0.01063197, + "balance_loss_clip": 1.48111618, + "balance_loss_mlp": 1.02524078, + "epoch": 0.08158725387043439, + "flos": 23560715721600.0, + "grad_norm": 2.0536022232714215, + "language_loss": 0.72369069, + "learning_rate": 3.972149466050329e-06, + "loss": 0.75196326, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 2.83203125, + "router_z_loss_mlp": 0.37963867, + "step": 1357, + "time_per_iteration": 2.907543659210205 + }, + { + "auxiliary_loss_clip": 0.01770899, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.487957, + "balance_loss_mlp": 1.03492987, + "epoch": 0.08164737712310235, + "flos": 22027310317440.0, + "grad_norm": 2.3552324530617623, + "language_loss": 0.85755682, + "learning_rate": 3.97208466009103e-06, + "loss": 0.88598818, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.37329102, + "step": 1358, + "time_per_iteration": 2.989595651626587 + }, + { + "auxiliary_loss_clip": 0.01766915, + "auxiliary_loss_mlp": 0.0107078, + "balance_loss_clip": 1.48600328, + "balance_loss_mlp": 1.0331099, + "epoch": 0.08170750037577033, + "flos": 23378288112000.0, + "grad_norm": 1.7158010584416945, + "language_loss": 1.03114879, + "learning_rate": 3.972019779350084e-06, + "loss": 1.05952573, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.37646484, + "step": 1359, + "time_per_iteration": 2.9303019046783447 + }, + { + "auxiliary_loss_clip": 0.0178615, + "auxiliary_loss_mlp": 0.01076286, + "balance_loss_clip": 1.49834001, + "balance_loss_mlp": 1.03914046, + "epoch": 0.0817676236284383, + "flos": 28408476090240.0, + "grad_norm": 2.1175514656720993, + "language_loss": 0.84717053, + "learning_rate": 3.971954823829951e-06, + "loss": 0.87579489, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 2.88085938, + "router_z_loss_mlp": 0.37158203, + "step": 1360, + "time_per_iteration": 2.977163076400757 + }, + { + "auxiliary_loss_clip": 0.01790942, + "auxiliary_loss_mlp": 0.01084686, + "balance_loss_clip": 1.50219238, + "balance_loss_mlp": 1.04494166, + "epoch": 0.08182774688110626, + "flos": 19218404724480.0, + "grad_norm": 1.9718761399000964, + "language_loss": 0.73906976, + "learning_rate": 3.971889793533093e-06, + "loss": 0.76782608, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 2.88476562, + "router_z_loss_mlp": 0.39746094, + "step": 1361, + "time_per_iteration": 2.9748871326446533 + }, + { + "auxiliary_loss_clip": 0.01752836, + "auxiliary_loss_mlp": 0.01088585, + "balance_loss_clip": 1.47250462, + "balance_loss_mlp": 1.05139184, + "epoch": 0.08188787013377424, + "flos": 22794420222720.0, + "grad_norm": 1.9527998624621776, + "language_loss": 0.78631604, + "learning_rate": 3.971824688461976e-06, + "loss": 0.81473023, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 2.80664062, + "router_z_loss_mlp": 0.37207031, + "step": 1362, + "time_per_iteration": 2.9789035320281982 + }, + { + "auxiliary_loss_clip": 0.01766387, + "auxiliary_loss_mlp": 0.01075896, + "balance_loss_clip": 1.48303092, + "balance_loss_mlp": 1.03879786, + "epoch": 0.08194799338644221, + "flos": 16475746798080.0, + "grad_norm": 2.0649014050132073, + "language_loss": 0.74827838, + "learning_rate": 3.971759508619069e-06, + "loss": 0.77670121, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.37109375, + "step": 1363, + "time_per_iteration": 2.9582254886627197 + }, + { + "auxiliary_loss_clip": 0.01772766, + "auxiliary_loss_mlp": 0.01083466, + "balance_loss_clip": 1.48922491, + "balance_loss_mlp": 1.04438937, + "epoch": 0.08200811663911017, + "flos": 23923851638400.0, + "grad_norm": 2.016765264038403, + "language_loss": 0.79393047, + "learning_rate": 3.971694254006844e-06, + "loss": 0.82249284, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 2.8359375, + "router_z_loss_mlp": 0.39038086, + "step": 1364, + "time_per_iteration": 2.923550844192505 + }, + { + "auxiliary_loss_clip": 0.01770282, + "auxiliary_loss_mlp": 0.01076344, + "balance_loss_clip": 1.48800421, + "balance_loss_mlp": 1.03879356, + "epoch": 0.08206823989177814, + "flos": 17906002761600.0, + "grad_norm": 1.5447050351815912, + "language_loss": 0.83267689, + "learning_rate": 3.971628924627776e-06, + "loss": 0.86114311, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.37597656, + "step": 1365, + "time_per_iteration": 3.0659120082855225 + }, + { + "auxiliary_loss_clip": 0.01747926, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_clip": 1.47391558, + "balance_loss_mlp": 1.04941177, + "epoch": 0.08212836314444612, + "flos": 22097630016000.0, + "grad_norm": 1.6112964124402194, + "language_loss": 0.82764316, + "learning_rate": 3.97156352048434e-06, + "loss": 0.85595721, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.34057617, + "step": 1366, + "time_per_iteration": 3.0514895915985107 + }, + { + "auxiliary_loss_clip": 0.01766795, + "auxiliary_loss_mlp": 0.01090009, + "balance_loss_clip": 1.48255849, + "balance_loss_mlp": 1.05133736, + "epoch": 0.08218848639711408, + "flos": 17604997234560.0, + "grad_norm": 1.632354579763871, + "language_loss": 0.8303234, + "learning_rate": 3.97149804157902e-06, + "loss": 0.85889143, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 2.83984375, + "router_z_loss_mlp": 0.38696289, + "step": 1367, + "time_per_iteration": 3.0351409912109375 + }, + { + "auxiliary_loss_clip": 0.01794004, + "auxiliary_loss_mlp": 0.01087776, + "balance_loss_clip": 1.50257969, + "balance_loss_mlp": 1.04993868, + "epoch": 0.08224860964978205, + "flos": 17866703013120.0, + "grad_norm": 2.333098625595346, + "language_loss": 0.85312641, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.88194418, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 2.91601562, + "router_z_loss_mlp": 0.37817383, + "step": 1368, + "time_per_iteration": 2.9744081497192383 + }, + { + "auxiliary_loss_clip": 0.01744689, + "auxiliary_loss_mlp": 0.01074284, + "balance_loss_clip": 1.47836792, + "balance_loss_mlp": 1.03899765, + "epoch": 0.08230873290245003, + "flos": 25238199127680.0, + "grad_norm": 1.752088942775731, + "language_loss": 0.82448447, + "learning_rate": 3.971366859492653e-06, + "loss": 0.85267419, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.35302734, + "step": 1369, + "time_per_iteration": 2.9599194526672363 + }, + { + "auxiliary_loss_clip": 0.01770671, + "auxiliary_loss_mlp": 0.0107247, + "balance_loss_clip": 1.49050546, + "balance_loss_mlp": 1.0364449, + "epoch": 0.08236885615511799, + "flos": 31772672841600.0, + "grad_norm": 2.0336973536431557, + "language_loss": 0.75926411, + "learning_rate": 3.971301156316582e-06, + "loss": 0.78769547, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.36010742, + "step": 1370, + "time_per_iteration": 2.989975690841675 + }, + { + "auxiliary_loss_clip": 0.01768857, + "auxiliary_loss_mlp": 0.01076746, + "balance_loss_clip": 1.48650742, + "balance_loss_mlp": 1.03905153, + "epoch": 0.08242897940778596, + "flos": 23196358195200.0, + "grad_norm": 5.136535752383083, + "language_loss": 0.7565378, + "learning_rate": 3.971235378388573e-06, + "loss": 0.78499377, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 2.82421875, + "router_z_loss_mlp": 0.37695312, + "step": 1371, + "time_per_iteration": 2.9811534881591797 + }, + { + "auxiliary_loss_clip": 0.01751356, + "auxiliary_loss_mlp": 0.0105912, + "balance_loss_clip": 1.47758842, + "balance_loss_mlp": 1.02323794, + "epoch": 0.08248910266045394, + "flos": 34503657609600.0, + "grad_norm": 1.8855956281567816, + "language_loss": 0.72162026, + "learning_rate": 3.971169525711122e-06, + "loss": 0.74972498, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.35913086, + "step": 1372, + "time_per_iteration": 2.985748291015625 + }, + { + "auxiliary_loss_clip": 0.01787272, + "auxiliary_loss_mlp": 0.01065605, + "balance_loss_clip": 1.50027013, + "balance_loss_mlp": 1.02848351, + "epoch": 0.0825492259131219, + "flos": 13442172935040.0, + "grad_norm": 2.416716014365081, + "language_loss": 0.89157993, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.92010874, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 2.8671875, + "router_z_loss_mlp": 0.37109375, + "step": 1373, + "time_per_iteration": 4.2509777545928955 + }, + { + "auxiliary_loss_clip": 0.01759165, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_clip": 1.47830153, + "balance_loss_mlp": 1.02627516, + "epoch": 0.08260934916578987, + "flos": 25823560095360.0, + "grad_norm": 1.8040252017321134, + "language_loss": 0.8421368, + "learning_rate": 3.971037596117882e-06, + "loss": 0.87035245, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 2.8125, + "router_z_loss_mlp": 0.36132812, + "step": 1374, + "time_per_iteration": 2.9278531074523926 + }, + { + "auxiliary_loss_clip": 0.01465793, + "auxiliary_loss_mlp": 0.01029962, + "balance_loss_clip": 1.30388033, + "balance_loss_mlp": 1.01594353, + "epoch": 0.08266947241845783, + "flos": 63488870893440.0, + "grad_norm": 0.8241947101525418, + "language_loss": 0.60740304, + "learning_rate": 3.970971519207095e-06, + "loss": 0.63236058, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.140625, + "step": 1375, + "time_per_iteration": 3.3244426250457764 + }, + { + "auxiliary_loss_clip": 0.01461756, + "auxiliary_loss_mlp": 0.01017944, + "balance_loss_clip": 1.29773855, + "balance_loss_mlp": 1.00316179, + "epoch": 0.08272959567112581, + "flos": 70024973420160.0, + "grad_norm": 0.9067559758625536, + "language_loss": 0.62412393, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64892095, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.14746094, + "step": 1376, + "time_per_iteration": 3.228940963745117 + }, + { + "auxiliary_loss_clip": 0.01774484, + "auxiliary_loss_mlp": 0.01069032, + "balance_loss_clip": 1.49547434, + "balance_loss_mlp": 1.03338885, + "epoch": 0.08278971892379378, + "flos": 20423087521920.0, + "grad_norm": 1.678029201777842, + "language_loss": 0.83895528, + "learning_rate": 3.970839141169718e-06, + "loss": 0.86739051, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.35644531, + "step": 1377, + "time_per_iteration": 3.092580556869507 + }, + { + "auxiliary_loss_clip": 0.01756542, + "auxiliary_loss_mlp": 0.01062618, + "balance_loss_clip": 1.48295379, + "balance_loss_mlp": 1.02716553, + "epoch": 0.08284984217646174, + "flos": 26261449701120.0, + "grad_norm": 1.7709881064104032, + "language_loss": 0.85894746, + "learning_rate": 3.970772840048147e-06, + "loss": 0.88713908, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.35449219, + "step": 1378, + "time_per_iteration": 4.38018798828125 + }, + { + "auxiliary_loss_clip": 0.01758226, + "auxiliary_loss_mlp": 0.0106796, + "balance_loss_clip": 1.47700381, + "balance_loss_mlp": 1.03143442, + "epoch": 0.08290996542912972, + "flos": 27205060147200.0, + "grad_norm": 2.0660573269683056, + "language_loss": 0.89093399, + "learning_rate": 3.970706464194672e-06, + "loss": 0.91919583, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 2.80859375, + "router_z_loss_mlp": 0.36572266, + "step": 1379, + "time_per_iteration": 4.387006044387817 + }, + { + "auxiliary_loss_clip": 0.01750377, + "auxiliary_loss_mlp": 0.01059168, + "balance_loss_clip": 1.48139834, + "balance_loss_mlp": 1.02538407, + "epoch": 0.08297008868179769, + "flos": 38632430557440.0, + "grad_norm": 1.7566636061174814, + "language_loss": 0.80076408, + "learning_rate": 3.970640013611812e-06, + "loss": 0.82885951, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.33789062, + "step": 1380, + "time_per_iteration": 4.435056924819946 + }, + { + "auxiliary_loss_clip": 0.01739797, + "auxiliary_loss_mlp": 0.01067806, + "balance_loss_clip": 1.47371936, + "balance_loss_mlp": 1.03278244, + "epoch": 0.08303021193446565, + "flos": 19984338264960.0, + "grad_norm": 2.215079791918043, + "language_loss": 0.87531769, + "learning_rate": 3.970573488302083e-06, + "loss": 0.90339375, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.3503418, + "step": 1381, + "time_per_iteration": 2.961040496826172 + }, + { + "auxiliary_loss_clip": 0.01780845, + "auxiliary_loss_mlp": 0.01060262, + "balance_loss_clip": 1.4971261, + "balance_loss_mlp": 1.02423692, + "epoch": 0.08309033518713363, + "flos": 13670596523520.0, + "grad_norm": 2.363480393063436, + "language_loss": 0.9038586, + "learning_rate": 3.970506888268011e-06, + "loss": 0.93226969, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.35986328, + "step": 1382, + "time_per_iteration": 2.8522253036499023 + }, + { + "auxiliary_loss_clip": 0.01776369, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.49793625, + "balance_loss_mlp": 1.02794385, + "epoch": 0.0831504584398016, + "flos": 17977770293760.0, + "grad_norm": 1.8115347387199894, + "language_loss": 0.78539979, + "learning_rate": 3.970440213512121e-06, + "loss": 0.81377864, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 2.78320312, + "router_z_loss_mlp": 0.33569336, + "step": 1383, + "time_per_iteration": 2.922457695007324 + }, + { + "auxiliary_loss_clip": 0.01775112, + "auxiliary_loss_mlp": 0.01067079, + "balance_loss_clip": 1.49554121, + "balance_loss_mlp": 1.03336668, + "epoch": 0.08321058169246956, + "flos": 22611585409920.0, + "grad_norm": 1.8434127222020462, + "language_loss": 0.84890991, + "learning_rate": 3.97037346403694e-06, + "loss": 0.87733179, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 2.80078125, + "router_z_loss_mlp": 0.3371582, + "step": 1384, + "time_per_iteration": 2.91709041595459 + }, + { + "auxiliary_loss_clip": 0.01799467, + "auxiliary_loss_mlp": 0.01069352, + "balance_loss_clip": 1.50863647, + "balance_loss_mlp": 1.03375578, + "epoch": 0.08327070494513754, + "flos": 22859039076480.0, + "grad_norm": 2.4367860614338164, + "language_loss": 0.86898071, + "learning_rate": 3.970306639845e-06, + "loss": 0.89766884, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 2.90625, + "router_z_loss_mlp": 0.35620117, + "step": 1385, + "time_per_iteration": 2.870175361633301 + }, + { + "auxiliary_loss_clip": 0.01771334, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_clip": 1.48844039, + "balance_loss_mlp": 1.02979064, + "epoch": 0.0833308281978055, + "flos": 22793017633920.0, + "grad_norm": 1.8684560632085678, + "language_loss": 0.71067047, + "learning_rate": 3.970239740938835e-06, + "loss": 0.73906296, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 2.828125, + "router_z_loss_mlp": 0.38134766, + "step": 1386, + "time_per_iteration": 2.8969640731811523 + }, + { + "auxiliary_loss_clip": 0.01749167, + "auxiliary_loss_mlp": 0.01069108, + "balance_loss_clip": 1.47469711, + "balance_loss_mlp": 1.03320265, + "epoch": 0.08339095145047347, + "flos": 20822084582400.0, + "grad_norm": 1.539133203196264, + "language_loss": 0.83382404, + "learning_rate": 3.97017276732098e-06, + "loss": 0.86200678, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 2.74414062, + "router_z_loss_mlp": 0.35888672, + "step": 1387, + "time_per_iteration": 2.85795521736145 + }, + { + "auxiliary_loss_clip": 0.017588, + "auxiliary_loss_mlp": 0.01074243, + "balance_loss_clip": 1.47759008, + "balance_loss_mlp": 1.03714466, + "epoch": 0.08345107470314143, + "flos": 18524555429760.0, + "grad_norm": 1.8857309010351542, + "language_loss": 0.79107267, + "learning_rate": 3.970105718993978e-06, + "loss": 0.81940311, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.37109375, + "step": 1388, + "time_per_iteration": 2.8621129989624023 + }, + { + "auxiliary_loss_clip": 0.01734578, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.46533728, + "balance_loss_mlp": 1.04165232, + "epoch": 0.08351119795580941, + "flos": 18816964444800.0, + "grad_norm": 2.4525114574714437, + "language_loss": 0.81249988, + "learning_rate": 3.970038595960369e-06, + "loss": 0.84063441, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.37231445, + "step": 1389, + "time_per_iteration": 2.837688684463501 + }, + { + "auxiliary_loss_clip": 0.01774585, + "auxiliary_loss_mlp": 0.01070846, + "balance_loss_clip": 1.49297249, + "balance_loss_mlp": 1.03520286, + "epoch": 0.08357132120847738, + "flos": 18450887616000.0, + "grad_norm": 5.384402542377428, + "language_loss": 0.89111555, + "learning_rate": 3.969971398222699e-06, + "loss": 0.91956991, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.35620117, + "step": 1390, + "time_per_iteration": 2.844578742980957 + }, + { + "auxiliary_loss_clip": 0.01749576, + "auxiliary_loss_mlp": 0.01076883, + "balance_loss_clip": 1.47308815, + "balance_loss_mlp": 1.03928483, + "epoch": 0.08363144446114534, + "flos": 25933586745600.0, + "grad_norm": 2.347095529463442, + "language_loss": 0.88276225, + "learning_rate": 3.969904125783517e-06, + "loss": 0.9110269, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 0.3762207, + "step": 1391, + "time_per_iteration": 2.9085569381713867 + }, + { + "auxiliary_loss_clip": 0.01767857, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.47996712, + "balance_loss_mlp": 1.0375011, + "epoch": 0.08369156771381332, + "flos": 18050487966720.0, + "grad_norm": 1.9889020763210228, + "language_loss": 0.89444786, + "learning_rate": 3.969836778645371e-06, + "loss": 0.92287785, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 2.875, + "router_z_loss_mlp": 0.37646484, + "step": 1392, + "time_per_iteration": 2.8006601333618164 + }, + { + "auxiliary_loss_clip": 0.01743371, + "auxiliary_loss_mlp": 0.01078581, + "balance_loss_clip": 1.46625674, + "balance_loss_mlp": 1.04336667, + "epoch": 0.08375169096648129, + "flos": 22685524692480.0, + "grad_norm": 2.55870208267229, + "language_loss": 0.82534647, + "learning_rate": 3.969769356810819e-06, + "loss": 0.85356599, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 2.77148438, + "router_z_loss_mlp": 0.35229492, + "step": 1393, + "time_per_iteration": 2.906816005706787 + }, + { + "auxiliary_loss_clip": 0.01727602, + "auxiliary_loss_mlp": 0.01081254, + "balance_loss_clip": 1.46002972, + "balance_loss_mlp": 1.04544353, + "epoch": 0.08381181421914925, + "flos": 26114114073600.0, + "grad_norm": 1.730721656081755, + "language_loss": 0.86437201, + "learning_rate": 3.969701860282415e-06, + "loss": 0.89246058, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.3581543, + "step": 1394, + "time_per_iteration": 2.8907511234283447 + }, + { + "auxiliary_loss_clip": 0.01735164, + "auxiliary_loss_mlp": 0.01079019, + "balance_loss_clip": 1.45972848, + "balance_loss_mlp": 1.04397166, + "epoch": 0.08387193747181723, + "flos": 20639114035200.0, + "grad_norm": 1.7258958866692262, + "language_loss": 0.84163547, + "learning_rate": 3.969634289062719e-06, + "loss": 0.86977726, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.35058594, + "step": 1395, + "time_per_iteration": 2.9512531757354736 + }, + { + "auxiliary_loss_clip": 0.01732564, + "auxiliary_loss_mlp": 0.01076692, + "balance_loss_clip": 1.45518935, + "balance_loss_mlp": 1.0392369, + "epoch": 0.0839320607244852, + "flos": 13450950426240.0, + "grad_norm": 3.2617413051056214, + "language_loss": 0.84993708, + "learning_rate": 3.969566643154293e-06, + "loss": 0.87802964, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 2.7734375, + "router_z_loss_mlp": 0.375, + "step": 1396, + "time_per_iteration": 2.8676838874816895 + }, + { + "auxiliary_loss_clip": 0.01720197, + "auxiliary_loss_mlp": 0.01070898, + "balance_loss_clip": 1.45195341, + "balance_loss_mlp": 1.03279829, + "epoch": 0.08399218397715316, + "flos": 23487726579840.0, + "grad_norm": 1.7894836092889945, + "language_loss": 0.78699738, + "learning_rate": 3.969498922559703e-06, + "loss": 0.81490833, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.38085938, + "step": 1397, + "time_per_iteration": 2.9442577362060547 + }, + { + "auxiliary_loss_clip": 0.01732432, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.45987952, + "balance_loss_mlp": 1.03216338, + "epoch": 0.08405230722982113, + "flos": 25931550729600.0, + "grad_norm": 1.7699328362967883, + "language_loss": 0.79700577, + "learning_rate": 3.969431127281516e-06, + "loss": 0.82501525, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.36352539, + "step": 1398, + "time_per_iteration": 2.939739227294922 + }, + { + "auxiliary_loss_clip": 0.01710663, + "auxiliary_loss_mlp": 0.01069924, + "balance_loss_clip": 1.44544983, + "balance_loss_mlp": 1.03256428, + "epoch": 0.0841124304824891, + "flos": 17976367704960.0, + "grad_norm": 2.601761082044779, + "language_loss": 0.96254277, + "learning_rate": 3.969363257322304e-06, + "loss": 0.99034858, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.3737793, + "step": 1399, + "time_per_iteration": 2.8060302734375 + }, + { + "auxiliary_loss_clip": 0.01719427, + "auxiliary_loss_mlp": 0.01058339, + "balance_loss_clip": 1.43792915, + "balance_loss_mlp": 1.02150297, + "epoch": 0.08417255373515707, + "flos": 25640634792960.0, + "grad_norm": 2.4764195377250817, + "language_loss": 0.83375692, + "learning_rate": 3.96929531268464e-06, + "loss": 0.8615346, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.36816406, + "step": 1400, + "time_per_iteration": 2.9495186805725098 + }, + { + "auxiliary_loss_clip": 0.01705879, + "auxiliary_loss_mlp": 0.01062138, + "balance_loss_clip": 1.43512976, + "balance_loss_mlp": 1.02596998, + "epoch": 0.08423267698782504, + "flos": 26260816273920.0, + "grad_norm": 1.6750564853355931, + "language_loss": 0.88053596, + "learning_rate": 3.969227293371099e-06, + "loss": 0.90821612, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.36157227, + "step": 1401, + "time_per_iteration": 2.911494016647339 + }, + { + "auxiliary_loss_clip": 0.0170274, + "auxiliary_loss_mlp": 0.01065486, + "balance_loss_clip": 1.43108153, + "balance_loss_mlp": 1.02750587, + "epoch": 0.08429280024049302, + "flos": 20129094938880.0, + "grad_norm": 1.7529957645445644, + "language_loss": 0.8862474, + "learning_rate": 3.969159199384263e-06, + "loss": 0.9139297, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.37988281, + "step": 1402, + "time_per_iteration": 2.9010045528411865 + }, + { + "auxiliary_loss_clip": 0.0167554, + "auxiliary_loss_mlp": 0.01060664, + "balance_loss_clip": 1.41144276, + "balance_loss_mlp": 1.02428102, + "epoch": 0.08435292349316098, + "flos": 42940147265280.0, + "grad_norm": 2.0878641968795275, + "language_loss": 0.89897627, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.92633832, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.36352539, + "step": 1403, + "time_per_iteration": 3.0818636417388916 + }, + { + "auxiliary_loss_clip": 0.01702916, + "auxiliary_loss_mlp": 0.01062048, + "balance_loss_clip": 1.42794204, + "balance_loss_mlp": 1.02397251, + "epoch": 0.08441304674582895, + "flos": 22867680833280.0, + "grad_norm": 2.5976180259762147, + "language_loss": 0.81951451, + "learning_rate": 3.969022787401033e-06, + "loss": 0.84716409, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 0.38085938, + "step": 1404, + "time_per_iteration": 2.874556064605713 + }, + { + "auxiliary_loss_clip": 0.01714956, + "auxiliary_loss_mlp": 0.01065933, + "balance_loss_clip": 1.43481994, + "balance_loss_mlp": 1.0280478, + "epoch": 0.08447316999849692, + "flos": 18706711570560.0, + "grad_norm": 1.8771345486199051, + "language_loss": 0.85080957, + "learning_rate": 3.968954469409811e-06, + "loss": 0.87861848, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.37915039, + "step": 1405, + "time_per_iteration": 2.836319923400879 + }, + { + "auxiliary_loss_clip": 0.01693822, + "auxiliary_loss_mlp": 0.01059742, + "balance_loss_clip": 1.42305446, + "balance_loss_mlp": 1.02083218, + "epoch": 0.08453329325116489, + "flos": 25494973223040.0, + "grad_norm": 1.7005052167643189, + "language_loss": 0.811131, + "learning_rate": 3.968886076755639e-06, + "loss": 0.83866668, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.38916016, + "step": 1406, + "time_per_iteration": 2.8876311779022217 + }, + { + "auxiliary_loss_clip": 0.01706433, + "auxiliary_loss_mlp": 0.01071811, + "balance_loss_clip": 1.43202448, + "balance_loss_mlp": 1.03278136, + "epoch": 0.08459341650383286, + "flos": 20929034586240.0, + "grad_norm": 1.9097070286358913, + "language_loss": 0.80653954, + "learning_rate": 3.96881760944111e-06, + "loss": 0.83432198, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 2.74609375, + "router_z_loss_mlp": 0.39038086, + "step": 1407, + "time_per_iteration": 2.892956256866455 + }, + { + "auxiliary_loss_clip": 0.0170535, + "auxiliary_loss_mlp": 0.0107212, + "balance_loss_clip": 1.43234146, + "balance_loss_mlp": 1.03330588, + "epoch": 0.08465353975650082, + "flos": 13051048469760.0, + "grad_norm": 2.0263555832229065, + "language_loss": 0.92362094, + "learning_rate": 3.968749067468819e-06, + "loss": 0.95139563, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.38842773, + "step": 1408, + "time_per_iteration": 4.275665521621704 + }, + { + "auxiliary_loss_clip": 0.01451762, + "auxiliary_loss_mlp": 0.01085111, + "balance_loss_clip": 1.29224741, + "balance_loss_mlp": 1.05707347, + "epoch": 0.0847136630091688, + "flos": 60907664972160.0, + "grad_norm": 0.9089391392015953, + "language_loss": 0.61906993, + "learning_rate": 3.968680450841368e-06, + "loss": 0.64443862, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.28125, + "step": 1409, + "time_per_iteration": 3.3845105171203613 + }, + { + "auxiliary_loss_clip": 0.01674573, + "auxiliary_loss_mlp": 0.01068142, + "balance_loss_clip": 1.41418517, + "balance_loss_mlp": 1.03006649, + "epoch": 0.08477378626183676, + "flos": 22054620193920.0, + "grad_norm": 1.7096289229701715, + "language_loss": 0.88204515, + "learning_rate": 3.968611759561355e-06, + "loss": 0.90947223, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.38085938, + "step": 1410, + "time_per_iteration": 2.8872504234313965 + }, + { + "auxiliary_loss_clip": 0.01714445, + "auxiliary_loss_mlp": 0.0107726, + "balance_loss_clip": 1.44080353, + "balance_loss_mlp": 1.03646648, + "epoch": 0.08483390951450473, + "flos": 16698560031360.0, + "grad_norm": 2.255376832042188, + "language_loss": 0.75654721, + "learning_rate": 3.968542993631388e-06, + "loss": 0.78446418, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.40771484, + "step": 1411, + "time_per_iteration": 2.8400719165802 + }, + { + "auxiliary_loss_clip": 0.01444405, + "auxiliary_loss_mlp": 0.01089337, + "balance_loss_clip": 1.28138685, + "balance_loss_mlp": 1.06301582, + "epoch": 0.08489403276717271, + "flos": 51615349061760.0, + "grad_norm": 1.0261755756644533, + "language_loss": 0.56826866, + "learning_rate": 3.968474153054073e-06, + "loss": 0.59360611, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.26367188, + "step": 1412, + "time_per_iteration": 4.642136812210083 + }, + { + "auxiliary_loss_clip": 0.01705887, + "auxiliary_loss_mlp": 0.01076071, + "balance_loss_clip": 1.43474674, + "balance_loss_mlp": 1.03542089, + "epoch": 0.08495415601984067, + "flos": 17101176675840.0, + "grad_norm": 2.0336288613133005, + "language_loss": 0.91326344, + "learning_rate": 3.96840523783202e-06, + "loss": 0.94108301, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.40649414, + "step": 1413, + "time_per_iteration": 2.8639779090881348 + }, + { + "auxiliary_loss_clip": 0.01695898, + "auxiliary_loss_mlp": 0.01063387, + "balance_loss_clip": 1.4273752, + "balance_loss_mlp": 1.02547848, + "epoch": 0.08501427927250864, + "flos": 23158687259520.0, + "grad_norm": 1.710128651009867, + "language_loss": 0.89320958, + "learning_rate": 3.968336247967844e-06, + "loss": 0.92080241, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.37890625, + "step": 1414, + "time_per_iteration": 4.273478031158447 + }, + { + "auxiliary_loss_clip": 0.0170861, + "auxiliary_loss_mlp": 0.01065077, + "balance_loss_clip": 1.43735564, + "balance_loss_mlp": 1.02814555, + "epoch": 0.08507440252517662, + "flos": 19073059868160.0, + "grad_norm": 1.614113913989463, + "language_loss": 0.78804165, + "learning_rate": 3.96826718346416e-06, + "loss": 0.81577849, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.36938477, + "step": 1415, + "time_per_iteration": 4.234161615371704 + }, + { + "auxiliary_loss_clip": 0.01699921, + "auxiliary_loss_mlp": 0.01064141, + "balance_loss_clip": 1.43137252, + "balance_loss_mlp": 1.02599382, + "epoch": 0.08513452577784458, + "flos": 60202396005120.0, + "grad_norm": 1.838000390275932, + "language_loss": 0.72164333, + "learning_rate": 3.968198044323587e-06, + "loss": 0.74928391, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.38134766, + "step": 1416, + "time_per_iteration": 3.2811455726623535 + }, + { + "auxiliary_loss_clip": 0.0171938, + "auxiliary_loss_mlp": 0.01063482, + "balance_loss_clip": 1.43907213, + "balance_loss_mlp": 1.0254786, + "epoch": 0.08519464903051255, + "flos": 27320018480640.0, + "grad_norm": 2.0694159304260125, + "language_loss": 0.76324677, + "learning_rate": 3.968128830548748e-06, + "loss": 0.79107541, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.37988281, + "step": 1417, + "time_per_iteration": 2.9654271602630615 + }, + { + "auxiliary_loss_clip": 0.01698695, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_clip": 1.42641354, + "balance_loss_mlp": 1.01850009, + "epoch": 0.08525477228318051, + "flos": 20276068608000.0, + "grad_norm": 2.7134467086714196, + "language_loss": 0.83940411, + "learning_rate": 3.968059542142265e-06, + "loss": 0.86695892, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.38305664, + "step": 1418, + "time_per_iteration": 3.080857992172241 + }, + { + "auxiliary_loss_clip": 0.01428271, + "auxiliary_loss_mlp": 0.01059974, + "balance_loss_clip": 1.26694417, + "balance_loss_mlp": 1.03670406, + "epoch": 0.08531489553584849, + "flos": 67643822597760.0, + "grad_norm": 0.8750690451173324, + "language_loss": 0.56656063, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.59144306, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.23242188, + "step": 1419, + "time_per_iteration": 3.3670828342437744 + }, + { + "auxiliary_loss_clip": 0.01689507, + "auxiliary_loss_mlp": 0.01062137, + "balance_loss_clip": 1.41599929, + "balance_loss_mlp": 1.02675605, + "epoch": 0.08537501878851646, + "flos": 27538442968320.0, + "grad_norm": 2.3764680350747738, + "language_loss": 0.71989983, + "learning_rate": 3.967920741444886e-06, + "loss": 0.74741632, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 2.734375, + "router_z_loss_mlp": 0.35375977, + "step": 1420, + "time_per_iteration": 3.0056228637695312 + }, + { + "auxiliary_loss_clip": 0.01692804, + "auxiliary_loss_mlp": 0.01059803, + "balance_loss_clip": 1.42318439, + "balance_loss_mlp": 1.02146566, + "epoch": 0.08543514204118442, + "flos": 22794601201920.0, + "grad_norm": 2.4690451007342156, + "language_loss": 0.89526373, + "learning_rate": 3.967851229159252e-06, + "loss": 0.92278981, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.38354492, + "step": 1421, + "time_per_iteration": 3.0284202098846436 + }, + { + "auxiliary_loss_clip": 0.01431016, + "auxiliary_loss_mlp": 0.01054971, + "balance_loss_clip": 1.2702651, + "balance_loss_mlp": 1.02101982, + "epoch": 0.0854952652938524, + "flos": 61021989878400.0, + "grad_norm": 0.8036311751223029, + "language_loss": 0.635903, + "learning_rate": 3.967781642252502e-06, + "loss": 0.66076291, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.33984375, + "step": 1422, + "time_per_iteration": 3.3928236961364746 + }, + { + "auxiliary_loss_clip": 0.01684407, + "auxiliary_loss_mlp": 0.01066852, + "balance_loss_clip": 1.41631794, + "balance_loss_mlp": 1.02827597, + "epoch": 0.08555538854652037, + "flos": 28049728919040.0, + "grad_norm": 1.8219653336092325, + "language_loss": 0.84232134, + "learning_rate": 3.967711980727276e-06, + "loss": 0.86983395, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.38574219, + "step": 1423, + "time_per_iteration": 3.0131216049194336 + }, + { + "auxiliary_loss_clip": 0.01696909, + "auxiliary_loss_mlp": 0.01066573, + "balance_loss_clip": 1.42668986, + "balance_loss_mlp": 1.03128731, + "epoch": 0.08561551179918833, + "flos": 23518610795520.0, + "grad_norm": 1.7632183728226252, + "language_loss": 0.75618935, + "learning_rate": 3.967642244586213e-06, + "loss": 0.78382409, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.35302734, + "step": 1424, + "time_per_iteration": 2.926640748977661 + }, + { + "auxiliary_loss_clip": 0.01689533, + "auxiliary_loss_mlp": 0.01060066, + "balance_loss_clip": 1.41794598, + "balance_loss_mlp": 1.02308726, + "epoch": 0.08567563505185631, + "flos": 17934805716480.0, + "grad_norm": 1.9034944689676325, + "language_loss": 0.77651429, + "learning_rate": 3.96757243383196e-06, + "loss": 0.80401027, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.36938477, + "step": 1425, + "time_per_iteration": 2.899348735809326 + }, + { + "auxiliary_loss_clip": 0.01692776, + "auxiliary_loss_mlp": 0.01065587, + "balance_loss_clip": 1.42230022, + "balance_loss_mlp": 1.02634311, + "epoch": 0.08573575830452428, + "flos": 19729102492800.0, + "grad_norm": 1.943892824896567, + "language_loss": 0.94842446, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.97600806, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.39233398, + "step": 1426, + "time_per_iteration": 2.9380276203155518 + }, + { + "auxiliary_loss_clip": 0.01704396, + "auxiliary_loss_mlp": 0.01062891, + "balance_loss_clip": 1.42579079, + "balance_loss_mlp": 1.02457714, + "epoch": 0.08579588155719224, + "flos": 17940642295680.0, + "grad_norm": 2.1700824985926315, + "language_loss": 0.78074801, + "learning_rate": 3.967432588494471e-06, + "loss": 0.80842084, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 0.38354492, + "step": 1427, + "time_per_iteration": 2.9521608352661133 + }, + { + "auxiliary_loss_clip": 0.01696848, + "auxiliary_loss_mlp": 0.0106418, + "balance_loss_clip": 1.42563224, + "balance_loss_mlp": 1.02810717, + "epoch": 0.08585600480986022, + "flos": 16041657755520.0, + "grad_norm": 2.312776955225836, + "language_loss": 0.83301485, + "learning_rate": 3.96736255391654e-06, + "loss": 0.86062503, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.36083984, + "step": 1428, + "time_per_iteration": 2.981518507003784 + }, + { + "auxiliary_loss_clip": 0.01704344, + "auxiliary_loss_mlp": 0.0106686, + "balance_loss_clip": 1.42512119, + "balance_loss_mlp": 1.03133535, + "epoch": 0.08591612806252819, + "flos": 28669050748800.0, + "grad_norm": 1.838207722422203, + "language_loss": 0.82079577, + "learning_rate": 3.967292444736023e-06, + "loss": 0.84850776, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 2.79296875, + "router_z_loss_mlp": 0.35498047, + "step": 1429, + "time_per_iteration": 2.9386069774627686 + }, + { + "auxiliary_loss_clip": 0.01702808, + "auxiliary_loss_mlp": 0.01069937, + "balance_loss_clip": 1.43006253, + "balance_loss_mlp": 1.03274357, + "epoch": 0.08597625131519615, + "flos": 20968786782720.0, + "grad_norm": 2.388530811603044, + "language_loss": 0.88910341, + "learning_rate": 3.967222260955578e-06, + "loss": 0.9168309, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.37207031, + "step": 1430, + "time_per_iteration": 2.874907970428467 + }, + { + "auxiliary_loss_clip": 0.01681406, + "auxiliary_loss_mlp": 0.01065273, + "balance_loss_clip": 1.41620493, + "balance_loss_mlp": 1.0326817, + "epoch": 0.08603637456786412, + "flos": 23266270690560.0, + "grad_norm": 1.5623032415037705, + "language_loss": 0.82888043, + "learning_rate": 3.96715200257787e-06, + "loss": 0.8563472, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.32568359, + "step": 1431, + "time_per_iteration": 2.861708164215088 + }, + { + "auxiliary_loss_clip": 0.01700497, + "auxiliary_loss_mlp": 0.01081992, + "balance_loss_clip": 1.42888129, + "balance_loss_mlp": 1.04611015, + "epoch": 0.0860964978205321, + "flos": 28705862033280.0, + "grad_norm": 2.435225587410007, + "language_loss": 0.7889396, + "learning_rate": 3.967081669605559e-06, + "loss": 0.81676447, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.35888672, + "step": 1432, + "time_per_iteration": 2.9523355960845947 + }, + { + "auxiliary_loss_clip": 0.0169521, + "auxiliary_loss_mlp": 0.01074067, + "balance_loss_clip": 1.42399538, + "balance_loss_mlp": 1.03792238, + "epoch": 0.08615662107320006, + "flos": 19327933681920.0, + "grad_norm": 1.8134356160512286, + "language_loss": 0.75193352, + "learning_rate": 3.967011262041315e-06, + "loss": 0.77962625, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.36132812, + "step": 1433, + "time_per_iteration": 3.0045876502990723 + }, + { + "auxiliary_loss_clip": 0.01713174, + "auxiliary_loss_mlp": 0.01069437, + "balance_loss_clip": 1.43466425, + "balance_loss_mlp": 1.03355479, + "epoch": 0.08621674432586802, + "flos": 15860180286720.0, + "grad_norm": 2.816150325384874, + "language_loss": 0.87046254, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.89828873, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 2.7890625, + "router_z_loss_mlp": 0.35913086, + "step": 1434, + "time_per_iteration": 2.818168878555298 + }, + { + "auxiliary_loss_clip": 0.01694877, + "auxiliary_loss_mlp": 0.01076575, + "balance_loss_clip": 1.42270827, + "balance_loss_mlp": 1.04243362, + "epoch": 0.086276867578536, + "flos": 14108757598080.0, + "grad_norm": 2.0602233215660486, + "language_loss": 0.80409151, + "learning_rate": 3.966870223147707e-06, + "loss": 0.83180606, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 2.72265625, + "router_z_loss_mlp": 0.34130859, + "step": 1435, + "time_per_iteration": 2.8941714763641357 + }, + { + "auxiliary_loss_clip": 0.01445746, + "auxiliary_loss_mlp": 0.01028285, + "balance_loss_clip": 1.28814626, + "balance_loss_mlp": 1.00978422, + "epoch": 0.08633699083120397, + "flos": 70218440760960.0, + "grad_norm": 0.8988382165235212, + "language_loss": 0.58007669, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60481703, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18457031, + "step": 1436, + "time_per_iteration": 3.4676103591918945 + }, + { + "auxiliary_loss_clip": 0.01705789, + "auxiliary_loss_mlp": 0.01074657, + "balance_loss_clip": 1.43048823, + "balance_loss_mlp": 1.03913283, + "epoch": 0.08639711408387193, + "flos": 30310808745600.0, + "grad_norm": 2.597100309015104, + "language_loss": 0.7176888, + "learning_rate": 3.966728885918437e-06, + "loss": 0.74549323, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.35522461, + "step": 1437, + "time_per_iteration": 3.0359976291656494 + }, + { + "auxiliary_loss_clip": 0.01696379, + "auxiliary_loss_mlp": 0.01075258, + "balance_loss_clip": 1.42335057, + "balance_loss_mlp": 1.04102135, + "epoch": 0.08645723733653991, + "flos": 20305957438080.0, + "grad_norm": 2.006618996803624, + "language_loss": 0.7437768, + "learning_rate": 3.966658105434627e-06, + "loss": 0.7714932, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.34228516, + "step": 1438, + "time_per_iteration": 2.883685827255249 + }, + { + "auxiliary_loss_clip": 0.01686215, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.41788769, + "balance_loss_mlp": 1.03769994, + "epoch": 0.08651736058920788, + "flos": 32903506846080.0, + "grad_norm": 1.6065783654927681, + "language_loss": 0.6571846, + "learning_rate": 3.966587250374945e-06, + "loss": 0.68476951, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.34570312, + "step": 1439, + "time_per_iteration": 3.0110933780670166 + }, + { + "auxiliary_loss_clip": 0.01690549, + "auxiliary_loss_mlp": 0.01077003, + "balance_loss_clip": 1.42228723, + "balance_loss_mlp": 1.04226589, + "epoch": 0.08657748384187584, + "flos": 22647446553600.0, + "grad_norm": 1.9724974421113723, + "language_loss": 0.89311683, + "learning_rate": 3.966516320742077e-06, + "loss": 0.92079234, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 2.68554688, + "router_z_loss_mlp": 0.34741211, + "step": 1440, + "time_per_iteration": 2.935922384262085 + }, + { + "auxiliary_loss_clip": 0.01728377, + "auxiliary_loss_mlp": 0.01079676, + "balance_loss_clip": 1.44433522, + "balance_loss_mlp": 1.04427075, + "epoch": 0.08663760709454381, + "flos": 23668570621440.0, + "grad_norm": 2.428064854464057, + "language_loss": 0.85249579, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.88057625, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 2.84179688, + "router_z_loss_mlp": 0.35375977, + "step": 1441, + "time_per_iteration": 2.883333206176758 + }, + { + "auxiliary_loss_clip": 0.01469998, + "auxiliary_loss_mlp": 0.01019698, + "balance_loss_clip": 1.31034112, + "balance_loss_mlp": 0.99738199, + "epoch": 0.08669773034721179, + "flos": 62713345455360.0, + "grad_norm": 0.8521165574315908, + "language_loss": 0.60621727, + "learning_rate": 3.966374237767545e-06, + "loss": 0.63111424, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.22363281, + "step": 1442, + "time_per_iteration": 3.5239264965057373 + }, + { + "auxiliary_loss_clip": 0.01709005, + "auxiliary_loss_mlp": 0.01084022, + "balance_loss_clip": 1.42988181, + "balance_loss_mlp": 1.04895055, + "epoch": 0.08675785359987975, + "flos": 20677146929280.0, + "grad_norm": 1.9882040729084187, + "language_loss": 0.81546283, + "learning_rate": 3.96630308443127e-06, + "loss": 0.84339309, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 2.79101562, + "router_z_loss_mlp": 0.35058594, + "step": 1443, + "time_per_iteration": 4.283212661743164 + }, + { + "auxiliary_loss_clip": 0.01707026, + "auxiliary_loss_mlp": 0.01079233, + "balance_loss_clip": 1.43125248, + "balance_loss_mlp": 1.04413795, + "epoch": 0.08681797685254772, + "flos": 26951905635840.0, + "grad_norm": 1.5589618569217094, + "language_loss": 0.8396157, + "learning_rate": 3.966231856532584e-06, + "loss": 0.86747831, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.35107422, + "step": 1444, + "time_per_iteration": 2.954648971557617 + }, + { + "auxiliary_loss_clip": 0.01723422, + "auxiliary_loss_mlp": 0.01080446, + "balance_loss_clip": 1.44141483, + "balance_loss_mlp": 1.04656696, + "epoch": 0.0868781001052157, + "flos": 17721901094400.0, + "grad_norm": 2.004602500040005, + "language_loss": 0.88611609, + "learning_rate": 3.966160554074189e-06, + "loss": 0.91415477, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.33862305, + "step": 1445, + "time_per_iteration": 2.815340280532837 + }, + { + "auxiliary_loss_clip": 0.0170191, + "auxiliary_loss_mlp": 0.01081163, + "balance_loss_clip": 1.42954755, + "balance_loss_mlp": 1.0482372, + "epoch": 0.08693822335788366, + "flos": 19904743382400.0, + "grad_norm": 1.8989968519371574, + "language_loss": 0.83322692, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8610577, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 2.72070312, + "router_z_loss_mlp": 0.3293457, + "step": 1446, + "time_per_iteration": 2.8846256732940674 + }, + { + "auxiliary_loss_clip": 0.01470507, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.30922985, + "balance_loss_mlp": 1.01409173, + "epoch": 0.08699834661055163, + "flos": 67050453300480.0, + "grad_norm": 0.7325457547512711, + "language_loss": 0.54894996, + "learning_rate": 3.966017725489091e-06, + "loss": 0.57400191, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.20605469, + "step": 1447, + "time_per_iteration": 4.8357110023498535 + }, + { + "auxiliary_loss_clip": 0.01669095, + "auxiliary_loss_mlp": 0.01074486, + "balance_loss_clip": 1.40504169, + "balance_loss_mlp": 1.04079723, + "epoch": 0.0870584698632196, + "flos": 13488485627520.0, + "grad_norm": 2.174017014775898, + "language_loss": 0.86280084, + "learning_rate": 3.965946199367804e-06, + "loss": 0.89023668, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.33666992, + "step": 1448, + "time_per_iteration": 2.8145554065704346 + }, + { + "auxiliary_loss_clip": 0.01719913, + "auxiliary_loss_mlp": 0.01077923, + "balance_loss_clip": 1.4381783, + "balance_loss_mlp": 1.04330468, + "epoch": 0.08711859311588757, + "flos": 16115551793280.0, + "grad_norm": 2.644616822273634, + "language_loss": 0.82496297, + "learning_rate": 3.965874598697638e-06, + "loss": 0.85294127, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 2.8203125, + "router_z_loss_mlp": 0.34594727, + "step": 1449, + "time_per_iteration": 2.8412206172943115 + }, + { + "auxiliary_loss_clip": 0.01693473, + "auxiliary_loss_mlp": 0.01077764, + "balance_loss_clip": 1.42366874, + "balance_loss_mlp": 1.0444572, + "epoch": 0.08717871636855554, + "flos": 38486588008320.0, + "grad_norm": 1.7273493153539616, + "language_loss": 0.72537041, + "learning_rate": 3.965802923481313e-06, + "loss": 0.75308269, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.33300781, + "step": 1450, + "time_per_iteration": 5.849255084991455 + }, + { + "auxiliary_loss_clip": 0.01687652, + "auxiliary_loss_mlp": 0.01079067, + "balance_loss_clip": 1.41450989, + "balance_loss_mlp": 1.04237473, + "epoch": 0.0872388396212235, + "flos": 17608888287360.0, + "grad_norm": 1.9194261179324634, + "language_loss": 0.84253937, + "learning_rate": 3.965731173721542e-06, + "loss": 0.87020653, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 2.73242188, + "router_z_loss_mlp": 0.3671875, + "step": 1451, + "time_per_iteration": 2.875861406326294 + }, + { + "auxiliary_loss_clip": 0.01679399, + "auxiliary_loss_mlp": 0.01095182, + "balance_loss_clip": 1.41394186, + "balance_loss_mlp": 1.05956221, + "epoch": 0.08729896287389148, + "flos": 25268857119360.0, + "grad_norm": 1.7061062845768897, + "language_loss": 0.75552243, + "learning_rate": 3.965659349421049e-06, + "loss": 0.78326821, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.35620117, + "step": 1452, + "time_per_iteration": 2.885282516479492 + }, + { + "auxiliary_loss_clip": 0.01701231, + "auxiliary_loss_mlp": 0.01079702, + "balance_loss_clip": 1.42550969, + "balance_loss_mlp": 1.04439187, + "epoch": 0.08735908612655945, + "flos": 15640805658240.0, + "grad_norm": 2.998545555636555, + "language_loss": 0.83116233, + "learning_rate": 3.965587450582556e-06, + "loss": 0.85897171, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.35302734, + "step": 1453, + "time_per_iteration": 2.867849588394165 + }, + { + "auxiliary_loss_clip": 0.0167399, + "auxiliary_loss_mlp": 0.01090298, + "balance_loss_clip": 1.40713286, + "balance_loss_mlp": 1.05372465, + "epoch": 0.08741920937922741, + "flos": 20349374463360.0, + "grad_norm": 1.9683129212356871, + "language_loss": 0.73230976, + "learning_rate": 3.96551547720879e-06, + "loss": 0.75995266, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.36572266, + "step": 1454, + "time_per_iteration": 2.8371164798736572 + }, + { + "auxiliary_loss_clip": 0.01473424, + "auxiliary_loss_mlp": 0.01046824, + "balance_loss_clip": 1.31099701, + "balance_loss_mlp": 1.02450836, + "epoch": 0.08747933263189539, + "flos": 62852781225600.0, + "grad_norm": 0.7776465392612809, + "language_loss": 0.58726376, + "learning_rate": 3.96544342930248e-06, + "loss": 0.61246622, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.22363281, + "step": 1455, + "time_per_iteration": 3.3931353092193604 + }, + { + "auxiliary_loss_clip": 0.01685266, + "auxiliary_loss_mlp": 0.01104446, + "balance_loss_clip": 1.41374278, + "balance_loss_mlp": 1.06827807, + "epoch": 0.08753945588456336, + "flos": 33048670723200.0, + "grad_norm": 1.7511202281446623, + "language_loss": 0.79132116, + "learning_rate": 3.965371306866359e-06, + "loss": 0.81921828, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 2.71484375, + "router_z_loss_mlp": 0.36157227, + "step": 1456, + "time_per_iteration": 2.9769954681396484 + }, + { + "auxiliary_loss_clip": 0.01694042, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_clip": 1.42124677, + "balance_loss_mlp": 1.05978656, + "epoch": 0.08759957913723132, + "flos": 35559014008320.0, + "grad_norm": 1.9199576013135244, + "language_loss": 0.73496616, + "learning_rate": 3.96529910990316e-06, + "loss": 0.7628485, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.34423828, + "step": 1457, + "time_per_iteration": 2.9944584369659424 + }, + { + "auxiliary_loss_clip": 0.01671141, + "auxiliary_loss_mlp": 0.01084023, + "balance_loss_clip": 1.40630615, + "balance_loss_mlp": 1.04821217, + "epoch": 0.0876597023898993, + "flos": 23920684502400.0, + "grad_norm": 2.1024040208512083, + "language_loss": 0.87908292, + "learning_rate": 3.965226838415622e-06, + "loss": 0.90663457, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.35839844, + "step": 1458, + "time_per_iteration": 2.94559907913208 + }, + { + "auxiliary_loss_clip": 0.01694166, + "auxiliary_loss_mlp": 0.01088486, + "balance_loss_clip": 1.42233491, + "balance_loss_mlp": 1.05341446, + "epoch": 0.08771982564256726, + "flos": 18123160394880.0, + "grad_norm": 1.6049704396692936, + "language_loss": 0.81521058, + "learning_rate": 3.965154492406486e-06, + "loss": 0.84303707, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.35083008, + "step": 1459, + "time_per_iteration": 2.846565008163452 + }, + { + "auxiliary_loss_clip": 0.01694209, + "auxiliary_loss_mlp": 0.0109419, + "balance_loss_clip": 1.41897845, + "balance_loss_mlp": 1.05711603, + "epoch": 0.08777994889523523, + "flos": 17720860464000.0, + "grad_norm": 2.120039522535448, + "language_loss": 0.85368615, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.88157016, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 2.74804688, + "router_z_loss_mlp": 0.37060547, + "step": 1460, + "time_per_iteration": 3.07643461227417 + }, + { + "auxiliary_loss_clip": 0.01693482, + "auxiliary_loss_mlp": 0.01091236, + "balance_loss_clip": 1.4209913, + "balance_loss_mlp": 1.05406654, + "epoch": 0.0878400721479032, + "flos": 12826742158080.0, + "grad_norm": 3.6312260092922144, + "language_loss": 0.82830882, + "learning_rate": 3.965009576834394e-06, + "loss": 0.85615599, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.37182617, + "step": 1461, + "time_per_iteration": 2.918724536895752 + }, + { + "auxiliary_loss_clip": 0.01691393, + "auxiliary_loss_mlp": 0.0108847, + "balance_loss_clip": 1.42056823, + "balance_loss_mlp": 1.05006099, + "epoch": 0.08790019540057117, + "flos": 26403039239040.0, + "grad_norm": 2.4215007321019977, + "language_loss": 0.77273512, + "learning_rate": 3.964937007276932e-06, + "loss": 0.80053377, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.38427734, + "step": 1462, + "time_per_iteration": 3.0084195137023926 + }, + { + "auxiliary_loss_clip": 0.01712235, + "auxiliary_loss_mlp": 0.01092823, + "balance_loss_clip": 1.43169188, + "balance_loss_mlp": 1.05572486, + "epoch": 0.08796031865323914, + "flos": 19143017608320.0, + "grad_norm": 2.1269486431456897, + "language_loss": 0.76087791, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.78892845, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.37084961, + "step": 1463, + "time_per_iteration": 2.9102320671081543 + }, + { + "auxiliary_loss_clip": 0.01705008, + "auxiliary_loss_mlp": 0.01082479, + "balance_loss_clip": 1.42527044, + "balance_loss_mlp": 1.04359317, + "epoch": 0.0880204419059071, + "flos": 26074542856320.0, + "grad_norm": 1.7738267130447192, + "language_loss": 0.84438133, + "learning_rate": 3.964791644632941e-06, + "loss": 0.87225622, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 2.79882812, + "router_z_loss_mlp": 0.38891602, + "step": 1464, + "time_per_iteration": 2.892742395401001 + }, + { + "auxiliary_loss_clip": 0.01694478, + "auxiliary_loss_mlp": 0.01079096, + "balance_loss_clip": 1.41953063, + "balance_loss_mlp": 1.04056787, + "epoch": 0.08808056515857508, + "flos": 22386781405440.0, + "grad_norm": 1.8455244796768437, + "language_loss": 0.79969823, + "learning_rate": 3.964718851551923e-06, + "loss": 0.82743394, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.38476562, + "step": 1465, + "time_per_iteration": 2.916489839553833 + }, + { + "auxiliary_loss_clip": 0.01707106, + "auxiliary_loss_mlp": 0.01088937, + "balance_loss_clip": 1.43181133, + "balance_loss_mlp": 1.04897857, + "epoch": 0.08814068841124305, + "flos": 23195679523200.0, + "grad_norm": 2.180820854670081, + "language_loss": 0.86923927, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.89719969, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 2.75195312, + "router_z_loss_mlp": 0.39941406, + "step": 1466, + "time_per_iteration": 2.8658385276794434 + }, + { + "auxiliary_loss_clip": 0.01692679, + "auxiliary_loss_mlp": 0.01070206, + "balance_loss_clip": 1.41867268, + "balance_loss_mlp": 1.0314635, + "epoch": 0.08820081166391101, + "flos": 25166114881920.0, + "grad_norm": 2.552061330735026, + "language_loss": 0.85616136, + "learning_rate": 3.964573041885641e-06, + "loss": 0.88379014, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.38720703, + "step": 1467, + "time_per_iteration": 2.935750722885132 + }, + { + "auxiliary_loss_clip": 0.01687484, + "auxiliary_loss_mlp": 0.01085464, + "balance_loss_clip": 1.41671693, + "balance_loss_mlp": 1.0474124, + "epoch": 0.08826093491657899, + "flos": 22240531653120.0, + "grad_norm": 1.6075343445833274, + "language_loss": 0.76493466, + "learning_rate": 3.964500025305907e-06, + "loss": 0.79266417, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 2.70703125, + "router_z_loss_mlp": 0.38037109, + "step": 1468, + "time_per_iteration": 2.882694721221924 + }, + { + "auxiliary_loss_clip": 0.01688703, + "auxiliary_loss_mlp": 0.01069531, + "balance_loss_clip": 1.42107892, + "balance_loss_mlp": 1.03004849, + "epoch": 0.08832105816924696, + "flos": 22136839274880.0, + "grad_norm": 1.7240328404565577, + "language_loss": 0.81352949, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.84111184, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.39501953, + "step": 1469, + "time_per_iteration": 2.873887300491333 + }, + { + "auxiliary_loss_clip": 0.01684523, + "auxiliary_loss_mlp": 0.01073465, + "balance_loss_clip": 1.41281271, + "balance_loss_mlp": 1.03415012, + "epoch": 0.08838118142191492, + "flos": 17575425118080.0, + "grad_norm": 1.9588436549928667, + "language_loss": 0.79851991, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.82609981, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.39331055, + "step": 1470, + "time_per_iteration": 2.901948928833008 + }, + { + "auxiliary_loss_clip": 0.0168469, + "auxiliary_loss_mlp": 0.01063463, + "balance_loss_clip": 1.41662407, + "balance_loss_mlp": 1.0249579, + "epoch": 0.0884413046745829, + "flos": 20786947355520.0, + "grad_norm": 1.7079328903955453, + "language_loss": 0.85606629, + "learning_rate": 3.964280528613569e-06, + "loss": 0.88354778, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 2.68164062, + "router_z_loss_mlp": 0.38476562, + "step": 1471, + "time_per_iteration": 2.882233142852783 + }, + { + "auxiliary_loss_clip": 0.01648562, + "auxiliary_loss_mlp": 0.01074955, + "balance_loss_clip": 1.39171231, + "balance_loss_mlp": 1.03392363, + "epoch": 0.08850142792725087, + "flos": 22135119972480.0, + "grad_norm": 1.6477712455437603, + "language_loss": 0.84121907, + "learning_rate": 3.964207214074324e-06, + "loss": 0.86845422, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.41015625, + "step": 1472, + "time_per_iteration": 2.9077906608581543 + }, + { + "auxiliary_loss_clip": 0.01675809, + "auxiliary_loss_mlp": 0.01071689, + "balance_loss_clip": 1.41121078, + "balance_loss_mlp": 1.03151584, + "epoch": 0.08856155117991883, + "flos": 22428841086720.0, + "grad_norm": 2.634937316587977, + "language_loss": 0.85639405, + "learning_rate": 3.964133825052146e-06, + "loss": 0.88386905, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.40161133, + "step": 1473, + "time_per_iteration": 2.8921308517456055 + }, + { + "auxiliary_loss_clip": 0.01671408, + "auxiliary_loss_mlp": 0.0107022, + "balance_loss_clip": 1.40414953, + "balance_loss_mlp": 1.03023684, + "epoch": 0.0886216744325868, + "flos": 29949618355200.0, + "grad_norm": 1.426364472899277, + "language_loss": 0.79852486, + "learning_rate": 3.964060361549816e-06, + "loss": 0.82594109, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 2.67578125, + "router_z_loss_mlp": 0.3996582, + "step": 1474, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.01656355, + "auxiliary_loss_mlp": 0.01065168, + "balance_loss_clip": 1.39996362, + "balance_loss_mlp": 1.0256387, + "epoch": 0.08868179768525478, + "flos": 23992090076160.0, + "grad_norm": 1.682659188555523, + "language_loss": 0.80450338, + "learning_rate": 3.963986823570121e-06, + "loss": 0.83171862, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.39526367, + "step": 1475, + "time_per_iteration": 3.0308282375335693 + }, + { + "auxiliary_loss_clip": 0.01674579, + "auxiliary_loss_mlp": 0.0106288, + "balance_loss_clip": 1.40535843, + "balance_loss_mlp": 1.0216341, + "epoch": 0.08874192093792274, + "flos": 43192623104640.0, + "grad_norm": 1.4762002362913769, + "language_loss": 0.75783765, + "learning_rate": 3.963913211115848e-06, + "loss": 0.78521222, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.41259766, + "step": 1476, + "time_per_iteration": 3.0930070877075195 + }, + { + "auxiliary_loss_clip": 0.01663867, + "auxiliary_loss_mlp": 0.0107097, + "balance_loss_clip": 1.39975309, + "balance_loss_mlp": 1.03146422, + "epoch": 0.0888020441905907, + "flos": 32864931014400.0, + "grad_norm": 1.4055656188485488, + "language_loss": 0.75966769, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.78701603, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.39501953, + "step": 1477, + "time_per_iteration": 3.0115277767181396 + }, + { + "auxiliary_loss_clip": 0.01659556, + "auxiliary_loss_mlp": 0.01060351, + "balance_loss_clip": 1.39239049, + "balance_loss_mlp": 1.01874661, + "epoch": 0.08886216744325869, + "flos": 23159863624320.0, + "grad_norm": 1.7128579526688374, + "language_loss": 0.88039815, + "learning_rate": 3.963765762794739e-06, + "loss": 0.90759724, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 2.67382812, + "router_z_loss_mlp": 0.41601562, + "step": 1478, + "time_per_iteration": 4.299095153808594 + }, + { + "auxiliary_loss_clip": 0.0165666, + "auxiliary_loss_mlp": 0.01067768, + "balance_loss_clip": 1.39166903, + "balance_loss_mlp": 1.02892947, + "epoch": 0.08892229069592665, + "flos": 23342110254720.0, + "grad_norm": 1.5493895659949921, + "language_loss": 0.78541017, + "learning_rate": 3.963691926933495e-06, + "loss": 0.81265444, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 2.65039062, + "router_z_loss_mlp": 0.38842773, + "step": 1479, + "time_per_iteration": 2.929715156555176 + }, + { + "auxiliary_loss_clip": 0.01656016, + "auxiliary_loss_mlp": 0.01062727, + "balance_loss_clip": 1.39300883, + "balance_loss_mlp": 1.0217905, + "epoch": 0.08898241394859462, + "flos": 26225000375040.0, + "grad_norm": 2.4639212824896823, + "language_loss": 0.79197025, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.81915772, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.40942383, + "step": 1480, + "time_per_iteration": 2.9235153198242188 + }, + { + "auxiliary_loss_clip": 0.01693831, + "auxiliary_loss_mlp": 0.01070554, + "balance_loss_clip": 1.41809785, + "balance_loss_mlp": 1.02868724, + "epoch": 0.0890425372012626, + "flos": 23561213414400.0, + "grad_norm": 10.443373973339584, + "language_loss": 0.68605328, + "learning_rate": 3.963544031823624e-06, + "loss": 0.71369708, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.41870117, + "step": 1481, + "time_per_iteration": 2.9531354904174805 + }, + { + "auxiliary_loss_clip": 0.01658667, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_clip": 1.39738476, + "balance_loss_mlp": 1.02937758, + "epoch": 0.08910266045393056, + "flos": 23013116179200.0, + "grad_norm": 1.9665640994860454, + "language_loss": 0.97791833, + "learning_rate": 3.9634699725806065e-06, + "loss": 1.00521791, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.41894531, + "step": 1482, + "time_per_iteration": 4.270545482635498 + }, + { + "auxiliary_loss_clip": 0.01676101, + "auxiliary_loss_mlp": 0.01078856, + "balance_loss_clip": 1.40607762, + "balance_loss_mlp": 1.03987479, + "epoch": 0.08916278370659853, + "flos": 31947770793600.0, + "grad_norm": 1.9059355623553798, + "language_loss": 0.79606533, + "learning_rate": 3.96339583888261e-06, + "loss": 0.82361495, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 0.3894043, + "step": 1483, + "time_per_iteration": 3.0013105869293213 + }, + { + "auxiliary_loss_clip": 0.01680897, + "auxiliary_loss_mlp": 0.01077246, + "balance_loss_clip": 1.41390324, + "balance_loss_mlp": 1.03802633, + "epoch": 0.08922290695926649, + "flos": 17539428240000.0, + "grad_norm": 1.9878844214004985, + "language_loss": 0.87401807, + "learning_rate": 3.963321630732448e-06, + "loss": 0.90159953, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 2.66796875, + "router_z_loss_mlp": 0.3918457, + "step": 1484, + "time_per_iteration": 2.878570318222046 + }, + { + "auxiliary_loss_clip": 0.01682102, + "auxiliary_loss_mlp": 0.01074906, + "balance_loss_clip": 1.4093945, + "balance_loss_mlp": 1.03423142, + "epoch": 0.08928303021193447, + "flos": 32137980508800.0, + "grad_norm": 1.9650926117134675, + "language_loss": 0.81299567, + "learning_rate": 3.963247348132932e-06, + "loss": 0.84056574, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.40649414, + "step": 1485, + "time_per_iteration": 4.396559000015259 + }, + { + "auxiliary_loss_clip": 0.01660476, + "auxiliary_loss_mlp": 0.01069478, + "balance_loss_clip": 1.39631033, + "balance_loss_mlp": 1.02997208, + "epoch": 0.08934315346460243, + "flos": 22134803258880.0, + "grad_norm": 1.595647567002402, + "language_loss": 0.84274656, + "learning_rate": 3.96317299108688e-06, + "loss": 0.87004602, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.39501953, + "step": 1486, + "time_per_iteration": 4.358829498291016 + }, + { + "auxiliary_loss_clip": 0.01652827, + "auxiliary_loss_mlp": 0.01074429, + "balance_loss_clip": 1.39119887, + "balance_loss_mlp": 1.03358781, + "epoch": 0.0894032767172704, + "flos": 22575950490240.0, + "grad_norm": 2.244217157801428, + "language_loss": 0.78194451, + "learning_rate": 3.963098559597111e-06, + "loss": 0.8092171, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.40844727, + "step": 1487, + "time_per_iteration": 2.9381394386291504 + }, + { + "auxiliary_loss_clip": 0.01635197, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.37701392, + "balance_loss_mlp": 1.01884341, + "epoch": 0.08946339996993838, + "flos": 20202988976640.0, + "grad_norm": 2.436894735809548, + "language_loss": 0.8501749, + "learning_rate": 3.963024053666449e-06, + "loss": 0.87710536, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.39013672, + "step": 1488, + "time_per_iteration": 2.884742498397827 + }, + { + "auxiliary_loss_clip": 0.01644925, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_clip": 1.38624668, + "balance_loss_mlp": 1.01995039, + "epoch": 0.08952352322260634, + "flos": 48377023920000.0, + "grad_norm": 2.3289671024855068, + "language_loss": 0.74075156, + "learning_rate": 3.962949473297718e-06, + "loss": 0.76777196, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.37158203, + "step": 1489, + "time_per_iteration": 3.1257054805755615 + }, + { + "auxiliary_loss_clip": 0.01633045, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_clip": 1.37348235, + "balance_loss_mlp": 1.02361655, + "epoch": 0.08958364647527431, + "flos": 31804959646080.0, + "grad_norm": 2.674069010172623, + "language_loss": 0.9119972, + "learning_rate": 3.962874818493745e-06, + "loss": 0.93895292, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.38891602, + "step": 1490, + "time_per_iteration": 3.0113754272460938 + }, + { + "auxiliary_loss_clip": 0.01655651, + "auxiliary_loss_mlp": 0.01065339, + "balance_loss_clip": 1.3933835, + "balance_loss_mlp": 1.02788305, + "epoch": 0.08964376972794229, + "flos": 23378740560000.0, + "grad_norm": 2.4490251042082325, + "language_loss": 0.7669338, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.79414368, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.37426758, + "step": 1491, + "time_per_iteration": 2.9008941650390625 + }, + { + "auxiliary_loss_clip": 0.01647906, + "auxiliary_loss_mlp": 0.01066715, + "balance_loss_clip": 1.38630414, + "balance_loss_mlp": 1.02995062, + "epoch": 0.08970389298061025, + "flos": 23305208480640.0, + "grad_norm": 1.6066820056404474, + "language_loss": 0.78682804, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.81397426, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.36791992, + "step": 1492, + "time_per_iteration": 2.936413526535034 + }, + { + "auxiliary_loss_clip": 0.01638439, + "auxiliary_loss_mlp": 0.01081234, + "balance_loss_clip": 1.37827098, + "balance_loss_mlp": 1.03166699, + "epoch": 0.08976401623327822, + "flos": 33773675702400.0, + "grad_norm": 1.8486330335418228, + "language_loss": 0.72137928, + "learning_rate": 3.962650407498707e-06, + "loss": 0.74857593, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.49560547, + "step": 1493, + "time_per_iteration": 3.029247999191284 + }, + { + "auxiliary_loss_clip": 0.01652156, + "auxiliary_loss_mlp": 0.01066631, + "balance_loss_clip": 1.38856578, + "balance_loss_mlp": 1.03027236, + "epoch": 0.08982413948594618, + "flos": 23920955971200.0, + "grad_norm": 2.0390692176178558, + "language_loss": 0.88118517, + "learning_rate": 3.962575454982109e-06, + "loss": 0.90837306, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.36376953, + "step": 1494, + "time_per_iteration": 2.9170124530792236 + }, + { + "auxiliary_loss_clip": 0.01632643, + "auxiliary_loss_mlp": 0.0106423, + "balance_loss_clip": 1.37449408, + "balance_loss_mlp": 1.02868187, + "epoch": 0.08988426273861416, + "flos": 16846483841280.0, + "grad_norm": 1.8717642691748706, + "language_loss": 0.84031439, + "learning_rate": 3.962500428044454e-06, + "loss": 0.86728311, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.35546875, + "step": 1495, + "time_per_iteration": 2.90978741645813 + }, + { + "auxiliary_loss_clip": 0.01669186, + "auxiliary_loss_mlp": 0.01069894, + "balance_loss_clip": 1.40159094, + "balance_loss_mlp": 1.0360384, + "epoch": 0.08994438599128213, + "flos": 14801521017600.0, + "grad_norm": 3.393541058830151, + "language_loss": 0.72491241, + "learning_rate": 3.962425326688585e-06, + "loss": 0.75230318, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 2.67773438, + "router_z_loss_mlp": 0.33862305, + "step": 1496, + "time_per_iteration": 2.8464479446411133 + }, + { + "auxiliary_loss_clip": 0.01644599, + "auxiliary_loss_mlp": 0.01070496, + "balance_loss_clip": 1.38588929, + "balance_loss_mlp": 1.03325534, + "epoch": 0.09000450924395009, + "flos": 17393268977280.0, + "grad_norm": 1.5496575400627126, + "language_loss": 0.81716537, + "learning_rate": 3.962350150917351e-06, + "loss": 0.8443163, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.37231445, + "step": 1497, + "time_per_iteration": 2.8943357467651367 + }, + { + "auxiliary_loss_clip": 0.01658293, + "auxiliary_loss_mlp": 0.01066662, + "balance_loss_clip": 1.39332759, + "balance_loss_mlp": 1.03070867, + "epoch": 0.09006463249661807, + "flos": 24291104832000.0, + "grad_norm": 3.3952271450621967, + "language_loss": 0.84967268, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.87692219, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.35961914, + "step": 1498, + "time_per_iteration": 2.891442060470581 + }, + { + "auxiliary_loss_clip": 0.01665443, + "auxiliary_loss_mlp": 0.01067398, + "balance_loss_clip": 1.39854503, + "balance_loss_mlp": 1.03189778, + "epoch": 0.09012475574928604, + "flos": 13669691627520.0, + "grad_norm": 2.015433740759597, + "language_loss": 0.8095361, + "learning_rate": 3.962199576140195e-06, + "loss": 0.83686447, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.35498047, + "step": 1499, + "time_per_iteration": 2.927785873413086 + }, + { + "auxiliary_loss_clip": 0.01644552, + "auxiliary_loss_mlp": 0.01070456, + "balance_loss_clip": 1.38787007, + "balance_loss_mlp": 1.03500283, + "epoch": 0.090184879001954, + "flos": 23337269061120.0, + "grad_norm": 1.5462937322027968, + "language_loss": 0.93682665, + "learning_rate": 3.962124177139981e-06, + "loss": 0.96397674, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.947924852371216 + }, + { + "auxiliary_loss_clip": 0.01671828, + "auxiliary_loss_mlp": 0.01072001, + "balance_loss_clip": 1.40261638, + "balance_loss_mlp": 1.03468895, + "epoch": 0.09024500225462198, + "flos": 23012980444800.0, + "grad_norm": 2.5774026193694057, + "language_loss": 0.76314932, + "learning_rate": 3.962048703735822e-06, + "loss": 0.79058754, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 2.69140625, + "router_z_loss_mlp": 0.37329102, + "step": 1501, + "time_per_iteration": 2.8911335468292236 + }, + { + "auxiliary_loss_clip": 0.01476009, + "auxiliary_loss_mlp": 0.01057652, + "balance_loss_clip": 1.31001985, + "balance_loss_mlp": 1.03419137, + "epoch": 0.09030512550728995, + "flos": 62219343018240.0, + "grad_norm": 0.739700095691954, + "language_loss": 0.58482862, + "learning_rate": 3.96197315593058e-06, + "loss": 0.61016524, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.234375, + "step": 1502, + "time_per_iteration": 3.403590679168701 + }, + { + "auxiliary_loss_clip": 0.01647799, + "auxiliary_loss_mlp": 0.0106847, + "balance_loss_clip": 1.38411653, + "balance_loss_mlp": 1.03196836, + "epoch": 0.09036524875995791, + "flos": 38814993901440.0, + "grad_norm": 2.05866843974999, + "language_loss": 0.72498345, + "learning_rate": 3.961897533727119e-06, + "loss": 0.75214612, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.36499023, + "step": 1503, + "time_per_iteration": 3.0312466621398926 + }, + { + "auxiliary_loss_clip": 0.01665307, + "auxiliary_loss_mlp": 0.01085847, + "balance_loss_clip": 1.39682531, + "balance_loss_mlp": 1.04901123, + "epoch": 0.09042537201262588, + "flos": 21700261768320.0, + "grad_norm": 1.9087514985889942, + "language_loss": 0.87092113, + "learning_rate": 3.961821837128306e-06, + "loss": 0.89843261, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.3684082, + "step": 1504, + "time_per_iteration": 2.9238553047180176 + }, + { + "auxiliary_loss_clip": 0.01670948, + "auxiliary_loss_mlp": 0.01072346, + "balance_loss_clip": 1.39870358, + "balance_loss_mlp": 1.03653526, + "epoch": 0.09048549526529386, + "flos": 22275578390400.0, + "grad_norm": 2.5746394782277813, + "language_loss": 0.74122477, + "learning_rate": 3.961746066137014e-06, + "loss": 0.76865768, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.35791016, + "step": 1505, + "time_per_iteration": 2.953392744064331 + }, + { + "auxiliary_loss_clip": 0.01650124, + "auxiliary_loss_mlp": 0.01073541, + "balance_loss_clip": 1.38625789, + "balance_loss_mlp": 1.03880346, + "epoch": 0.09054561851796182, + "flos": 14619048163200.0, + "grad_norm": 2.112731160702036, + "language_loss": 0.83871639, + "learning_rate": 3.961670220756114e-06, + "loss": 0.86595297, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34741211, + "step": 1506, + "time_per_iteration": 2.8467507362365723 + }, + { + "auxiliary_loss_clip": 0.01639445, + "auxiliary_loss_mlp": 0.01070387, + "balance_loss_clip": 1.38040805, + "balance_loss_mlp": 1.03493464, + "epoch": 0.09060574177062979, + "flos": 27647609967360.0, + "grad_norm": 1.6353697396254228, + "language_loss": 0.77677554, + "learning_rate": 3.961594300988482e-06, + "loss": 0.8038739, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.35449219, + "step": 1507, + "time_per_iteration": 2.971989154815674 + }, + { + "auxiliary_loss_clip": 0.01457405, + "auxiliary_loss_mlp": 0.01026021, + "balance_loss_clip": 1.29464793, + "balance_loss_mlp": 1.00027204, + "epoch": 0.09066586502329776, + "flos": 66115032163200.0, + "grad_norm": 0.8046106051236679, + "language_loss": 0.57799071, + "learning_rate": 3.961518306836998e-06, + "loss": 0.60282499, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.2578125, + "step": 1508, + "time_per_iteration": 3.2084362506866455 + }, + { + "auxiliary_loss_clip": 0.01655207, + "auxiliary_loss_mlp": 0.0106778, + "balance_loss_clip": 1.39106107, + "balance_loss_mlp": 1.03235114, + "epoch": 0.09072598827596573, + "flos": 18925724240640.0, + "grad_norm": 1.7674853944206754, + "language_loss": 0.86488581, + "learning_rate": 3.961442238304543e-06, + "loss": 0.89211559, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.35400391, + "step": 1509, + "time_per_iteration": 2.8927559852600098 + }, + { + "auxiliary_loss_clip": 0.01673208, + "auxiliary_loss_mlp": 0.01094452, + "balance_loss_clip": 1.39916849, + "balance_loss_mlp": 1.05637646, + "epoch": 0.0907861115286337, + "flos": 24831148492800.0, + "grad_norm": 4.48337319120008, + "language_loss": 0.86248207, + "learning_rate": 3.961366095394002e-06, + "loss": 0.89015865, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 2.73828125, + "router_z_loss_mlp": 0.38085938, + "step": 1510, + "time_per_iteration": 2.947505235671997 + }, + { + "auxiliary_loss_clip": 0.01662225, + "auxiliary_loss_mlp": 0.01074659, + "balance_loss_clip": 1.39516759, + "balance_loss_mlp": 1.03889644, + "epoch": 0.09084623478130167, + "flos": 21662817056640.0, + "grad_norm": 2.026678293487158, + "language_loss": 0.88278031, + "learning_rate": 3.961289878108262e-06, + "loss": 0.91014922, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.35791016, + "step": 1511, + "time_per_iteration": 2.859786033630371 + }, + { + "auxiliary_loss_clip": 0.01645444, + "auxiliary_loss_mlp": 0.01069385, + "balance_loss_clip": 1.38609385, + "balance_loss_mlp": 1.03583968, + "epoch": 0.09090635803396964, + "flos": 27651501020160.0, + "grad_norm": 2.0124371127310314, + "language_loss": 0.85914028, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.88628858, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.33520508, + "step": 1512, + "time_per_iteration": 2.994142770767212 + }, + { + "auxiliary_loss_clip": 0.01658503, + "auxiliary_loss_mlp": 0.01073684, + "balance_loss_clip": 1.39797199, + "balance_loss_mlp": 1.03980482, + "epoch": 0.0909664812866376, + "flos": 17676312318720.0, + "grad_norm": 2.1469279417870557, + "language_loss": 0.89224726, + "learning_rate": 3.961137220422749e-06, + "loss": 0.91956913, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.33886719, + "step": 1513, + "time_per_iteration": 4.289369106292725 + }, + { + "auxiliary_loss_clip": 0.0165105, + "auxiliary_loss_mlp": 0.01065944, + "balance_loss_clip": 1.38828921, + "balance_loss_mlp": 1.02722454, + "epoch": 0.09102660453930557, + "flos": 23962020266880.0, + "grad_norm": 1.6461351785800569, + "language_loss": 0.87990695, + "learning_rate": 3.961060780028764e-06, + "loss": 0.90707684, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 2.63085938, + "router_z_loss_mlp": 0.38696289, + "step": 1514, + "time_per_iteration": 2.90616512298584 + }, + { + "auxiliary_loss_clip": 0.01636009, + "auxiliary_loss_mlp": 0.01075415, + "balance_loss_clip": 1.37680578, + "balance_loss_mlp": 1.03896117, + "epoch": 0.09108672779197355, + "flos": 25824193522560.0, + "grad_norm": 1.683506148467793, + "language_loss": 0.91180718, + "learning_rate": 3.960984265271159e-06, + "loss": 0.93892145, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.36425781, + "step": 1515, + "time_per_iteration": 2.998619318008423 + }, + { + "auxiliary_loss_clip": 0.01655958, + "auxiliary_loss_mlp": 0.01071523, + "balance_loss_clip": 1.39173818, + "balance_loss_mlp": 1.03745317, + "epoch": 0.09114685104464151, + "flos": 29650558354560.0, + "grad_norm": 1.7634488736913074, + "language_loss": 0.86503738, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.89231217, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.34082031, + "step": 1516, + "time_per_iteration": 2.938854694366455 + }, + { + "auxiliary_loss_clip": 0.0166739, + "auxiliary_loss_mlp": 0.01067818, + "balance_loss_clip": 1.39876509, + "balance_loss_mlp": 1.03296149, + "epoch": 0.09120697429730948, + "flos": 33742746241920.0, + "grad_norm": 1.6036059964623912, + "language_loss": 0.81596935, + "learning_rate": 3.960831012676692e-06, + "loss": 0.8433215, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.34863281, + "step": 1517, + "time_per_iteration": 4.508835077285767 + }, + { + "auxiliary_loss_clip": 0.01663062, + "auxiliary_loss_mlp": 0.01064118, + "balance_loss_clip": 1.39769816, + "balance_loss_mlp": 1.02897525, + "epoch": 0.09126709754997746, + "flos": 18409416117120.0, + "grad_norm": 1.5995405068390827, + "language_loss": 0.78774977, + "learning_rate": 3.960754274845642e-06, + "loss": 0.81502151, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.3515625, + "step": 1518, + "time_per_iteration": 2.919077157974243 + }, + { + "auxiliary_loss_clip": 0.01655945, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_clip": 1.39179683, + "balance_loss_mlp": 1.02338874, + "epoch": 0.09132722080264542, + "flos": 22102380720000.0, + "grad_norm": 1.8266928893058352, + "language_loss": 0.88356459, + "learning_rate": 3.960677462662594e-06, + "loss": 0.91069078, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.33251953, + "step": 1519, + "time_per_iteration": 2.8690521717071533 + }, + { + "auxiliary_loss_clip": 0.01662544, + "auxiliary_loss_mlp": 0.01063968, + "balance_loss_clip": 1.39687383, + "balance_loss_mlp": 1.02844381, + "epoch": 0.09138734405531339, + "flos": 21043088023680.0, + "grad_norm": 2.250584127886328, + "language_loss": 0.74989736, + "learning_rate": 3.96060057613046e-06, + "loss": 0.77716255, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.35522461, + "step": 1520, + "time_per_iteration": 4.306478977203369 + }, + { + "auxiliary_loss_clip": 0.01666095, + "auxiliary_loss_mlp": 0.01058503, + "balance_loss_clip": 1.39719105, + "balance_loss_mlp": 1.02293086, + "epoch": 0.09144746730798137, + "flos": 20093505264000.0, + "grad_norm": 2.43809225579792, + "language_loss": 0.87891495, + "learning_rate": 3.960523615252156e-06, + "loss": 0.90616095, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.35546875, + "step": 1521, + "time_per_iteration": 4.307372808456421 + }, + { + "auxiliary_loss_clip": 0.01669929, + "auxiliary_loss_mlp": 0.01059273, + "balance_loss_clip": 1.40267062, + "balance_loss_mlp": 1.02589464, + "epoch": 0.09150759056064933, + "flos": 22787000075520.0, + "grad_norm": 1.807534738591785, + "language_loss": 0.85570478, + "learning_rate": 3.960446580030599e-06, + "loss": 0.8829968, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.33374023, + "step": 1522, + "time_per_iteration": 2.9227657318115234 + }, + { + "auxiliary_loss_clip": 0.01639962, + "auxiliary_loss_mlp": 0.01057061, + "balance_loss_clip": 1.3840183, + "balance_loss_mlp": 1.02284765, + "epoch": 0.0915677138133173, + "flos": 27575887680000.0, + "grad_norm": 1.6233727391182309, + "language_loss": 0.82577395, + "learning_rate": 3.960369470468711e-06, + "loss": 0.85274422, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.3425293, + "step": 1523, + "time_per_iteration": 2.915952682495117 + }, + { + "auxiliary_loss_clip": 0.01667642, + "auxiliary_loss_mlp": 0.01055244, + "balance_loss_clip": 1.39966583, + "balance_loss_mlp": 1.02129316, + "epoch": 0.09162783706598528, + "flos": 17683415752320.0, + "grad_norm": 2.0379907384696585, + "language_loss": 0.76017284, + "learning_rate": 3.960292286569418e-06, + "loss": 0.78740174, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.33959961, + "step": 1524, + "time_per_iteration": 2.8934710025787354 + }, + { + "auxiliary_loss_clip": 0.01664751, + "auxiliary_loss_mlp": 0.01058879, + "balance_loss_clip": 1.40006042, + "balance_loss_mlp": 1.02435625, + "epoch": 0.09168796031865324, + "flos": 18487246452480.0, + "grad_norm": 1.8437103088332405, + "language_loss": 0.87414771, + "learning_rate": 3.960215028335644e-06, + "loss": 0.901384, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.3449707, + "step": 1525, + "time_per_iteration": 2.8521816730499268 + }, + { + "auxiliary_loss_clip": 0.01675324, + "auxiliary_loss_mlp": 0.01057867, + "balance_loss_clip": 1.40696514, + "balance_loss_mlp": 1.02148473, + "epoch": 0.0917480835713212, + "flos": 29399530348800.0, + "grad_norm": 2.5446778005448127, + "language_loss": 0.75899804, + "learning_rate": 3.96013769577032e-06, + "loss": 0.78632998, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.36376953, + "step": 1526, + "time_per_iteration": 2.963404417037964 + }, + { + "auxiliary_loss_clip": 0.01656664, + "auxiliary_loss_mlp": 0.01060591, + "balance_loss_clip": 1.39404547, + "balance_loss_mlp": 1.02344561, + "epoch": 0.09180820682398917, + "flos": 19838948163840.0, + "grad_norm": 1.7267260946179757, + "language_loss": 0.78369761, + "learning_rate": 3.960060288876378e-06, + "loss": 0.81087017, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.37158203, + "step": 1527, + "time_per_iteration": 2.906580924987793 + }, + { + "auxiliary_loss_clip": 0.01667804, + "auxiliary_loss_mlp": 0.01054095, + "balance_loss_clip": 1.40304434, + "balance_loss_mlp": 1.01923871, + "epoch": 0.09186833007665715, + "flos": 23852355575040.0, + "grad_norm": 1.9932991185033384, + "language_loss": 0.80954593, + "learning_rate": 3.959982807656753e-06, + "loss": 0.83676493, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.34863281, + "step": 1528, + "time_per_iteration": 2.8752079010009766 + }, + { + "auxiliary_loss_clip": 0.01684812, + "auxiliary_loss_mlp": 0.01052023, + "balance_loss_clip": 1.41251504, + "balance_loss_mlp": 1.01952672, + "epoch": 0.09192845332932512, + "flos": 12940433637120.0, + "grad_norm": 4.031828383907446, + "language_loss": 0.79142755, + "learning_rate": 3.959905252114384e-06, + "loss": 0.81879586, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.32495117, + "step": 1529, + "time_per_iteration": 2.8646273612976074 + }, + { + "auxiliary_loss_clip": 0.01686021, + "auxiliary_loss_mlp": 0.01061238, + "balance_loss_clip": 1.41511226, + "balance_loss_mlp": 1.02719188, + "epoch": 0.09198857658199308, + "flos": 24577993981440.0, + "grad_norm": 1.8772750235243316, + "language_loss": 0.83580929, + "learning_rate": 3.959827622252211e-06, + "loss": 0.86328197, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 2.7109375, + "router_z_loss_mlp": 0.34033203, + "step": 1530, + "time_per_iteration": 2.8885927200317383 + }, + { + "auxiliary_loss_clip": 0.01676379, + "auxiliary_loss_mlp": 0.01057787, + "balance_loss_clip": 1.41199255, + "balance_loss_mlp": 1.02335894, + "epoch": 0.09204869983466106, + "flos": 20276702035200.0, + "grad_norm": 1.8271986087380496, + "language_loss": 0.85206437, + "learning_rate": 3.959749918073179e-06, + "loss": 0.87940598, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.34423828, + "step": 1531, + "time_per_iteration": 2.898883104324341 + }, + { + "auxiliary_loss_clip": 0.01674737, + "auxiliary_loss_mlp": 0.01063659, + "balance_loss_clip": 1.40775132, + "balance_loss_mlp": 1.02963638, + "epoch": 0.09210882308732903, + "flos": 20895164213760.0, + "grad_norm": 2.532466387693168, + "language_loss": 0.82364744, + "learning_rate": 3.959672139580233e-06, + "loss": 0.85103136, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.34008789, + "step": 1532, + "time_per_iteration": 2.896074056625366 + }, + { + "auxiliary_loss_clip": 0.0167546, + "auxiliary_loss_mlp": 0.01058557, + "balance_loss_clip": 1.40920353, + "balance_loss_mlp": 1.02300894, + "epoch": 0.09216894633999699, + "flos": 30968615917440.0, + "grad_norm": 1.806584452524252, + "language_loss": 0.84684324, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.87418342, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.35571289, + "step": 1533, + "time_per_iteration": 2.9397881031036377 + }, + { + "auxiliary_loss_clip": 0.01676852, + "auxiliary_loss_mlp": 0.0106575, + "balance_loss_clip": 1.41056919, + "balance_loss_mlp": 1.03396904, + "epoch": 0.09222906959266497, + "flos": 13159310572800.0, + "grad_norm": 3.123647290685528, + "language_loss": 0.91731352, + "learning_rate": 3.959516359664402e-06, + "loss": 0.94473958, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.31762695, + "step": 1534, + "time_per_iteration": 2.9093289375305176 + }, + { + "auxiliary_loss_clip": 0.01689488, + "auxiliary_loss_mlp": 0.01075286, + "balance_loss_clip": 1.42022121, + "balance_loss_mlp": 1.041574, + "epoch": 0.09228919284533293, + "flos": 26005535256960.0, + "grad_norm": 2.1953025632176155, + "language_loss": 0.78038126, + "learning_rate": 3.959438358247424e-06, + "loss": 0.808029, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 2.69335938, + "router_z_loss_mlp": 0.33691406, + "step": 1535, + "time_per_iteration": 2.9374282360076904 + }, + { + "auxiliary_loss_clip": 0.01664848, + "auxiliary_loss_mlp": 0.01077637, + "balance_loss_clip": 1.40434861, + "balance_loss_mlp": 1.0433526, + "epoch": 0.0923493160980009, + "flos": 18669945530880.0, + "grad_norm": 2.442655396544495, + "language_loss": 0.83426291, + "learning_rate": 3.959360282528346e-06, + "loss": 0.86168778, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.3425293, + "step": 1536, + "time_per_iteration": 2.895871639251709 + }, + { + "auxiliary_loss_clip": 0.01675834, + "auxiliary_loss_mlp": 0.01070177, + "balance_loss_clip": 1.41351306, + "balance_loss_mlp": 1.03887272, + "epoch": 0.09240943935066886, + "flos": 21150038027520.0, + "grad_norm": 1.8130106426531971, + "language_loss": 0.90937197, + "learning_rate": 3.959282132510131e-06, + "loss": 0.93683207, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.31298828, + "step": 1537, + "time_per_iteration": 2.8687620162963867 + }, + { + "auxiliary_loss_clip": 0.01691744, + "auxiliary_loss_mlp": 0.01079056, + "balance_loss_clip": 1.4236424, + "balance_loss_mlp": 1.04467583, + "epoch": 0.09246956260333684, + "flos": 20601533589120.0, + "grad_norm": 2.3139860813727875, + "language_loss": 0.83089095, + "learning_rate": 3.959203908195741e-06, + "loss": 0.85859901, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 2.68359375, + "router_z_loss_mlp": 0.34375, + "step": 1538, + "time_per_iteration": 2.8745205402374268 + }, + { + "auxiliary_loss_clip": 0.01485637, + "auxiliary_loss_mlp": 0.01053187, + "balance_loss_clip": 1.32329369, + "balance_loss_mlp": 1.03125274, + "epoch": 0.09252968585600481, + "flos": 67591827043200.0, + "grad_norm": 0.8730405108189423, + "language_loss": 0.57457119, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59995943, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.21972656, + "step": 1539, + "time_per_iteration": 3.492041826248169 + }, + { + "auxiliary_loss_clip": 0.01689132, + "auxiliary_loss_mlp": 0.01070051, + "balance_loss_clip": 1.419572, + "balance_loss_mlp": 1.03555179, + "epoch": 0.09258980910867277, + "flos": 17392680794880.0, + "grad_norm": 2.4811633319148427, + "language_loss": 0.69509053, + "learning_rate": 3.959047236690304e-06, + "loss": 0.72268236, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.34521484, + "step": 1540, + "time_per_iteration": 2.8701462745666504 + }, + { + "auxiliary_loss_clip": 0.01673523, + "auxiliary_loss_mlp": 0.01064862, + "balance_loss_clip": 1.41102493, + "balance_loss_mlp": 1.03215146, + "epoch": 0.09264993236134075, + "flos": 19875668958720.0, + "grad_norm": 1.6297506297394941, + "language_loss": 0.84615725, + "learning_rate": 3.958968789505198e-06, + "loss": 0.87354112, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.32739258, + "step": 1541, + "time_per_iteration": 2.947115659713745 + }, + { + "auxiliary_loss_clip": 0.01475852, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.31257975, + "balance_loss_mlp": 1.02284372, + "epoch": 0.09271005561400872, + "flos": 62310864545280.0, + "grad_norm": 0.9311058150022005, + "language_loss": 0.61956948, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64479774, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.24121094, + "step": 1542, + "time_per_iteration": 3.4006619453430176 + }, + { + "auxiliary_loss_clip": 0.01691138, + "auxiliary_loss_mlp": 0.01073376, + "balance_loss_clip": 1.42473745, + "balance_loss_mlp": 1.03725576, + "epoch": 0.09277017886667668, + "flos": 23339938504320.0, + "grad_norm": 1.9644571836147888, + "language_loss": 0.8400231, + "learning_rate": 3.958811672285086e-06, + "loss": 0.86766827, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.36108398, + "step": 1543, + "time_per_iteration": 2.964691162109375 + }, + { + "auxiliary_loss_clip": 0.01663198, + "auxiliary_loss_mlp": 0.01069577, + "balance_loss_clip": 1.40055931, + "balance_loss_mlp": 1.03681827, + "epoch": 0.09283030211934466, + "flos": 54763664313600.0, + "grad_norm": 1.8938736090684354, + "language_loss": 0.74119055, + "learning_rate": 3.958733002256038e-06, + "loss": 0.76851833, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.32788086, + "step": 1544, + "time_per_iteration": 3.303894519805908 + }, + { + "auxiliary_loss_clip": 0.01709023, + "auxiliary_loss_mlp": 0.01080336, + "balance_loss_clip": 1.43616796, + "balance_loss_mlp": 1.04731512, + "epoch": 0.09289042537201263, + "flos": 30346579399680.0, + "grad_norm": 1.697534650509015, + "language_loss": 0.78734583, + "learning_rate": 3.958654257951637e-06, + "loss": 0.81523943, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.33007812, + "step": 1545, + "time_per_iteration": 3.1133921146392822 + }, + { + "auxiliary_loss_clip": 0.01685278, + "auxiliary_loss_mlp": 0.01061896, + "balance_loss_clip": 1.41969967, + "balance_loss_mlp": 1.02911353, + "epoch": 0.09295054862468059, + "flos": 17755273774080.0, + "grad_norm": 2.6820716914623084, + "language_loss": 0.76525199, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.79272377, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.328125, + "step": 1546, + "time_per_iteration": 2.9021050930023193 + }, + { + "auxiliary_loss_clip": 0.01697143, + "auxiliary_loss_mlp": 0.01068672, + "balance_loss_clip": 1.42737544, + "balance_loss_mlp": 1.03581762, + "epoch": 0.09301067187734856, + "flos": 23668118173440.0, + "grad_norm": 1.8074082677916805, + "language_loss": 0.85311288, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.88077104, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.32861328, + "step": 1547, + "time_per_iteration": 4.337569952011108 + }, + { + "auxiliary_loss_clip": 0.01688189, + "auxiliary_loss_mlp": 0.01077346, + "balance_loss_clip": 1.41800451, + "balance_loss_mlp": 1.04461133, + "epoch": 0.09307079513001654, + "flos": 27539393109120.0, + "grad_norm": 2.0606188619544596, + "language_loss": 0.6944418, + "learning_rate": 3.958417579416199e-06, + "loss": 0.72209716, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.32714844, + "step": 1548, + "time_per_iteration": 3.04360294342041 + }, + { + "auxiliary_loss_clip": 0.01701575, + "auxiliary_loss_mlp": 0.01069372, + "balance_loss_clip": 1.42902946, + "balance_loss_mlp": 1.03482497, + "epoch": 0.0931309183826845, + "flos": 20636037388800.0, + "grad_norm": 1.9982047661161284, + "language_loss": 0.85074699, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.87845647, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 2.7265625, + "router_z_loss_mlp": 0.34545898, + "step": 1549, + "time_per_iteration": 2.8929972648620605 + }, + { + "auxiliary_loss_clip": 0.01691639, + "auxiliary_loss_mlp": 0.01061956, + "balance_loss_clip": 1.42602694, + "balance_loss_mlp": 1.02886355, + "epoch": 0.09319104163535247, + "flos": 29032277155200.0, + "grad_norm": 1.6455810917990779, + "language_loss": 0.77012366, + "learning_rate": 3.958259422403966e-06, + "loss": 0.79765964, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.33105469, + "step": 1550, + "time_per_iteration": 2.9863178730010986 + }, + { + "auxiliary_loss_clip": 0.01698337, + "auxiliary_loss_mlp": 0.01061705, + "balance_loss_clip": 1.42893028, + "balance_loss_mlp": 1.02801621, + "epoch": 0.09325116488802045, + "flos": 25312500368640.0, + "grad_norm": 2.071664797882892, + "language_loss": 0.84583366, + "learning_rate": 3.95818023251026e-06, + "loss": 0.87343413, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 2.6953125, + "router_z_loss_mlp": 0.33666992, + "step": 1551, + "time_per_iteration": 2.893467664718628 + }, + { + "auxiliary_loss_clip": 0.01472754, + "auxiliary_loss_mlp": 0.01020382, + "balance_loss_clip": 1.31332803, + "balance_loss_mlp": 1.00216663, + "epoch": 0.09331128814068841, + "flos": 61567146201600.0, + "grad_norm": 0.7624139530838739, + "language_loss": 0.61921847, + "learning_rate": 3.958100968362163e-06, + "loss": 0.64414984, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18261719, + "step": 1552, + "time_per_iteration": 4.899768829345703 + }, + { + "auxiliary_loss_clip": 0.01462663, + "auxiliary_loss_mlp": 0.01024386, + "balance_loss_clip": 1.30172515, + "balance_loss_mlp": 1.0039773, + "epoch": 0.09337141139335638, + "flos": 53323354725120.0, + "grad_norm": 0.8340954966249549, + "language_loss": 0.59015512, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61502558, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.20410156, + "step": 1553, + "time_per_iteration": 3.505605459213257 + }, + { + "auxiliary_loss_clip": 0.0169823, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_clip": 1.42470193, + "balance_loss_mlp": 1.02667832, + "epoch": 0.09343153464602436, + "flos": 23487228887040.0, + "grad_norm": 1.857410680123229, + "language_loss": 0.88717604, + "learning_rate": 3.957942217314823e-06, + "loss": 0.91477579, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 2.73632812, + "router_z_loss_mlp": 0.35083008, + "step": 1554, + "time_per_iteration": 2.9784204959869385 + }, + { + "auxiliary_loss_clip": 0.01668264, + "auxiliary_loss_mlp": 0.01060271, + "balance_loss_clip": 1.40916312, + "balance_loss_mlp": 1.02694035, + "epoch": 0.09349165789869232, + "flos": 19362663705600.0, + "grad_norm": 3.2075799729611316, + "language_loss": 0.82897961, + "learning_rate": 3.957862730421599e-06, + "loss": 0.85626507, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.33325195, + "step": 1555, + "time_per_iteration": 4.3635475635528564 + }, + { + "auxiliary_loss_clip": 0.01465751, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.30729675, + "balance_loss_mlp": 1.00908649, + "epoch": 0.09355178115136029, + "flos": 67530583566720.0, + "grad_norm": 0.9039178008312055, + "language_loss": 0.59732431, + "learning_rate": 3.957783169286024e-06, + "loss": 0.62225294, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18066406, + "step": 1556, + "time_per_iteration": 4.861315488815308 + }, + { + "auxiliary_loss_clip": 0.01681581, + "auxiliary_loss_mlp": 0.01065203, + "balance_loss_clip": 1.4190774, + "balance_loss_mlp": 1.03284991, + "epoch": 0.09361190440402825, + "flos": 37355663514240.0, + "grad_norm": 1.610525982407241, + "language_loss": 0.85251623, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.87998402, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.32348633, + "step": 1557, + "time_per_iteration": 3.1293222904205322 + }, + { + "auxiliary_loss_clip": 0.01670151, + "auxiliary_loss_mlp": 0.0106736, + "balance_loss_clip": 1.40632057, + "balance_loss_mlp": 1.03307509, + "epoch": 0.09367202765669623, + "flos": 24910426661760.0, + "grad_norm": 1.5471847831115995, + "language_loss": 0.79123926, + "learning_rate": 3.957623824299893e-06, + "loss": 0.81861436, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.34277344, + "step": 1558, + "time_per_iteration": 2.983290910720825 + }, + { + "auxiliary_loss_clip": 0.01696423, + "auxiliary_loss_mlp": 0.01072729, + "balance_loss_clip": 1.42462504, + "balance_loss_mlp": 1.03832579, + "epoch": 0.0937321509093642, + "flos": 15713975779200.0, + "grad_norm": 1.8205707476344841, + "language_loss": 0.81291062, + "learning_rate": 3.957544040455379e-06, + "loss": 0.84060216, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 2.71875, + "router_z_loss_mlp": 0.34423828, + "step": 1559, + "time_per_iteration": 2.8640787601470947 + }, + { + "auxiliary_loss_clip": 0.01679526, + "auxiliary_loss_mlp": 0.01065656, + "balance_loss_clip": 1.4155767, + "balance_loss_mlp": 1.03339767, + "epoch": 0.09379227416203216, + "flos": 20492954772480.0, + "grad_norm": 2.178271087878595, + "language_loss": 0.77273452, + "learning_rate": 3.957464182380599e-06, + "loss": 0.80018634, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.32250977, + "step": 1560, + "time_per_iteration": 2.8942768573760986 + }, + { + "auxiliary_loss_clip": 0.01701621, + "auxiliary_loss_mlp": 0.01076726, + "balance_loss_clip": 1.43133783, + "balance_loss_mlp": 1.04270339, + "epoch": 0.09385239741470014, + "flos": 24363098588160.0, + "grad_norm": 1.6092176782861025, + "language_loss": 0.81653351, + "learning_rate": 3.95738425007858e-06, + "loss": 0.84431696, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 2.70507812, + "router_z_loss_mlp": 0.34033203, + "step": 1561, + "time_per_iteration": 2.9103035926818848 + }, + { + "auxiliary_loss_clip": 0.01680573, + "auxiliary_loss_mlp": 0.01066424, + "balance_loss_clip": 1.41299379, + "balance_loss_mlp": 1.03440452, + "epoch": 0.0939125206673681, + "flos": 33304404188160.0, + "grad_norm": 1.9364840289644394, + "language_loss": 0.63734114, + "learning_rate": 3.957304243552354e-06, + "loss": 0.66481113, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.3203125, + "step": 1562, + "time_per_iteration": 2.968560218811035 + }, + { + "auxiliary_loss_clip": 0.01658543, + "auxiliary_loss_mlp": 0.01071725, + "balance_loss_clip": 1.40328753, + "balance_loss_mlp": 1.0400629, + "epoch": 0.09397264392003607, + "flos": 19254311112960.0, + "grad_norm": 1.953984097138558, + "language_loss": 0.86656106, + "learning_rate": 3.957224162804956e-06, + "loss": 0.89386374, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31616211, + "step": 1563, + "time_per_iteration": 2.9199600219726562 + }, + { + "auxiliary_loss_clip": 0.01661315, + "auxiliary_loss_mlp": 0.01067212, + "balance_loss_clip": 1.40234923, + "balance_loss_mlp": 1.03545451, + "epoch": 0.09403276717270405, + "flos": 19327345499520.0, + "grad_norm": 1.9685165478772086, + "language_loss": 0.77886653, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.80615175, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 2.59375, + "router_z_loss_mlp": 0.31738281, + "step": 1564, + "time_per_iteration": 2.9036755561828613 + }, + { + "auxiliary_loss_clip": 0.01661603, + "auxiliary_loss_mlp": 0.01068242, + "balance_loss_clip": 1.40166974, + "balance_loss_mlp": 1.03565001, + "epoch": 0.09409289042537201, + "flos": 23593138260480.0, + "grad_norm": 1.859278118064319, + "language_loss": 0.81393957, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.84123808, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.32592773, + "step": 1565, + "time_per_iteration": 2.9754257202148438 + }, + { + "auxiliary_loss_clip": 0.01665799, + "auxiliary_loss_mlp": 0.01067583, + "balance_loss_clip": 1.40236998, + "balance_loss_mlp": 1.03451395, + "epoch": 0.09415301367803998, + "flos": 20086582809600.0, + "grad_norm": 1.780377154388199, + "language_loss": 0.77078539, + "learning_rate": 3.956983475266103e-06, + "loss": 0.79811919, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.33081055, + "step": 1566, + "time_per_iteration": 2.908154010772705 + }, + { + "auxiliary_loss_clip": 0.01671211, + "auxiliary_loss_mlp": 0.01062458, + "balance_loss_clip": 1.40950513, + "balance_loss_mlp": 1.03174937, + "epoch": 0.09421313693070796, + "flos": 21070081186560.0, + "grad_norm": 1.7857000530051752, + "language_loss": 0.79621357, + "learning_rate": 3.956903097664407e-06, + "loss": 0.82355034, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 2.61914062, + "router_z_loss_mlp": 0.30688477, + "step": 1567, + "time_per_iteration": 2.895137310028076 + }, + { + "auxiliary_loss_clip": 0.01669168, + "auxiliary_loss_mlp": 0.01057116, + "balance_loss_clip": 1.40814734, + "balance_loss_mlp": 1.02519202, + "epoch": 0.09427326018337592, + "flos": 24326784996480.0, + "grad_norm": 1.7393687082286562, + "language_loss": 0.83758318, + "learning_rate": 3.956822645856749e-06, + "loss": 0.86484605, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.3190918, + "step": 1568, + "time_per_iteration": 2.899127721786499 + }, + { + "auxiliary_loss_clip": 0.01674193, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_clip": 1.40871382, + "balance_loss_mlp": 1.02014935, + "epoch": 0.09433338343604389, + "flos": 20272856227200.0, + "grad_norm": 1.835052947558203, + "language_loss": 0.78092921, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.80819142, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.31860352, + "step": 1569, + "time_per_iteration": 2.9549431800842285 + }, + { + "auxiliary_loss_clip": 0.01642971, + "auxiliary_loss_mlp": 0.01050353, + "balance_loss_clip": 1.3887912, + "balance_loss_mlp": 1.01938248, + "epoch": 0.09439350668871185, + "flos": 12748730843520.0, + "grad_norm": 24.49398075370472, + "language_loss": 0.86881441, + "learning_rate": 3.956661519635756e-06, + "loss": 0.89574766, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.30932617, + "step": 1570, + "time_per_iteration": 2.929267168045044 + }, + { + "auxiliary_loss_clip": 0.01653267, + "auxiliary_loss_mlp": 0.01050483, + "balance_loss_clip": 1.39278412, + "balance_loss_mlp": 1.01779592, + "epoch": 0.09445362994137983, + "flos": 25974424817280.0, + "grad_norm": 1.8294101879097515, + "language_loss": 0.77577591, + "learning_rate": 3.95658084522853e-06, + "loss": 0.80281341, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.3269043, + "step": 1571, + "time_per_iteration": 2.93245530128479 + }, + { + "auxiliary_loss_clip": 0.01631565, + "auxiliary_loss_mlp": 0.01052312, + "balance_loss_clip": 1.38330686, + "balance_loss_mlp": 1.02022123, + "epoch": 0.0945137531940478, + "flos": 19723854096000.0, + "grad_norm": 2.0658382312666177, + "language_loss": 0.80485189, + "learning_rate": 3.956500096627561e-06, + "loss": 0.83169067, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.32055664, + "step": 1572, + "time_per_iteration": 2.929464340209961 + }, + { + "auxiliary_loss_clip": 0.016437, + "auxiliary_loss_mlp": 0.01058175, + "balance_loss_clip": 1.38848889, + "balance_loss_mlp": 1.02510595, + "epoch": 0.09457387644671576, + "flos": 23625968002560.0, + "grad_norm": 1.8566483459501812, + "language_loss": 0.88564354, + "learning_rate": 3.956419273835913e-06, + "loss": 0.91266227, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.33056641, + "step": 1573, + "time_per_iteration": 2.9060301780700684 + }, + { + "auxiliary_loss_clip": 0.01673609, + "auxiliary_loss_mlp": 0.01055963, + "balance_loss_clip": 1.41033125, + "balance_loss_mlp": 1.02239418, + "epoch": 0.09463399969938374, + "flos": 26918759180160.0, + "grad_norm": 1.8515859176987535, + "language_loss": 0.83219522, + "learning_rate": 3.95633837685665e-06, + "loss": 0.85949099, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.33544922, + "step": 1574, + "time_per_iteration": 2.965719699859619 + }, + { + "auxiliary_loss_clip": 0.01648739, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.39284444, + "balance_loss_mlp": 1.02098179, + "epoch": 0.0946941229520517, + "flos": 23670063699840.0, + "grad_norm": 1.6742444890038077, + "language_loss": 0.82612801, + "learning_rate": 3.95625740569284e-06, + "loss": 0.85313272, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.30761719, + "step": 1575, + "time_per_iteration": 2.936870813369751 + }, + { + "auxiliary_loss_clip": 0.01640362, + "auxiliary_loss_mlp": 0.01059645, + "balance_loss_clip": 1.38622344, + "balance_loss_mlp": 1.02731538, + "epoch": 0.09475424620471967, + "flos": 24144719345280.0, + "grad_norm": 1.9125987984562414, + "language_loss": 0.88512975, + "learning_rate": 3.956176360347553e-06, + "loss": 0.91212988, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.32324219, + "step": 1576, + "time_per_iteration": 2.870584726333618 + }, + { + "auxiliary_loss_clip": 0.01451173, + "auxiliary_loss_mlp": 0.01080725, + "balance_loss_clip": 1.29460096, + "balance_loss_mlp": 1.05611992, + "epoch": 0.09481436945738765, + "flos": 68457380929920.0, + "grad_norm": 0.9971887942456493, + "language_loss": 0.6594578, + "learning_rate": 3.956095240823862e-06, + "loss": 0.68477678, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.24609375, + "step": 1577, + "time_per_iteration": 3.36885929107666 + }, + { + "auxiliary_loss_clip": 0.01647953, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_clip": 1.39241052, + "balance_loss_mlp": 1.02183282, + "epoch": 0.09487449271005562, + "flos": 16662698887680.0, + "grad_norm": 2.298275101277943, + "language_loss": 0.8170172, + "learning_rate": 3.956014047124844e-06, + "loss": 0.84402359, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.30834961, + "step": 1578, + "time_per_iteration": 2.862560749053955 + }, + { + "auxiliary_loss_clip": 0.01659374, + "auxiliary_loss_mlp": 0.01065341, + "balance_loss_clip": 1.40223265, + "balance_loss_mlp": 1.03367901, + "epoch": 0.09493461596272358, + "flos": 24285268252800.0, + "grad_norm": 1.8945186008709878, + "language_loss": 0.78810775, + "learning_rate": 3.955932779253578e-06, + "loss": 0.81535488, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.31665039, + "step": 1579, + "time_per_iteration": 2.9497861862182617 + }, + { + "auxiliary_loss_clip": 0.01654801, + "auxiliary_loss_mlp": 0.01065365, + "balance_loss_clip": 1.39822519, + "balance_loss_mlp": 1.03253508, + "epoch": 0.09499473921539155, + "flos": 21879793710720.0, + "grad_norm": 1.9438024873245814, + "language_loss": 0.75129163, + "learning_rate": 3.955851437213144e-06, + "loss": 0.77849334, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.328125, + "step": 1580, + "time_per_iteration": 2.956225872039795 + }, + { + "auxiliary_loss_clip": 0.01641227, + "auxiliary_loss_mlp": 0.01068635, + "balance_loss_clip": 1.38873696, + "balance_loss_mlp": 1.03837955, + "epoch": 0.09505486246805953, + "flos": 33560182897920.0, + "grad_norm": 1.77094123245073, + "language_loss": 0.78561193, + "learning_rate": 3.955770021006627e-06, + "loss": 0.81271052, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.30273438, + "step": 1581, + "time_per_iteration": 2.9628002643585205 + }, + { + "auxiliary_loss_clip": 0.01657623, + "auxiliary_loss_mlp": 0.01067777, + "balance_loss_clip": 1.40200663, + "balance_loss_mlp": 1.03785503, + "epoch": 0.09511498572072749, + "flos": 21225289409280.0, + "grad_norm": 1.9941683221572173, + "language_loss": 0.89070201, + "learning_rate": 3.955688530637116e-06, + "loss": 0.91795605, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.29956055, + "step": 1582, + "time_per_iteration": 4.397181987762451 + }, + { + "auxiliary_loss_clip": 0.01667634, + "auxiliary_loss_mlp": 0.01076367, + "balance_loss_clip": 1.40494275, + "balance_loss_mlp": 1.04351306, + "epoch": 0.09517510897339546, + "flos": 14619455366400.0, + "grad_norm": 2.0892438557132276, + "language_loss": 0.68198711, + "learning_rate": 3.955606966107699e-06, + "loss": 0.70942712, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.32836914, + "step": 1583, + "time_per_iteration": 2.854550838470459 + }, + { + "auxiliary_loss_clip": 0.01675859, + "auxiliary_loss_mlp": 0.01076224, + "balance_loss_clip": 1.41326928, + "balance_loss_mlp": 1.04379928, + "epoch": 0.09523523222606343, + "flos": 27828499253760.0, + "grad_norm": 1.7913228974781277, + "language_loss": 0.721699, + "learning_rate": 3.95552532742147e-06, + "loss": 0.74921989, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.32446289, + "step": 1584, + "time_per_iteration": 2.9217662811279297 + }, + { + "auxiliary_loss_clip": 0.01669726, + "auxiliary_loss_mlp": 0.01077647, + "balance_loss_clip": 1.41111684, + "balance_loss_mlp": 1.04672396, + "epoch": 0.0952953554787314, + "flos": 20716537167360.0, + "grad_norm": 1.8652426024296636, + "language_loss": 0.8206653, + "learning_rate": 3.955443614581525e-06, + "loss": 0.84813905, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.30908203, + "step": 1585, + "time_per_iteration": 2.917325735092163 + }, + { + "auxiliary_loss_clip": 0.01678077, + "auxiliary_loss_mlp": 0.01086265, + "balance_loss_clip": 1.40857971, + "balance_loss_mlp": 1.05341101, + "epoch": 0.09535547873139937, + "flos": 24797640078720.0, + "grad_norm": 1.590124253390054, + "language_loss": 0.73552573, + "learning_rate": 3.955361827590961e-06, + "loss": 0.76316917, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.32836914, + "step": 1586, + "time_per_iteration": 3.0434961318969727 + }, + { + "auxiliary_loss_clip": 0.01446017, + "auxiliary_loss_mlp": 0.01074623, + "balance_loss_clip": 1.28522432, + "balance_loss_mlp": 1.04887414, + "epoch": 0.09541560198406734, + "flos": 71940001881600.0, + "grad_norm": 0.8539689430566186, + "language_loss": 0.55537951, + "learning_rate": 3.955279966452883e-06, + "loss": 0.5805859, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.2578125, + "step": 1587, + "time_per_iteration": 4.681636810302734 + }, + { + "auxiliary_loss_clip": 0.01662544, + "auxiliary_loss_mlp": 0.01073677, + "balance_loss_clip": 1.39844894, + "balance_loss_mlp": 1.04184794, + "epoch": 0.09547572523673531, + "flos": 28993475099520.0, + "grad_norm": 1.8439018553298865, + "language_loss": 0.82488841, + "learning_rate": 3.955198031170391e-06, + "loss": 0.85225058, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.31835938, + "step": 1588, + "time_per_iteration": 2.9555037021636963 + }, + { + "auxiliary_loss_clip": 0.0166501, + "auxiliary_loss_mlp": 0.01066182, + "balance_loss_clip": 1.40398562, + "balance_loss_mlp": 1.03604615, + "epoch": 0.09553584848940327, + "flos": 24144538366080.0, + "grad_norm": 1.4001707759982354, + "language_loss": 0.82546937, + "learning_rate": 3.955116021746594e-06, + "loss": 0.8527813, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.30151367, + "step": 1589, + "time_per_iteration": 2.9987950325012207 + }, + { + "auxiliary_loss_clip": 0.01651515, + "auxiliary_loss_mlp": 0.01067807, + "balance_loss_clip": 1.39294243, + "balance_loss_mlp": 1.03330779, + "epoch": 0.09559597174207124, + "flos": 42866931899520.0, + "grad_norm": 1.5258445915739465, + "language_loss": 0.66045588, + "learning_rate": 3.955033938184601e-06, + "loss": 0.68764913, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.3449707, + "step": 1590, + "time_per_iteration": 4.590895414352417 + }, + { + "auxiliary_loss_clip": 0.01651451, + "auxiliary_loss_mlp": 0.01064292, + "balance_loss_clip": 1.39274037, + "balance_loss_mlp": 1.03112757, + "epoch": 0.09565609499473922, + "flos": 32683498790400.0, + "grad_norm": 1.752281569722935, + "language_loss": 0.83885121, + "learning_rate": 3.954951780487526e-06, + "loss": 0.86600858, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.33178711, + "step": 1591, + "time_per_iteration": 4.405884742736816 + }, + { + "auxiliary_loss_clip": 0.01679153, + "auxiliary_loss_mlp": 0.01055766, + "balance_loss_clip": 1.41313875, + "balance_loss_mlp": 1.02467656, + "epoch": 0.09571621824740718, + "flos": 18487517921280.0, + "grad_norm": 3.2507097693941525, + "language_loss": 0.76540911, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.79275835, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.31103516, + "step": 1592, + "time_per_iteration": 2.9665539264678955 + }, + { + "auxiliary_loss_clip": 0.01644039, + "auxiliary_loss_mlp": 0.01055466, + "balance_loss_clip": 1.38541389, + "balance_loss_mlp": 1.01958418, + "epoch": 0.09577634150007515, + "flos": 29399394614400.0, + "grad_norm": 2.2332493747524635, + "language_loss": 0.75289357, + "learning_rate": 3.954787242700592e-06, + "loss": 0.77988863, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.35913086, + "step": 1593, + "time_per_iteration": 3.0665364265441895 + }, + { + "auxiliary_loss_clip": 0.01644639, + "auxiliary_loss_mlp": 0.01058236, + "balance_loss_clip": 1.38794374, + "balance_loss_mlp": 1.02402329, + "epoch": 0.09583646475274313, + "flos": 22758197120640.0, + "grad_norm": 1.9911825847226712, + "language_loss": 0.70502043, + "learning_rate": 3.954704862616971e-06, + "loss": 0.73204923, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.34228516, + "step": 1594, + "time_per_iteration": 2.925469398498535 + }, + { + "auxiliary_loss_clip": 0.0165673, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.39399767, + "balance_loss_mlp": 1.01965797, + "epoch": 0.0958965880054111, + "flos": 23227875838080.0, + "grad_norm": 2.2906333336460225, + "language_loss": 0.83821642, + "learning_rate": 3.954622408410747e-06, + "loss": 0.86531979, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.33959961, + "step": 1595, + "time_per_iteration": 2.9478964805603027 + }, + { + "auxiliary_loss_clip": 0.01641882, + "auxiliary_loss_mlp": 0.01054621, + "balance_loss_clip": 1.38106644, + "balance_loss_mlp": 1.02095592, + "epoch": 0.09595671125807906, + "flos": 21334320673920.0, + "grad_norm": 1.9264892444302049, + "language_loss": 0.86206651, + "learning_rate": 3.954539880085045e-06, + "loss": 0.88903147, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.33666992, + "step": 1596, + "time_per_iteration": 2.871415615081787 + }, + { + "auxiliary_loss_clip": 0.01662945, + "auxiliary_loss_mlp": 0.01057168, + "balance_loss_clip": 1.4018234, + "balance_loss_mlp": 1.02283525, + "epoch": 0.09601683451074704, + "flos": 39618010195200.0, + "grad_norm": 1.729284294176226, + "language_loss": 0.70215195, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.72935307, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.34301758, + "step": 1597, + "time_per_iteration": 3.082613945007324 + }, + { + "auxiliary_loss_clip": 0.01657317, + "auxiliary_loss_mlp": 0.01059487, + "balance_loss_clip": 1.39205611, + "balance_loss_mlp": 1.0250361, + "epoch": 0.096076957763415, + "flos": 23743279065600.0, + "grad_norm": 2.0382577318238027, + "language_loss": 0.76589233, + "learning_rate": 3.954374601087729e-06, + "loss": 0.79306042, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.34423828, + "step": 1598, + "time_per_iteration": 2.958341360092163 + }, + { + "auxiliary_loss_clip": 0.01670949, + "auxiliary_loss_mlp": 0.01054762, + "balance_loss_clip": 1.40473258, + "balance_loss_mlp": 1.01954758, + "epoch": 0.09613708101608297, + "flos": 34691424105600.0, + "grad_norm": 1.8705984064355141, + "language_loss": 0.70483571, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7320928, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.35229492, + "step": 1599, + "time_per_iteration": 3.058208703994751 + }, + { + "auxiliary_loss_clip": 0.01650815, + "auxiliary_loss_mlp": 0.01052439, + "balance_loss_clip": 1.39288926, + "balance_loss_mlp": 1.01736784, + "epoch": 0.09619720426875093, + "flos": 20749593133440.0, + "grad_norm": 1.9397325138855626, + "language_loss": 0.85779208, + "learning_rate": 3.954209025650093e-06, + "loss": 0.88482463, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.35107422, + "step": 1600, + "time_per_iteration": 2.936253070831299 + }, + { + "auxiliary_loss_clip": 0.01660547, + "auxiliary_loss_mlp": 0.01056365, + "balance_loss_clip": 1.39506805, + "balance_loss_mlp": 1.020769, + "epoch": 0.09625732752141891, + "flos": 13050596021760.0, + "grad_norm": 2.1241624338002065, + "language_loss": 0.82352507, + "learning_rate": 3.954126126774001e-06, + "loss": 0.85069418, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.35571289, + "step": 1601, + "time_per_iteration": 2.853712797164917 + }, + { + "auxiliary_loss_clip": 0.01658122, + "auxiliary_loss_mlp": 0.01056592, + "balance_loss_clip": 1.39030623, + "balance_loss_mlp": 1.022331, + "epoch": 0.09631745077408688, + "flos": 22283903433600.0, + "grad_norm": 2.266216223412545, + "language_loss": 0.83876359, + "learning_rate": 3.954043153797251e-06, + "loss": 0.86591077, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.34277344, + "step": 1602, + "time_per_iteration": 2.93969464302063 + }, + { + "auxiliary_loss_clip": 0.01638309, + "auxiliary_loss_mlp": 0.01053401, + "balance_loss_clip": 1.38091302, + "balance_loss_mlp": 1.01828206, + "epoch": 0.09637757402675484, + "flos": 24765081805440.0, + "grad_norm": 2.1491817393407646, + "language_loss": 0.64454544, + "learning_rate": 3.953960106722989e-06, + "loss": 0.67146254, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.35131836, + "step": 1603, + "time_per_iteration": 2.9422457218170166 + }, + { + "auxiliary_loss_clip": 0.01664992, + "auxiliary_loss_mlp": 0.01052994, + "balance_loss_clip": 1.39955497, + "balance_loss_mlp": 1.01758862, + "epoch": 0.09643769727942282, + "flos": 22535248152960.0, + "grad_norm": 2.5334867137647166, + "language_loss": 0.72566396, + "learning_rate": 3.953876985554364e-06, + "loss": 0.7528438, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.35400391, + "step": 1604, + "time_per_iteration": 3.066404104232788 + }, + { + "auxiliary_loss_clip": 0.01641104, + "auxiliary_loss_mlp": 0.01053797, + "balance_loss_clip": 1.38544428, + "balance_loss_mlp": 1.02008462, + "epoch": 0.09649782053209079, + "flos": 30933614424960.0, + "grad_norm": 2.11611164836917, + "language_loss": 0.80490959, + "learning_rate": 3.953793790294527e-06, + "loss": 0.83185863, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.33666992, + "step": 1605, + "time_per_iteration": 3.0610921382904053 + }, + { + "auxiliary_loss_clip": 0.0165535, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.38818598, + "balance_loss_mlp": 1.01625609, + "epoch": 0.09655794378475875, + "flos": 25348497246720.0, + "grad_norm": 1.8970586376790766, + "language_loss": 0.76332021, + "learning_rate": 3.953710520946634e-06, + "loss": 0.79036784, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 2.66992188, + "router_z_loss_mlp": 0.33154297, + "step": 1606, + "time_per_iteration": 2.9301247596740723 + }, + { + "auxiliary_loss_clip": 0.0164322, + "auxiliary_loss_mlp": 0.01052336, + "balance_loss_clip": 1.38363028, + "balance_loss_mlp": 1.01931465, + "epoch": 0.09661806703742673, + "flos": 22356213903360.0, + "grad_norm": 1.8953103984078128, + "language_loss": 0.76853716, + "learning_rate": 3.953627177513843e-06, + "loss": 0.79549265, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.33032227, + "step": 1607, + "time_per_iteration": 2.916126012802124 + }, + { + "auxiliary_loss_clip": 0.01650555, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.38808537, + "balance_loss_mlp": 1.0155561, + "epoch": 0.0966781902900947, + "flos": 17466620077440.0, + "grad_norm": 2.167468443843805, + "language_loss": 0.87912583, + "learning_rate": 3.953543759999312e-06, + "loss": 0.90611857, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 0.33178711, + "step": 1608, + "time_per_iteration": 2.831538200378418 + }, + { + "auxiliary_loss_clip": 0.01675008, + "auxiliary_loss_mlp": 0.01058749, + "balance_loss_clip": 1.40776515, + "balance_loss_mlp": 1.02320087, + "epoch": 0.09673831354276266, + "flos": 36917954887680.0, + "grad_norm": 2.077952711627616, + "language_loss": 0.72956705, + "learning_rate": 3.953460268406207e-06, + "loss": 0.7569046, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.35546875, + "step": 1609, + "time_per_iteration": 2.9994795322418213 + }, + { + "auxiliary_loss_clip": 0.01639346, + "auxiliary_loss_mlp": 0.01052875, + "balance_loss_clip": 1.37953639, + "balance_loss_mlp": 1.01990163, + "epoch": 0.09679843679543064, + "flos": 20710383874560.0, + "grad_norm": 2.0216202889848915, + "language_loss": 0.86594737, + "learning_rate": 3.953376702737693e-06, + "loss": 0.89286953, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.32983398, + "step": 1610, + "time_per_iteration": 2.8739261627197266 + }, + { + "auxiliary_loss_clip": 0.01642223, + "auxiliary_loss_mlp": 0.01052911, + "balance_loss_clip": 1.38707066, + "balance_loss_mlp": 1.02077258, + "epoch": 0.0968585600480986, + "flos": 23525080801920.0, + "grad_norm": 2.469735117942462, + "language_loss": 0.68302494, + "learning_rate": 3.953293062996939e-06, + "loss": 0.70997632, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.32128906, + "step": 1611, + "time_per_iteration": 2.854700803756714 + }, + { + "auxiliary_loss_clip": 0.01644342, + "auxiliary_loss_mlp": 0.01059204, + "balance_loss_clip": 1.38468432, + "balance_loss_mlp": 1.02475262, + "epoch": 0.09691868330076657, + "flos": 20130814241280.0, + "grad_norm": 2.1950074591485476, + "language_loss": 0.82920468, + "learning_rate": 3.953209349187115e-06, + "loss": 0.85624015, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.34472656, + "step": 1612, + "time_per_iteration": 2.889291763305664 + }, + { + "auxiliary_loss_clip": 0.01659098, + "auxiliary_loss_mlp": 0.01058563, + "balance_loss_clip": 1.3990407, + "balance_loss_mlp": 1.02601886, + "epoch": 0.09697880655343454, + "flos": 16553305664640.0, + "grad_norm": 2.332418049687465, + "language_loss": 0.82654285, + "learning_rate": 3.953125561311398e-06, + "loss": 0.85371947, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.32543945, + "step": 1613, + "time_per_iteration": 2.868716239929199 + }, + { + "auxiliary_loss_clip": 0.01643933, + "auxiliary_loss_mlp": 0.01055775, + "balance_loss_clip": 1.38937831, + "balance_loss_mlp": 1.02149057, + "epoch": 0.09703892980610251, + "flos": 26115335683200.0, + "grad_norm": 2.304378112165568, + "language_loss": 0.86120695, + "learning_rate": 3.953041699372964e-06, + "loss": 0.88820404, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.34301758, + "step": 1614, + "time_per_iteration": 2.9557573795318604 + }, + { + "auxiliary_loss_clip": 0.01455909, + "auxiliary_loss_mlp": 0.01202433, + "balance_loss_clip": 1.29597092, + "balance_loss_mlp": 1.07120788, + "epoch": 0.09709905305877048, + "flos": 60474797539200.0, + "grad_norm": 0.8811916472240193, + "language_loss": 0.54673225, + "learning_rate": 3.952957763374992e-06, + "loss": 0.57331568, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 1.3125, + "step": 1615, + "time_per_iteration": 3.3516602516174316 + }, + { + "auxiliary_loss_clip": 0.01449281, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.29068542, + "balance_loss_mlp": 1.01389444, + "epoch": 0.09715917631143844, + "flos": 57671864259840.0, + "grad_norm": 0.9169516296606716, + "language_loss": 0.58343858, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60826868, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19824219, + "step": 1616, + "time_per_iteration": 3.454557180404663 + }, + { + "auxiliary_loss_clip": 0.01652103, + "auxiliary_loss_mlp": 0.01056017, + "balance_loss_clip": 1.39345479, + "balance_loss_mlp": 1.02399743, + "epoch": 0.09721929956410642, + "flos": 20567889440640.0, + "grad_norm": 1.739163347547808, + "language_loss": 0.69494361, + "learning_rate": 3.952789669213172e-06, + "loss": 0.7220248, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.32006836, + "step": 1617, + "time_per_iteration": 4.305253505706787 + }, + { + "auxiliary_loss_clip": 0.0164957, + "auxiliary_loss_mlp": 0.01056067, + "balance_loss_clip": 1.39020705, + "balance_loss_mlp": 1.02252185, + "epoch": 0.09727942281677439, + "flos": 27355743889920.0, + "grad_norm": 1.637223716991624, + "language_loss": 0.81472832, + "learning_rate": 3.952705511055698e-06, + "loss": 0.84178472, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.33544922, + "step": 1618, + "time_per_iteration": 2.968405246734619 + }, + { + "auxiliary_loss_clip": 0.01647255, + "auxiliary_loss_mlp": 0.01055738, + "balance_loss_clip": 1.39423156, + "balance_loss_mlp": 1.0230267, + "epoch": 0.09733954606944235, + "flos": 24910833864960.0, + "grad_norm": 1.5403005117357864, + "language_loss": 0.93926758, + "learning_rate": 3.952621278851435e-06, + "loss": 0.96629751, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.3269043, + "step": 1619, + "time_per_iteration": 2.92160701751709 + }, + { + "auxiliary_loss_clip": 0.01645746, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.39275169, + "balance_loss_mlp": 1.01968527, + "epoch": 0.09739966932211033, + "flos": 31516441683840.0, + "grad_norm": 1.876683740818155, + "language_loss": 0.90099275, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.92796749, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.32055664, + "step": 1620, + "time_per_iteration": 2.9349992275238037 + }, + { + "auxiliary_loss_clip": 0.01673294, + "auxiliary_loss_mlp": 0.01059554, + "balance_loss_clip": 1.41336679, + "balance_loss_mlp": 1.02329063, + "epoch": 0.0974597925747783, + "flos": 23889393083520.0, + "grad_norm": 2.5199832440775207, + "language_loss": 0.78703415, + "learning_rate": 3.952452592315324e-06, + "loss": 0.81436259, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.36279297, + "step": 1621, + "time_per_iteration": 2.8896656036376953 + }, + { + "auxiliary_loss_clip": 0.01659957, + "auxiliary_loss_mlp": 0.01052864, + "balance_loss_clip": 1.40289664, + "balance_loss_mlp": 1.02010584, + "epoch": 0.09751991582744626, + "flos": 17028459002880.0, + "grad_norm": 1.921034322446758, + "language_loss": 0.79299259, + "learning_rate": 3.952368137989871e-06, + "loss": 0.82012081, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.32739258, + "step": 1622, + "time_per_iteration": 4.305184841156006 + }, + { + "auxiliary_loss_clip": 0.01678574, + "auxiliary_loss_mlp": 0.01053947, + "balance_loss_clip": 1.41343451, + "balance_loss_mlp": 1.02159357, + "epoch": 0.09758003908011423, + "flos": 28414584138240.0, + "grad_norm": 2.849900325175906, + "language_loss": 0.86250341, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88982868, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 2.65234375, + "router_z_loss_mlp": 0.32324219, + "step": 1623, + "time_per_iteration": 2.9492335319519043 + }, + { + "auxiliary_loss_clip": 0.01660984, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_clip": 1.40490985, + "balance_loss_mlp": 1.01867926, + "epoch": 0.09764016233278221, + "flos": 18151827615360.0, + "grad_norm": 2.0722409469359526, + "language_loss": 0.81531084, + "learning_rate": 3.952199007240184e-06, + "loss": 0.842421, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.31347656, + "step": 1624, + "time_per_iteration": 2.8749661445617676 + }, + { + "auxiliary_loss_clip": 0.01671576, + "auxiliary_loss_mlp": 0.01050663, + "balance_loss_clip": 1.41634953, + "balance_loss_mlp": 1.01881027, + "epoch": 0.09770028558545017, + "flos": 15273642954240.0, + "grad_norm": 2.2243270137073035, + "language_loss": 0.86989129, + "learning_rate": 3.952114330822364e-06, + "loss": 0.89711374, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31860352, + "step": 1625, + "time_per_iteration": 4.385685682296753 + }, + { + "auxiliary_loss_clip": 0.01693149, + "auxiliary_loss_mlp": 0.01058917, + "balance_loss_clip": 1.42737448, + "balance_loss_mlp": 1.02205789, + "epoch": 0.09776040883811814, + "flos": 23481663776640.0, + "grad_norm": 2.0026177819574533, + "language_loss": 0.86380178, + "learning_rate": 3.952029580380172e-06, + "loss": 0.89132249, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.36865234, + "step": 1626, + "time_per_iteration": 4.45385479927063 + }, + { + "auxiliary_loss_clip": 0.01688389, + "auxiliary_loss_mlp": 0.01055092, + "balance_loss_clip": 1.42240798, + "balance_loss_mlp": 1.02164221, + "epoch": 0.09782053209078612, + "flos": 24510388970880.0, + "grad_norm": 1.8367417036470772, + "language_loss": 0.83688557, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.8643204, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 2.66210938, + "router_z_loss_mlp": 0.33447266, + "step": 1627, + "time_per_iteration": 2.9997692108154297 + }, + { + "auxiliary_loss_clip": 0.01685794, + "auxiliary_loss_mlp": 0.01060409, + "balance_loss_clip": 1.42672801, + "balance_loss_mlp": 1.02631545, + "epoch": 0.09788065534345408, + "flos": 21590189873280.0, + "grad_norm": 1.8147677850618489, + "language_loss": 0.85667801, + "learning_rate": 3.951859857435534e-06, + "loss": 0.88414001, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.34130859, + "step": 1628, + "time_per_iteration": 2.880930185317993 + }, + { + "auxiliary_loss_clip": 0.01671965, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.41638947, + "balance_loss_mlp": 1.02267981, + "epoch": 0.09794077859612205, + "flos": 23853124736640.0, + "grad_norm": 1.6318197992984442, + "language_loss": 0.77119827, + "learning_rate": 3.951774884939523e-06, + "loss": 0.79845893, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31420898, + "step": 1629, + "time_per_iteration": 2.9591474533081055 + }, + { + "auxiliary_loss_clip": 0.01675769, + "auxiliary_loss_mlp": 0.0105316, + "balance_loss_clip": 1.41675663, + "balance_loss_mlp": 1.02009177, + "epoch": 0.09800090184879003, + "flos": 23670470903040.0, + "grad_norm": 2.5756197553458944, + "language_loss": 0.79707205, + "learning_rate": 3.951689838432013e-06, + "loss": 0.82436126, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.33056641, + "step": 1630, + "time_per_iteration": 2.9287426471710205 + }, + { + "auxiliary_loss_clip": 0.01686935, + "auxiliary_loss_mlp": 0.0105417, + "balance_loss_clip": 1.42304778, + "balance_loss_mlp": 1.02076793, + "epoch": 0.09806102510145799, + "flos": 17064591615360.0, + "grad_norm": 3.2334987700011717, + "language_loss": 0.87786329, + "learning_rate": 3.951604717916228e-06, + "loss": 0.90527433, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.33374023, + "step": 1631, + "time_per_iteration": 2.830043077468872 + }, + { + "auxiliary_loss_clip": 0.01667944, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.41184628, + "balance_loss_mlp": 1.02163315, + "epoch": 0.09812114835412596, + "flos": 23889031125120.0, + "grad_norm": 1.845748022879846, + "language_loss": 0.83683145, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.86406958, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.3425293, + "step": 1632, + "time_per_iteration": 3.027871608734131 + }, + { + "auxiliary_loss_clip": 0.01674796, + "auxiliary_loss_mlp": 0.01060629, + "balance_loss_clip": 1.41676235, + "balance_loss_mlp": 1.02710688, + "epoch": 0.09818127160679392, + "flos": 20605379397120.0, + "grad_norm": 1.4980867869267873, + "language_loss": 0.79750866, + "learning_rate": 3.951434254872751e-06, + "loss": 0.82486284, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.3347168, + "step": 1633, + "time_per_iteration": 2.857276678085327 + }, + { + "auxiliary_loss_clip": 0.01661805, + "auxiliary_loss_mlp": 0.01054102, + "balance_loss_clip": 1.40637767, + "balance_loss_mlp": 1.0222255, + "epoch": 0.0982413948594619, + "flos": 15495868005120.0, + "grad_norm": 5.829786364016594, + "language_loss": 0.74253768, + "learning_rate": 3.951348912351521e-06, + "loss": 0.76969671, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31860352, + "step": 1634, + "time_per_iteration": 2.8926961421966553 + }, + { + "auxiliary_loss_clip": 0.01685673, + "auxiliary_loss_mlp": 0.01067148, + "balance_loss_clip": 1.4189111, + "balance_loss_mlp": 1.030527, + "epoch": 0.09830151811212987, + "flos": 24218884851840.0, + "grad_norm": 2.7996077708593967, + "language_loss": 0.74234295, + "learning_rate": 3.951263495834947e-06, + "loss": 0.76987118, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.36669922, + "step": 1635, + "time_per_iteration": 2.945892333984375 + }, + { + "auxiliary_loss_clip": 0.0169307, + "auxiliary_loss_mlp": 0.01067713, + "balance_loss_clip": 1.42735028, + "balance_loss_mlp": 1.03216505, + "epoch": 0.09836164136479783, + "flos": 20604248277120.0, + "grad_norm": 1.6861794853413723, + "language_loss": 0.78761971, + "learning_rate": 3.951178005326264e-06, + "loss": 0.81522757, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 2.65820312, + "router_z_loss_mlp": 0.35546875, + "step": 1636, + "time_per_iteration": 2.940061569213867 + }, + { + "auxiliary_loss_clip": 0.0168376, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.42006958, + "balance_loss_mlp": 1.02575421, + "epoch": 0.09842176461746581, + "flos": 19942685786880.0, + "grad_norm": 2.08279676563933, + "language_loss": 0.7103622, + "learning_rate": 3.951092440828715e-06, + "loss": 0.73780322, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.34594727, + "step": 1637, + "time_per_iteration": 2.893254518508911 + }, + { + "auxiliary_loss_clip": 0.01675004, + "auxiliary_loss_mlp": 0.01068969, + "balance_loss_clip": 1.41543412, + "balance_loss_mlp": 1.03492332, + "epoch": 0.09848188787013377, + "flos": 21224429758080.0, + "grad_norm": 2.000875303067703, + "language_loss": 0.7912671, + "learning_rate": 3.951006802345545e-06, + "loss": 0.81870681, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.34033203, + "step": 1638, + "time_per_iteration": 2.841355323791504 + }, + { + "auxiliary_loss_clip": 0.01652974, + "auxiliary_loss_mlp": 0.01054674, + "balance_loss_clip": 1.40154743, + "balance_loss_mlp": 1.02120054, + "epoch": 0.09854201112280174, + "flos": 30166459274880.0, + "grad_norm": 1.454420564234614, + "language_loss": 0.73573679, + "learning_rate": 3.950921089880003e-06, + "loss": 0.76281333, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.33496094, + "step": 1639, + "time_per_iteration": 2.96954083442688 + }, + { + "auxiliary_loss_clip": 0.01683684, + "auxiliary_loss_mlp": 0.01058141, + "balance_loss_clip": 1.42171574, + "balance_loss_mlp": 1.0230937, + "epoch": 0.09860213437546972, + "flos": 21804949532160.0, + "grad_norm": 2.0919120124570245, + "language_loss": 0.89702493, + "learning_rate": 3.950835303435337e-06, + "loss": 0.92444324, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.35058594, + "step": 1640, + "time_per_iteration": 3.0248308181762695 + }, + { + "auxiliary_loss_clip": 0.01674831, + "auxiliary_loss_mlp": 0.01056629, + "balance_loss_clip": 1.41494918, + "balance_loss_mlp": 1.0232029, + "epoch": 0.09866225762813768, + "flos": 21845651869440.0, + "grad_norm": 2.0434595825272694, + "language_loss": 0.83053005, + "learning_rate": 3.950749443014801e-06, + "loss": 0.85784465, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.33398438, + "step": 1641, + "time_per_iteration": 2.9520416259765625 + }, + { + "auxiliary_loss_clip": 0.01673711, + "auxiliary_loss_mlp": 0.01055096, + "balance_loss_clip": 1.41183388, + "balance_loss_mlp": 1.02181244, + "epoch": 0.09872238088080565, + "flos": 17607983391360.0, + "grad_norm": 3.7281091732192246, + "language_loss": 0.88705671, + "learning_rate": 3.95066350862165e-06, + "loss": 0.91434479, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.33251953, + "step": 1642, + "time_per_iteration": 3.0409581661224365 + }, + { + "auxiliary_loss_clip": 0.0167975, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.42301345, + "balance_loss_mlp": 1.02894366, + "epoch": 0.09878250413347361, + "flos": 27647564722560.0, + "grad_norm": 1.6246653175310566, + "language_loss": 0.8221935, + "learning_rate": 3.950577500259144e-06, + "loss": 0.84961402, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.33349609, + "step": 1643, + "time_per_iteration": 3.002387046813965 + }, + { + "auxiliary_loss_clip": 0.01687656, + "auxiliary_loss_mlp": 0.01066427, + "balance_loss_clip": 1.4263258, + "balance_loss_mlp": 1.03347778, + "epoch": 0.0988426273861416, + "flos": 16553350909440.0, + "grad_norm": 1.8461821839513446, + "language_loss": 0.83488715, + "learning_rate": 3.950491417930543e-06, + "loss": 0.86242801, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.3293457, + "step": 1644, + "time_per_iteration": 2.945784330368042 + }, + { + "auxiliary_loss_clip": 0.01672817, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_clip": 1.41988802, + "balance_loss_mlp": 1.02670634, + "epoch": 0.09890275063880956, + "flos": 21225198919680.0, + "grad_norm": 1.6729221350538515, + "language_loss": 0.69492525, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.72226143, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.34106445, + "step": 1645, + "time_per_iteration": 2.962468385696411 + }, + { + "auxiliary_loss_clip": 0.01433318, + "auxiliary_loss_mlp": 0.01082604, + "balance_loss_clip": 1.27307642, + "balance_loss_mlp": 1.0465548, + "epoch": 0.09896287389147752, + "flos": 59408672878080.0, + "grad_norm": 0.8576296353662578, + "language_loss": 0.60905313, + "learning_rate": 3.950319031388119e-06, + "loss": 0.63421237, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.36132812, + "step": 1646, + "time_per_iteration": 3.3440983295440674 + }, + { + "auxiliary_loss_clip": 0.01659233, + "auxiliary_loss_mlp": 0.01055604, + "balance_loss_clip": 1.40046811, + "balance_loss_mlp": 1.02182043, + "epoch": 0.0990229971441455, + "flos": 29654585141760.0, + "grad_norm": 1.5661887617310395, + "language_loss": 0.73899096, + "learning_rate": 3.950232727180833e-06, + "loss": 0.76613927, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.33764648, + "step": 1647, + "time_per_iteration": 2.991462230682373 + }, + { + "auxiliary_loss_clip": 0.0167897, + "auxiliary_loss_mlp": 0.0105975, + "balance_loss_clip": 1.41851616, + "balance_loss_mlp": 1.02901816, + "epoch": 0.09908312039681347, + "flos": 21844792218240.0, + "grad_norm": 2.6522101328662533, + "language_loss": 0.86134726, + "learning_rate": 3.950146349020525e-06, + "loss": 0.88873446, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 2.60742188, + "router_z_loss_mlp": 0.30712891, + "step": 1648, + "time_per_iteration": 2.9638943672180176 + }, + { + "auxiliary_loss_clip": 0.01418548, + "auxiliary_loss_mlp": 0.01028149, + "balance_loss_clip": 1.2630887, + "balance_loss_mlp": 1.00926626, + "epoch": 0.09914324364948143, + "flos": 57595527002880.0, + "grad_norm": 0.7412887852309825, + "language_loss": 0.55764318, + "learning_rate": 3.950059896910473e-06, + "loss": 0.58211017, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18847656, + "step": 1649, + "time_per_iteration": 3.294149398803711 + }, + { + "auxiliary_loss_clip": 0.01662693, + "auxiliary_loss_mlp": 0.0107432, + "balance_loss_clip": 1.40591872, + "balance_loss_mlp": 1.04394603, + "epoch": 0.09920336690214941, + "flos": 34135318540800.0, + "grad_norm": 2.146224760986189, + "language_loss": 0.91636431, + "learning_rate": 3.949973370853954e-06, + "loss": 0.94373447, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.30395508, + "step": 1650, + "time_per_iteration": 3.152336597442627 + }, + { + "auxiliary_loss_clip": 0.01418269, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.26254106, + "balance_loss_mlp": 1.00987685, + "epoch": 0.09926349015481738, + "flos": 71252866800000.0, + "grad_norm": 0.8941358468866922, + "language_loss": 0.63933432, + "learning_rate": 3.94988677085425e-06, + "loss": 0.66378552, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.16992188, + "step": 1651, + "time_per_iteration": 3.469982624053955 + }, + { + "auxiliary_loss_clip": 0.01656066, + "auxiliary_loss_mlp": 0.01091817, + "balance_loss_clip": 1.4001925, + "balance_loss_mlp": 1.06053674, + "epoch": 0.09932361340748534, + "flos": 23159049217920.0, + "grad_norm": 1.8632920635116326, + "language_loss": 0.8870306, + "learning_rate": 3.949800096914643e-06, + "loss": 0.91450953, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31298828, + "step": 1652, + "time_per_iteration": 4.444247484207153 + }, + { + "auxiliary_loss_clip": 0.01656101, + "auxiliary_loss_mlp": 0.01099525, + "balance_loss_clip": 1.40004015, + "balance_loss_mlp": 1.06869769, + "epoch": 0.09938373666015332, + "flos": 19838179002240.0, + "grad_norm": 2.0649214290469775, + "language_loss": 0.83022273, + "learning_rate": 3.949713349038422e-06, + "loss": 0.85777891, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.30834961, + "step": 1653, + "time_per_iteration": 2.9593770503997803 + }, + { + "auxiliary_loss_clip": 0.01665012, + "auxiliary_loss_mlp": 0.01109709, + "balance_loss_clip": 1.40404069, + "balance_loss_mlp": 1.07859564, + "epoch": 0.09944385991282129, + "flos": 22100751907200.0, + "grad_norm": 1.6242681281374058, + "language_loss": 0.81298214, + "learning_rate": 3.949626527228875e-06, + "loss": 0.84072936, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.3112793, + "step": 1654, + "time_per_iteration": 2.939091682434082 + }, + { + "auxiliary_loss_clip": 0.01642086, + "auxiliary_loss_mlp": 0.01111678, + "balance_loss_clip": 1.39667869, + "balance_loss_mlp": 1.08092165, + "epoch": 0.09950398316548925, + "flos": 19838450471040.0, + "grad_norm": 1.4842317120426711, + "language_loss": 0.82624382, + "learning_rate": 3.949539631489295e-06, + "loss": 0.85378146, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.30761719, + "step": 1655, + "time_per_iteration": 2.891676187515259 + }, + { + "auxiliary_loss_clip": 0.01651685, + "auxiliary_loss_mlp": 0.01112978, + "balance_loss_clip": 1.39562845, + "balance_loss_mlp": 1.08155417, + "epoch": 0.09956410641815722, + "flos": 25013033164800.0, + "grad_norm": 1.8225771329591445, + "language_loss": 0.82350165, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.85114831, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.31420898, + "step": 1656, + "time_per_iteration": 2.968209743499756 + }, + { + "auxiliary_loss_clip": 0.01669599, + "auxiliary_loss_mlp": 0.01116259, + "balance_loss_clip": 1.41334581, + "balance_loss_mlp": 1.08590841, + "epoch": 0.0996242296708252, + "flos": 19327074030720.0, + "grad_norm": 1.5325067508829209, + "language_loss": 0.89919537, + "learning_rate": 3.949365618233217e-06, + "loss": 0.92705393, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.30371094, + "step": 1657, + "time_per_iteration": 4.365803241729736 + }, + { + "auxiliary_loss_clip": 0.01676938, + "auxiliary_loss_mlp": 0.01121521, + "balance_loss_clip": 1.41021478, + "balance_loss_mlp": 1.08964479, + "epoch": 0.09968435292349316, + "flos": 21881603502720.0, + "grad_norm": 2.3752628990889897, + "language_loss": 0.86534941, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.89333403, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.31884766, + "step": 1658, + "time_per_iteration": 2.9380922317504883 + }, + { + "auxiliary_loss_clip": 0.01406291, + "auxiliary_loss_mlp": 0.01054549, + "balance_loss_clip": 1.25232887, + "balance_loss_mlp": 1.03280509, + "epoch": 0.09974447617616113, + "flos": 65411500481280.0, + "grad_norm": 0.9029586435383485, + "language_loss": 0.60973328, + "learning_rate": 3.949191309296585e-06, + "loss": 0.63434166, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.21777344, + "step": 1659, + "time_per_iteration": 3.3646867275238037 + }, + { + "auxiliary_loss_clip": 0.01654625, + "auxiliary_loss_mlp": 0.01104976, + "balance_loss_clip": 1.40116262, + "balance_loss_mlp": 1.07321835, + "epoch": 0.0998045994288291, + "flos": 23670380413440.0, + "grad_norm": 2.3456588179276086, + "language_loss": 0.86984038, + "learning_rate": 3.949104043956321e-06, + "loss": 0.89743626, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.31762695, + "step": 1660, + "time_per_iteration": 2.888805389404297 + }, + { + "auxiliary_loss_clip": 0.01662228, + "auxiliary_loss_mlp": 0.01088557, + "balance_loss_clip": 1.40853071, + "balance_loss_mlp": 1.05756259, + "epoch": 0.09986472268149707, + "flos": 19619166332160.0, + "grad_norm": 2.079259343004954, + "language_loss": 0.80868685, + "learning_rate": 3.949016704705836e-06, + "loss": 0.83619469, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.31005859, + "step": 1661, + "time_per_iteration": 5.724136829376221 + }, + { + "auxiliary_loss_clip": 0.01678603, + "auxiliary_loss_mlp": 0.01090577, + "balance_loss_clip": 1.41135836, + "balance_loss_mlp": 1.05848575, + "epoch": 0.09992484593416504, + "flos": 26224412192640.0, + "grad_norm": 1.8929103029583056, + "language_loss": 0.84489989, + "learning_rate": 3.948929291548443e-06, + "loss": 0.87259167, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 2.671875, + "router_z_loss_mlp": 0.32104492, + "step": 1662, + "time_per_iteration": 2.992192268371582 + }, + { + "auxiliary_loss_clip": 0.01645088, + "auxiliary_loss_mlp": 0.01084047, + "balance_loss_clip": 1.38852763, + "balance_loss_mlp": 1.05019176, + "epoch": 0.09998496918683301, + "flos": 17502571710720.0, + "grad_norm": 1.875665560280599, + "language_loss": 0.90547681, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.93276817, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.33837891, + "step": 1663, + "time_per_iteration": 2.893289804458618 + }, + { + "auxiliary_loss_clip": 0.0166504, + "auxiliary_loss_mlp": 0.01073249, + "balance_loss_clip": 1.40243971, + "balance_loss_mlp": 1.04058564, + "epoch": 0.10004509243950098, + "flos": 22795460853120.0, + "grad_norm": 1.6503906532829125, + "language_loss": 0.7170831, + "learning_rate": 3.948754243526191e-06, + "loss": 0.74446595, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.32666016, + "step": 1664, + "time_per_iteration": 2.9072556495666504 + }, + { + "auxiliary_loss_clip": 0.01640771, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.38310039, + "balance_loss_mlp": 1.03244233, + "epoch": 0.10010521569216894, + "flos": 16262073014400.0, + "grad_norm": 2.6469106304180334, + "language_loss": 0.80001211, + "learning_rate": 3.94866660866797e-06, + "loss": 0.82708281, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 2.57617188, + "router_z_loss_mlp": 0.33862305, + "step": 1665, + "time_per_iteration": 2.8685007095336914 + }, + { + "auxiliary_loss_clip": 0.01654298, + "auxiliary_loss_mlp": 0.01072739, + "balance_loss_clip": 1.39549804, + "balance_loss_mlp": 1.03795362, + "epoch": 0.10016533894483691, + "flos": 23412610932480.0, + "grad_norm": 1.716968963228285, + "language_loss": 0.70704883, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.73431921, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.34790039, + "step": 1666, + "time_per_iteration": 2.902575969696045 + }, + { + "auxiliary_loss_clip": 0.0166278, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_clip": 1.39978373, + "balance_loss_mlp": 1.03636634, + "epoch": 0.10022546219750489, + "flos": 19363523356800.0, + "grad_norm": 1.9047857404542026, + "language_loss": 0.80325389, + "learning_rate": 3.948491117273956e-06, + "loss": 0.83060348, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.35791016, + "step": 1667, + "time_per_iteration": 2.854438543319702 + }, + { + "auxiliary_loss_clip": 0.0163957, + "auxiliary_loss_mlp": 0.01063807, + "balance_loss_clip": 1.38292968, + "balance_loss_mlp": 1.02845001, + "epoch": 0.10028558545017285, + "flos": 27096255106560.0, + "grad_norm": 2.455144623915397, + "language_loss": 0.79675514, + "learning_rate": 3.948403260744817e-06, + "loss": 0.82378888, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.35327148, + "step": 1668, + "time_per_iteration": 2.950324058532715 + }, + { + "auxiliary_loss_clip": 0.01629477, + "auxiliary_loss_mlp": 0.01059295, + "balance_loss_clip": 1.37405825, + "balance_loss_mlp": 1.02222097, + "epoch": 0.10034570870284082, + "flos": 25857792426240.0, + "grad_norm": 2.0955813013551277, + "language_loss": 0.79053175, + "learning_rate": 3.948315330332031e-06, + "loss": 0.81741947, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.37060547, + "step": 1669, + "time_per_iteration": 2.970571279525757 + }, + { + "auxiliary_loss_clip": 0.01652121, + "auxiliary_loss_mlp": 0.01072164, + "balance_loss_clip": 1.38983178, + "balance_loss_mlp": 1.03611541, + "epoch": 0.1004058319555088, + "flos": 26260635294720.0, + "grad_norm": 2.169091601472982, + "language_loss": 0.86633027, + "learning_rate": 3.948227326038933e-06, + "loss": 0.89357316, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.3605957, + "step": 1670, + "time_per_iteration": 2.9880945682525635 + }, + { + "auxiliary_loss_clip": 0.0162786, + "auxiliary_loss_mlp": 0.0106165, + "balance_loss_clip": 1.38059139, + "balance_loss_mlp": 1.02681756, + "epoch": 0.10046595520817676, + "flos": 25385444265600.0, + "grad_norm": 1.6463926918416225, + "language_loss": 0.77591807, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.80281317, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.34838867, + "step": 1671, + "time_per_iteration": 2.9631965160369873 + }, + { + "auxiliary_loss_clip": 0.01390151, + "auxiliary_loss_mlp": 0.01055989, + "balance_loss_clip": 1.23043489, + "balance_loss_mlp": 1.02909517, + "epoch": 0.10052607846084473, + "flos": 67488704864640.0, + "grad_norm": 0.7857511421673885, + "language_loss": 0.60782909, + "learning_rate": 3.948051095825149e-06, + "loss": 0.63229048, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.26953125, + "step": 1672, + "time_per_iteration": 3.3744518756866455 + }, + { + "auxiliary_loss_clip": 0.01650999, + "auxiliary_loss_mlp": 0.01065444, + "balance_loss_clip": 1.39364076, + "balance_loss_mlp": 1.02667689, + "epoch": 0.10058620171351271, + "flos": 21370272307200.0, + "grad_norm": 1.9233180959994893, + "language_loss": 0.7783345, + "learning_rate": 3.947962869911147e-06, + "loss": 0.80549896, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.38793945, + "step": 1673, + "time_per_iteration": 2.835486650466919 + }, + { + "auxiliary_loss_clip": 0.01647363, + "auxiliary_loss_mlp": 0.01061035, + "balance_loss_clip": 1.39086044, + "balance_loss_mlp": 1.02484345, + "epoch": 0.10064632496618067, + "flos": 16808812905600.0, + "grad_norm": 2.341539480492534, + "language_loss": 0.75925916, + "learning_rate": 3.947874570130197e-06, + "loss": 0.78634322, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.36181641, + "step": 1674, + "time_per_iteration": 2.829949140548706 + }, + { + "auxiliary_loss_clip": 0.01639911, + "auxiliary_loss_mlp": 0.01071286, + "balance_loss_clip": 1.3825438, + "balance_loss_mlp": 1.03685784, + "epoch": 0.10070644821884864, + "flos": 23635288431360.0, + "grad_norm": 4.7178875817820485, + "language_loss": 0.80480331, + "learning_rate": 3.947786196485649e-06, + "loss": 0.83191532, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.34448242, + "step": 1675, + "time_per_iteration": 2.8814475536346436 + }, + { + "auxiliary_loss_clip": 0.01629417, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.37404251, + "balance_loss_mlp": 1.02398646, + "epoch": 0.1007665714715166, + "flos": 24473351462400.0, + "grad_norm": 2.176203324708197, + "language_loss": 0.82577938, + "learning_rate": 3.947697748980853e-06, + "loss": 0.85266387, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.3503418, + "step": 1676, + "time_per_iteration": 2.866112470626831 + }, + { + "auxiliary_loss_clip": 0.01655655, + "auxiliary_loss_mlp": 0.01065716, + "balance_loss_clip": 1.39657378, + "balance_loss_mlp": 1.0286901, + "epoch": 0.10082669472418458, + "flos": 16807546051200.0, + "grad_norm": 1.8931664892652669, + "language_loss": 0.87307936, + "learning_rate": 3.947609227619163e-06, + "loss": 0.90029299, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.37036133, + "step": 1677, + "time_per_iteration": 2.829728603363037 + }, + { + "auxiliary_loss_clip": 0.01642844, + "auxiliary_loss_mlp": 0.01060559, + "balance_loss_clip": 1.38630915, + "balance_loss_mlp": 1.02663219, + "epoch": 0.10088681797685255, + "flos": 13561520014080.0, + "grad_norm": 1.8140118150399882, + "language_loss": 0.87695408, + "learning_rate": 3.947520632403936e-06, + "loss": 0.90398806, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.33935547, + "step": 1678, + "time_per_iteration": 2.863939046859741 + }, + { + "auxiliary_loss_clip": 0.01643288, + "auxiliary_loss_mlp": 0.01064866, + "balance_loss_clip": 1.38737714, + "balance_loss_mlp": 1.03093934, + "epoch": 0.10094694122952051, + "flos": 25276820204160.0, + "grad_norm": 1.9409867713278317, + "language_loss": 0.91164911, + "learning_rate": 3.947431963338532e-06, + "loss": 0.93873066, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.33911133, + "step": 1679, + "time_per_iteration": 2.873224973678589 + }, + { + "auxiliary_loss_clip": 0.01397275, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.24328172, + "balance_loss_mlp": 1.02404392, + "epoch": 0.10100706448218849, + "flos": 69887229690240.0, + "grad_norm": 0.7849845222796132, + "language_loss": 0.53100234, + "learning_rate": 3.947343220426312e-06, + "loss": 0.5553872, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.171875, + "step": 1680, + "time_per_iteration": 3.4003121852874756 + }, + { + "auxiliary_loss_clip": 0.01636305, + "auxiliary_loss_mlp": 0.01059589, + "balance_loss_clip": 1.38392699, + "balance_loss_mlp": 1.02442181, + "epoch": 0.10106718773485646, + "flos": 20015720173440.0, + "grad_norm": 1.8279466723189983, + "language_loss": 0.77787495, + "learning_rate": 3.947254403670641e-06, + "loss": 0.80483389, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.3515625, + "step": 1681, + "time_per_iteration": 2.8465073108673096 + }, + { + "auxiliary_loss_clip": 0.01657775, + "auxiliary_loss_mlp": 0.01060418, + "balance_loss_clip": 1.39468622, + "balance_loss_mlp": 1.02527499, + "epoch": 0.10112731098752442, + "flos": 13487625976320.0, + "grad_norm": 2.3866911160155375, + "language_loss": 0.9551121, + "learning_rate": 3.947165513074889e-06, + "loss": 0.98229402, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.35180664, + "step": 1682, + "time_per_iteration": 2.854020595550537 + }, + { + "auxiliary_loss_clip": 0.01636086, + "auxiliary_loss_mlp": 0.01060827, + "balance_loss_clip": 1.3780334, + "balance_loss_mlp": 1.02730572, + "epoch": 0.1011874342401924, + "flos": 18525279346560.0, + "grad_norm": 1.908998024014615, + "language_loss": 0.88646305, + "learning_rate": 3.947076548642425e-06, + "loss": 0.91343218, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.33544922, + "step": 1683, + "time_per_iteration": 2.8425066471099854 + }, + { + "auxiliary_loss_clip": 0.01633548, + "auxiliary_loss_mlp": 0.01054732, + "balance_loss_clip": 1.3801465, + "balance_loss_mlp": 1.02216423, + "epoch": 0.10124755749286037, + "flos": 20712238911360.0, + "grad_norm": 1.952470283015156, + "language_loss": 0.76025844, + "learning_rate": 3.946987510376624e-06, + "loss": 0.7871412, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.32568359, + "step": 1684, + "time_per_iteration": 2.9909896850585938 + }, + { + "auxiliary_loss_clip": 0.01411753, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.25598288, + "balance_loss_mlp": 1.0148958, + "epoch": 0.10130768074552833, + "flos": 56141490257280.0, + "grad_norm": 0.7616883329885786, + "language_loss": 0.61197436, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6364516, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.2109375, + "step": 1685, + "time_per_iteration": 3.4471018314361572 + }, + { + "auxiliary_loss_clip": 0.01631877, + "auxiliary_loss_mlp": 0.01062662, + "balance_loss_clip": 1.37613583, + "balance_loss_mlp": 1.0272094, + "epoch": 0.1013678039981963, + "flos": 33415652448000.0, + "grad_norm": 2.460523872673505, + "language_loss": 0.62732834, + "learning_rate": 3.946809212358516e-06, + "loss": 0.65427375, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.35449219, + "step": 1686, + "time_per_iteration": 2.977872133255005 + }, + { + "auxiliary_loss_clip": 0.01626053, + "auxiliary_loss_mlp": 0.01062362, + "balance_loss_clip": 1.37669873, + "balance_loss_mlp": 1.02617049, + "epoch": 0.10142792725086427, + "flos": 31917972453120.0, + "grad_norm": 2.324665368725329, + "language_loss": 0.81534594, + "learning_rate": 3.946719952612972e-06, + "loss": 0.84223008, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.36181641, + "step": 1687, + "time_per_iteration": 3.0080950260162354 + }, + { + "auxiliary_loss_clip": 0.01652481, + "auxiliary_loss_mlp": 0.01063124, + "balance_loss_clip": 1.39276397, + "balance_loss_mlp": 1.02314115, + "epoch": 0.10148805050353224, + "flos": 28487709014400.0, + "grad_norm": 1.6084524842911123, + "language_loss": 0.73304212, + "learning_rate": 3.94663061904761e-06, + "loss": 0.76019812, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.40014648, + "step": 1688, + "time_per_iteration": 4.3482506275177 + }, + { + "auxiliary_loss_clip": 0.01622541, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.37256205, + "balance_loss_mlp": 1.02299201, + "epoch": 0.1015481737562002, + "flos": 25158061307520.0, + "grad_norm": 2.0632919782674644, + "language_loss": 0.87715614, + "learning_rate": 3.94654121166582e-06, + "loss": 0.90394998, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.33862305, + "step": 1689, + "time_per_iteration": 2.8477039337158203 + }, + { + "auxiliary_loss_clip": 0.01601391, + "auxiliary_loss_mlp": 0.01055077, + "balance_loss_clip": 1.35162401, + "balance_loss_mlp": 1.02331924, + "epoch": 0.10160829700886818, + "flos": 30894902858880.0, + "grad_norm": 1.7257431313037477, + "language_loss": 0.90173066, + "learning_rate": 3.946451730470993e-06, + "loss": 0.92829537, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.31713867, + "step": 1690, + "time_per_iteration": 3.057968854904175 + }, + { + "auxiliary_loss_clip": 0.01634154, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_clip": 1.37967944, + "balance_loss_mlp": 1.02396917, + "epoch": 0.10166842026153615, + "flos": 20421594443520.0, + "grad_norm": 2.349401431963509, + "language_loss": 0.84699166, + "learning_rate": 3.946362175466521e-06, + "loss": 0.87391555, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.34228516, + "step": 1691, + "time_per_iteration": 3.0034735202789307 + }, + { + "auxiliary_loss_clip": 0.01631343, + "auxiliary_loss_mlp": 0.01060545, + "balance_loss_clip": 1.37709665, + "balance_loss_mlp": 1.02711892, + "epoch": 0.10172854351420411, + "flos": 33490722850560.0, + "grad_norm": 1.7145380462601965, + "language_loss": 0.68260574, + "learning_rate": 3.946272546655801e-06, + "loss": 0.70952463, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.33447266, + "step": 1692, + "time_per_iteration": 4.420594215393066 + }, + { + "auxiliary_loss_clip": 0.01621381, + "auxiliary_loss_mlp": 0.01062277, + "balance_loss_clip": 1.37000465, + "balance_loss_mlp": 1.02975702, + "epoch": 0.1017886667668721, + "flos": 23560851456000.0, + "grad_norm": 1.8693052396234329, + "language_loss": 0.77762872, + "learning_rate": 3.94618284404223e-06, + "loss": 0.80446529, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.32519531, + "step": 1693, + "time_per_iteration": 2.916638135910034 + }, + { + "auxiliary_loss_clip": 0.01624915, + "auxiliary_loss_mlp": 0.01063789, + "balance_loss_clip": 1.36832333, + "balance_loss_mlp": 1.02940929, + "epoch": 0.10184879001954006, + "flos": 23306837293440.0, + "grad_norm": 1.7251910500257541, + "language_loss": 0.88054377, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.90743083, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.34399414, + "step": 1694, + "time_per_iteration": 2.899407148361206 + }, + { + "auxiliary_loss_clip": 0.01634678, + "auxiliary_loss_mlp": 0.01071615, + "balance_loss_clip": 1.37615085, + "balance_loss_mlp": 1.03501797, + "epoch": 0.10190891327220802, + "flos": 18342354044160.0, + "grad_norm": 1.9353100494807025, + "language_loss": 0.81407481, + "learning_rate": 3.946003217420147e-06, + "loss": 0.84113777, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.36547852, + "step": 1695, + "time_per_iteration": 2.8439905643463135 + }, + { + "auxiliary_loss_clip": 0.01634777, + "auxiliary_loss_mlp": 0.01065726, + "balance_loss_clip": 1.3777113, + "balance_loss_mlp": 1.03020167, + "epoch": 0.10196903652487599, + "flos": 26475666422400.0, + "grad_norm": 1.8203360153226271, + "language_loss": 0.87410241, + "learning_rate": 3.945913293418447e-06, + "loss": 0.90110743, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.35522461, + "step": 1696, + "time_per_iteration": 5.84446120262146 + }, + { + "auxiliary_loss_clip": 0.01610557, + "auxiliary_loss_mlp": 0.01060179, + "balance_loss_clip": 1.36300743, + "balance_loss_mlp": 1.02577507, + "epoch": 0.10202915977754397, + "flos": 21878979304320.0, + "grad_norm": 1.7463107283667392, + "language_loss": 0.82610399, + "learning_rate": 3.945823295627519e-06, + "loss": 0.85281134, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.34375, + "step": 1697, + "time_per_iteration": 2.9421753883361816 + }, + { + "auxiliary_loss_clip": 0.0164112, + "auxiliary_loss_mlp": 0.01061551, + "balance_loss_clip": 1.38431454, + "balance_loss_mlp": 1.02831578, + "epoch": 0.10208928303021193, + "flos": 22319945556480.0, + "grad_norm": 2.0228655405503746, + "language_loss": 0.826581, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.85360771, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 2.56835938, + "router_z_loss_mlp": 0.33227539, + "step": 1698, + "time_per_iteration": 2.8640811443328857 + }, + { + "auxiliary_loss_clip": 0.01640887, + "auxiliary_loss_mlp": 0.01056985, + "balance_loss_clip": 1.38619721, + "balance_loss_mlp": 1.02467918, + "epoch": 0.1021494062828799, + "flos": 22135255706880.0, + "grad_norm": 4.768320934070104, + "language_loss": 0.77478015, + "learning_rate": 3.945643078691637e-06, + "loss": 0.80175883, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.32250977, + "step": 1699, + "time_per_iteration": 2.9023337364196777 + }, + { + "auxiliary_loss_clip": 0.01620192, + "auxiliary_loss_mlp": 0.01057445, + "balance_loss_clip": 1.3701216, + "balance_loss_mlp": 1.0254494, + "epoch": 0.10220952953554788, + "flos": 19656339575040.0, + "grad_norm": 1.9656606228135034, + "language_loss": 0.80870491, + "learning_rate": 3.945552859553516e-06, + "loss": 0.83548129, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.31982422, + "step": 1700, + "time_per_iteration": 2.926096200942993 + }, + { + "auxiliary_loss_clip": 0.01646348, + "auxiliary_loss_mlp": 0.01060103, + "balance_loss_clip": 1.39144063, + "balance_loss_mlp": 1.02844143, + "epoch": 0.10226965278821584, + "flos": 29798889367680.0, + "grad_norm": 1.7342292760684124, + "language_loss": 0.78124261, + "learning_rate": 3.945462566639836e-06, + "loss": 0.80830705, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.31665039, + "step": 1701, + "time_per_iteration": 2.9173777103424072 + }, + { + "auxiliary_loss_clip": 0.01649863, + "auxiliary_loss_mlp": 0.01064524, + "balance_loss_clip": 1.38818121, + "balance_loss_mlp": 1.03271842, + "epoch": 0.10232977604088381, + "flos": 27028333382400.0, + "grad_norm": 1.782857266774266, + "language_loss": 0.78808391, + "learning_rate": 3.945372199954019e-06, + "loss": 0.81522781, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.31811523, + "step": 1702, + "time_per_iteration": 2.984719753265381 + }, + { + "auxiliary_loss_clip": 0.01615345, + "auxiliary_loss_mlp": 0.01057498, + "balance_loss_clip": 1.36606944, + "balance_loss_mlp": 1.0261699, + "epoch": 0.10238989929355179, + "flos": 20787354558720.0, + "grad_norm": 1.8807014098569284, + "language_loss": 0.96009856, + "learning_rate": 3.945281759499494e-06, + "loss": 0.98682702, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.31347656, + "step": 1703, + "time_per_iteration": 2.875232458114624 + }, + { + "auxiliary_loss_clip": 0.01399946, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.24483776, + "balance_loss_mlp": 1.00802314, + "epoch": 0.10245002254621975, + "flos": 57726484012800.0, + "grad_norm": 0.8912655234827841, + "language_loss": 0.55162191, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57591045, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.20898438, + "step": 1704, + "time_per_iteration": 3.2951996326446533 + }, + { + "auxiliary_loss_clip": 0.01629984, + "auxiliary_loss_mlp": 0.01061435, + "balance_loss_clip": 1.37667418, + "balance_loss_mlp": 1.02791381, + "epoch": 0.10251014579888772, + "flos": 16808812905600.0, + "grad_norm": 2.783030814298188, + "language_loss": 0.85636693, + "learning_rate": 3.945100657298039e-06, + "loss": 0.88328117, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.33544922, + "step": 1705, + "time_per_iteration": 2.8939990997314453 + }, + { + "auxiliary_loss_clip": 0.01395563, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.24225283, + "balance_loss_mlp": 1.01001251, + "epoch": 0.1025702690515557, + "flos": 68595034170240.0, + "grad_norm": 0.771639341507712, + "language_loss": 0.60469592, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62892336, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.171875, + "step": 1706, + "time_per_iteration": 3.397503137588501 + }, + { + "auxiliary_loss_clip": 0.01636447, + "auxiliary_loss_mlp": 0.01051307, + "balance_loss_clip": 1.38145471, + "balance_loss_mlp": 1.0201931, + "epoch": 0.10263039230422366, + "flos": 14874872117760.0, + "grad_norm": 2.0092338966768484, + "language_loss": 0.87398982, + "learning_rate": 3.94491926006294e-06, + "loss": 0.90086734, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.31103516, + "step": 1707, + "time_per_iteration": 2.943096876144409 + }, + { + "auxiliary_loss_clip": 0.01620514, + "auxiliary_loss_mlp": 0.01054021, + "balance_loss_clip": 1.37343407, + "balance_loss_mlp": 1.02283573, + "epoch": 0.10269051555689163, + "flos": 25348452001920.0, + "grad_norm": 1.4620615314260867, + "language_loss": 0.73841333, + "learning_rate": 3.944828450816369e-06, + "loss": 0.76515871, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.31176758, + "step": 1708, + "time_per_iteration": 2.9941771030426025 + }, + { + "auxiliary_loss_clip": 0.01630087, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.37949216, + "balance_loss_mlp": 1.0273385, + "epoch": 0.10275063880955959, + "flos": 21078587208960.0, + "grad_norm": 1.5984931920141832, + "language_loss": 0.91603184, + "learning_rate": 3.944737567821709e-06, + "loss": 0.94294298, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.33691406, + "step": 1709, + "time_per_iteration": 2.946044445037842 + }, + { + "auxiliary_loss_clip": 0.01621395, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_clip": 1.37321138, + "balance_loss_mlp": 1.02433586, + "epoch": 0.10281076206222757, + "flos": 30377418370560.0, + "grad_norm": 1.9167532283037152, + "language_loss": 0.89144075, + "learning_rate": 3.944646611082406e-06, + "loss": 0.91822582, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.32763672, + "step": 1710, + "time_per_iteration": 3.063547134399414 + }, + { + "auxiliary_loss_clip": 0.01615704, + "auxiliary_loss_mlp": 0.01051974, + "balance_loss_clip": 1.36817312, + "balance_loss_mlp": 1.02124166, + "epoch": 0.10287088531489554, + "flos": 22428298149120.0, + "grad_norm": 1.7118212119594622, + "language_loss": 0.80330074, + "learning_rate": 3.944555580601908e-06, + "loss": 0.82997751, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.30688477, + "step": 1711, + "time_per_iteration": 3.0635430812835693 + }, + { + "auxiliary_loss_clip": 0.01626165, + "auxiliary_loss_mlp": 0.01051418, + "balance_loss_clip": 1.37362933, + "balance_loss_mlp": 1.01908851, + "epoch": 0.1029310085675635, + "flos": 25126091216640.0, + "grad_norm": 1.814239451675464, + "language_loss": 0.74798751, + "learning_rate": 3.944464476383668e-06, + "loss": 0.77476335, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.32324219, + "step": 1712, + "time_per_iteration": 2.9420578479766846 + }, + { + "auxiliary_loss_clip": 0.01615244, + "auxiliary_loss_mlp": 0.01060991, + "balance_loss_clip": 1.3730433, + "balance_loss_mlp": 1.02918637, + "epoch": 0.10299113182023148, + "flos": 19875307000320.0, + "grad_norm": 1.9701052394331993, + "language_loss": 0.87746233, + "learning_rate": 3.94437329843114e-06, + "loss": 0.90422463, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.31811523, + "step": 1713, + "time_per_iteration": 2.9421658515930176 + }, + { + "auxiliary_loss_clip": 0.01620043, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.37233698, + "balance_loss_mlp": 1.02726865, + "epoch": 0.10305125507289944, + "flos": 20456957894400.0, + "grad_norm": 4.83932868525954, + "language_loss": 0.73517847, + "learning_rate": 3.944282046747782e-06, + "loss": 0.76196373, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.31176758, + "step": 1714, + "time_per_iteration": 2.9591064453125 + }, + { + "auxiliary_loss_clip": 0.01629074, + "auxiliary_loss_mlp": 0.01055526, + "balance_loss_clip": 1.37502635, + "balance_loss_mlp": 1.02415037, + "epoch": 0.10311137832556741, + "flos": 26262128373120.0, + "grad_norm": 2.6008867428651765, + "language_loss": 0.91686141, + "learning_rate": 3.944190721337053e-06, + "loss": 0.94370747, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.3137207, + "step": 1715, + "time_per_iteration": 3.167482852935791 + }, + { + "auxiliary_loss_clip": 0.0161421, + "auxiliary_loss_mlp": 0.01057665, + "balance_loss_clip": 1.36636162, + "balance_loss_mlp": 1.02190268, + "epoch": 0.10317150157823539, + "flos": 35312555727360.0, + "grad_norm": 1.755137452651067, + "language_loss": 0.76998538, + "learning_rate": 3.944099322202418e-06, + "loss": 0.79670411, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.35766602, + "step": 1716, + "time_per_iteration": 3.0549886226654053 + }, + { + "auxiliary_loss_clip": 0.01626232, + "auxiliary_loss_mlp": 0.0106336, + "balance_loss_clip": 1.37607586, + "balance_loss_mlp": 1.03255641, + "epoch": 0.10323162483090335, + "flos": 25751159136000.0, + "grad_norm": 1.9021524887456134, + "language_loss": 0.86877179, + "learning_rate": 3.944007849347342e-06, + "loss": 0.89566767, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.30786133, + "step": 1717, + "time_per_iteration": 2.9758265018463135 + }, + { + "auxiliary_loss_clip": 0.01620729, + "auxiliary_loss_mlp": 0.01069751, + "balance_loss_clip": 1.36914396, + "balance_loss_mlp": 1.03849483, + "epoch": 0.10329174808357132, + "flos": 16298567585280.0, + "grad_norm": 1.8529052019820862, + "language_loss": 0.83917761, + "learning_rate": 3.943916302775292e-06, + "loss": 0.86608243, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.3125, + "step": 1718, + "time_per_iteration": 2.8774306774139404 + }, + { + "auxiliary_loss_clip": 0.01623261, + "auxiliary_loss_mlp": 0.01063341, + "balance_loss_clip": 1.37648225, + "balance_loss_mlp": 1.03175032, + "epoch": 0.10335187133623928, + "flos": 36701611660800.0, + "grad_norm": 1.8804761346699725, + "language_loss": 0.74158484, + "learning_rate": 3.943824682489742e-06, + "loss": 0.7684508, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.31567383, + "step": 1719, + "time_per_iteration": 3.0115675926208496 + }, + { + "auxiliary_loss_clip": 0.01613678, + "auxiliary_loss_mlp": 0.01062561, + "balance_loss_clip": 1.36660457, + "balance_loss_mlp": 1.03108954, + "epoch": 0.10341199458890726, + "flos": 14983812892800.0, + "grad_norm": 1.8839162917606667, + "language_loss": 0.94453859, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.97130096, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.31494141, + "step": 1720, + "time_per_iteration": 2.893920660018921 + }, + { + "auxiliary_loss_clip": 0.01621202, + "auxiliary_loss_mlp": 0.01070525, + "balance_loss_clip": 1.37060785, + "balance_loss_mlp": 1.03626406, + "epoch": 0.10347211784157523, + "flos": 21041142497280.0, + "grad_norm": 2.260984925709028, + "language_loss": 0.80841869, + "learning_rate": 3.943641220792039e-06, + "loss": 0.83533597, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.898102283477783 + }, + { + "auxiliary_loss_clip": 0.01631119, + "auxiliary_loss_mlp": 0.01074171, + "balance_loss_clip": 1.37593186, + "balance_loss_mlp": 1.04272306, + "epoch": 0.1035322410942432, + "flos": 19801593941760.0, + "grad_norm": 1.7245980568394772, + "language_loss": 0.81792909, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.84498203, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31420898, + "step": 1722, + "time_per_iteration": 2.916135311126709 + }, + { + "auxiliary_loss_clip": 0.013979, + "auxiliary_loss_mlp": 0.01050343, + "balance_loss_clip": 1.24428964, + "balance_loss_mlp": 1.02869463, + "epoch": 0.10359236434691117, + "flos": 52725506192640.0, + "grad_norm": 0.9778290645399514, + "language_loss": 0.67279178, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69727421, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.21679688, + "step": 1723, + "time_per_iteration": 4.5575971603393555 + }, + { + "auxiliary_loss_clip": 0.01625279, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.37117147, + "balance_loss_mlp": 1.03204262, + "epoch": 0.10365248759957914, + "flos": 18414483534720.0, + "grad_norm": 2.6174777984540283, + "language_loss": 0.80086613, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.82776248, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.32324219, + "step": 1724, + "time_per_iteration": 2.8233587741851807 + }, + { + "auxiliary_loss_clip": 0.01626068, + "auxiliary_loss_mlp": 0.0105828, + "balance_loss_clip": 1.37080312, + "balance_loss_mlp": 1.02699983, + "epoch": 0.1037126108522471, + "flos": 47569166432640.0, + "grad_norm": 1.8249106344009134, + "language_loss": 0.75916672, + "learning_rate": 3.943273412987676e-06, + "loss": 0.78601015, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.31274414, + "step": 1725, + "time_per_iteration": 3.142331838607788 + }, + { + "auxiliary_loss_clip": 0.01621412, + "auxiliary_loss_mlp": 0.01057339, + "balance_loss_clip": 1.37257409, + "balance_loss_mlp": 1.02603436, + "epoch": 0.10377273410491508, + "flos": 22825983110400.0, + "grad_norm": 2.0201606291783634, + "language_loss": 0.76212686, + "learning_rate": 3.943181276805054e-06, + "loss": 0.78891438, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.31298828, + "step": 1726, + "time_per_iteration": 2.8910768032073975 + }, + { + "auxiliary_loss_clip": 0.01627055, + "auxiliary_loss_mlp": 0.01067697, + "balance_loss_clip": 1.3719933, + "balance_loss_mlp": 1.0334363, + "epoch": 0.10383285735758305, + "flos": 26149341790080.0, + "grad_norm": 2.067950945008, + "language_loss": 0.76024151, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.78718907, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.34228516, + "step": 1727, + "time_per_iteration": 2.9034175872802734 + }, + { + "auxiliary_loss_clip": 0.01615915, + "auxiliary_loss_mlp": 0.01059528, + "balance_loss_clip": 1.36618042, + "balance_loss_mlp": 1.02741313, + "epoch": 0.10389298061025101, + "flos": 17100498003840.0, + "grad_norm": 1.9939065911736176, + "language_loss": 0.86376309, + "learning_rate": 3.942996783386422e-06, + "loss": 0.89051759, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.32104492, + "step": 1728, + "time_per_iteration": 4.333642482757568 + }, + { + "auxiliary_loss_clip": 0.01604985, + "auxiliary_loss_mlp": 0.01056937, + "balance_loss_clip": 1.35644317, + "balance_loss_mlp": 1.02372575, + "epoch": 0.10395310386291898, + "flos": 20786087704320.0, + "grad_norm": 1.8805302763898075, + "language_loss": 0.71632564, + "learning_rate": 3.942904426157406e-06, + "loss": 0.74294484, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.33227539, + "step": 1729, + "time_per_iteration": 2.8833014965057373 + }, + { + "auxiliary_loss_clip": 0.01635032, + "auxiliary_loss_mlp": 0.01054956, + "balance_loss_clip": 1.37983799, + "balance_loss_mlp": 1.02126718, + "epoch": 0.10401322711558696, + "flos": 12827511319680.0, + "grad_norm": 3.6652999697063984, + "language_loss": 0.83303535, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.85993516, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.33666992, + "step": 1730, + "time_per_iteration": 2.815133571624756 + }, + { + "auxiliary_loss_clip": 0.01614463, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.36434507, + "balance_loss_mlp": 1.03044677, + "epoch": 0.10407335036825492, + "flos": 23194910361600.0, + "grad_norm": 1.9724207556201654, + "language_loss": 0.7723, + "learning_rate": 3.942719490677489e-06, + "loss": 0.79906535, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.31616211, + "step": 1731, + "time_per_iteration": 4.340686321258545 + }, + { + "auxiliary_loss_clip": 0.01610036, + "auxiliary_loss_mlp": 0.01055096, + "balance_loss_clip": 1.36427915, + "balance_loss_mlp": 1.02352917, + "epoch": 0.10413347362092289, + "flos": 26115335683200.0, + "grad_norm": 1.8204582053592702, + "language_loss": 0.84059095, + "learning_rate": 3.9426269124336e-06, + "loss": 0.86724234, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.31518555, + "step": 1732, + "time_per_iteration": 2.9108691215515137 + }, + { + "auxiliary_loss_clip": 0.01634259, + "auxiliary_loss_mlp": 0.01062629, + "balance_loss_clip": 1.38303089, + "balance_loss_mlp": 1.02903569, + "epoch": 0.10419359687359087, + "flos": 12648748538880.0, + "grad_norm": 1.9057474517808413, + "language_loss": 0.85419464, + "learning_rate": 3.942534260525104e-06, + "loss": 0.88116348, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.3359375, + "step": 1733, + "time_per_iteration": 2.8295109272003174 + }, + { + "auxiliary_loss_clip": 0.01638901, + "auxiliary_loss_mlp": 0.01059647, + "balance_loss_clip": 1.38396883, + "balance_loss_mlp": 1.02433693, + "epoch": 0.10425372012625883, + "flos": 12131309295360.0, + "grad_norm": 2.210485712887533, + "language_loss": 0.77781367, + "learning_rate": 3.942441534955514e-06, + "loss": 0.8047992, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.35327148, + "step": 1734, + "time_per_iteration": 2.8586843013763428 + }, + { + "auxiliary_loss_clip": 0.01620967, + "auxiliary_loss_mlp": 0.01057149, + "balance_loss_clip": 1.37493849, + "balance_loss_mlp": 1.02574956, + "epoch": 0.1043138433789268, + "flos": 25348406757120.0, + "grad_norm": 1.6602544486123059, + "language_loss": 0.76127332, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.78805453, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.3137207, + "step": 1735, + "time_per_iteration": 2.87495493888855 + }, + { + "auxiliary_loss_clip": 0.01642019, + "auxiliary_loss_mlp": 0.01054354, + "balance_loss_clip": 1.38703966, + "balance_loss_mlp": 1.02204859, + "epoch": 0.10437396663159478, + "flos": 29178617397120.0, + "grad_norm": 1.7367010196584898, + "language_loss": 0.79951179, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8264755, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.32299805, + "step": 1736, + "time_per_iteration": 2.9355287551879883 + }, + { + "auxiliary_loss_clip": 0.01626774, + "auxiliary_loss_mlp": 0.01064528, + "balance_loss_clip": 1.37747741, + "balance_loss_mlp": 1.02852678, + "epoch": 0.10443408988426274, + "flos": 25091406437760.0, + "grad_norm": 3.5098811020067897, + "language_loss": 0.72250807, + "learning_rate": 3.942162916315356e-06, + "loss": 0.74942112, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.35986328, + "step": 1737, + "time_per_iteration": 2.8991878032684326 + }, + { + "auxiliary_loss_clip": 0.01649648, + "auxiliary_loss_mlp": 0.01062488, + "balance_loss_clip": 1.3893292, + "balance_loss_mlp": 1.02689159, + "epoch": 0.1044942131369307, + "flos": 26770654391040.0, + "grad_norm": 2.716988155793134, + "language_loss": 0.83254182, + "learning_rate": 3.942069896136581e-06, + "loss": 0.85966313, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.35620117, + "step": 1738, + "time_per_iteration": 2.9242448806762695 + }, + { + "auxiliary_loss_clip": 0.01645684, + "auxiliary_loss_mlp": 0.01062972, + "balance_loss_clip": 1.38940787, + "balance_loss_mlp": 1.02892566, + "epoch": 0.10455433638959867, + "flos": 18451837756800.0, + "grad_norm": 2.2672206113952726, + "language_loss": 0.76530719, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.79239368, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 2.56445312, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.8779656887054443 + }, + { + "auxiliary_loss_clip": 0.01637994, + "auxiliary_loss_mlp": 0.01051227, + "balance_loss_clip": 1.38523638, + "balance_loss_mlp": 1.01844454, + "epoch": 0.10461445964226665, + "flos": 23228644999680.0, + "grad_norm": 2.064501779280056, + "language_loss": 0.78779864, + "learning_rate": 3.941883634852104e-06, + "loss": 0.81469095, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.32788086, + "step": 1740, + "time_per_iteration": 2.9048547744750977 + }, + { + "auxiliary_loss_clip": 0.01647777, + "auxiliary_loss_mlp": 0.01055398, + "balance_loss_clip": 1.39894009, + "balance_loss_mlp": 1.02061248, + "epoch": 0.10467458289493461, + "flos": 24353959138560.0, + "grad_norm": 1.9460175288987018, + "language_loss": 0.87532246, + "learning_rate": 3.941790393753467e-06, + "loss": 0.90235424, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.34790039, + "step": 1741, + "time_per_iteration": 2.930488348007202 + }, + { + "auxiliary_loss_clip": 0.01672172, + "auxiliary_loss_mlp": 0.01055087, + "balance_loss_clip": 1.41378522, + "balance_loss_mlp": 1.02177966, + "epoch": 0.10473470614760258, + "flos": 21297826103040.0, + "grad_norm": 2.2914045243327057, + "language_loss": 0.77325141, + "learning_rate": 3.941697079021942e-06, + "loss": 0.800524, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 2.58398438, + "router_z_loss_mlp": 0.33276367, + "step": 1742, + "time_per_iteration": 2.8465445041656494 + }, + { + "auxiliary_loss_clip": 0.01643238, + "auxiliary_loss_mlp": 0.01059224, + "balance_loss_clip": 1.39175677, + "balance_loss_mlp": 1.02641714, + "epoch": 0.10479482940027056, + "flos": 21696642184320.0, + "grad_norm": 2.2064236408363382, + "language_loss": 0.88134134, + "learning_rate": 3.94160369066107e-06, + "loss": 0.90836596, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.328125, + "step": 1743, + "time_per_iteration": 2.9496910572052 + }, + { + "auxiliary_loss_clip": 0.01626785, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_clip": 1.37817609, + "balance_loss_mlp": 1.0234834, + "epoch": 0.10485495265293852, + "flos": 21582045809280.0, + "grad_norm": 2.28091701033822, + "language_loss": 0.77172786, + "learning_rate": 3.941510228674391e-06, + "loss": 0.79857552, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.3449707, + "step": 1744, + "time_per_iteration": 2.9551281929016113 + }, + { + "auxiliary_loss_clip": 0.01646057, + "auxiliary_loss_mlp": 0.01052004, + "balance_loss_clip": 1.3967216, + "balance_loss_mlp": 1.0191735, + "epoch": 0.10491507590560649, + "flos": 37976116464000.0, + "grad_norm": 1.9689631735422777, + "language_loss": 0.81944823, + "learning_rate": 3.941416693065451e-06, + "loss": 0.84642887, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.32836914, + "step": 1745, + "time_per_iteration": 3.014305353164673 + }, + { + "auxiliary_loss_clip": 0.01635324, + "auxiliary_loss_mlp": 0.01054086, + "balance_loss_clip": 1.38230014, + "balance_loss_mlp": 1.02321076, + "epoch": 0.10497519915827447, + "flos": 26407563719040.0, + "grad_norm": 2.0253298486749847, + "language_loss": 0.84448278, + "learning_rate": 3.941323083837794e-06, + "loss": 0.87137681, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.30834961, + "step": 1746, + "time_per_iteration": 2.9951868057250977 + }, + { + "auxiliary_loss_clip": 0.01636734, + "auxiliary_loss_mlp": 0.01060982, + "balance_loss_clip": 1.38903379, + "balance_loss_mlp": 1.02917719, + "epoch": 0.10503532241094243, + "flos": 40677167157120.0, + "grad_norm": 1.5152703881523422, + "language_loss": 0.71764612, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7446233, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.31811523, + "step": 1747, + "time_per_iteration": 3.1420838832855225 + }, + { + "auxiliary_loss_clip": 0.01679089, + "auxiliary_loss_mlp": 0.01060797, + "balance_loss_clip": 1.41551018, + "balance_loss_mlp": 1.02572584, + "epoch": 0.1050954456636104, + "flos": 29801558810880.0, + "grad_norm": 2.1219481368241087, + "language_loss": 0.85959506, + "learning_rate": 3.941135644540535e-06, + "loss": 0.88699389, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.35058594, + "step": 1748, + "time_per_iteration": 2.95402193069458 + }, + { + "auxiliary_loss_clip": 0.01636631, + "auxiliary_loss_mlp": 0.01058852, + "balance_loss_clip": 1.38253999, + "balance_loss_mlp": 1.02342248, + "epoch": 0.10515556891627838, + "flos": 23958626906880.0, + "grad_norm": 1.7758675812088518, + "language_loss": 0.73059434, + "learning_rate": 3.941041814478041e-06, + "loss": 0.75754917, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.35400391, + "step": 1749, + "time_per_iteration": 2.9274566173553467 + }, + { + "auxiliary_loss_clip": 0.0162871, + "auxiliary_loss_mlp": 0.01053428, + "balance_loss_clip": 1.38015485, + "balance_loss_mlp": 1.01985884, + "epoch": 0.10521569216894634, + "flos": 18268686230400.0, + "grad_norm": 2.0357412186842625, + "language_loss": 0.83720255, + "learning_rate": 3.940947910811047e-06, + "loss": 0.86402392, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.33569336, + "step": 1750, + "time_per_iteration": 3.0262739658355713 + }, + { + "auxiliary_loss_clip": 0.01641418, + "auxiliary_loss_mlp": 0.01059729, + "balance_loss_clip": 1.38756061, + "balance_loss_mlp": 1.02565897, + "epoch": 0.10527581542161431, + "flos": 15638860131840.0, + "grad_norm": 2.465969131162752, + "language_loss": 0.94313425, + "learning_rate": 3.940853933543114e-06, + "loss": 0.9701457, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.34082031, + "step": 1751, + "time_per_iteration": 2.8915250301361084 + }, + { + "auxiliary_loss_clip": 0.01645281, + "auxiliary_loss_mlp": 0.01058645, + "balance_loss_clip": 1.3952477, + "balance_loss_mlp": 1.02650583, + "epoch": 0.10533593867428227, + "flos": 18305768983680.0, + "grad_norm": 1.9159637713035294, + "language_loss": 0.79866433, + "learning_rate": 3.940759882677805e-06, + "loss": 0.82570356, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.32128906, + "step": 1752, + "time_per_iteration": 2.854999303817749 + }, + { + "auxiliary_loss_clip": 0.01636921, + "auxiliary_loss_mlp": 0.01063659, + "balance_loss_clip": 1.38860869, + "balance_loss_mlp": 1.03085268, + "epoch": 0.10539606192695025, + "flos": 29035172822400.0, + "grad_norm": 1.7823232322914189, + "language_loss": 0.77164829, + "learning_rate": 3.940665758218686e-06, + "loss": 0.79865408, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.328125, + "step": 1753, + "time_per_iteration": 2.9355738162994385 + }, + { + "auxiliary_loss_clip": 0.01663179, + "auxiliary_loss_mlp": 0.01068393, + "balance_loss_clip": 1.4014132, + "balance_loss_mlp": 1.03327417, + "epoch": 0.10545618517961822, + "flos": 19977506300160.0, + "grad_norm": 1.7554161544906135, + "language_loss": 0.85043174, + "learning_rate": 3.940571560169328e-06, + "loss": 0.87774748, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.35131836, + "step": 1754, + "time_per_iteration": 2.8851380348205566 + }, + { + "auxiliary_loss_clip": 0.01666402, + "auxiliary_loss_mlp": 0.01064036, + "balance_loss_clip": 1.41022062, + "balance_loss_mlp": 1.03010941, + "epoch": 0.10551630843228618, + "flos": 16151955874560.0, + "grad_norm": 2.190111071812958, + "language_loss": 0.70758605, + "learning_rate": 3.940477288533302e-06, + "loss": 0.7348904, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.33935547, + "step": 1755, + "time_per_iteration": 2.8315024375915527 + }, + { + "auxiliary_loss_clip": 0.01665246, + "auxiliary_loss_mlp": 0.01068477, + "balance_loss_clip": 1.40214801, + "balance_loss_mlp": 1.03333426, + "epoch": 0.10557643168495416, + "flos": 23450417602560.0, + "grad_norm": 2.1960360692476946, + "language_loss": 0.78195035, + "learning_rate": 3.940382943314182e-06, + "loss": 0.80928761, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.3515625, + "step": 1756, + "time_per_iteration": 2.9532480239868164 + }, + { + "auxiliary_loss_clip": 0.0166622, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_clip": 1.40685081, + "balance_loss_mlp": 1.04499412, + "epoch": 0.10563655493762213, + "flos": 21809112053760.0, + "grad_norm": 1.7558333036665354, + "language_loss": 0.81170762, + "learning_rate": 3.940288524515547e-06, + "loss": 0.83915854, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.33837891, + "step": 1757, + "time_per_iteration": 2.820737838745117 + }, + { + "auxiliary_loss_clip": 0.01657761, + "auxiliary_loss_mlp": 0.01060706, + "balance_loss_clip": 1.39970267, + "balance_loss_mlp": 1.02758968, + "epoch": 0.10569667819029009, + "flos": 53820234846720.0, + "grad_norm": 1.5591721674401517, + "language_loss": 0.79811817, + "learning_rate": 3.940194032140976e-06, + "loss": 0.82530284, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.33105469, + "step": 1758, + "time_per_iteration": 4.524232864379883 + }, + { + "auxiliary_loss_clip": 0.01668243, + "auxiliary_loss_mlp": 0.01062286, + "balance_loss_clip": 1.40790796, + "balance_loss_mlp": 1.02955079, + "epoch": 0.10575680144295807, + "flos": 22934697661440.0, + "grad_norm": 1.83881742724694, + "language_loss": 0.92614478, + "learning_rate": 3.940099466194054e-06, + "loss": 0.95345008, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.32714844, + "step": 1759, + "time_per_iteration": 2.896393060684204 + }, + { + "auxiliary_loss_clip": 0.01651909, + "auxiliary_loss_mlp": 0.01063633, + "balance_loss_clip": 1.39297485, + "balance_loss_mlp": 1.02925253, + "epoch": 0.10581692469562604, + "flos": 14144663986560.0, + "grad_norm": 2.984907278519586, + "language_loss": 0.77870953, + "learning_rate": 3.940004826678365e-06, + "loss": 0.80586493, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.34375, + "step": 1760, + "time_per_iteration": 2.832669496536255 + }, + { + "auxiliary_loss_clip": 0.01659411, + "auxiliary_loss_mlp": 0.01063973, + "balance_loss_clip": 1.39653373, + "balance_loss_mlp": 1.02704191, + "epoch": 0.105877047948294, + "flos": 25969900337280.0, + "grad_norm": 2.6940677493604657, + "language_loss": 0.91170448, + "learning_rate": 3.939910113597498e-06, + "loss": 0.93893838, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 2.62890625, + "router_z_loss_mlp": 0.36914062, + "step": 1761, + "time_per_iteration": 2.9727752208709717 + }, + { + "auxiliary_loss_clip": 0.0165895, + "auxiliary_loss_mlp": 0.01059936, + "balance_loss_clip": 1.40139043, + "balance_loss_mlp": 1.02612793, + "epoch": 0.10593717120096197, + "flos": 30677066553600.0, + "grad_norm": 1.9657485396386047, + "language_loss": 0.79839504, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.82558388, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.33789062, + "step": 1762, + "time_per_iteration": 2.939302444458008 + }, + { + "auxiliary_loss_clip": 0.01369406, + "auxiliary_loss_mlp": 0.01050123, + "balance_loss_clip": 1.21870828, + "balance_loss_mlp": 1.03038204, + "epoch": 0.10599729445362994, + "flos": 66471851790720.0, + "grad_norm": 0.770817293889527, + "language_loss": 0.60645485, + "learning_rate": 3.939720466754602e-06, + "loss": 0.63065016, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.19726562, + "step": 1763, + "time_per_iteration": 4.973220586776733 + }, + { + "auxiliary_loss_clip": 0.01664396, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.40468216, + "balance_loss_mlp": 1.02443099, + "epoch": 0.10605741770629791, + "flos": 23957902990080.0, + "grad_norm": 1.8800996476986331, + "language_loss": 0.81792796, + "learning_rate": 3.939625532999763e-06, + "loss": 0.8451817, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 2.59765625, + "router_z_loss_mlp": 0.36547852, + "step": 1764, + "time_per_iteration": 3.015533685684204 + }, + { + "auxiliary_loss_clip": 0.01650475, + "auxiliary_loss_mlp": 0.01059442, + "balance_loss_clip": 1.39384472, + "balance_loss_mlp": 1.02358377, + "epoch": 0.10611754095896588, + "flos": 19396081630080.0, + "grad_norm": 2.54635969385903, + "language_loss": 0.81169343, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.83879262, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.3581543, + "step": 1765, + "time_per_iteration": 3.051846981048584 + }, + { + "auxiliary_loss_clip": 0.0164507, + "auxiliary_loss_mlp": 0.01061545, + "balance_loss_clip": 1.39066982, + "balance_loss_mlp": 1.02914357, + "epoch": 0.10617766421163385, + "flos": 22248178024320.0, + "grad_norm": 1.9517843127258139, + "language_loss": 0.77885103, + "learning_rate": 3.939435444841306e-06, + "loss": 0.80591714, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.32373047, + "step": 1766, + "time_per_iteration": 5.675248861312866 + }, + { + "auxiliary_loss_clip": 0.01656229, + "auxiliary_loss_mlp": 0.01060391, + "balance_loss_clip": 1.40301871, + "balance_loss_mlp": 1.026178, + "epoch": 0.10623778746430182, + "flos": 28416574909440.0, + "grad_norm": 9.768060537263501, + "language_loss": 0.78029341, + "learning_rate": 3.939340290444895e-06, + "loss": 0.80745959, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.34228516, + "step": 1767, + "time_per_iteration": 2.9640796184539795 + }, + { + "auxiliary_loss_clip": 0.01373993, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.22112584, + "balance_loss_mlp": 1.00915802, + "epoch": 0.10629791071696978, + "flos": 64265889409920.0, + "grad_norm": 0.6946962106775932, + "language_loss": 0.58051407, + "learning_rate": 3.939245062508506e-06, + "loss": 0.60461068, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.265625, + "step": 1768, + "time_per_iteration": 3.4650774002075195 + }, + { + "auxiliary_loss_clip": 0.01660368, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.40369678, + "balance_loss_mlp": 1.02675796, + "epoch": 0.10635803396963776, + "flos": 22757880407040.0, + "grad_norm": 1.4344345373321545, + "language_loss": 0.87348735, + "learning_rate": 3.939149761035749e-06, + "loss": 0.90068501, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.32641602, + "step": 1769, + "time_per_iteration": 3.1858532428741455 + }, + { + "auxiliary_loss_clip": 0.01664356, + "auxiliary_loss_mlp": 0.01071507, + "balance_loss_clip": 1.40534699, + "balance_loss_mlp": 1.0374136, + "epoch": 0.10641815722230573, + "flos": 31408360560000.0, + "grad_norm": 2.071546057868812, + "language_loss": 0.62956864, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.65692729, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.34082031, + "step": 1770, + "time_per_iteration": 2.957024097442627 + }, + { + "auxiliary_loss_clip": 0.01387368, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.22880912, + "balance_loss_mlp": 1.0023222, + "epoch": 0.1064782804749737, + "flos": 58575342568320.0, + "grad_norm": 0.9005741925445382, + "language_loss": 0.57217085, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.596331, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.26367188, + "step": 1771, + "time_per_iteration": 3.247133255004883 + }, + { + "auxiliary_loss_clip": 0.01669666, + "auxiliary_loss_mlp": 0.01081616, + "balance_loss_clip": 1.41204107, + "balance_loss_mlp": 1.0469501, + "epoch": 0.10653840372764166, + "flos": 23998379103360.0, + "grad_norm": 1.9908329289812166, + "language_loss": 0.89470625, + "learning_rate": 3.938863415435429e-06, + "loss": 0.92221904, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 0.34692383, + "step": 1772, + "time_per_iteration": 2.8781545162200928 + }, + { + "auxiliary_loss_clip": 0.01671037, + "auxiliary_loss_mlp": 0.01081071, + "balance_loss_clip": 1.40657508, + "balance_loss_mlp": 1.04149318, + "epoch": 0.10659852698030964, + "flos": 18303371009280.0, + "grad_norm": 2.7256897885723337, + "language_loss": 0.78747731, + "learning_rate": 3.93876781985337e-06, + "loss": 0.81499839, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.39575195, + "step": 1773, + "time_per_iteration": 2.8259241580963135 + }, + { + "auxiliary_loss_clip": 0.01671059, + "auxiliary_loss_mlp": 0.0107467, + "balance_loss_clip": 1.41134799, + "balance_loss_mlp": 1.04248369, + "epoch": 0.1066586502329776, + "flos": 32173253470080.0, + "grad_norm": 2.075994853031606, + "language_loss": 0.85227686, + "learning_rate": 3.938672150753041e-06, + "loss": 0.87973416, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.32202148, + "step": 1774, + "time_per_iteration": 2.981714963912964 + }, + { + "auxiliary_loss_clip": 0.0167782, + "auxiliary_loss_mlp": 0.0108537, + "balance_loss_clip": 1.41643429, + "balance_loss_mlp": 1.05010796, + "epoch": 0.10671877348564557, + "flos": 17794075829760.0, + "grad_norm": 2.7574443853371466, + "language_loss": 0.78517091, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.81280285, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.35253906, + "step": 1775, + "time_per_iteration": 2.8423469066619873 + }, + { + "auxiliary_loss_clip": 0.01375899, + "auxiliary_loss_mlp": 0.01050615, + "balance_loss_clip": 1.21844709, + "balance_loss_mlp": 1.03058755, + "epoch": 0.10677889673831355, + "flos": 63542603733120.0, + "grad_norm": 0.8255468392421951, + "language_loss": 0.57552344, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59978855, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.20019531, + "step": 1776, + "time_per_iteration": 3.4358584880828857 + }, + { + "auxiliary_loss_clip": 0.01651181, + "auxiliary_loss_mlp": 0.01076217, + "balance_loss_clip": 1.39633441, + "balance_loss_mlp": 1.04064488, + "epoch": 0.10683901999098151, + "flos": 22027762765440.0, + "grad_norm": 1.5743561494173086, + "language_loss": 0.84229231, + "learning_rate": 3.938384702378727e-06, + "loss": 0.86956632, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.35571289, + "step": 1777, + "time_per_iteration": 2.8997440338134766 + }, + { + "auxiliary_loss_clip": 0.0164965, + "auxiliary_loss_mlp": 0.01070178, + "balance_loss_clip": 1.39817154, + "balance_loss_mlp": 1.03665638, + "epoch": 0.10689914324364948, + "flos": 25053011585280.0, + "grad_norm": 2.0149006574604034, + "language_loss": 0.88569981, + "learning_rate": 3.938288739241625e-06, + "loss": 0.91289806, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.33520508, + "step": 1778, + "time_per_iteration": 2.9007174968719482 + }, + { + "auxiliary_loss_clip": 0.01661561, + "auxiliary_loss_mlp": 0.01069652, + "balance_loss_clip": 1.40730691, + "balance_loss_mlp": 1.03412795, + "epoch": 0.10695926649631746, + "flos": 16443686217600.0, + "grad_norm": 2.0009593711947335, + "language_loss": 0.85251212, + "learning_rate": 3.938192702604417e-06, + "loss": 0.87982428, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.35522461, + "step": 1779, + "time_per_iteration": 2.9164516925811768 + }, + { + "auxiliary_loss_clip": 0.01647789, + "auxiliary_loss_mlp": 0.01061059, + "balance_loss_clip": 1.39398301, + "balance_loss_mlp": 1.02582026, + "epoch": 0.10701938974898542, + "flos": 16987077993600.0, + "grad_norm": 2.2929798521459848, + "language_loss": 0.68710697, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.71419543, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.35229492, + "step": 1780, + "time_per_iteration": 2.84983229637146 + }, + { + "auxiliary_loss_clip": 0.016565, + "auxiliary_loss_mlp": 0.01064065, + "balance_loss_clip": 1.40190649, + "balance_loss_mlp": 1.02939916, + "epoch": 0.10707951300165339, + "flos": 15896584368000.0, + "grad_norm": 2.4682664356387902, + "language_loss": 0.92984635, + "learning_rate": 3.938000408844265e-06, + "loss": 0.95705199, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.34643555, + "step": 1781, + "time_per_iteration": 2.8994290828704834 + }, + { + "auxiliary_loss_clip": 0.01656836, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.4015193, + "balance_loss_mlp": 1.03097105, + "epoch": 0.10713963625432135, + "flos": 14255459798400.0, + "grad_norm": 1.798870937743681, + "language_loss": 0.80067283, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.82789421, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.34326172, + "step": 1782, + "time_per_iteration": 2.835123300552368 + }, + { + "auxiliary_loss_clip": 0.01675397, + "auxiliary_loss_mlp": 0.01061353, + "balance_loss_clip": 1.41335559, + "balance_loss_mlp": 1.02618575, + "epoch": 0.10719975950698933, + "flos": 16763676577920.0, + "grad_norm": 1.77047306521287, + "language_loss": 0.8050667, + "learning_rate": 3.937807821127436e-06, + "loss": 0.83243418, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.3515625, + "step": 1783, + "time_per_iteration": 2.858966112136841 + }, + { + "auxiliary_loss_clip": 0.01667612, + "auxiliary_loss_mlp": 0.0106038, + "balance_loss_clip": 1.40603566, + "balance_loss_mlp": 1.02516556, + "epoch": 0.1072598827596573, + "flos": 22720797653760.0, + "grad_norm": 1.8483117221906926, + "language_loss": 0.88283324, + "learning_rate": 3.937711417044395e-06, + "loss": 0.9101131, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.35229492, + "step": 1784, + "time_per_iteration": 2.864152193069458 + }, + { + "auxiliary_loss_clip": 0.01658466, + "auxiliary_loss_mlp": 0.01057425, + "balance_loss_clip": 1.39988995, + "balance_loss_mlp": 1.02218699, + "epoch": 0.10732000601232526, + "flos": 23268668664960.0, + "grad_norm": 3.197251053794145, + "language_loss": 1.03224194, + "learning_rate": 3.937614939483143e-06, + "loss": 1.0594008, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 2.58789062, + "router_z_loss_mlp": 0.35253906, + "step": 1785, + "time_per_iteration": 2.920448064804077 + }, + { + "auxiliary_loss_clip": 0.01637775, + "auxiliary_loss_mlp": 0.01061951, + "balance_loss_clip": 1.38972759, + "balance_loss_mlp": 1.02418578, + "epoch": 0.10738012926499324, + "flos": 24217753731840.0, + "grad_norm": 1.3405170509890478, + "language_loss": 0.86092997, + "learning_rate": 3.937518388447339e-06, + "loss": 0.88792717, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.37744141, + "step": 1786, + "time_per_iteration": 2.94661021232605 + }, + { + "auxiliary_loss_clip": 0.01677895, + "auxiliary_loss_mlp": 0.01056498, + "balance_loss_clip": 1.41478801, + "balance_loss_mlp": 1.01827919, + "epoch": 0.1074402525176612, + "flos": 20932835149440.0, + "grad_norm": 1.6708025748495405, + "language_loss": 0.79802972, + "learning_rate": 3.937421763940642e-06, + "loss": 0.82537365, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.38256836, + "step": 1787, + "time_per_iteration": 2.86024808883667 + }, + { + "auxiliary_loss_clip": 0.01668039, + "auxiliary_loss_mlp": 0.01055538, + "balance_loss_clip": 1.40639114, + "balance_loss_mlp": 1.02106249, + "epoch": 0.10750037577032917, + "flos": 16955877064320.0, + "grad_norm": 1.7657097803914457, + "language_loss": 0.8471064, + "learning_rate": 3.937325065966719e-06, + "loss": 0.87434214, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.34472656, + "step": 1788, + "time_per_iteration": 2.9009547233581543 + }, + { + "auxiliary_loss_clip": 0.01664713, + "auxiliary_loss_mlp": 0.01061717, + "balance_loss_clip": 1.40869713, + "balance_loss_mlp": 1.02695549, + "epoch": 0.10756049902299715, + "flos": 20276204342400.0, + "grad_norm": 1.801512337769345, + "language_loss": 0.79350221, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.82076651, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.34765625, + "step": 1789, + "time_per_iteration": 2.893228769302368 + }, + { + "auxiliary_loss_clip": 0.01652635, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.39461446, + "balance_loss_mlp": 1.02043366, + "epoch": 0.10762062227566511, + "flos": 23596984068480.0, + "grad_norm": 2.6600240657684013, + "language_loss": 0.76599503, + "learning_rate": 3.937131449631859e-06, + "loss": 0.7930876, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.36206055, + "step": 1790, + "time_per_iteration": 2.9434897899627686 + }, + { + "auxiliary_loss_clip": 0.01667256, + "auxiliary_loss_mlp": 0.01067609, + "balance_loss_clip": 1.40683675, + "balance_loss_mlp": 1.0310353, + "epoch": 0.10768074552833308, + "flos": 24319817297280.0, + "grad_norm": 32.03016098518442, + "language_loss": 0.80186504, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.82921368, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 2.60351562, + "router_z_loss_mlp": 0.36547852, + "step": 1791, + "time_per_iteration": 2.9553351402282715 + }, + { + "auxiliary_loss_clip": 0.01642211, + "auxiliary_loss_mlp": 0.01058843, + "balance_loss_clip": 1.39141226, + "balance_loss_mlp": 1.02436757, + "epoch": 0.10774086878100106, + "flos": 25310509597440.0, + "grad_norm": 2.4051165939131907, + "language_loss": 0.72505844, + "learning_rate": 3.936937539472126e-06, + "loss": 0.752069, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.8725008964538574 + }, + { + "auxiliary_loss_clip": 0.0164798, + "auxiliary_loss_mlp": 0.01051931, + "balance_loss_clip": 1.39233112, + "balance_loss_mlp": 1.0179565, + "epoch": 0.10780099203366902, + "flos": 22064076357120.0, + "grad_norm": 1.7055465024459227, + "language_loss": 0.78020853, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.8072077, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.34008789, + "step": 1793, + "time_per_iteration": 4.266247510910034 + }, + { + "auxiliary_loss_clip": 0.01653715, + "auxiliary_loss_mlp": 0.0106525, + "balance_loss_clip": 1.40382361, + "balance_loss_mlp": 1.02820039, + "epoch": 0.10786111528633699, + "flos": 22757744672640.0, + "grad_norm": 1.4613807417286013, + "language_loss": 0.85996896, + "learning_rate": 3.936743335516936e-06, + "loss": 0.88715863, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.37011719, + "step": 1794, + "time_per_iteration": 2.9946212768554688 + }, + { + "auxiliary_loss_clip": 0.01683414, + "auxiliary_loss_mlp": 0.01062161, + "balance_loss_clip": 1.41548634, + "balance_loss_mlp": 1.0241096, + "epoch": 0.10792123853900495, + "flos": 20861067617280.0, + "grad_norm": 1.583120607367775, + "language_loss": 0.76802135, + "learning_rate": 3.936646123375246e-06, + "loss": 0.79547715, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 2.6796875, + "router_z_loss_mlp": 0.38037109, + "step": 1795, + "time_per_iteration": 2.9647886753082275 + }, + { + "auxiliary_loss_clip": 0.01662046, + "auxiliary_loss_mlp": 0.01061752, + "balance_loss_clip": 1.40165424, + "balance_loss_mlp": 1.02732396, + "epoch": 0.10798136179167293, + "flos": 17757626503680.0, + "grad_norm": 2.29474512237384, + "language_loss": 0.83238059, + "learning_rate": 3.936548837795741e-06, + "loss": 0.85961854, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.34399414, + "step": 1796, + "time_per_iteration": 3.0808980464935303 + }, + { + "auxiliary_loss_clip": 0.01690711, + "auxiliary_loss_mlp": 0.01073733, + "balance_loss_clip": 1.42690337, + "balance_loss_mlp": 1.03873348, + "epoch": 0.1080414850443409, + "flos": 13597562136960.0, + "grad_norm": 2.340393849571511, + "language_loss": 0.76884443, + "learning_rate": 3.936451478782111e-06, + "loss": 0.79648888, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.35009766, + "step": 1797, + "time_per_iteration": 4.293665170669556 + }, + { + "auxiliary_loss_clip": 0.01640993, + "auxiliary_loss_mlp": 0.01058211, + "balance_loss_clip": 1.39056301, + "balance_loss_mlp": 1.0243082, + "epoch": 0.10810160829700886, + "flos": 16262118259200.0, + "grad_norm": 1.8488051374576389, + "language_loss": 0.82786107, + "learning_rate": 3.936354046338046e-06, + "loss": 0.85485315, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.33911133, + "step": 1798, + "time_per_iteration": 2.939959764480591 + }, + { + "auxiliary_loss_clip": 0.01658255, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_clip": 1.40425158, + "balance_loss_mlp": 1.02308583, + "epoch": 0.10816173154967684, + "flos": 15166195257600.0, + "grad_norm": 2.3986012796790797, + "language_loss": 0.87432593, + "learning_rate": 3.936256540467242e-06, + "loss": 0.90147167, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.33227539, + "step": 1799, + "time_per_iteration": 2.9105916023254395 + }, + { + "auxiliary_loss_clip": 0.01655034, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.40304112, + "balance_loss_mlp": 1.03270507, + "epoch": 0.10822185480234481, + "flos": 17794302053760.0, + "grad_norm": 1.924142278656181, + "language_loss": 0.79055649, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.81776381, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.33007812, + "step": 1800, + "time_per_iteration": 5.824314594268799 + }, + { + "auxiliary_loss_clip": 0.01652182, + "auxiliary_loss_mlp": 0.01059406, + "balance_loss_clip": 1.39958096, + "balance_loss_mlp": 1.02693391, + "epoch": 0.10828197805501277, + "flos": 25567600406400.0, + "grad_norm": 1.5844643017722237, + "language_loss": 0.74124634, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.76836228, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.32495117, + "step": 1801, + "time_per_iteration": 3.055722713470459 + }, + { + "auxiliary_loss_clip": 0.01666609, + "auxiliary_loss_mlp": 0.01054197, + "balance_loss_clip": 1.40785897, + "balance_loss_mlp": 1.02201045, + "epoch": 0.10834210130768075, + "flos": 28995782584320.0, + "grad_norm": 1.8227728666059286, + "language_loss": 0.67621613, + "learning_rate": 3.935963582331381e-06, + "loss": 0.70342416, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 2.58984375, + "router_z_loss_mlp": 0.32202148, + "step": 1802, + "time_per_iteration": 2.932758331298828 + }, + { + "auxiliary_loss_clip": 0.01666838, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.41338158, + "balance_loss_mlp": 1.03201389, + "epoch": 0.10840222456034872, + "flos": 20273263430400.0, + "grad_norm": 2.104081932561071, + "language_loss": 0.83130378, + "learning_rate": 3.935865782790621e-06, + "loss": 0.85863513, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.34277344, + "step": 1803, + "time_per_iteration": 2.999124526977539 + }, + { + "auxiliary_loss_clip": 0.01649802, + "auxiliary_loss_mlp": 0.01060713, + "balance_loss_clip": 1.39870381, + "balance_loss_mlp": 1.021088, + "epoch": 0.10846234781301668, + "flos": 19872049374720.0, + "grad_norm": 4.261571665983494, + "language_loss": 0.92204106, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.94914615, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.39648438, + "step": 1804, + "time_per_iteration": 2.9025673866271973 + }, + { + "auxiliary_loss_clip": 0.01669299, + "auxiliary_loss_mlp": 0.01054654, + "balance_loss_clip": 1.41388154, + "balance_loss_mlp": 1.02165675, + "epoch": 0.10852247106568465, + "flos": 26480281392000.0, + "grad_norm": 2.080530908020713, + "language_loss": 0.77698398, + "learning_rate": 3.935669963488139e-06, + "loss": 0.80422354, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.32983398, + "step": 1805, + "time_per_iteration": 2.936420202255249 + }, + { + "auxiliary_loss_clip": 0.01653807, + "auxiliary_loss_mlp": 0.01055493, + "balance_loss_clip": 1.40541828, + "balance_loss_mlp": 1.02373552, + "epoch": 0.10858259431835263, + "flos": 30093786846720.0, + "grad_norm": 1.6889860810304063, + "language_loss": 0.87087715, + "learning_rate": 3.935571943733843e-06, + "loss": 0.89797008, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.31738281, + "step": 1806, + "time_per_iteration": 2.9881298542022705 + }, + { + "auxiliary_loss_clip": 0.01675394, + "auxiliary_loss_mlp": 0.01060531, + "balance_loss_clip": 1.42206168, + "balance_loss_mlp": 1.02462554, + "epoch": 0.10864271757102059, + "flos": 19072697909760.0, + "grad_norm": 2.0395093996297127, + "language_loss": 0.81998384, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.84734309, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.359375, + "step": 1807, + "time_per_iteration": 2.8164284229278564 + }, + { + "auxiliary_loss_clip": 0.01671712, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.41857851, + "balance_loss_mlp": 1.0281918, + "epoch": 0.10870284082368856, + "flos": 24725148629760.0, + "grad_norm": 1.8362949788031764, + "language_loss": 0.79798311, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.82531238, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.33007812, + "step": 1808, + "time_per_iteration": 2.9492576122283936 + }, + { + "auxiliary_loss_clip": 0.01673934, + "auxiliary_loss_mlp": 0.01054853, + "balance_loss_clip": 1.41644251, + "balance_loss_mlp": 1.02078295, + "epoch": 0.10876296407635654, + "flos": 20636444592000.0, + "grad_norm": 1.7266126194675402, + "language_loss": 0.7990272, + "learning_rate": 3.935277444103342e-06, + "loss": 0.82631505, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.34082031, + "step": 1809, + "time_per_iteration": 3.002049207687378 + }, + { + "auxiliary_loss_clip": 0.01661325, + "auxiliary_loss_mlp": 0.01054423, + "balance_loss_clip": 1.40977192, + "balance_loss_mlp": 1.02149773, + "epoch": 0.1088230873290245, + "flos": 21589782670080.0, + "grad_norm": 2.4738526607636366, + "language_loss": 0.8653999, + "learning_rate": 3.935179130783046e-06, + "loss": 0.89255738, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.32885742, + "step": 1810, + "time_per_iteration": 2.904573440551758 + }, + { + "auxiliary_loss_clip": 0.01699167, + "auxiliary_loss_mlp": 0.01065831, + "balance_loss_clip": 1.43554401, + "balance_loss_mlp": 1.02975821, + "epoch": 0.10888321058169247, + "flos": 26480281392000.0, + "grad_norm": 1.6441629543434517, + "language_loss": 0.65335464, + "learning_rate": 3.935080744080564e-06, + "loss": 0.68100464, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 2.640625, + "router_z_loss_mlp": 0.3605957, + "step": 1811, + "time_per_iteration": 2.8799867630004883 + }, + { + "auxiliary_loss_clip": 0.01672289, + "auxiliary_loss_mlp": 0.01059761, + "balance_loss_clip": 1.41412318, + "balance_loss_mlp": 1.02778959, + "epoch": 0.10894333383436045, + "flos": 25859647463040.0, + "grad_norm": 1.9922991203738685, + "language_loss": 0.75363183, + "learning_rate": 3.934982283999626e-06, + "loss": 0.78095222, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.31933594, + "step": 1812, + "time_per_iteration": 2.859090805053711 + }, + { + "auxiliary_loss_clip": 0.01661227, + "auxiliary_loss_mlp": 0.01062798, + "balance_loss_clip": 1.4074558, + "balance_loss_mlp": 1.0289191, + "epoch": 0.10900345708702841, + "flos": 19546629638400.0, + "grad_norm": 1.5626948189007186, + "language_loss": 0.74263477, + "learning_rate": 3.934883750543966e-06, + "loss": 0.76987505, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.33837891, + "step": 1813, + "time_per_iteration": 2.870229482650757 + }, + { + "auxiliary_loss_clip": 0.01656631, + "auxiliary_loss_mlp": 0.01071631, + "balance_loss_clip": 1.40733194, + "balance_loss_mlp": 1.0380137, + "epoch": 0.10906358033969638, + "flos": 23633840597760.0, + "grad_norm": 1.6660099693804868, + "language_loss": 0.83760172, + "learning_rate": 3.93478514371732e-06, + "loss": 0.86488432, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.33642578, + "step": 1814, + "time_per_iteration": 2.8601434230804443 + }, + { + "auxiliary_loss_clip": 0.01676615, + "auxiliary_loss_mlp": 0.01061646, + "balance_loss_clip": 1.41923046, + "balance_loss_mlp": 1.02790976, + "epoch": 0.10912370359236434, + "flos": 21224520247680.0, + "grad_norm": 2.0016969463096252, + "language_loss": 0.85410118, + "learning_rate": 3.934686463523429e-06, + "loss": 0.88148385, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.33740234, + "step": 1815, + "time_per_iteration": 2.8639605045318604 + }, + { + "auxiliary_loss_clip": 0.01654458, + "auxiliary_loss_mlp": 0.0106251, + "balance_loss_clip": 1.40570319, + "balance_loss_mlp": 1.02877355, + "epoch": 0.10918382684503232, + "flos": 13561112810880.0, + "grad_norm": 2.5928474257956524, + "language_loss": 0.73184514, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.75901484, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.33764648, + "step": 1816, + "time_per_iteration": 2.845823287963867 + }, + { + "auxiliary_loss_clip": 0.01666526, + "auxiliary_loss_mlp": 0.01060952, + "balance_loss_clip": 1.41068316, + "balance_loss_mlp": 1.02514195, + "epoch": 0.10924395009770028, + "flos": 27975789636480.0, + "grad_norm": 2.414422067454588, + "language_loss": 0.75282502, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.78009975, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.35791016, + "step": 1817, + "time_per_iteration": 2.9115588665008545 + }, + { + "auxiliary_loss_clip": 0.01662003, + "auxiliary_loss_mlp": 0.01065791, + "balance_loss_clip": 1.41061044, + "balance_loss_mlp": 1.03219819, + "epoch": 0.10930407335036825, + "flos": 25604864138880.0, + "grad_norm": 1.719690892501037, + "language_loss": 0.68458045, + "learning_rate": 3.934389982775706e-06, + "loss": 0.71185839, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.33618164, + "step": 1818, + "time_per_iteration": 2.923327684402466 + }, + { + "auxiliary_loss_clip": 0.01682328, + "auxiliary_loss_mlp": 0.01071389, + "balance_loss_clip": 1.42408514, + "balance_loss_mlp": 1.03829646, + "epoch": 0.10936419660303623, + "flos": 18415478920320.0, + "grad_norm": 2.1591273767934656, + "language_loss": 0.74901229, + "learning_rate": 3.934291009150275e-06, + "loss": 0.77654946, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.33105469, + "step": 1819, + "time_per_iteration": 2.8560845851898193 + }, + { + "auxiliary_loss_clip": 0.01671865, + "auxiliary_loss_mlp": 0.01072477, + "balance_loss_clip": 1.41970539, + "balance_loss_mlp": 1.03735828, + "epoch": 0.1094243198557042, + "flos": 23850093335040.0, + "grad_norm": 2.3534294407903094, + "language_loss": 0.74171811, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76916158, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.35131836, + "step": 1820, + "time_per_iteration": 2.9484684467315674 + }, + { + "auxiliary_loss_clip": 0.01642415, + "auxiliary_loss_mlp": 0.01071342, + "balance_loss_clip": 1.39250827, + "balance_loss_mlp": 1.0318594, + "epoch": 0.10948444310837216, + "flos": 14651153988480.0, + "grad_norm": 2.1726801686114814, + "language_loss": 0.8347466, + "learning_rate": 3.934092841857642e-06, + "loss": 0.86188424, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.39526367, + "step": 1821, + "time_per_iteration": 2.88749623298645 + }, + { + "auxiliary_loss_clip": 0.01642364, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_clip": 1.3934958, + "balance_loss_mlp": 1.02404237, + "epoch": 0.10954456636104014, + "flos": 27830354290560.0, + "grad_norm": 1.9666655539870395, + "language_loss": 0.78304422, + "learning_rate": 3.933993648197955e-06, + "loss": 0.81004924, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.34057617, + "step": 1822, + "time_per_iteration": 2.9416422843933105 + }, + { + "auxiliary_loss_clip": 0.01637683, + "auxiliary_loss_mlp": 0.0106702, + "balance_loss_clip": 1.39066935, + "balance_loss_mlp": 1.03185368, + "epoch": 0.1096046896137081, + "flos": 33634212670080.0, + "grad_norm": 2.7914516961817997, + "language_loss": 0.81274152, + "learning_rate": 3.933894381201034e-06, + "loss": 0.83978844, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.3515625, + "step": 1823, + "time_per_iteration": 2.9525787830352783 + }, + { + "auxiliary_loss_clip": 0.01649119, + "auxiliary_loss_mlp": 0.01069381, + "balance_loss_clip": 1.40302587, + "balance_loss_mlp": 1.03409517, + "epoch": 0.10966481286637607, + "flos": 26991431608320.0, + "grad_norm": 1.4705873957146391, + "language_loss": 0.80902714, + "learning_rate": 3.933795040870645e-06, + "loss": 0.83621216, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.35302734, + "step": 1824, + "time_per_iteration": 2.915361166000366 + }, + { + "auxiliary_loss_clip": 0.01649849, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.40107918, + "balance_loss_mlp": 1.03141308, + "epoch": 0.10972493611904403, + "flos": 23046172145280.0, + "grad_norm": 1.806577612849813, + "language_loss": 0.89297706, + "learning_rate": 3.933695627210554e-06, + "loss": 0.92012656, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.33666992, + "step": 1825, + "time_per_iteration": 2.9158551692962646 + }, + { + "auxiliary_loss_clip": 0.01634706, + "auxiliary_loss_mlp": 0.01077994, + "balance_loss_clip": 1.38566613, + "balance_loss_mlp": 1.04106259, + "epoch": 0.10978505937171201, + "flos": 38118022715520.0, + "grad_norm": 2.537309949730699, + "language_loss": 0.77586746, + "learning_rate": 3.933596140224532e-06, + "loss": 0.80299449, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.36914062, + "step": 1826, + "time_per_iteration": 2.970384120941162 + }, + { + "auxiliary_loss_clip": 0.01407482, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.25193715, + "balance_loss_mlp": 1.01540875, + "epoch": 0.10984518262437998, + "flos": 59878080126720.0, + "grad_norm": 0.8461556239851626, + "language_loss": 0.550116, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57452613, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18164062, + "step": 1827, + "time_per_iteration": 4.752614259719849 + }, + { + "auxiliary_loss_clip": 0.01407866, + "auxiliary_loss_mlp": 0.01020111, + "balance_loss_clip": 1.25666249, + "balance_loss_mlp": 1.00065601, + "epoch": 0.10990530587704794, + "flos": 66752813871360.0, + "grad_norm": 0.7537303323695871, + "language_loss": 0.55401498, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57829475, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.19433594, + "step": 1828, + "time_per_iteration": 3.372375249862671 + }, + { + "auxiliary_loss_clip": 0.01684989, + "auxiliary_loss_mlp": 0.01069171, + "balance_loss_clip": 1.42781055, + "balance_loss_mlp": 1.03219247, + "epoch": 0.10996542912971592, + "flos": 25458116693760.0, + "grad_norm": 2.565385374163624, + "language_loss": 0.85528207, + "learning_rate": 3.933297239348612e-06, + "loss": 0.88282359, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.37011719, + "step": 1829, + "time_per_iteration": 2.9499361515045166 + }, + { + "auxiliary_loss_clip": 0.01665615, + "auxiliary_loss_mlp": 0.01074324, + "balance_loss_clip": 1.4108212, + "balance_loss_mlp": 1.039276, + "epoch": 0.11002555238238389, + "flos": 44033853271680.0, + "grad_norm": 1.8824245693733597, + "language_loss": 0.90118903, + "learning_rate": 3.933197459096614e-06, + "loss": 0.92858845, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.35058594, + "step": 1830, + "time_per_iteration": 3.107515811920166 + }, + { + "auxiliary_loss_clip": 0.01396148, + "auxiliary_loss_mlp": 0.01082121, + "balance_loss_clip": 1.24750733, + "balance_loss_mlp": 1.06543183, + "epoch": 0.11008567563505185, + "flos": 54096346454400.0, + "grad_norm": 0.6984388693805477, + "language_loss": 0.55562323, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.58040595, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.16699219, + "step": 1831, + "time_per_iteration": 3.287799119949341 + }, + { + "auxiliary_loss_clip": 0.01671055, + "auxiliary_loss_mlp": 0.01087214, + "balance_loss_clip": 1.4143225, + "balance_loss_mlp": 1.04944861, + "epoch": 0.11014579888771983, + "flos": 24253750609920.0, + "grad_norm": 2.230561217715295, + "language_loss": 0.92645419, + "learning_rate": 3.932997678675282e-06, + "loss": 0.95403683, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.37768555, + "step": 1832, + "time_per_iteration": 2.8702781200408936 + }, + { + "auxiliary_loss_clip": 0.01388233, + "auxiliary_loss_mlp": 0.01065625, + "balance_loss_clip": 1.23508716, + "balance_loss_mlp": 1.04864979, + "epoch": 0.1102059221403878, + "flos": 57774516007680.0, + "grad_norm": 0.71940886324166, + "language_loss": 0.60084367, + "learning_rate": 3.932897678513523e-06, + "loss": 0.62538224, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.16992188, + "step": 1833, + "time_per_iteration": 4.771799325942993 + }, + { + "auxiliary_loss_clip": 0.01648775, + "auxiliary_loss_mlp": 0.01071218, + "balance_loss_clip": 1.3947643, + "balance_loss_mlp": 1.03619456, + "epoch": 0.11026604539305576, + "flos": 16803519264000.0, + "grad_norm": 2.3018917287816927, + "language_loss": 0.81952155, + "learning_rate": 3.93279760505609e-06, + "loss": 0.84672153, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.34985352, + "step": 1834, + "time_per_iteration": 2.9032599925994873 + }, + { + "auxiliary_loss_clip": 0.01654474, + "auxiliary_loss_mlp": 0.01069033, + "balance_loss_clip": 1.40252173, + "balance_loss_mlp": 1.03191125, + "epoch": 0.11032616864572373, + "flos": 23998695816960.0, + "grad_norm": 2.149595243144503, + "language_loss": 0.91308945, + "learning_rate": 3.932697458306779e-06, + "loss": 0.94032449, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.37109375, + "step": 1835, + "time_per_iteration": 5.702774524688721 + }, + { + "auxiliary_loss_clip": 0.01638956, + "auxiliary_loss_mlp": 0.01065215, + "balance_loss_clip": 1.38836253, + "balance_loss_mlp": 1.03097773, + "epoch": 0.1103862918983917, + "flos": 19692743656320.0, + "grad_norm": 1.9907897611475591, + "language_loss": 0.65434349, + "learning_rate": 3.932597238269386e-06, + "loss": 0.68138516, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.34228516, + "step": 1836, + "time_per_iteration": 2.986696243286133 + }, + { + "auxiliary_loss_clip": 0.01627838, + "auxiliary_loss_mlp": 0.01061398, + "balance_loss_clip": 1.38021564, + "balance_loss_mlp": 1.02499139, + "epoch": 0.11044641515105967, + "flos": 32173615428480.0, + "grad_norm": 1.7085426176234162, + "language_loss": 0.7456125, + "learning_rate": 3.932496944947711e-06, + "loss": 0.77250493, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.36425781, + "step": 1837, + "time_per_iteration": 2.9840283393859863 + }, + { + "auxiliary_loss_clip": 0.01637966, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.38720691, + "balance_loss_mlp": 1.02269495, + "epoch": 0.11050653840372764, + "flos": 16697519400960.0, + "grad_norm": 6.5514250287017, + "language_loss": 0.7942645, + "learning_rate": 3.93239657834556e-06, + "loss": 0.82122612, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.35498047, + "step": 1838, + "time_per_iteration": 2.985825300216675 + }, + { + "auxiliary_loss_clip": 0.0162949, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.38025939, + "balance_loss_mlp": 1.02256274, + "epoch": 0.11056666165639562, + "flos": 21217145345280.0, + "grad_norm": 2.031016457326122, + "language_loss": 0.72288847, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74976856, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.359375, + "step": 1839, + "time_per_iteration": 2.9799442291259766 + }, + { + "auxiliary_loss_clip": 0.01644812, + "auxiliary_loss_mlp": 0.01053144, + "balance_loss_clip": 1.39069057, + "balance_loss_mlp": 1.0169282, + "epoch": 0.11062678490906358, + "flos": 19173177907200.0, + "grad_norm": 2.214835780168386, + "language_loss": 0.80713892, + "learning_rate": 3.93219562531505e-06, + "loss": 0.83411849, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.36230469, + "step": 1840, + "time_per_iteration": 3.0268166065216064 + }, + { + "auxiliary_loss_clip": 0.01628844, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.38154745, + "balance_loss_mlp": 1.01879215, + "epoch": 0.11068690816173155, + "flos": 24904861551360.0, + "grad_norm": 2.0525940004975842, + "language_loss": 0.89295304, + "learning_rate": 3.932095038894311e-06, + "loss": 0.91977006, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.34057617, + "step": 1841, + "time_per_iteration": 2.9861416816711426 + }, + { + "auxiliary_loss_clip": 0.01624435, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.37614453, + "balance_loss_mlp": 1.02380013, + "epoch": 0.11074703141439952, + "flos": 16481130929280.0, + "grad_norm": 2.140971247823854, + "language_loss": 0.9160136, + "learning_rate": 3.931994379208334e-06, + "loss": 0.94286048, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.36450195, + "step": 1842, + "time_per_iteration": 2.8882813453674316 + }, + { + "auxiliary_loss_clip": 0.01635521, + "auxiliary_loss_mlp": 0.01065127, + "balance_loss_clip": 1.38401675, + "balance_loss_mlp": 1.02953124, + "epoch": 0.11080715466706749, + "flos": 19181910153600.0, + "grad_norm": 1.924028775228275, + "language_loss": 0.87571752, + "learning_rate": 3.931893646260937e-06, + "loss": 0.90272403, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.35571289, + "step": 1843, + "time_per_iteration": 2.9157257080078125 + }, + { + "auxiliary_loss_clip": 0.01636513, + "auxiliary_loss_mlp": 0.0105775, + "balance_loss_clip": 1.38599646, + "balance_loss_mlp": 1.02275038, + "epoch": 0.11086727791973545, + "flos": 27714988753920.0, + "grad_norm": 4.0143476727091745, + "language_loss": 0.75696582, + "learning_rate": 3.931792840055941e-06, + "loss": 0.78390843, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.34960938, + "step": 1844, + "time_per_iteration": 2.8911240100860596 + }, + { + "auxiliary_loss_clip": 0.0164876, + "auxiliary_loss_mlp": 0.01057856, + "balance_loss_clip": 1.39517415, + "balance_loss_mlp": 1.02130687, + "epoch": 0.11092740117240343, + "flos": 18524102981760.0, + "grad_norm": 2.1517356881508576, + "language_loss": 0.76697141, + "learning_rate": 3.931691960597165e-06, + "loss": 0.79403758, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.36572266, + "step": 1845, + "time_per_iteration": 2.8920705318450928 + }, + { + "auxiliary_loss_clip": 0.01635894, + "auxiliary_loss_mlp": 0.01057874, + "balance_loss_clip": 1.38683081, + "balance_loss_mlp": 1.02194476, + "epoch": 0.1109875244250714, + "flos": 20532299765760.0, + "grad_norm": 1.5240346941220115, + "language_loss": 0.77778852, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.80472624, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.35961914, + "step": 1846, + "time_per_iteration": 2.8380441665649414 + }, + { + "auxiliary_loss_clip": 0.01665076, + "auxiliary_loss_mlp": 0.01057006, + "balance_loss_clip": 1.40710676, + "balance_loss_mlp": 1.02067077, + "epoch": 0.11104764767773936, + "flos": 14106178644480.0, + "grad_norm": 2.356262715001083, + "language_loss": 0.87483555, + "learning_rate": 3.931489981933584e-06, + "loss": 0.9020564, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 2.58007812, + "router_z_loss_mlp": 0.36328125, + "step": 1847, + "time_per_iteration": 2.8154752254486084 + }, + { + "auxiliary_loss_clip": 0.0164584, + "auxiliary_loss_mlp": 0.0105266, + "balance_loss_clip": 1.39213598, + "balance_loss_mlp": 1.01859057, + "epoch": 0.11110777093040733, + "flos": 20604157787520.0, + "grad_norm": 2.0268622062555783, + "language_loss": 0.78274798, + "learning_rate": 3.931388882736438e-06, + "loss": 0.80973303, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.34057617, + "step": 1848, + "time_per_iteration": 2.927241802215576 + }, + { + "auxiliary_loss_clip": 0.01627519, + "auxiliary_loss_mlp": 0.01055639, + "balance_loss_clip": 1.38274431, + "balance_loss_mlp": 1.02102089, + "epoch": 0.11116789418307531, + "flos": 21879657976320.0, + "grad_norm": 2.0356482882871734, + "language_loss": 0.78713334, + "learning_rate": 3.931287710300832e-06, + "loss": 0.81396496, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.34619141, + "step": 1849, + "time_per_iteration": 2.8230950832366943 + }, + { + "auxiliary_loss_clip": 0.01642746, + "auxiliary_loss_mlp": 0.01052853, + "balance_loss_clip": 1.38677943, + "balance_loss_mlp": 1.0173285, + "epoch": 0.11122801743574327, + "flos": 15531457680000.0, + "grad_norm": 3.548742938997845, + "language_loss": 0.73455089, + "learning_rate": 3.931186464630601e-06, + "loss": 0.76150686, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.35522461, + "step": 1850, + "time_per_iteration": 2.822286367416382 + }, + { + "auxiliary_loss_clip": 0.01646059, + "auxiliary_loss_mlp": 0.01055789, + "balance_loss_clip": 1.39195824, + "balance_loss_mlp": 1.02031267, + "epoch": 0.11128814068841124, + "flos": 14400397451520.0, + "grad_norm": 2.418397696347503, + "language_loss": 0.83094382, + "learning_rate": 3.931085145729588e-06, + "loss": 0.85796225, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.35473633, + "step": 1851, + "time_per_iteration": 2.777526617050171 + }, + { + "auxiliary_loss_clip": 0.01641197, + "auxiliary_loss_mlp": 0.01060903, + "balance_loss_clip": 1.39177895, + "balance_loss_mlp": 1.02766788, + "epoch": 0.11134826394107922, + "flos": 16662517908480.0, + "grad_norm": 2.6622498475088943, + "language_loss": 0.90035301, + "learning_rate": 3.930983753601631e-06, + "loss": 0.92737401, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.33227539, + "step": 1852, + "time_per_iteration": 2.870260715484619 + }, + { + "auxiliary_loss_clip": 0.01638331, + "auxiliary_loss_mlp": 0.01054477, + "balance_loss_clip": 1.38930941, + "balance_loss_mlp": 1.02109826, + "epoch": 0.11140838719374718, + "flos": 16699736396160.0, + "grad_norm": 1.7902848803385545, + "language_loss": 0.72899139, + "learning_rate": 3.930882288250578e-06, + "loss": 0.75591946, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.33374023, + "step": 1853, + "time_per_iteration": 2.809765338897705 + }, + { + "auxiliary_loss_clip": 0.01354151, + "auxiliary_loss_mlp": 0.01069483, + "balance_loss_clip": 1.20754099, + "balance_loss_mlp": 1.05002809, + "epoch": 0.11146851044641515, + "flos": 61003891958400.0, + "grad_norm": 0.8057884260550521, + "language_loss": 0.53891981, + "learning_rate": 3.930780749680273e-06, + "loss": 0.56315613, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.19433594, + "step": 1854, + "time_per_iteration": 3.330568313598633 + }, + { + "auxiliary_loss_clip": 0.01660958, + "auxiliary_loss_mlp": 0.01057536, + "balance_loss_clip": 1.39677119, + "balance_loss_mlp": 1.02208328, + "epoch": 0.11152863369908313, + "flos": 22202317779840.0, + "grad_norm": 5.96628180795442, + "language_loss": 0.86222744, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.88941234, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.35449219, + "step": 1855, + "time_per_iteration": 2.939340353012085 + }, + { + "auxiliary_loss_clip": 0.01637186, + "auxiliary_loss_mlp": 0.01054759, + "balance_loss_clip": 1.38427782, + "balance_loss_mlp": 1.02149975, + "epoch": 0.11158875695175109, + "flos": 19547172576000.0, + "grad_norm": 2.450912124315484, + "language_loss": 0.83289385, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.85981321, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.33251953, + "step": 1856, + "time_per_iteration": 2.8480355739593506 + }, + { + "auxiliary_loss_clip": 0.01625019, + "auxiliary_loss_mlp": 0.0105757, + "balance_loss_clip": 1.38110185, + "balance_loss_mlp": 1.02302301, + "epoch": 0.11164888020441906, + "flos": 25452461093760.0, + "grad_norm": 1.887611352929275, + "language_loss": 0.83520371, + "learning_rate": 3.93047569469238e-06, + "loss": 0.86202955, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.34521484, + "step": 1857, + "time_per_iteration": 2.9490907192230225 + }, + { + "auxiliary_loss_clip": 0.01645625, + "auxiliary_loss_mlp": 0.01058231, + "balance_loss_clip": 1.38873518, + "balance_loss_mlp": 1.02311182, + "epoch": 0.11170900345708702, + "flos": 15641167616640.0, + "grad_norm": 3.2015901476721487, + "language_loss": 0.84344876, + "learning_rate": 3.930373863283608e-06, + "loss": 0.87048727, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.35107422, + "step": 1858, + "time_per_iteration": 2.819706916809082 + }, + { + "auxiliary_loss_clip": 0.01638557, + "auxiliary_loss_mlp": 0.01064856, + "balance_loss_clip": 1.38975668, + "balance_loss_mlp": 1.03147697, + "epoch": 0.111769126709755, + "flos": 23049791729280.0, + "grad_norm": 1.9558637591306052, + "language_loss": 0.92822933, + "learning_rate": 3.930271958674866e-06, + "loss": 0.95526338, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.33349609, + "step": 1859, + "time_per_iteration": 2.9462645053863525 + }, + { + "auxiliary_loss_clip": 0.01647078, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_clip": 1.38994169, + "balance_loss_mlp": 1.04566681, + "epoch": 0.11182924996242297, + "flos": 20860615169280.0, + "grad_norm": 2.783770588737898, + "language_loss": 0.83969688, + "learning_rate": 3.930169980870018e-06, + "loss": 0.86695838, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.33422852, + "step": 1860, + "time_per_iteration": 2.9032232761383057 + }, + { + "auxiliary_loss_clip": 0.0163233, + "auxiliary_loss_mlp": 0.0107674, + "balance_loss_clip": 1.3847363, + "balance_loss_mlp": 1.04326606, + "epoch": 0.11188937321509093, + "flos": 17463769655040.0, + "grad_norm": 2.0594554813296497, + "language_loss": 0.7726326, + "learning_rate": 3.930067929872931e-06, + "loss": 0.79972327, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.3347168, + "step": 1861, + "time_per_iteration": 2.831134557723999 + }, + { + "auxiliary_loss_clip": 0.01627568, + "auxiliary_loss_mlp": 0.010744, + "balance_loss_clip": 1.38165188, + "balance_loss_mlp": 1.04216599, + "epoch": 0.11194949646775891, + "flos": 24106143513600.0, + "grad_norm": 3.065421765526497, + "language_loss": 0.90322745, + "learning_rate": 3.929965805687474e-06, + "loss": 0.93024719, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.32226562, + "step": 1862, + "time_per_iteration": 4.2993481159210205 + }, + { + "auxiliary_loss_clip": 0.01634528, + "auxiliary_loss_mlp": 0.01093217, + "balance_loss_clip": 1.38588977, + "balance_loss_mlp": 1.0622946, + "epoch": 0.11200961972042688, + "flos": 25164350334720.0, + "grad_norm": 2.2418783495232546, + "language_loss": 0.88038135, + "learning_rate": 3.92986360831752e-06, + "loss": 0.90765882, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.30908203, + "step": 1863, + "time_per_iteration": 2.8739309310913086 + }, + { + "auxiliary_loss_clip": 0.01644105, + "auxiliary_loss_mlp": 0.01101404, + "balance_loss_clip": 1.39374948, + "balance_loss_mlp": 1.06709528, + "epoch": 0.11206974297309484, + "flos": 21298278551040.0, + "grad_norm": 1.720347646684224, + "language_loss": 0.66049069, + "learning_rate": 3.929761337766945e-06, + "loss": 0.68794572, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.34326172, + "step": 1864, + "time_per_iteration": 2.837022304534912 + }, + { + "auxiliary_loss_clip": 0.01642712, + "auxiliary_loss_mlp": 0.01083449, + "balance_loss_clip": 1.39269328, + "balance_loss_mlp": 1.05402851, + "epoch": 0.11212986622576282, + "flos": 18925136058240.0, + "grad_norm": 2.0897821394719083, + "language_loss": 0.7522462, + "learning_rate": 3.929658994039627e-06, + "loss": 0.77950782, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.29418945, + "step": 1865, + "time_per_iteration": 2.86236834526062 + }, + { + "auxiliary_loss_clip": 0.01661242, + "auxiliary_loss_mlp": 0.01084772, + "balance_loss_clip": 1.40540898, + "balance_loss_mlp": 1.05280066, + "epoch": 0.11218998947843078, + "flos": 22064981253120.0, + "grad_norm": 2.1544729068850437, + "language_loss": 0.85864747, + "learning_rate": 3.929556577139446e-06, + "loss": 0.88610768, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31982422, + "step": 1866, + "time_per_iteration": 2.8472747802734375 + }, + { + "auxiliary_loss_clip": 0.01655986, + "auxiliary_loss_mlp": 0.01085421, + "balance_loss_clip": 1.40061617, + "balance_loss_mlp": 1.05201888, + "epoch": 0.11225011273109875, + "flos": 24582427971840.0, + "grad_norm": 2.0437837376314665, + "language_loss": 0.82286239, + "learning_rate": 3.929454087070286e-06, + "loss": 0.85027647, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.33374023, + "step": 1867, + "time_per_iteration": 2.944963216781616 + }, + { + "auxiliary_loss_clip": 0.01653831, + "auxiliary_loss_mlp": 0.01093001, + "balance_loss_clip": 1.40238571, + "balance_loss_mlp": 1.0610528, + "epoch": 0.11231023598376672, + "flos": 28450264302720.0, + "grad_norm": 2.2563361566156224, + "language_loss": 0.87852168, + "learning_rate": 3.929351523836035e-06, + "loss": 0.90599, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.31933594, + "step": 1868, + "time_per_iteration": 4.2724609375 + }, + { + "auxiliary_loss_clip": 0.01638305, + "auxiliary_loss_mlp": 0.01092106, + "balance_loss_clip": 1.39220071, + "balance_loss_mlp": 1.05948997, + "epoch": 0.1123703592364347, + "flos": 14433996355200.0, + "grad_norm": 2.5509331863416707, + "language_loss": 0.70306021, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.73036432, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.32617188, + "step": 1869, + "time_per_iteration": 2.8070859909057617 + }, + { + "auxiliary_loss_clip": 0.01664939, + "auxiliary_loss_mlp": 0.0108573, + "balance_loss_clip": 1.40906429, + "balance_loss_mlp": 1.05535507, + "epoch": 0.11243048248910266, + "flos": 22246322987520.0, + "grad_norm": 2.877655991592143, + "language_loss": 0.78348231, + "learning_rate": 3.929146177887814e-06, + "loss": 0.81098896, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.30371094, + "step": 1870, + "time_per_iteration": 4.329606056213379 + }, + { + "auxiliary_loss_clip": 0.0166453, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_clip": 1.40317988, + "balance_loss_mlp": 1.05094171, + "epoch": 0.11249060574177062, + "flos": 18592341419520.0, + "grad_norm": 1.7959799112918144, + "language_loss": 0.76857561, + "learning_rate": 3.929043395181631e-06, + "loss": 0.79605389, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 2.61328125, + "router_z_loss_mlp": 0.32348633, + "step": 1871, + "time_per_iteration": 4.254576206207275 + }, + { + "auxiliary_loss_clip": 0.01654372, + "auxiliary_loss_mlp": 0.01071572, + "balance_loss_clip": 1.39920235, + "balance_loss_mlp": 1.03964758, + "epoch": 0.1125507289944386, + "flos": 22866775937280.0, + "grad_norm": 1.876618550287375, + "language_loss": 0.83641607, + "learning_rate": 3.928940539325929e-06, + "loss": 0.86367553, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.3190918, + "step": 1872, + "time_per_iteration": 2.87638783454895 + }, + { + "auxiliary_loss_clip": 0.01657465, + "auxiliary_loss_mlp": 0.01065317, + "balance_loss_clip": 1.40300024, + "balance_loss_mlp": 1.03272533, + "epoch": 0.11261085224710657, + "flos": 19684237633920.0, + "grad_norm": 2.1589935520138863, + "language_loss": 0.84590745, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.87313533, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.32568359, + "step": 1873, + "time_per_iteration": 2.8316445350646973 + }, + { + "auxiliary_loss_clip": 0.01672092, + "auxiliary_loss_mlp": 0.01067576, + "balance_loss_clip": 1.41052425, + "balance_loss_mlp": 1.03331518, + "epoch": 0.11267097549977453, + "flos": 26073230757120.0, + "grad_norm": 1.9032233603675803, + "language_loss": 0.93002272, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9574194, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.3425293, + "step": 1874, + "time_per_iteration": 2.930551528930664 + }, + { + "auxiliary_loss_clip": 0.01649259, + "auxiliary_loss_mlp": 0.0106707, + "balance_loss_clip": 1.3978374, + "balance_loss_mlp": 1.03435922, + "epoch": 0.11273109875244251, + "flos": 21077591823360.0, + "grad_norm": 1.7873619829906509, + "language_loss": 0.75292051, + "learning_rate": 3.928631532900729e-06, + "loss": 0.78008378, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.32714844, + "step": 1875, + "time_per_iteration": 2.837376117706299 + }, + { + "auxiliary_loss_clip": 0.01656105, + "auxiliary_loss_mlp": 0.01062662, + "balance_loss_clip": 1.40711713, + "balance_loss_mlp": 1.03195357, + "epoch": 0.11279122200511048, + "flos": 27100327138560.0, + "grad_norm": 1.84118064224662, + "language_loss": 0.72393966, + "learning_rate": 3.928528384485984e-06, + "loss": 0.7511273, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.30737305, + "step": 1876, + "time_per_iteration": 2.896716594696045 + }, + { + "auxiliary_loss_clip": 0.0164538, + "auxiliary_loss_mlp": 0.01060814, + "balance_loss_clip": 1.3978101, + "balance_loss_mlp": 1.02831769, + "epoch": 0.11285134525777844, + "flos": 20196066522240.0, + "grad_norm": 1.8343960209518386, + "language_loss": 0.77695894, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.80402088, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.32495117, + "step": 1877, + "time_per_iteration": 2.83687162399292 + }, + { + "auxiliary_loss_clip": 0.01662992, + "auxiliary_loss_mlp": 0.01058239, + "balance_loss_clip": 1.41034865, + "balance_loss_mlp": 1.02581358, + "epoch": 0.11291146851044641, + "flos": 12466049460480.0, + "grad_norm": 2.383874108514402, + "language_loss": 0.89382386, + "learning_rate": 3.928321868270436e-06, + "loss": 0.92103618, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.32421875, + "step": 1878, + "time_per_iteration": 2.8291823863983154 + }, + { + "auxiliary_loss_clip": 0.01661658, + "auxiliary_loss_mlp": 0.01061327, + "balance_loss_clip": 1.4077487, + "balance_loss_mlp": 1.02706623, + "epoch": 0.11297159176311439, + "flos": 23852943757440.0, + "grad_norm": 2.1440936834894946, + "language_loss": 0.83088887, + "learning_rate": 3.928218500477466e-06, + "loss": 0.85811865, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.34277344, + "step": 1879, + "time_per_iteration": 2.9513301849365234 + }, + { + "auxiliary_loss_clip": 0.01665619, + "auxiliary_loss_mlp": 0.01063175, + "balance_loss_clip": 1.41115439, + "balance_loss_mlp": 1.03003442, + "epoch": 0.11303171501578235, + "flos": 29941474291200.0, + "grad_norm": 1.8970372825853974, + "language_loss": 0.7156176, + "learning_rate": 3.928115059566259e-06, + "loss": 0.74290556, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.33129883, + "step": 1880, + "time_per_iteration": 2.91438889503479 + }, + { + "auxiliary_loss_clip": 0.01652282, + "auxiliary_loss_mlp": 0.01055855, + "balance_loss_clip": 1.40439749, + "balance_loss_mlp": 1.02381158, + "epoch": 0.11309183826845032, + "flos": 16189264851840.0, + "grad_norm": 1.5438667018631862, + "language_loss": 0.73862588, + "learning_rate": 3.928011545540734e-06, + "loss": 0.76570719, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.32055664, + "step": 1881, + "time_per_iteration": 2.8669984340667725 + }, + { + "auxiliary_loss_clip": 0.01660831, + "auxiliary_loss_mlp": 0.01053747, + "balance_loss_clip": 1.40438342, + "balance_loss_mlp": 1.02039194, + "epoch": 0.1131519615211183, + "flos": 12028205099520.0, + "grad_norm": 3.4303585633456346, + "language_loss": 0.75630862, + "learning_rate": 3.927907958404819e-06, + "loss": 0.78345442, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 2.56640625, + "router_z_loss_mlp": 0.33374023, + "step": 1882, + "time_per_iteration": 2.810260772705078 + }, + { + "auxiliary_loss_clip": 0.01676587, + "auxiliary_loss_mlp": 0.01056964, + "balance_loss_clip": 1.41819596, + "balance_loss_mlp": 1.0238955, + "epoch": 0.11321208477378626, + "flos": 26261313966720.0, + "grad_norm": 2.3838427647283993, + "language_loss": 0.80935657, + "learning_rate": 3.92780429816244e-06, + "loss": 0.83669209, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 2.58203125, + "router_z_loss_mlp": 0.33081055, + "step": 1883, + "time_per_iteration": 2.9002041816711426 + }, + { + "auxiliary_loss_clip": 0.01659707, + "auxiliary_loss_mlp": 0.01055521, + "balance_loss_clip": 1.4043895, + "balance_loss_mlp": 1.02116513, + "epoch": 0.11327220802645423, + "flos": 13634373421440.0, + "grad_norm": 2.513887922149616, + "language_loss": 0.78301257, + "learning_rate": 3.927700564817529e-06, + "loss": 0.81016481, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.34326172, + "step": 1884, + "time_per_iteration": 2.814112424850464 + }, + { + "auxiliary_loss_clip": 0.01382589, + "auxiliary_loss_mlp": 0.01022332, + "balance_loss_clip": 1.23395729, + "balance_loss_mlp": 1.01031542, + "epoch": 0.1133323312791222, + "flos": 57220944151680.0, + "grad_norm": 0.8340940054542284, + "language_loss": 0.55358803, + "learning_rate": 3.927596758374019e-06, + "loss": 0.57763726, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.12011719, + "step": 1885, + "time_per_iteration": 3.2482433319091797 + }, + { + "auxiliary_loss_clip": 0.01639852, + "auxiliary_loss_mlp": 0.01059332, + "balance_loss_clip": 1.39255214, + "balance_loss_mlp": 1.02619219, + "epoch": 0.11339245453179017, + "flos": 24362103202560.0, + "grad_norm": 1.9742138793123498, + "language_loss": 0.91773832, + "learning_rate": 3.927492878835848e-06, + "loss": 0.94473016, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.33129883, + "step": 1886, + "time_per_iteration": 2.904172420501709 + }, + { + "auxiliary_loss_clip": 0.01675758, + "auxiliary_loss_mlp": 0.01062019, + "balance_loss_clip": 1.42271233, + "balance_loss_mlp": 1.0295465, + "epoch": 0.11345257778445814, + "flos": 22679959582080.0, + "grad_norm": 1.8128975459980492, + "language_loss": 0.86057675, + "learning_rate": 3.927388926206953e-06, + "loss": 0.88795447, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.32495117, + "step": 1887, + "time_per_iteration": 2.8360402584075928 + }, + { + "auxiliary_loss_clip": 0.01657521, + "auxiliary_loss_mlp": 0.01059757, + "balance_loss_clip": 1.40727174, + "balance_loss_mlp": 1.02881062, + "epoch": 0.11351270103712612, + "flos": 20996006169600.0, + "grad_norm": 2.637537510521018, + "language_loss": 0.79017317, + "learning_rate": 3.927284900491277e-06, + "loss": 0.81734592, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.30932617, + "step": 1888, + "time_per_iteration": 2.8814618587493896 + }, + { + "auxiliary_loss_clip": 0.01693333, + "auxiliary_loss_mlp": 0.01071447, + "balance_loss_clip": 1.43244219, + "balance_loss_mlp": 1.03630376, + "epoch": 0.11357282428979408, + "flos": 37363581354240.0, + "grad_norm": 1.652953250774899, + "language_loss": 0.69104517, + "learning_rate": 3.927180801692764e-06, + "loss": 0.71869296, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.35131836, + "step": 1889, + "time_per_iteration": 3.017458438873291 + }, + { + "auxiliary_loss_clip": 0.01669315, + "auxiliary_loss_mlp": 0.0106626, + "balance_loss_clip": 1.4172138, + "balance_loss_mlp": 1.03323925, + "epoch": 0.11363294754246205, + "flos": 21765649783680.0, + "grad_norm": 2.027506146688059, + "language_loss": 0.84657806, + "learning_rate": 3.927076629815362e-06, + "loss": 0.87393385, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.33032227, + "step": 1890, + "time_per_iteration": 2.897216558456421 + }, + { + "auxiliary_loss_clip": 0.01665401, + "auxiliary_loss_mlp": 0.01067324, + "balance_loss_clip": 1.41462159, + "balance_loss_mlp": 1.03418422, + "epoch": 0.11369307079513001, + "flos": 22611902123520.0, + "grad_norm": 7.54288467608163, + "language_loss": 0.6680755, + "learning_rate": 3.926972384863022e-06, + "loss": 0.69540274, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.33105469, + "step": 1891, + "time_per_iteration": 2.870441198348999 + }, + { + "auxiliary_loss_clip": 0.01697029, + "auxiliary_loss_mlp": 0.01064271, + "balance_loss_clip": 1.43371105, + "balance_loss_mlp": 1.03120255, + "epoch": 0.11375319404779799, + "flos": 21954185441280.0, + "grad_norm": 1.840886672137919, + "language_loss": 0.88987601, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.91748911, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.33081055, + "step": 1892, + "time_per_iteration": 2.819755792617798 + }, + { + "auxiliary_loss_clip": 0.0169679, + "auxiliary_loss_mlp": 0.01064905, + "balance_loss_clip": 1.43439317, + "balance_loss_mlp": 1.03317165, + "epoch": 0.11381331730046595, + "flos": 26406523088640.0, + "grad_norm": 2.385998203777028, + "language_loss": 0.74447221, + "learning_rate": 3.926763675749339e-06, + "loss": 0.77208912, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 2.62109375, + "router_z_loss_mlp": 0.31689453, + "step": 1893, + "time_per_iteration": 2.9209349155426025 + }, + { + "auxiliary_loss_clip": 0.01673365, + "auxiliary_loss_mlp": 0.01064614, + "balance_loss_clip": 1.41975689, + "balance_loss_mlp": 1.03338099, + "epoch": 0.11387344055313392, + "flos": 23815046597760.0, + "grad_norm": 5.635954085318265, + "language_loss": 0.80806255, + "learning_rate": 3.92665921159591e-06, + "loss": 0.83544236, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.31201172, + "step": 1894, + "time_per_iteration": 2.8243119716644287 + }, + { + "auxiliary_loss_clip": 0.01698643, + "auxiliary_loss_mlp": 0.0106767, + "balance_loss_clip": 1.4351685, + "balance_loss_mlp": 1.03479242, + "epoch": 0.1139335638058019, + "flos": 34533727418880.0, + "grad_norm": 3.2410356574812047, + "language_loss": 0.81184793, + "learning_rate": 3.926554674383371e-06, + "loss": 0.83951104, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 2.63476562, + "router_z_loss_mlp": 0.32861328, + "step": 1895, + "time_per_iteration": 2.9811582565307617 + }, + { + "auxiliary_loss_clip": 0.01381629, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_clip": 1.23468375, + "balance_loss_mlp": 1.03443718, + "epoch": 0.11399368705846986, + "flos": 70621917056640.0, + "grad_norm": 0.8205756428737664, + "language_loss": 0.63462269, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65891969, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.13671875, + "step": 1896, + "time_per_iteration": 3.414736270904541 + }, + { + "auxiliary_loss_clip": 0.01664781, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.41420054, + "balance_loss_mlp": 1.03246033, + "epoch": 0.11405381031113783, + "flos": 21334094449920.0, + "grad_norm": 1.770385539491751, + "language_loss": 0.85732973, + "learning_rate": 3.926345380796821e-06, + "loss": 0.88465506, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.35302734, + "step": 1897, + "time_per_iteration": 4.300881624221802 + }, + { + "auxiliary_loss_clip": 0.0167727, + "auxiliary_loss_mlp": 0.01055738, + "balance_loss_clip": 1.42179346, + "balance_loss_mlp": 1.02512527, + "epoch": 0.11411393356380581, + "flos": 19729238227200.0, + "grad_norm": 2.4746044532681637, + "language_loss": 0.81371707, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.84104711, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.30615234, + "step": 1898, + "time_per_iteration": 2.94209623336792 + }, + { + "auxiliary_loss_clip": 0.01690943, + "auxiliary_loss_mlp": 0.01059817, + "balance_loss_clip": 1.42956042, + "balance_loss_mlp": 1.0278213, + "epoch": 0.11417405681647377, + "flos": 17539473484800.0, + "grad_norm": 1.891187306031339, + "language_loss": 0.74964106, + "learning_rate": 3.926135795021435e-06, + "loss": 0.77714866, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 2.6171875, + "router_z_loss_mlp": 0.32006836, + "step": 1899, + "time_per_iteration": 2.9555864334106445 + }, + { + "auxiliary_loss_clip": 0.01385252, + "auxiliary_loss_mlp": 0.01028289, + "balance_loss_clip": 1.23745561, + "balance_loss_mlp": 1.01140928, + "epoch": 0.11423418006914174, + "flos": 59704927701120.0, + "grad_norm": 0.9260692499633738, + "language_loss": 0.63511467, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65925014, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.16894531, + "step": 1900, + "time_per_iteration": 3.2517733573913574 + }, + { + "auxiliary_loss_clip": 0.0166306, + "auxiliary_loss_mlp": 0.01062834, + "balance_loss_clip": 1.41216469, + "balance_loss_mlp": 1.03074265, + "epoch": 0.1142943033218097, + "flos": 22972594821120.0, + "grad_norm": 1.6641204532924387, + "language_loss": 0.79010677, + "learning_rate": 3.925925917089001e-06, + "loss": 0.81736565, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.32080078, + "step": 1901, + "time_per_iteration": 2.8988401889801025 + }, + { + "auxiliary_loss_clip": 0.01672062, + "auxiliary_loss_mlp": 0.01056433, + "balance_loss_clip": 1.4176439, + "balance_loss_mlp": 1.02529562, + "epoch": 0.11435442657447768, + "flos": 18264478464000.0, + "grad_norm": 1.9676991721351533, + "language_loss": 0.85125136, + "learning_rate": 3.925820868573839e-06, + "loss": 0.87853628, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.31103516, + "step": 1902, + "time_per_iteration": 2.7864139080047607 + }, + { + "auxiliary_loss_clip": 0.01654376, + "auxiliary_loss_mlp": 0.01053938, + "balance_loss_clip": 1.39953578, + "balance_loss_mlp": 1.02363455, + "epoch": 0.11441454982714565, + "flos": 24071865937920.0, + "grad_norm": 4.265625057928174, + "language_loss": 0.78950047, + "learning_rate": 3.925715747031356e-06, + "loss": 0.81658363, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.30322266, + "step": 1903, + "time_per_iteration": 4.1965553760528564 + }, + { + "auxiliary_loss_clip": 0.01660706, + "auxiliary_loss_mlp": 0.01049136, + "balance_loss_clip": 1.40800953, + "balance_loss_mlp": 1.01776052, + "epoch": 0.11447467307981361, + "flos": 25348587736320.0, + "grad_norm": 2.089030685452198, + "language_loss": 0.76493818, + "learning_rate": 3.925610552465539e-06, + "loss": 0.79203659, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.31396484, + "step": 1904, + "time_per_iteration": 2.8917083740234375 + }, + { + "auxiliary_loss_clip": 0.0164803, + "auxiliary_loss_mlp": 0.01058796, + "balance_loss_clip": 1.39880157, + "balance_loss_mlp": 1.02670527, + "epoch": 0.11453479633248159, + "flos": 21736122912000.0, + "grad_norm": 2.289761968153971, + "language_loss": 0.93618685, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.96325505, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.32104492, + "step": 1905, + "time_per_iteration": 4.213242053985596 + }, + { + "auxiliary_loss_clip": 0.01686888, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.42166448, + "balance_loss_mlp": 1.02213049, + "epoch": 0.11459491958514956, + "flos": 12977561635200.0, + "grad_norm": 2.827277672448864, + "language_loss": 0.7868185, + "learning_rate": 3.925399944279861e-06, + "loss": 0.81422627, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 2.65429688, + "router_z_loss_mlp": 0.31762695, + "step": 1906, + "time_per_iteration": 4.239685535430908 + }, + { + "auxiliary_loss_clip": 0.01641635, + "auxiliary_loss_mlp": 0.01062565, + "balance_loss_clip": 1.39188266, + "balance_loss_mlp": 1.02911472, + "epoch": 0.11465504283781752, + "flos": 22721612060160.0, + "grad_norm": 2.5467313309445743, + "language_loss": 0.83686471, + "learning_rate": 3.925294530667986e-06, + "loss": 0.86390668, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.33422852, + "step": 1907, + "time_per_iteration": 2.820119619369507 + }, + { + "auxiliary_loss_clip": 0.01638303, + "auxiliary_loss_mlp": 0.01054475, + "balance_loss_clip": 1.38808608, + "balance_loss_mlp": 1.02393377, + "epoch": 0.1147151660904855, + "flos": 23407317290880.0, + "grad_norm": 2.117392929735174, + "language_loss": 0.86423796, + "learning_rate": 3.92518904404875e-06, + "loss": 0.89116573, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.30517578, + "step": 1908, + "time_per_iteration": 2.87188982963562 + }, + { + "auxiliary_loss_clip": 0.01374896, + "auxiliary_loss_mlp": 0.01014223, + "balance_loss_clip": 1.22866607, + "balance_loss_mlp": 0.99534017, + "epoch": 0.11477528934315347, + "flos": 63037905540480.0, + "grad_norm": 0.9217337795532026, + "language_loss": 0.61163139, + "learning_rate": 3.925083484426153e-06, + "loss": 0.6355226, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.18847656, + "step": 1909, + "time_per_iteration": 3.0713250637054443 + }, + { + "auxiliary_loss_clip": 0.01634338, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.38582718, + "balance_loss_mlp": 1.01866758, + "epoch": 0.11483541259582143, + "flos": 16334112015360.0, + "grad_norm": 1.8785937704739248, + "language_loss": 0.80510473, + "learning_rate": 3.924977851804197e-06, + "loss": 0.83195698, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 2.48828125, + "router_z_loss_mlp": 0.32177734, + "step": 1910, + "time_per_iteration": 2.843212604522705 + }, + { + "auxiliary_loss_clip": 0.01645012, + "auxiliary_loss_mlp": 0.01056312, + "balance_loss_clip": 1.39531612, + "balance_loss_mlp": 1.02298164, + "epoch": 0.1148955358484894, + "flos": 21590506586880.0, + "grad_norm": 2.0611640541525524, + "language_loss": 0.77724105, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.80425429, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.33325195, + "step": 1911, + "time_per_iteration": 2.8195383548736572 + }, + { + "auxiliary_loss_clip": 0.01614505, + "auxiliary_loss_mlp": 0.01046438, + "balance_loss_clip": 1.37500489, + "balance_loss_mlp": 1.01398945, + "epoch": 0.11495565910115738, + "flos": 27685416637440.0, + "grad_norm": 1.6856586141314842, + "language_loss": 0.80012608, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.8267355, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.32470703, + "step": 1912, + "time_per_iteration": 2.9688098430633545 + }, + { + "auxiliary_loss_clip": 0.01631711, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_clip": 1.38720512, + "balance_loss_mlp": 1.02118254, + "epoch": 0.11501578235382534, + "flos": 20642371660800.0, + "grad_norm": 1.870224539798296, + "language_loss": 0.79869175, + "learning_rate": 3.924660515982246e-06, + "loss": 0.82552993, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30957031, + "step": 1913, + "time_per_iteration": 2.8613626956939697 + }, + { + "auxiliary_loss_clip": 0.01638379, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.39150214, + "balance_loss_mlp": 1.01977396, + "epoch": 0.1150759056064933, + "flos": 19838676695040.0, + "grad_norm": 1.7774886424880454, + "language_loss": 0.71086192, + "learning_rate": 3.924554591402939e-06, + "loss": 0.73774987, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.3059082, + "step": 1914, + "time_per_iteration": 2.8420538902282715 + }, + { + "auxiliary_loss_clip": 0.01392863, + "auxiliary_loss_mlp": 0.0105075, + "balance_loss_clip": 1.24118495, + "balance_loss_mlp": 1.03139079, + "epoch": 0.11513602885916129, + "flos": 70081104234240.0, + "grad_norm": 0.7720932567460119, + "language_loss": 0.61094427, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63538039, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.19335938, + "step": 1915, + "time_per_iteration": 3.4700827598571777 + }, + { + "auxiliary_loss_clip": 0.01636639, + "auxiliary_loss_mlp": 0.01056967, + "balance_loss_clip": 1.39040875, + "balance_loss_mlp": 1.02540016, + "epoch": 0.11519615211182925, + "flos": 15750063146880.0, + "grad_norm": 2.4507270409296984, + "language_loss": 0.95198739, + "learning_rate": 3.924342523310436e-06, + "loss": 0.97892344, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.31518555, + "step": 1916, + "time_per_iteration": 2.8086276054382324 + }, + { + "auxiliary_loss_clip": 0.01630103, + "auxiliary_loss_mlp": 0.01059519, + "balance_loss_clip": 1.38365626, + "balance_loss_mlp": 1.02974057, + "epoch": 0.11525627536449722, + "flos": 20677327908480.0, + "grad_norm": 1.958313030732003, + "language_loss": 0.73675263, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.76364887, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.29760742, + "step": 1917, + "time_per_iteration": 2.8850314617156982 + }, + { + "auxiliary_loss_clip": 0.01623877, + "auxiliary_loss_mlp": 0.01051912, + "balance_loss_clip": 1.38295829, + "balance_loss_mlp": 1.01989198, + "epoch": 0.1153163986171652, + "flos": 20313015626880.0, + "grad_norm": 2.1549398954251595, + "language_loss": 0.76779997, + "learning_rate": 3.92413016333289e-06, + "loss": 0.79455787, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.3203125, + "step": 1918, + "time_per_iteration": 2.9272842407226562 + }, + { + "auxiliary_loss_clip": 0.01633424, + "auxiliary_loss_mlp": 0.01058415, + "balance_loss_clip": 1.38499808, + "balance_loss_mlp": 1.0262289, + "epoch": 0.11537652186983316, + "flos": 17648233280640.0, + "grad_norm": 2.1126943363991524, + "language_loss": 0.8881954, + "learning_rate": 3.92402387389729e-06, + "loss": 0.91511381, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.3215332, + "step": 1919, + "time_per_iteration": 2.814169406890869 + }, + { + "auxiliary_loss_clip": 0.01626699, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.38417029, + "balance_loss_mlp": 1.02820837, + "epoch": 0.11543664512250112, + "flos": 21079311125760.0, + "grad_norm": 1.8369416067190785, + "language_loss": 0.87410855, + "learning_rate": 3.923917511502512e-06, + "loss": 0.90097803, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.3203125, + "step": 1920, + "time_per_iteration": 2.8678860664367676 + }, + { + "auxiliary_loss_clip": 0.0161576, + "auxiliary_loss_mlp": 0.01051158, + "balance_loss_clip": 1.37714303, + "balance_loss_mlp": 1.0200212, + "epoch": 0.11549676837516909, + "flos": 22757835162240.0, + "grad_norm": 1.9120480231546784, + "language_loss": 0.81114936, + "learning_rate": 3.923811076152589e-06, + "loss": 0.8378185, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.3112793, + "step": 1921, + "time_per_iteration": 2.800992012023926 + }, + { + "auxiliary_loss_clip": 0.01635643, + "auxiliary_loss_mlp": 0.01076258, + "balance_loss_clip": 1.38180542, + "balance_loss_mlp": 1.0415678, + "epoch": 0.11555689162783707, + "flos": 19177566652800.0, + "grad_norm": 1.7523872475809967, + "language_loss": 0.7932651, + "learning_rate": 3.923704567851557e-06, + "loss": 0.82038414, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.34692383, + "step": 1922, + "time_per_iteration": 2.879377841949463 + }, + { + "auxiliary_loss_clip": 0.01618249, + "auxiliary_loss_mlp": 0.01067342, + "balance_loss_clip": 1.37285495, + "balance_loss_mlp": 1.03606153, + "epoch": 0.11561701488050503, + "flos": 24582880419840.0, + "grad_norm": 1.7946493062269744, + "language_loss": 0.85143238, + "learning_rate": 3.923597986603456e-06, + "loss": 0.87828827, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.31298828, + "step": 1923, + "time_per_iteration": 2.903189182281494 + }, + { + "auxiliary_loss_clip": 0.0162974, + "auxiliary_loss_mlp": 0.01065785, + "balance_loss_clip": 1.38401794, + "balance_loss_mlp": 1.03254962, + "epoch": 0.115677138133173, + "flos": 17101448144640.0, + "grad_norm": 1.971587017038377, + "language_loss": 0.82354975, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.85050499, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.8708131313323975 + }, + { + "auxiliary_loss_clip": 0.01401595, + "auxiliary_loss_mlp": 0.01050279, + "balance_loss_clip": 1.2491293, + "balance_loss_mlp": 1.02805841, + "epoch": 0.11573726138584098, + "flos": 62735153448960.0, + "grad_norm": 0.969200324207284, + "language_loss": 0.61336768, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63788646, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.22265625, + "step": 1925, + "time_per_iteration": 3.368759870529175 + }, + { + "auxiliary_loss_clip": 0.01616445, + "auxiliary_loss_mlp": 0.01070957, + "balance_loss_clip": 1.37339354, + "balance_loss_mlp": 1.03920007, + "epoch": 0.11579738463850894, + "flos": 22611268696320.0, + "grad_norm": 1.7578155488573342, + "language_loss": 0.76553285, + "learning_rate": 3.923277805217161e-06, + "loss": 0.7924068, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.31738281, + "step": 1926, + "time_per_iteration": 2.8073651790618896 + }, + { + "auxiliary_loss_clip": 0.01635733, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.38034832, + "balance_loss_mlp": 1.02970099, + "epoch": 0.11585750789117691, + "flos": 21736213401600.0, + "grad_norm": 3.1362773429820536, + "language_loss": 0.74668348, + "learning_rate": 3.923170932221222e-06, + "loss": 0.77365685, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.31884766, + "step": 1927, + "time_per_iteration": 2.842369556427002 + }, + { + "auxiliary_loss_clip": 0.01620441, + "auxiliary_loss_mlp": 0.01056061, + "balance_loss_clip": 1.3727622, + "balance_loss_mlp": 1.02516246, + "epoch": 0.11591763114384489, + "flos": 26298984902400.0, + "grad_norm": 2.1250443331058584, + "language_loss": 0.87823772, + "learning_rate": 3.92306398629845e-06, + "loss": 0.90500271, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.30883789, + "step": 1928, + "time_per_iteration": 2.8482730388641357 + }, + { + "auxiliary_loss_clip": 0.0162706, + "auxiliary_loss_mlp": 0.01055206, + "balance_loss_clip": 1.37674904, + "balance_loss_mlp": 1.02561784, + "epoch": 0.11597775439651285, + "flos": 23010175267200.0, + "grad_norm": 1.6132691148975156, + "language_loss": 0.79100442, + "learning_rate": 3.922956967452898e-06, + "loss": 0.81782705, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.29589844, + "step": 1929, + "time_per_iteration": 2.8569881916046143 + }, + { + "auxiliary_loss_clip": 0.01618724, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.37592947, + "balance_loss_mlp": 1.02794468, + "epoch": 0.11603787764918082, + "flos": 31954331289600.0, + "grad_norm": 1.7771524798174432, + "language_loss": 0.78176236, + "learning_rate": 3.922849875688626e-06, + "loss": 0.80853164, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.30249023, + "step": 1930, + "time_per_iteration": 2.947993278503418 + }, + { + "auxiliary_loss_clip": 0.01614923, + "auxiliary_loss_mlp": 0.0105988, + "balance_loss_clip": 1.37149298, + "balance_loss_mlp": 1.02933824, + "epoch": 0.1160980009018488, + "flos": 22281414969600.0, + "grad_norm": 5.600405423930789, + "language_loss": 0.73321497, + "learning_rate": 3.922742711009693e-06, + "loss": 0.75996298, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.30517578, + "step": 1931, + "time_per_iteration": 2.851008653640747 + }, + { + "auxiliary_loss_clip": 0.01626813, + "auxiliary_loss_mlp": 0.01057477, + "balance_loss_clip": 1.38073719, + "balance_loss_mlp": 1.02593446, + "epoch": 0.11615812415451676, + "flos": 22793832040320.0, + "grad_norm": 1.5306770750771606, + "language_loss": 0.83253825, + "learning_rate": 3.922635473420164e-06, + "loss": 0.85938114, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.31542969, + "step": 1932, + "time_per_iteration": 4.297504425048828 + }, + { + "auxiliary_loss_clip": 0.01399413, + "auxiliary_loss_mlp": 0.0102507, + "balance_loss_clip": 1.2548461, + "balance_loss_mlp": 1.00284922, + "epoch": 0.11621824740718473, + "flos": 67176179896320.0, + "grad_norm": 0.7819516798766193, + "language_loss": 0.6104387, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63468349, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.22265625, + "step": 1933, + "time_per_iteration": 3.234625816345215 + }, + { + "auxiliary_loss_clip": 0.01630379, + "auxiliary_loss_mlp": 0.01053026, + "balance_loss_clip": 1.37848091, + "balance_loss_mlp": 1.02162623, + "epoch": 0.11627837065985269, + "flos": 20385597565440.0, + "grad_norm": 2.0536543416230963, + "language_loss": 0.87242043, + "learning_rate": 3.922420779525586e-06, + "loss": 0.89925456, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.3137207, + "step": 1934, + "time_per_iteration": 2.831000804901123 + }, + { + "auxiliary_loss_clip": 0.01649902, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.39246106, + "balance_loss_mlp": 1.02400959, + "epoch": 0.11633849391252067, + "flos": 21735806198400.0, + "grad_norm": 7.0849099866476175, + "language_loss": 0.68154275, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.70861423, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 2.57421875, + "router_z_loss_mlp": 0.33251953, + "step": 1935, + "time_per_iteration": 2.8797683715820312 + }, + { + "auxiliary_loss_clip": 0.01628501, + "auxiliary_loss_mlp": 0.01054739, + "balance_loss_clip": 1.38033128, + "balance_loss_mlp": 1.0234828, + "epoch": 0.11639861716518864, + "flos": 18814159267200.0, + "grad_norm": 2.3927861703147424, + "language_loss": 0.77390611, + "learning_rate": 3.922205794037456e-06, + "loss": 0.80073851, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.3125, + "step": 1936, + "time_per_iteration": 2.849095582962036 + }, + { + "auxiliary_loss_clip": 0.01638754, + "auxiliary_loss_mlp": 0.01051576, + "balance_loss_clip": 1.38656974, + "balance_loss_mlp": 1.01691031, + "epoch": 0.1164587404178566, + "flos": 21224836961280.0, + "grad_norm": 1.9258805335079625, + "language_loss": 0.85200059, + "learning_rate": 3.922098191955998e-06, + "loss": 0.87890387, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 2.52148438, + "router_z_loss_mlp": 0.34667969, + "step": 1937, + "time_per_iteration": 4.372204065322876 + }, + { + "auxiliary_loss_clip": 0.01614733, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.37230146, + "balance_loss_mlp": 1.01771057, + "epoch": 0.11651886367052458, + "flos": 27830128066560.0, + "grad_norm": 1.7918743152486405, + "language_loss": 0.77388954, + "learning_rate": 3.921990516988384e-06, + "loss": 0.80053061, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.31665039, + "step": 1938, + "time_per_iteration": 2.958155632019043 + }, + { + "auxiliary_loss_clip": 0.01636528, + "auxiliary_loss_mlp": 0.01051672, + "balance_loss_clip": 1.38695073, + "balance_loss_mlp": 1.01886511, + "epoch": 0.11657898692319255, + "flos": 22898881762560.0, + "grad_norm": 1.6745859729483639, + "language_loss": 0.80420578, + "learning_rate": 3.921882769138696e-06, + "loss": 0.83108783, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 2.49609375, + "router_z_loss_mlp": 0.328125, + "step": 1939, + "time_per_iteration": 2.8416757583618164 + }, + { + "auxiliary_loss_clip": 0.01630732, + "auxiliary_loss_mlp": 0.010587, + "balance_loss_clip": 1.38280058, + "balance_loss_mlp": 1.02510643, + "epoch": 0.11663911017586051, + "flos": 24326468282880.0, + "grad_norm": 2.9626539954795392, + "language_loss": 0.87410265, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.90099698, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.3359375, + "step": 1940, + "time_per_iteration": 4.378810882568359 + }, + { + "auxiliary_loss_clip": 0.01611851, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.37223339, + "balance_loss_mlp": 1.01639867, + "epoch": 0.11669923342852849, + "flos": 42355464969600.0, + "grad_norm": 6.858556919654926, + "language_loss": 0.76876587, + "learning_rate": 3.921667054809449e-06, + "loss": 0.79537427, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.32592773, + "step": 1941, + "time_per_iteration": 4.345696210861206 + }, + { + "auxiliary_loss_clip": 0.01628025, + "auxiliary_loss_mlp": 0.01063419, + "balance_loss_clip": 1.38198423, + "balance_loss_mlp": 1.02791858, + "epoch": 0.11675935668119646, + "flos": 14649072727680.0, + "grad_norm": 2.100945004161073, + "language_loss": 0.90263677, + "learning_rate": 3.921559088338068e-06, + "loss": 0.92955124, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.35498047, + "step": 1942, + "time_per_iteration": 2.85488224029541 + }, + { + "auxiliary_loss_clip": 0.01616599, + "auxiliary_loss_mlp": 0.01051221, + "balance_loss_clip": 1.37515855, + "balance_loss_mlp": 1.01943994, + "epoch": 0.11681947993386442, + "flos": 35130037628160.0, + "grad_norm": 1.8343489287506176, + "language_loss": 0.69120634, + "learning_rate": 3.921451049000975e-06, + "loss": 0.71788454, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.31787109, + "step": 1943, + "time_per_iteration": 2.9278244972229004 + }, + { + "auxiliary_loss_clip": 0.01618483, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.37522507, + "balance_loss_mlp": 1.01732457, + "epoch": 0.11687960318653239, + "flos": 38997647735040.0, + "grad_norm": 1.8250316012368597, + "language_loss": 0.70805049, + "learning_rate": 3.921342936802265e-06, + "loss": 0.73474282, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.33447266, + "step": 1944, + "time_per_iteration": 2.977511167526245 + }, + { + "auxiliary_loss_clip": 0.01622775, + "auxiliary_loss_mlp": 0.01054882, + "balance_loss_clip": 1.38199544, + "balance_loss_mlp": 1.02119303, + "epoch": 0.11693972643920036, + "flos": 26006168684160.0, + "grad_norm": 1.6095932837683644, + "language_loss": 0.8338933, + "learning_rate": 3.921234751746038e-06, + "loss": 0.86066985, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.33691406, + "step": 1945, + "time_per_iteration": 2.8427557945251465 + }, + { + "auxiliary_loss_clip": 0.01614885, + "auxiliary_loss_mlp": 0.01051464, + "balance_loss_clip": 1.37029243, + "balance_loss_mlp": 1.01841879, + "epoch": 0.11699984969186833, + "flos": 27283659644160.0, + "grad_norm": 2.017841983843651, + "language_loss": 0.77471745, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.80138093, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.33056641, + "step": 1946, + "time_per_iteration": 2.902353525161743 + }, + { + "auxiliary_loss_clip": 0.01604489, + "auxiliary_loss_mlp": 0.01050677, + "balance_loss_clip": 1.36244774, + "balance_loss_mlp": 1.01801372, + "epoch": 0.1170599729445363, + "flos": 15276176663040.0, + "grad_norm": 2.0398332160398787, + "language_loss": 0.69937718, + "learning_rate": 3.921018163077448e-06, + "loss": 0.72592884, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.32666016, + "step": 1947, + "time_per_iteration": 2.84840726852417 + }, + { + "auxiliary_loss_clip": 0.01620663, + "auxiliary_loss_mlp": 0.01051643, + "balance_loss_clip": 1.37659955, + "balance_loss_mlp": 1.01897943, + "epoch": 0.11712009619720427, + "flos": 17173079942400.0, + "grad_norm": 3.0739033906511493, + "language_loss": 0.85927612, + "learning_rate": 3.920909759473295e-06, + "loss": 0.8859992, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.3269043, + "step": 1948, + "time_per_iteration": 2.806398391723633 + }, + { + "auxiliary_loss_clip": 0.01401078, + "auxiliary_loss_mlp": 0.01054135, + "balance_loss_clip": 1.25390148, + "balance_loss_mlp": 1.02800405, + "epoch": 0.11718021944987224, + "flos": 70972158205440.0, + "grad_norm": 0.8857654417852417, + "language_loss": 0.65239882, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67695093, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.26171875, + "step": 1949, + "time_per_iteration": 3.400667190551758 + }, + { + "auxiliary_loss_clip": 0.01625736, + "auxiliary_loss_mlp": 0.01059015, + "balance_loss_clip": 1.3822639, + "balance_loss_mlp": 1.02575517, + "epoch": 0.1172403427025402, + "flos": 27464775154560.0, + "grad_norm": 1.5962900295590228, + "language_loss": 0.7364347, + "learning_rate": 3.920692733745835e-06, + "loss": 0.76328224, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.33251953, + "step": 1950, + "time_per_iteration": 2.89705491065979 + }, + { + "auxiliary_loss_clip": 0.01622045, + "auxiliary_loss_mlp": 0.01052963, + "balance_loss_clip": 1.373564, + "balance_loss_mlp": 1.01953697, + "epoch": 0.11730046595520818, + "flos": 15677028760320.0, + "grad_norm": 8.98494982383053, + "language_loss": 0.78145981, + "learning_rate": 3.920584111630755e-06, + "loss": 0.8082099, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.33447266, + "step": 1951, + "time_per_iteration": 2.841495990753174 + }, + { + "auxiliary_loss_clip": 0.01635878, + "auxiliary_loss_mlp": 0.01053754, + "balance_loss_clip": 1.38794291, + "balance_loss_mlp": 1.02218747, + "epoch": 0.11736058920787615, + "flos": 25641268220160.0, + "grad_norm": 4.680276844814568, + "language_loss": 0.77217746, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.79907382, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.31518555, + "step": 1952, + "time_per_iteration": 2.912318468093872 + }, + { + "auxiliary_loss_clip": 0.01638673, + "auxiliary_loss_mlp": 0.0105861, + "balance_loss_clip": 1.38808548, + "balance_loss_mlp": 1.02659059, + "epoch": 0.11742071246054411, + "flos": 21444302079360.0, + "grad_norm": 2.1966172253268934, + "language_loss": 0.73236883, + "learning_rate": 3.920366648918491e-06, + "loss": 0.7593416, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.3203125, + "step": 1953, + "time_per_iteration": 2.817875385284424 + }, + { + "auxiliary_loss_clip": 0.01636054, + "auxiliary_loss_mlp": 0.01056592, + "balance_loss_clip": 1.38251925, + "balance_loss_mlp": 1.02237892, + "epoch": 0.11748083571321208, + "flos": 16006430039040.0, + "grad_norm": 2.9283380922772073, + "language_loss": 0.81883478, + "learning_rate": 3.920257808329552e-06, + "loss": 0.84576124, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.3425293, + "step": 1954, + "time_per_iteration": 2.8646225929260254 + }, + { + "auxiliary_loss_clip": 0.01637084, + "auxiliary_loss_mlp": 0.0105813, + "balance_loss_clip": 1.38594925, + "balance_loss_mlp": 1.02401268, + "epoch": 0.11754095896588006, + "flos": 16188631424640.0, + "grad_norm": 1.8589066641785723, + "language_loss": 0.86869931, + "learning_rate": 3.920148894924246e-06, + "loss": 0.89565146, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.34106445, + "step": 1955, + "time_per_iteration": 2.7937235832214355 + }, + { + "auxiliary_loss_clip": 0.0164272, + "auxiliary_loss_mlp": 0.01057672, + "balance_loss_clip": 1.39204121, + "balance_loss_mlp": 1.02467513, + "epoch": 0.11760108221854802, + "flos": 13269518202240.0, + "grad_norm": 2.2919183555517275, + "language_loss": 0.78554517, + "learning_rate": 3.920039908706701e-06, + "loss": 0.81254905, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.32983398, + "step": 1956, + "time_per_iteration": 2.823009729385376 + }, + { + "auxiliary_loss_clip": 0.01612334, + "auxiliary_loss_mlp": 0.01057773, + "balance_loss_clip": 1.36899126, + "balance_loss_mlp": 1.02441812, + "epoch": 0.11766120547121599, + "flos": 24509076871680.0, + "grad_norm": 2.379460927913242, + "language_loss": 0.81788468, + "learning_rate": 3.91993084968105e-06, + "loss": 0.84458572, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.33325195, + "step": 1957, + "time_per_iteration": 2.8674838542938232 + }, + { + "auxiliary_loss_clip": 0.01639825, + "auxiliary_loss_mlp": 0.01055665, + "balance_loss_clip": 1.38950109, + "balance_loss_mlp": 1.02457559, + "epoch": 0.11772132872388397, + "flos": 17792627996160.0, + "grad_norm": 2.0853427656977224, + "language_loss": 0.79771483, + "learning_rate": 3.919821717851428e-06, + "loss": 0.82466972, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.31079102, + "step": 1958, + "time_per_iteration": 2.8826892375946045 + }, + { + "auxiliary_loss_clip": 0.0163205, + "auxiliary_loss_mlp": 0.01060318, + "balance_loss_clip": 1.38215148, + "balance_loss_mlp": 1.02393508, + "epoch": 0.11778145197655193, + "flos": 13222843551360.0, + "grad_norm": 1.6573814230741817, + "language_loss": 0.78356433, + "learning_rate": 3.919712513221976e-06, + "loss": 0.81048799, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.36425781, + "step": 1959, + "time_per_iteration": 2.8553364276885986 + }, + { + "auxiliary_loss_clip": 0.01634177, + "auxiliary_loss_mlp": 0.01052799, + "balance_loss_clip": 1.38609338, + "balance_loss_mlp": 1.0194447, + "epoch": 0.1178415752292199, + "flos": 20239890750720.0, + "grad_norm": 1.9989154594490788, + "language_loss": 0.71348727, + "learning_rate": 3.919603235796832e-06, + "loss": 0.74035698, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.33325195, + "step": 1960, + "time_per_iteration": 2.8179805278778076 + }, + { + "auxiliary_loss_clip": 0.01651076, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.39600825, + "balance_loss_mlp": 1.02911603, + "epoch": 0.11790169848188788, + "flos": 13047338396160.0, + "grad_norm": 2.307705689389334, + "language_loss": 0.82922328, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.85634351, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.31835938, + "step": 1961, + "time_per_iteration": 2.8357348442077637 + }, + { + "auxiliary_loss_clip": 0.01605067, + "auxiliary_loss_mlp": 0.01052452, + "balance_loss_clip": 1.36577058, + "balance_loss_mlp": 1.02055192, + "epoch": 0.11796182173455584, + "flos": 22273949577600.0, + "grad_norm": 1.8368248602544195, + "language_loss": 0.93396556, + "learning_rate": 3.919384462576049e-06, + "loss": 0.96054077, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.3190918, + "step": 1962, + "time_per_iteration": 2.8189504146575928 + }, + { + "auxiliary_loss_clip": 0.01638184, + "auxiliary_loss_mlp": 0.01056928, + "balance_loss_clip": 1.38833714, + "balance_loss_mlp": 1.02347755, + "epoch": 0.1180219449872238, + "flos": 10642361546880.0, + "grad_norm": 2.2350262647410433, + "language_loss": 0.8869524, + "learning_rate": 3.919274966788707e-06, + "loss": 0.91390359, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.33422852, + "step": 1963, + "time_per_iteration": 2.824310302734375 + }, + { + "auxiliary_loss_clip": 0.01649931, + "auxiliary_loss_mlp": 0.01056923, + "balance_loss_clip": 1.39766192, + "balance_loss_mlp": 1.02564216, + "epoch": 0.11808206823989177, + "flos": 20933423331840.0, + "grad_norm": 1.91263783886209, + "language_loss": 0.85249692, + "learning_rate": 3.919165398222265e-06, + "loss": 0.87956548, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.3125, + "step": 1964, + "time_per_iteration": 2.81848406791687 + }, + { + "auxiliary_loss_clip": 0.01646597, + "auxiliary_loss_mlp": 0.01054888, + "balance_loss_clip": 1.39873147, + "balance_loss_mlp": 1.02358377, + "epoch": 0.11814219149255975, + "flos": 20787671272320.0, + "grad_norm": 1.7930537874067154, + "language_loss": 0.84301108, + "learning_rate": 3.919055756880879e-06, + "loss": 0.87002587, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.31323242, + "step": 1965, + "time_per_iteration": 2.8481717109680176 + }, + { + "auxiliary_loss_clip": 0.01633826, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.38282156, + "balance_loss_mlp": 1.02799535, + "epoch": 0.11820231474522772, + "flos": 48776473428480.0, + "grad_norm": 1.8864015839202002, + "language_loss": 0.75510311, + "learning_rate": 3.918946042768707e-06, + "loss": 0.78204083, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.31933594, + "step": 1966, + "time_per_iteration": 3.1544883251190186 + }, + { + "auxiliary_loss_clip": 0.01639138, + "auxiliary_loss_mlp": 0.01060867, + "balance_loss_clip": 1.38861752, + "balance_loss_mlp": 1.02856112, + "epoch": 0.11826243799789568, + "flos": 16699238703360.0, + "grad_norm": 2.4480582315859434, + "language_loss": 0.74822688, + "learning_rate": 3.918836255889908e-06, + "loss": 0.77522695, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.32324219, + "step": 1967, + "time_per_iteration": 4.176625728607178 + }, + { + "auxiliary_loss_clip": 0.01621732, + "auxiliary_loss_mlp": 0.01050274, + "balance_loss_clip": 1.37471867, + "balance_loss_mlp": 1.01853991, + "epoch": 0.11832256125056366, + "flos": 16918658576640.0, + "grad_norm": 2.718382355311837, + "language_loss": 0.90111315, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.9278332, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.31713867, + "step": 1968, + "time_per_iteration": 2.834907054901123 + }, + { + "auxiliary_loss_clip": 0.01623874, + "auxiliary_loss_mlp": 0.01050778, + "balance_loss_clip": 1.37882495, + "balance_loss_mlp": 1.01961637, + "epoch": 0.11838268450323162, + "flos": 22830688569600.0, + "grad_norm": 1.8599788077303476, + "language_loss": 0.6822868, + "learning_rate": 3.918616463849087e-06, + "loss": 0.70903331, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.31201172, + "step": 1969, + "time_per_iteration": 2.882693290710449 + }, + { + "auxiliary_loss_clip": 0.01630633, + "auxiliary_loss_mlp": 0.01053524, + "balance_loss_clip": 1.38541651, + "balance_loss_mlp": 1.02119482, + "epoch": 0.11844280775589959, + "flos": 33558554085120.0, + "grad_norm": 2.9259227076254173, + "language_loss": 0.82404208, + "learning_rate": 3.918506458695399e-06, + "loss": 0.8508836, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.32299805, + "step": 1970, + "time_per_iteration": 2.9069132804870605 + }, + { + "auxiliary_loss_clip": 0.01391961, + "auxiliary_loss_mlp": 0.01124374, + "balance_loss_clip": 1.2440691, + "balance_loss_mlp": 1.10377443, + "epoch": 0.11850293100856757, + "flos": 66382845989760.0, + "grad_norm": 0.8170064597394192, + "language_loss": 0.66161942, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68678284, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 1.4765625, + "router_z_loss_mlp": 0.20605469, + "step": 1971, + "time_per_iteration": 3.4016199111938477 + }, + { + "auxiliary_loss_clip": 0.01647161, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_clip": 1.39633191, + "balance_loss_mlp": 1.0224545, + "epoch": 0.11856305426123553, + "flos": 24691775950080.0, + "grad_norm": 3.2765403030292495, + "language_loss": 0.80649674, + "learning_rate": 3.918286230142327e-06, + "loss": 0.83351636, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.32373047, + "step": 1972, + "time_per_iteration": 2.84053897857666 + }, + { + "auxiliary_loss_clip": 0.01625499, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.38142276, + "balance_loss_mlp": 1.02289391, + "epoch": 0.1186231775139035, + "flos": 24290833363200.0, + "grad_norm": 5.542512978830529, + "language_loss": 0.74306273, + "learning_rate": 3.918176006751292e-06, + "loss": 0.76986516, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.31835938, + "step": 1973, + "time_per_iteration": 4.197857618331909 + }, + { + "auxiliary_loss_clip": 0.0162081, + "auxiliary_loss_mlp": 0.01045268, + "balance_loss_clip": 1.37907052, + "balance_loss_mlp": 1.01583564, + "epoch": 0.11868330076657148, + "flos": 21766147476480.0, + "grad_norm": 1.6757297008400915, + "language_loss": 0.73398846, + "learning_rate": 3.918065710622832e-06, + "loss": 0.76064926, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.29431152, + "step": 1974, + "time_per_iteration": 2.897937774658203 + }, + { + "auxiliary_loss_clip": 0.01629611, + "auxiliary_loss_mlp": 0.01048469, + "balance_loss_clip": 1.3839463, + "balance_loss_mlp": 1.01809418, + "epoch": 0.11874342401923944, + "flos": 17199937370880.0, + "grad_norm": 5.865056289949529, + "language_loss": 0.79432863, + "learning_rate": 3.917955341761128e-06, + "loss": 0.82110941, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.3034668, + "step": 1975, + "time_per_iteration": 5.693732500076294 + }, + { + "auxiliary_loss_clip": 0.01622737, + "auxiliary_loss_mlp": 0.01052649, + "balance_loss_clip": 1.38310981, + "balance_loss_mlp": 1.02139246, + "epoch": 0.11880354727190741, + "flos": 15237419852160.0, + "grad_norm": 7.114599470975693, + "language_loss": 0.76920015, + "learning_rate": 3.917844900170364e-06, + "loss": 0.79595399, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.31225586, + "step": 1976, + "time_per_iteration": 2.8625831604003906 + }, + { + "auxiliary_loss_clip": 0.01632616, + "auxiliary_loss_mlp": 0.01048267, + "balance_loss_clip": 1.38764226, + "balance_loss_mlp": 1.01648545, + "epoch": 0.11886367052457537, + "flos": 27321556803840.0, + "grad_norm": 1.7946953498693983, + "language_loss": 0.76112443, + "learning_rate": 3.91773438585473e-06, + "loss": 0.78793323, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.31762695, + "step": 1977, + "time_per_iteration": 2.897394895553589 + }, + { + "auxiliary_loss_clip": 0.01643081, + "auxiliary_loss_mlp": 0.01056974, + "balance_loss_clip": 1.39340413, + "balance_loss_mlp": 1.02648067, + "epoch": 0.11892379377724335, + "flos": 21808161912960.0, + "grad_norm": 2.2135717929174605, + "language_loss": 0.75168341, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.77868396, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.30517578, + "step": 1978, + "time_per_iteration": 2.8633077144622803 + }, + { + "auxiliary_loss_clip": 0.01631046, + "auxiliary_loss_mlp": 0.01054735, + "balance_loss_clip": 1.38814652, + "balance_loss_mlp": 1.02405024, + "epoch": 0.11898391702991132, + "flos": 13999047661440.0, + "grad_norm": 1.9443553269737481, + "language_loss": 0.74152172, + "learning_rate": 3.917513139065616e-06, + "loss": 0.76837951, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.30688477, + "step": 1979, + "time_per_iteration": 2.8243184089660645 + }, + { + "auxiliary_loss_clip": 0.01634357, + "auxiliary_loss_mlp": 0.01058523, + "balance_loss_clip": 1.38964629, + "balance_loss_mlp": 1.02779078, + "epoch": 0.11904404028257928, + "flos": 32247509466240.0, + "grad_norm": 1.6316963206858905, + "language_loss": 0.99888003, + "learning_rate": 3.917402406600525e-06, + "loss": 1.02580893, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.30688477, + "step": 1980, + "time_per_iteration": 2.959196090698242 + }, + { + "auxiliary_loss_clip": 0.01649781, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.39881849, + "balance_loss_mlp": 1.0243659, + "epoch": 0.11910416353524726, + "flos": 23596667354880.0, + "grad_norm": 1.649537440218896, + "language_loss": 0.86876345, + "learning_rate": 3.917291601427342e-06, + "loss": 0.89583844, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.33349609, + "step": 1981, + "time_per_iteration": 2.8342201709747314 + }, + { + "auxiliary_loss_clip": 0.0163372, + "auxiliary_loss_mlp": 0.01054004, + "balance_loss_clip": 1.38758159, + "balance_loss_mlp": 1.02191246, + "epoch": 0.11916428678791523, + "flos": 25342705912320.0, + "grad_norm": 1.7026565616149083, + "language_loss": 0.86245465, + "learning_rate": 3.91718072355027e-06, + "loss": 0.88933194, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.32055664, + "step": 1982, + "time_per_iteration": 2.9098153114318848 + }, + { + "auxiliary_loss_clip": 0.01635011, + "auxiliary_loss_mlp": 0.0104871, + "balance_loss_clip": 1.39207435, + "balance_loss_mlp": 1.01854992, + "epoch": 0.11922441004058319, + "flos": 19796978972160.0, + "grad_norm": 1.8571862911600066, + "language_loss": 0.8615942, + "learning_rate": 3.917069772973513e-06, + "loss": 0.88843137, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.30126953, + "step": 1983, + "time_per_iteration": 2.839355707168579 + }, + { + "auxiliary_loss_clip": 0.016441, + "auxiliary_loss_mlp": 0.01051902, + "balance_loss_clip": 1.39477718, + "balance_loss_mlp": 1.0200491, + "epoch": 0.11928453329325117, + "flos": 21544465363200.0, + "grad_norm": 2.546085826451954, + "language_loss": 0.78706074, + "learning_rate": 3.916958749701277e-06, + "loss": 0.81402075, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31860352, + "step": 1984, + "time_per_iteration": 2.7857542037963867 + }, + { + "auxiliary_loss_clip": 0.01646715, + "auxiliary_loss_mlp": 0.01052873, + "balance_loss_clip": 1.3984108, + "balance_loss_mlp": 1.02445304, + "epoch": 0.11934465654591914, + "flos": 20825115984000.0, + "grad_norm": 1.7074737498949688, + "language_loss": 0.84247816, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.86947405, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.28442383, + "step": 1985, + "time_per_iteration": 2.861555337905884 + }, + { + "auxiliary_loss_clip": 0.01629966, + "auxiliary_loss_mlp": 0.01054774, + "balance_loss_clip": 1.38519251, + "balance_loss_mlp": 1.0246141, + "epoch": 0.1194047797985871, + "flos": 19069304549760.0, + "grad_norm": 2.000799394102718, + "language_loss": 0.75199908, + "learning_rate": 3.916736485087216e-06, + "loss": 0.77884638, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.30151367, + "step": 1986, + "time_per_iteration": 2.7705626487731934 + }, + { + "auxiliary_loss_clip": 0.01641384, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.39512956, + "balance_loss_mlp": 1.02692342, + "epoch": 0.11946490305125507, + "flos": 27200173708800.0, + "grad_norm": 1.783154624070136, + "language_loss": 0.73950398, + "learning_rate": 3.916625243753819e-06, + "loss": 0.76649207, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.3046875, + "step": 1987, + "time_per_iteration": 2.9006142616271973 + }, + { + "auxiliary_loss_clip": 0.0164535, + "auxiliary_loss_mlp": 0.01056965, + "balance_loss_clip": 1.39704418, + "balance_loss_mlp": 1.02587557, + "epoch": 0.11952502630392305, + "flos": 21150309496320.0, + "grad_norm": 3.21165335386776, + "language_loss": 0.73814934, + "learning_rate": 3.916513929741799e-06, + "loss": 0.76517254, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.31103516, + "step": 1988, + "time_per_iteration": 2.8191041946411133 + }, + { + "auxiliary_loss_clip": 0.01634008, + "auxiliary_loss_mlp": 0.01057791, + "balance_loss_clip": 1.39064336, + "balance_loss_mlp": 1.02710629, + "epoch": 0.11958514955659101, + "flos": 22133581649280.0, + "grad_norm": 1.7524285138265456, + "language_loss": 0.81368124, + "learning_rate": 3.91640254305538e-06, + "loss": 0.84059918, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.30664062, + "step": 1989, + "time_per_iteration": 2.9077816009521484 + }, + { + "auxiliary_loss_clip": 0.01641137, + "auxiliary_loss_mlp": 0.01057548, + "balance_loss_clip": 1.39341569, + "balance_loss_mlp": 1.02679229, + "epoch": 0.11964527280925898, + "flos": 17430577954560.0, + "grad_norm": 2.18717446910425, + "language_loss": 0.77796072, + "learning_rate": 3.916291083698784e-06, + "loss": 0.80494756, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.30810547, + "step": 1990, + "time_per_iteration": 2.853093147277832 + }, + { + "auxiliary_loss_clip": 0.01415727, + "auxiliary_loss_mlp": 0.01046463, + "balance_loss_clip": 1.26876545, + "balance_loss_mlp": 1.02462375, + "epoch": 0.11970539606192696, + "flos": 70709321306880.0, + "grad_norm": 0.8554876932194154, + "language_loss": 0.55336595, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57798779, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.21875, + "step": 1991, + "time_per_iteration": 3.3667848110198975 + }, + { + "auxiliary_loss_clip": 0.01624975, + "auxiliary_loss_mlp": 0.01054974, + "balance_loss_clip": 1.38706458, + "balance_loss_mlp": 1.02507663, + "epoch": 0.11976551931459492, + "flos": 21224882206080.0, + "grad_norm": 2.15023502285844, + "language_loss": 0.79853141, + "learning_rate": 3.916067946991971e-06, + "loss": 0.82533091, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.29882812, + "step": 1992, + "time_per_iteration": 2.8614344596862793 + }, + { + "auxiliary_loss_clip": 0.01641222, + "auxiliary_loss_mlp": 0.01055345, + "balance_loss_clip": 1.39407158, + "balance_loss_mlp": 1.02504253, + "epoch": 0.11982564256726289, + "flos": 25999020005760.0, + "grad_norm": 1.6427895646402095, + "language_loss": 0.79822439, + "learning_rate": 3.915956269650216e-06, + "loss": 0.82519007, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.30273438, + "step": 1993, + "time_per_iteration": 2.896195888519287 + }, + { + "auxiliary_loss_clip": 0.01634601, + "auxiliary_loss_mlp": 0.01050249, + "balance_loss_clip": 1.39066386, + "balance_loss_mlp": 1.02116179, + "epoch": 0.11988576581993086, + "flos": 21660328592640.0, + "grad_norm": 1.9588540730003674, + "language_loss": 0.83678639, + "learning_rate": 3.915844519655208e-06, + "loss": 0.86363494, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29101562, + "step": 1994, + "time_per_iteration": 2.894418478012085 + }, + { + "auxiliary_loss_clip": 0.0163142, + "auxiliary_loss_mlp": 0.01056538, + "balance_loss_clip": 1.3920325, + "balance_loss_mlp": 1.02547216, + "epoch": 0.11994588907259883, + "flos": 17866160075520.0, + "grad_norm": 2.722007902691198, + "language_loss": 0.90140629, + "learning_rate": 3.915732697011183e-06, + "loss": 0.92828584, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.31054688, + "step": 1995, + "time_per_iteration": 2.8299598693847656 + }, + { + "auxiliary_loss_clip": 0.01631871, + "auxiliary_loss_mlp": 0.01049652, + "balance_loss_clip": 1.38650286, + "balance_loss_mlp": 1.01899135, + "epoch": 0.1200060123252668, + "flos": 24473577686400.0, + "grad_norm": 2.3139917583602885, + "language_loss": 0.74968302, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.7764982, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.30664062, + "step": 1996, + "time_per_iteration": 2.850292444229126 + }, + { + "auxiliary_loss_clip": 0.01628128, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.38765502, + "balance_loss_mlp": 1.01667428, + "epoch": 0.12006613557793476, + "flos": 18740943901440.0, + "grad_norm": 1.7770349507597787, + "language_loss": 0.88840735, + "learning_rate": 3.915508833793048e-06, + "loss": 0.91516656, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.31103516, + "step": 1997, + "time_per_iteration": 2.813540458679199 + }, + { + "auxiliary_loss_clip": 0.0161829, + "auxiliary_loss_mlp": 0.01051624, + "balance_loss_clip": 1.37703943, + "balance_loss_mlp": 1.0202477, + "epoch": 0.12012625883060274, + "flos": 22276483286400.0, + "grad_norm": 1.8488331638574855, + "language_loss": 0.79832798, + "learning_rate": 3.915396793227428e-06, + "loss": 0.82502711, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.31347656, + "step": 1998, + "time_per_iteration": 2.817448616027832 + }, + { + "auxiliary_loss_clip": 0.01618624, + "auxiliary_loss_mlp": 0.01049176, + "balance_loss_clip": 1.38106346, + "balance_loss_mlp": 1.01882505, + "epoch": 0.1201863820832707, + "flos": 21768002513280.0, + "grad_norm": 1.7128665612678604, + "language_loss": 0.74748743, + "learning_rate": 3.915284680029769e-06, + "loss": 0.77416539, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.30322266, + "step": 1999, + "time_per_iteration": 2.8867669105529785 + }, + { + "auxiliary_loss_clip": 0.01630135, + "auxiliary_loss_mlp": 0.01050208, + "balance_loss_clip": 1.38606238, + "balance_loss_mlp": 1.02069163, + "epoch": 0.12024650533593867, + "flos": 21917962339200.0, + "grad_norm": 3.608338786650729, + "language_loss": 0.77667332, + "learning_rate": 3.915172494204323e-06, + "loss": 0.80347681, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.29516602, + "step": 2000, + "time_per_iteration": 2.8472232818603516 + }, + { + "auxiliary_loss_clip": 0.01617985, + "auxiliary_loss_mlp": 0.01049975, + "balance_loss_clip": 1.37765408, + "balance_loss_mlp": 1.01931405, + "epoch": 0.12030662858860665, + "flos": 21699175893120.0, + "grad_norm": 1.4814596738643855, + "language_loss": 0.85903108, + "learning_rate": 3.915060235755344e-06, + "loss": 0.88571066, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.30664062, + "step": 2001, + "time_per_iteration": 4.317075967788696 + }, + { + "auxiliary_loss_clip": 0.01617357, + "auxiliary_loss_mlp": 0.0104683, + "balance_loss_clip": 1.37809515, + "balance_loss_mlp": 1.01702738, + "epoch": 0.12036675184127461, + "flos": 12941067064320.0, + "grad_norm": 3.7620045920970493, + "language_loss": 0.75965548, + "learning_rate": 3.91494790468709e-06, + "loss": 0.78629732, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29785156, + "step": 2002, + "time_per_iteration": 2.8462488651275635 + }, + { + "auxiliary_loss_clip": 0.01637492, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_clip": 1.3893249, + "balance_loss_mlp": 1.02804542, + "epoch": 0.12042687509394258, + "flos": 20861022372480.0, + "grad_norm": 2.3809941058326247, + "language_loss": 0.80095649, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.82792497, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.31298828, + "step": 2003, + "time_per_iteration": 2.826965093612671 + }, + { + "auxiliary_loss_clip": 0.01614765, + "auxiliary_loss_mlp": 0.01056053, + "balance_loss_clip": 1.37722647, + "balance_loss_mlp": 1.02412868, + "epoch": 0.12048699834661056, + "flos": 23889076369920.0, + "grad_norm": 1.8262103782388928, + "language_loss": 0.73449266, + "learning_rate": 3.914723024709793e-06, + "loss": 0.7612009, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.3190918, + "step": 2004, + "time_per_iteration": 2.8599326610565186 + }, + { + "auxiliary_loss_clip": 0.01637994, + "auxiliary_loss_mlp": 0.01060316, + "balance_loss_clip": 1.39074266, + "balance_loss_mlp": 1.02805781, + "epoch": 0.12054712159927852, + "flos": 19766049511680.0, + "grad_norm": 1.5702345220040477, + "language_loss": 0.7960068, + "learning_rate": 3.914610475809279e-06, + "loss": 0.82298988, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.32250977, + "step": 2005, + "time_per_iteration": 2.8613545894622803 + }, + { + "auxiliary_loss_clip": 0.01410898, + "auxiliary_loss_mlp": 0.01107105, + "balance_loss_clip": 1.2707485, + "balance_loss_mlp": 1.08393121, + "epoch": 0.12060724485194649, + "flos": 51697957363200.0, + "grad_norm": 1.044777046042512, + "language_loss": 0.58150733, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60668731, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.23144531, + "step": 2006, + "time_per_iteration": 3.1565887928009033 + }, + { + "auxiliary_loss_clip": 0.01612811, + "auxiliary_loss_mlp": 0.01057133, + "balance_loss_clip": 1.37726843, + "balance_loss_mlp": 1.022753, + "epoch": 0.12066736810461445, + "flos": 18999980236800.0, + "grad_norm": 1.6405931835173404, + "language_loss": 0.77916747, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.8058669, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.34375, + "step": 2007, + "time_per_iteration": 2.8739397525787354 + }, + { + "auxiliary_loss_clip": 0.01617659, + "auxiliary_loss_mlp": 0.01054622, + "balance_loss_clip": 1.37417841, + "balance_loss_mlp": 1.02279305, + "epoch": 0.12072749135728243, + "flos": 16480633236480.0, + "grad_norm": 5.135065103620375, + "language_loss": 0.85593915, + "learning_rate": 3.914272393511494e-06, + "loss": 0.88266194, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.31811523, + "step": 2008, + "time_per_iteration": 4.412937164306641 + }, + { + "auxiliary_loss_clip": 0.01607496, + "auxiliary_loss_mlp": 0.01061741, + "balance_loss_clip": 1.36626005, + "balance_loss_mlp": 1.02817178, + "epoch": 0.1207876146099504, + "flos": 18086122886400.0, + "grad_norm": 1.982377361588975, + "language_loss": 0.8492341, + "learning_rate": 3.91415955422773e-06, + "loss": 0.87592643, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.33544922, + "step": 2009, + "time_per_iteration": 2.8293211460113525 + }, + { + "auxiliary_loss_clip": 0.01594824, + "auxiliary_loss_mlp": 0.01055255, + "balance_loss_clip": 1.35961223, + "balance_loss_mlp": 1.02440405, + "epoch": 0.12084773786261836, + "flos": 21881558257920.0, + "grad_norm": 1.8519385814977776, + "language_loss": 0.85227579, + "learning_rate": 3.914046642358844e-06, + "loss": 0.87877661, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.30859375, + "step": 2010, + "time_per_iteration": 5.669464826583862 + }, + { + "auxiliary_loss_clip": 0.01608427, + "auxiliary_loss_mlp": 0.01059629, + "balance_loss_clip": 1.36823368, + "balance_loss_mlp": 1.02715611, + "epoch": 0.12090786111528634, + "flos": 18342263554560.0, + "grad_norm": 3.3810644587327103, + "language_loss": 0.85478783, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.88146842, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.32446289, + "step": 2011, + "time_per_iteration": 2.8427391052246094 + }, + { + "auxiliary_loss_clip": 0.01613522, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.3719672, + "balance_loss_mlp": 1.02299452, + "epoch": 0.1209679843679543, + "flos": 21115760451840.0, + "grad_norm": 2.5393634901421636, + "language_loss": 0.97459733, + "learning_rate": 3.913820600882834e-06, + "loss": 1.00127125, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.30834961, + "step": 2012, + "time_per_iteration": 2.8172810077667236 + }, + { + "auxiliary_loss_clip": 0.01595965, + "auxiliary_loss_mlp": 0.01050823, + "balance_loss_clip": 1.36152864, + "balance_loss_mlp": 1.01877987, + "epoch": 0.12102810762062227, + "flos": 29252782903680.0, + "grad_norm": 1.8472118826441335, + "language_loss": 0.8209306, + "learning_rate": 3.913707471284283e-06, + "loss": 0.84739846, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.32006836, + "step": 2013, + "time_per_iteration": 2.943225860595703 + }, + { + "auxiliary_loss_clip": 0.01616816, + "auxiliary_loss_mlp": 0.01057815, + "balance_loss_clip": 1.37174797, + "balance_loss_mlp": 1.02639174, + "epoch": 0.12108823087329025, + "flos": 17939556420480.0, + "grad_norm": 3.2206260927798245, + "language_loss": 0.78378904, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.81053543, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.31396484, + "step": 2014, + "time_per_iteration": 2.8425710201263428 + }, + { + "auxiliary_loss_clip": 0.01613316, + "auxiliary_loss_mlp": 0.01060558, + "balance_loss_clip": 1.37494349, + "balance_loss_mlp": 1.03015924, + "epoch": 0.12114835412595822, + "flos": 22102244985600.0, + "grad_norm": 2.7961281310746084, + "language_loss": 0.88378966, + "learning_rate": 3.913480994387535e-06, + "loss": 0.9105283, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.3034668, + "step": 2015, + "time_per_iteration": 2.8273189067840576 + }, + { + "auxiliary_loss_clip": 0.01594537, + "auxiliary_loss_mlp": 0.01056396, + "balance_loss_clip": 1.35861802, + "balance_loss_mlp": 1.02502012, + "epoch": 0.12120847737862618, + "flos": 20422001646720.0, + "grad_norm": 1.9284370533913628, + "language_loss": 0.70881188, + "learning_rate": 3.913367647097926e-06, + "loss": 0.73532116, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.3137207, + "step": 2016, + "time_per_iteration": 2.8837335109710693 + }, + { + "auxiliary_loss_clip": 0.01620071, + "auxiliary_loss_mlp": 0.01062108, + "balance_loss_clip": 1.37646377, + "balance_loss_mlp": 1.03068495, + "epoch": 0.12126860063129415, + "flos": 22319040660480.0, + "grad_norm": 2.372933768031466, + "language_loss": 0.82272518, + "learning_rate": 3.913254227253225e-06, + "loss": 0.84954691, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.3145752, + "step": 2017, + "time_per_iteration": 2.824951410293579 + }, + { + "auxiliary_loss_clip": 0.0161418, + "auxiliary_loss_mlp": 0.01066063, + "balance_loss_clip": 1.37149048, + "balance_loss_mlp": 1.03597403, + "epoch": 0.12132872388396213, + "flos": 13707272073600.0, + "grad_norm": 2.382223348942275, + "language_loss": 0.71578526, + "learning_rate": 3.913140734857731e-06, + "loss": 0.74258775, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.30065918, + "step": 2018, + "time_per_iteration": 2.850609064102173 + }, + { + "auxiliary_loss_clip": 0.01607876, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_clip": 1.36720335, + "balance_loss_mlp": 1.02955282, + "epoch": 0.12138884713663009, + "flos": 26477657193600.0, + "grad_norm": 1.6655361279672307, + "language_loss": 0.73499447, + "learning_rate": 3.91302716991575e-06, + "loss": 0.76166797, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29931641, + "step": 2019, + "time_per_iteration": 2.867161989212036 + }, + { + "auxiliary_loss_clip": 0.01609412, + "auxiliary_loss_mlp": 0.01063381, + "balance_loss_clip": 1.36775088, + "balance_loss_mlp": 1.0324347, + "epoch": 0.12144897038929806, + "flos": 26153006618880.0, + "grad_norm": 2.16267773991476, + "language_loss": 0.93590027, + "learning_rate": 3.912913532431586e-06, + "loss": 0.96262825, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.30957031, + "step": 2020, + "time_per_iteration": 2.9060051441192627 + }, + { + "auxiliary_loss_clip": 0.01619783, + "auxiliary_loss_mlp": 0.01056743, + "balance_loss_clip": 1.3786931, + "balance_loss_mlp": 1.02694094, + "epoch": 0.12150909364196603, + "flos": 24728270520960.0, + "grad_norm": 1.8179579846040788, + "language_loss": 0.7910918, + "learning_rate": 3.912799822409549e-06, + "loss": 0.81785703, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.29785156, + "step": 2021, + "time_per_iteration": 2.890885353088379 + }, + { + "auxiliary_loss_clip": 0.0160521, + "auxiliary_loss_mlp": 0.01059462, + "balance_loss_clip": 1.36697638, + "balance_loss_mlp": 1.02880132, + "epoch": 0.121569216894634, + "flos": 25196591894400.0, + "grad_norm": 1.9627962233383696, + "language_loss": 0.81516278, + "learning_rate": 3.912686039853952e-06, + "loss": 0.84180951, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.30664062, + "step": 2022, + "time_per_iteration": 2.8331868648529053 + }, + { + "auxiliary_loss_clip": 0.01616074, + "auxiliary_loss_mlp": 0.01061996, + "balance_loss_clip": 1.37334764, + "balance_loss_mlp": 1.03164577, + "epoch": 0.12162934014730196, + "flos": 13452081546240.0, + "grad_norm": 1.6948097649401113, + "language_loss": 0.85581809, + "learning_rate": 3.912572184769108e-06, + "loss": 0.88259876, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.3034668, + "step": 2023, + "time_per_iteration": 2.9060311317443848 + }, + { + "auxiliary_loss_clip": 0.01623903, + "auxiliary_loss_mlp": 0.01060799, + "balance_loss_clip": 1.37965965, + "balance_loss_mlp": 1.03093696, + "epoch": 0.12168946339996994, + "flos": 16954881678720.0, + "grad_norm": 2.8929058913279957, + "language_loss": 0.88144171, + "learning_rate": 3.912458257159335e-06, + "loss": 0.90828872, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.29846191, + "step": 2024, + "time_per_iteration": 2.8102779388427734 + }, + { + "auxiliary_loss_clip": 0.0160577, + "auxiliary_loss_mlp": 0.01058922, + "balance_loss_clip": 1.36631131, + "balance_loss_mlp": 1.02854729, + "epoch": 0.12174958665263791, + "flos": 29832624005760.0, + "grad_norm": 3.009166264133383, + "language_loss": 0.73268396, + "learning_rate": 3.912344257028954e-06, + "loss": 0.75933087, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.3034668, + "step": 2025, + "time_per_iteration": 2.883385419845581 + }, + { + "auxiliary_loss_clip": 0.01610539, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.37072647, + "balance_loss_mlp": 1.0303514, + "epoch": 0.12180970990530587, + "flos": 24651933264000.0, + "grad_norm": 1.583315508506995, + "language_loss": 0.7689324, + "learning_rate": 3.912230184382286e-06, + "loss": 0.79567319, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.33178711, + "step": 2026, + "time_per_iteration": 2.904548168182373 + }, + { + "auxiliary_loss_clip": 0.01614673, + "auxiliary_loss_mlp": 0.01060934, + "balance_loss_clip": 1.37188292, + "balance_loss_mlp": 1.02951002, + "epoch": 0.12186983315797385, + "flos": 20531349624960.0, + "grad_norm": 2.494158278865927, + "language_loss": 0.9006424, + "learning_rate": 3.912116039223659e-06, + "loss": 0.92739856, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.31396484, + "step": 2027, + "time_per_iteration": 2.8451380729675293 + }, + { + "auxiliary_loss_clip": 0.01594947, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.35983729, + "balance_loss_mlp": 1.03156948, + "epoch": 0.12192995641064182, + "flos": 27829766108160.0, + "grad_norm": 1.5757892421732966, + "language_loss": 0.76913273, + "learning_rate": 3.912001821557399e-06, + "loss": 0.79570544, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.30761719, + "step": 2028, + "time_per_iteration": 2.9301092624664307 + }, + { + "auxiliary_loss_clip": 0.01603348, + "auxiliary_loss_mlp": 0.01068738, + "balance_loss_clip": 1.36393785, + "balance_loss_mlp": 1.03657544, + "epoch": 0.12199007966330978, + "flos": 22027038848640.0, + "grad_norm": 1.8307123857150265, + "language_loss": 0.7841711, + "learning_rate": 3.911887531387839e-06, + "loss": 0.81089199, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.3215332, + "step": 2029, + "time_per_iteration": 2.880915403366089 + }, + { + "auxiliary_loss_clip": 0.01611125, + "auxiliary_loss_mlp": 0.01065045, + "balance_loss_clip": 1.37028217, + "balance_loss_mlp": 1.03281116, + "epoch": 0.12205020291597775, + "flos": 23305751418240.0, + "grad_norm": 1.905353357031078, + "language_loss": 0.80856109, + "learning_rate": 3.911773168719313e-06, + "loss": 0.83532274, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.32250977, + "step": 2030, + "time_per_iteration": 2.9699089527130127 + }, + { + "auxiliary_loss_clip": 0.01590415, + "auxiliary_loss_mlp": 0.01062462, + "balance_loss_clip": 1.35463321, + "balance_loss_mlp": 1.02953613, + "epoch": 0.12211032616864573, + "flos": 26042844234240.0, + "grad_norm": 1.938281287557282, + "language_loss": 0.75926119, + "learning_rate": 3.911658733556155e-06, + "loss": 0.78578997, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.32910156, + "step": 2031, + "time_per_iteration": 2.8645670413970947 + }, + { + "auxiliary_loss_clip": 0.01611133, + "auxiliary_loss_mlp": 0.01057899, + "balance_loss_clip": 1.37108469, + "balance_loss_mlp": 1.02797771, + "epoch": 0.12217044942131369, + "flos": 20420237099520.0, + "grad_norm": 2.117690868002167, + "language_loss": 0.76141101, + "learning_rate": 3.911544225902707e-06, + "loss": 0.78810132, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.29882812, + "step": 2032, + "time_per_iteration": 2.869957685470581 + }, + { + "auxiliary_loss_clip": 0.01581308, + "auxiliary_loss_mlp": 0.01058047, + "balance_loss_clip": 1.34976387, + "balance_loss_mlp": 1.02748203, + "epoch": 0.12223057267398166, + "flos": 22867499854080.0, + "grad_norm": 1.8030644261618107, + "language_loss": 0.90073138, + "learning_rate": 3.911429645763311e-06, + "loss": 0.92712498, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.30566406, + "step": 2033, + "time_per_iteration": 2.9194748401641846 + }, + { + "auxiliary_loss_clip": 0.01613895, + "auxiliary_loss_mlp": 0.01057799, + "balance_loss_clip": 1.36996901, + "balance_loss_mlp": 1.02704275, + "epoch": 0.12229069592664964, + "flos": 20057146427520.0, + "grad_norm": 2.1885423244281026, + "language_loss": 0.67713737, + "learning_rate": 3.911314993142311e-06, + "loss": 0.70385432, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.30786133, + "step": 2034, + "time_per_iteration": 2.859017848968506 + }, + { + "auxiliary_loss_clip": 0.01592919, + "auxiliary_loss_mlp": 0.01060032, + "balance_loss_clip": 1.35861027, + "balance_loss_mlp": 1.02670121, + "epoch": 0.1223508191793176, + "flos": 22284672595200.0, + "grad_norm": 1.6189858144593374, + "language_loss": 0.77124035, + "learning_rate": 3.911200268044055e-06, + "loss": 0.79776996, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.33325195, + "step": 2035, + "time_per_iteration": 2.859151601791382 + }, + { + "auxiliary_loss_clip": 0.01619147, + "auxiliary_loss_mlp": 0.01061547, + "balance_loss_clip": 1.37405109, + "balance_loss_mlp": 1.03107738, + "epoch": 0.12241094243198557, + "flos": 21295337639040.0, + "grad_norm": 1.8193557370409685, + "language_loss": 0.72322679, + "learning_rate": 3.911085470472892e-06, + "loss": 0.75003374, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.30444336, + "step": 2036, + "time_per_iteration": 2.8182883262634277 + }, + { + "auxiliary_loss_clip": 0.015945, + "auxiliary_loss_mlp": 0.01065439, + "balance_loss_clip": 1.35581923, + "balance_loss_mlp": 1.03277564, + "epoch": 0.12247106568465355, + "flos": 17390825758080.0, + "grad_norm": 1.8572462155147782, + "language_loss": 0.83913732, + "learning_rate": 3.910970600433178e-06, + "loss": 0.86573666, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.32641602, + "step": 2037, + "time_per_iteration": 4.244171619415283 + }, + { + "auxiliary_loss_clip": 0.01611316, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_clip": 1.37069726, + "balance_loss_mlp": 1.02428031, + "epoch": 0.12253118893732151, + "flos": 27054919342080.0, + "grad_norm": 2.312552818845248, + "language_loss": 0.80960524, + "learning_rate": 3.910855657929267e-06, + "loss": 0.83627093, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.30957031, + "step": 2038, + "time_per_iteration": 2.900172710418701 + }, + { + "auxiliary_loss_clip": 0.01413415, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.26804149, + "balance_loss_mlp": 1.00178552, + "epoch": 0.12259131218998948, + "flos": 53887179168000.0, + "grad_norm": 0.8594070717713321, + "language_loss": 0.58758175, + "learning_rate": 3.910740642965518e-06, + "loss": 0.61204278, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.30859375, + "step": 2039, + "time_per_iteration": 3.2369508743286133 + }, + { + "auxiliary_loss_clip": 0.01602628, + "auxiliary_loss_mlp": 0.01064672, + "balance_loss_clip": 1.36275828, + "balance_loss_mlp": 1.0319134, + "epoch": 0.12265143544265744, + "flos": 17900166182400.0, + "grad_norm": 2.629760277663384, + "language_loss": 0.82376063, + "learning_rate": 3.910625555546292e-06, + "loss": 0.85043359, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.32788086, + "step": 2040, + "time_per_iteration": 2.848524570465088 + }, + { + "auxiliary_loss_clip": 0.01590687, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.35659456, + "balance_loss_mlp": 1.02280021, + "epoch": 0.12271155869532542, + "flos": 21810243173760.0, + "grad_norm": 2.068668669688981, + "language_loss": 0.84131414, + "learning_rate": 3.910510395675953e-06, + "loss": 0.86775827, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.30932617, + "step": 2041, + "time_per_iteration": 2.849118709564209 + }, + { + "auxiliary_loss_clip": 0.01633911, + "auxiliary_loss_mlp": 0.01054426, + "balance_loss_clip": 1.38674068, + "balance_loss_mlp": 1.02378941, + "epoch": 0.12277168194799339, + "flos": 19838314736640.0, + "grad_norm": 1.724008152216135, + "language_loss": 0.68690473, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.71378809, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.30615234, + "step": 2042, + "time_per_iteration": 2.8713154792785645 + }, + { + "auxiliary_loss_clip": 0.01602858, + "auxiliary_loss_mlp": 0.01057487, + "balance_loss_clip": 1.36425996, + "balance_loss_mlp": 1.02563477, + "epoch": 0.12283180520066135, + "flos": 23231223953280.0, + "grad_norm": 1.7881255729665175, + "language_loss": 0.82108468, + "learning_rate": 3.910279858599409e-06, + "loss": 0.84768808, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.31860352, + "step": 2043, + "time_per_iteration": 4.252284526824951 + }, + { + "auxiliary_loss_clip": 0.01612123, + "auxiliary_loss_mlp": 0.01052437, + "balance_loss_clip": 1.3688798, + "balance_loss_mlp": 1.02137101, + "epoch": 0.12289192845332933, + "flos": 18597770795520.0, + "grad_norm": 1.8349287941531383, + "language_loss": 0.81947672, + "learning_rate": 3.910164481401946e-06, + "loss": 0.84612226, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.31079102, + "step": 2044, + "time_per_iteration": 2.839778423309326 + }, + { + "auxiliary_loss_clip": 0.01598773, + "auxiliary_loss_mlp": 0.01056047, + "balance_loss_clip": 1.35961199, + "balance_loss_mlp": 1.02555346, + "epoch": 0.1229520517059973, + "flos": 25778966705280.0, + "grad_norm": 1.9783434856162228, + "language_loss": 0.7890442, + "learning_rate": 3.910049031770853e-06, + "loss": 0.81559241, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.3046875, + "step": 2045, + "time_per_iteration": 4.430351495742798 + }, + { + "auxiliary_loss_clip": 0.01603623, + "auxiliary_loss_mlp": 0.01055205, + "balance_loss_clip": 1.36371553, + "balance_loss_mlp": 1.02232718, + "epoch": 0.12301217495866526, + "flos": 20897064495360.0, + "grad_norm": 2.1510176839202138, + "language_loss": 0.69105494, + "learning_rate": 3.90993350971051e-06, + "loss": 0.7176432, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.32861328, + "step": 2046, + "time_per_iteration": 2.842721700668335 + }, + { + "auxiliary_loss_clip": 0.01603081, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_clip": 1.36424696, + "balance_loss_mlp": 1.02301407, + "epoch": 0.12307229821133324, + "flos": 22388003015040.0, + "grad_norm": 2.3817370627726704, + "language_loss": 0.74176931, + "learning_rate": 3.909817915225297e-06, + "loss": 0.76834619, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.31616211, + "step": 2047, + "time_per_iteration": 2.860633611679077 + }, + { + "auxiliary_loss_clip": 0.01618564, + "auxiliary_loss_mlp": 0.01057989, + "balance_loss_clip": 1.37717104, + "balance_loss_mlp": 1.02615976, + "epoch": 0.1231324214640012, + "flos": 23377745174400.0, + "grad_norm": 2.4750048746598057, + "language_loss": 0.77941227, + "learning_rate": 3.909702248319597e-06, + "loss": 0.80617785, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.31835938, + "step": 2048, + "time_per_iteration": 2.8816163539886475 + }, + { + "auxiliary_loss_clip": 0.01598136, + "auxiliary_loss_mlp": 0.01057161, + "balance_loss_clip": 1.3599174, + "balance_loss_mlp": 1.02738237, + "epoch": 0.12319254471666917, + "flos": 23777194682880.0, + "grad_norm": 1.9893181066791714, + "language_loss": 0.86376965, + "learning_rate": 3.909586508997797e-06, + "loss": 0.89032257, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29785156, + "step": 2049, + "time_per_iteration": 2.9355108737945557 + }, + { + "auxiliary_loss_clip": 0.01614375, + "auxiliary_loss_mlp": 0.01052964, + "balance_loss_clip": 1.37123537, + "balance_loss_mlp": 1.02166009, + "epoch": 0.12325266796933713, + "flos": 23560625232000.0, + "grad_norm": 1.9791180563582706, + "language_loss": 0.77364701, + "learning_rate": 3.909470697264285e-06, + "loss": 0.80032045, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.31298828, + "step": 2050, + "time_per_iteration": 2.8553547859191895 + }, + { + "auxiliary_loss_clip": 0.01606519, + "auxiliary_loss_mlp": 0.01051928, + "balance_loss_clip": 1.36376858, + "balance_loss_mlp": 1.02026641, + "epoch": 0.12331279122200511, + "flos": 24434232693120.0, + "grad_norm": 2.548830750245577, + "language_loss": 0.82759798, + "learning_rate": 3.909354813123452e-06, + "loss": 0.85418248, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.31640625, + "step": 2051, + "time_per_iteration": 2.9713497161865234 + }, + { + "auxiliary_loss_clip": 0.01598831, + "auxiliary_loss_mlp": 0.01054544, + "balance_loss_clip": 1.36317182, + "balance_loss_mlp": 1.02414525, + "epoch": 0.12337291447467308, + "flos": 25495380426240.0, + "grad_norm": 1.7358271338958777, + "language_loss": 0.80907071, + "learning_rate": 3.909238856579693e-06, + "loss": 0.83560443, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.30444336, + "step": 2052, + "time_per_iteration": 2.888572931289673 + }, + { + "auxiliary_loss_clip": 0.01628642, + "auxiliary_loss_mlp": 0.01051434, + "balance_loss_clip": 1.3838551, + "balance_loss_mlp": 1.02067828, + "epoch": 0.12343303772734104, + "flos": 23560444252800.0, + "grad_norm": 2.2721944820748092, + "language_loss": 0.7566151, + "learning_rate": 3.909122827637406e-06, + "loss": 0.78341585, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.30712891, + "step": 2053, + "time_per_iteration": 2.924380302429199 + }, + { + "auxiliary_loss_clip": 0.01616202, + "auxiliary_loss_mlp": 0.01058773, + "balance_loss_clip": 1.37108541, + "balance_loss_mlp": 1.02548957, + "epoch": 0.12349316098000902, + "flos": 47573600423040.0, + "grad_norm": 2.0957633771191277, + "language_loss": 0.75022447, + "learning_rate": 3.909006726300991e-06, + "loss": 0.7769742, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.33251953, + "step": 2054, + "time_per_iteration": 3.070680856704712 + }, + { + "auxiliary_loss_clip": 0.01585666, + "auxiliary_loss_mlp": 0.01052613, + "balance_loss_clip": 1.35108411, + "balance_loss_mlp": 1.02080774, + "epoch": 0.12355328423267699, + "flos": 25056812148480.0, + "grad_norm": 1.9862088767527892, + "language_loss": 0.85733199, + "learning_rate": 3.908890552574849e-06, + "loss": 0.8837148, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.31787109, + "step": 2055, + "time_per_iteration": 2.8612966537475586 + }, + { + "auxiliary_loss_clip": 0.01613414, + "auxiliary_loss_mlp": 0.01059438, + "balance_loss_clip": 1.37337685, + "balance_loss_mlp": 1.0290401, + "epoch": 0.12361340748534495, + "flos": 27720599109120.0, + "grad_norm": 2.0199117849927846, + "language_loss": 0.80013359, + "learning_rate": 3.908774306463384e-06, + "loss": 0.8268621, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.30419922, + "step": 2056, + "time_per_iteration": 2.9823648929595947 + }, + { + "auxiliary_loss_clip": 0.01607443, + "auxiliary_loss_mlp": 0.01060568, + "balance_loss_clip": 1.36569667, + "balance_loss_mlp": 1.02904892, + "epoch": 0.12367353073801293, + "flos": 26151739764480.0, + "grad_norm": 2.35527119978301, + "language_loss": 0.84369427, + "learning_rate": 3.908657987971009e-06, + "loss": 0.87037444, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.31542969, + "step": 2057, + "time_per_iteration": 2.8494343757629395 + }, + { + "auxiliary_loss_clip": 0.01622938, + "auxiliary_loss_mlp": 0.01060716, + "balance_loss_clip": 1.37803745, + "balance_loss_mlp": 1.02745605, + "epoch": 0.1237336539906809, + "flos": 25167065022720.0, + "grad_norm": 1.494978850318772, + "language_loss": 0.79206991, + "learning_rate": 3.90854159710213e-06, + "loss": 0.81890643, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.33251953, + "step": 2058, + "time_per_iteration": 2.8970115184783936 + }, + { + "auxiliary_loss_clip": 0.01631329, + "auxiliary_loss_mlp": 0.01057908, + "balance_loss_clip": 1.3837471, + "balance_loss_mlp": 1.02455389, + "epoch": 0.12379377724334886, + "flos": 15312987947520.0, + "grad_norm": 3.4367903217694824, + "language_loss": 0.8438639, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.87075627, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.33349609, + "step": 2059, + "time_per_iteration": 2.8305108547210693 + }, + { + "auxiliary_loss_clip": 0.01613191, + "auxiliary_loss_mlp": 0.01055014, + "balance_loss_clip": 1.36724842, + "balance_loss_mlp": 1.02266085, + "epoch": 0.12385390049601683, + "flos": 21324593041920.0, + "grad_norm": 2.2545793040938076, + "language_loss": 0.82500726, + "learning_rate": 3.908308598252523e-06, + "loss": 0.85168928, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.32324219, + "step": 2060, + "time_per_iteration": 2.8443989753723145 + }, + { + "auxiliary_loss_clip": 0.01617736, + "auxiliary_loss_mlp": 0.01057003, + "balance_loss_clip": 1.37302625, + "balance_loss_mlp": 1.02300465, + "epoch": 0.1239140237486848, + "flos": 15123140190720.0, + "grad_norm": 1.8555092895554524, + "language_loss": 0.87633944, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.90308684, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.33984375, + "step": 2061, + "time_per_iteration": 2.8386049270629883 + }, + { + "auxiliary_loss_clip": 0.01601384, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.36383367, + "balance_loss_mlp": 1.02361846, + "epoch": 0.12397414700135277, + "flos": 21985703084160.0, + "grad_norm": 1.8263396253836652, + "language_loss": 0.86061519, + "learning_rate": 3.908075309949906e-06, + "loss": 0.88719231, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.32714844, + "step": 2062, + "time_per_iteration": 2.7986247539520264 + }, + { + "auxiliary_loss_clip": 0.01609011, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_clip": 1.36881161, + "balance_loss_mlp": 1.02262974, + "epoch": 0.12403427025402074, + "flos": 13407126197760.0, + "grad_norm": 1.8372462898799482, + "language_loss": 0.80020833, + "learning_rate": 3.907958557264774e-06, + "loss": 0.82685375, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.32885742, + "step": 2063, + "time_per_iteration": 2.851421594619751 + }, + { + "auxiliary_loss_clip": 0.0161179, + "auxiliary_loss_mlp": 0.01058558, + "balance_loss_clip": 1.37013507, + "balance_loss_mlp": 1.02420235, + "epoch": 0.12409439350668872, + "flos": 15312716478720.0, + "grad_norm": 1.9889517159580623, + "language_loss": 0.80543524, + "learning_rate": 3.907841732229663e-06, + "loss": 0.83213866, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.34350586, + "step": 2064, + "time_per_iteration": 2.8137762546539307 + }, + { + "auxiliary_loss_clip": 0.01607451, + "auxiliary_loss_mlp": 0.01059783, + "balance_loss_clip": 1.36713505, + "balance_loss_mlp": 1.02511668, + "epoch": 0.12415451675935668, + "flos": 25020181843200.0, + "grad_norm": 2.109343347154861, + "language_loss": 0.93825656, + "learning_rate": 3.907724834849002e-06, + "loss": 0.96492887, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.34667969, + "step": 2065, + "time_per_iteration": 2.892699956893921 + }, + { + "auxiliary_loss_clip": 0.01604085, + "auxiliary_loss_mlp": 0.01049644, + "balance_loss_clip": 1.36300373, + "balance_loss_mlp": 1.01810098, + "epoch": 0.12421464001202465, + "flos": 23670199434240.0, + "grad_norm": 2.0104308233958657, + "language_loss": 0.82014525, + "learning_rate": 3.907607865127225e-06, + "loss": 0.84668249, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.31518555, + "step": 2066, + "time_per_iteration": 2.941105604171753 + }, + { + "auxiliary_loss_clip": 0.01421239, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.2773118, + "balance_loss_mlp": 1.00878036, + "epoch": 0.12427476326469263, + "flos": 65765152972800.0, + "grad_norm": 0.886175785832063, + "language_loss": 0.63356197, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65812725, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.265625, + "step": 2067, + "time_per_iteration": 3.3501780033111572 + }, + { + "auxiliary_loss_clip": 0.01616355, + "auxiliary_loss_mlp": 0.01059055, + "balance_loss_clip": 1.37255049, + "balance_loss_mlp": 1.02653444, + "epoch": 0.12433488651736059, + "flos": 24546204869760.0, + "grad_norm": 10.83668345769654, + "language_loss": 0.94702333, + "learning_rate": 3.907373708678063e-06, + "loss": 0.97377741, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.32543945, + "step": 2068, + "time_per_iteration": 2.8932948112487793 + }, + { + "auxiliary_loss_clip": 0.01608624, + "auxiliary_loss_mlp": 0.01054359, + "balance_loss_clip": 1.36742496, + "balance_loss_mlp": 1.02374649, + "epoch": 0.12439500977002856, + "flos": 21041232986880.0, + "grad_norm": 1.8852381052852825, + "language_loss": 0.82342184, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.8500517, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30615234, + "step": 2069, + "time_per_iteration": 2.8271608352661133 + }, + { + "auxiliary_loss_clip": 0.01618944, + "auxiliary_loss_mlp": 0.01062394, + "balance_loss_clip": 1.3759048, + "balance_loss_mlp": 1.02984965, + "epoch": 0.12445513302269653, + "flos": 26841109824000.0, + "grad_norm": 1.4983882398942479, + "language_loss": 0.77530909, + "learning_rate": 3.907139262917696e-06, + "loss": 0.80212247, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.32519531, + "step": 2070, + "time_per_iteration": 2.9832003116607666 + }, + { + "auxiliary_loss_clip": 0.01621126, + "auxiliary_loss_mlp": 0.01054236, + "balance_loss_clip": 1.37846744, + "balance_loss_mlp": 1.02173924, + "epoch": 0.1245152562753645, + "flos": 18377988963840.0, + "grad_norm": 2.524178346039938, + "language_loss": 0.82529175, + "learning_rate": 3.907021931556922e-06, + "loss": 0.85204536, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.32470703, + "step": 2071, + "time_per_iteration": 4.245940685272217 + }, + { + "auxiliary_loss_clip": 0.01602707, + "auxiliary_loss_mlp": 0.01062166, + "balance_loss_clip": 1.36543334, + "balance_loss_mlp": 1.02966928, + "epoch": 0.12457537952803246, + "flos": 33120573989760.0, + "grad_norm": 1.7099306617010328, + "language_loss": 0.79223078, + "learning_rate": 3.906904527881684e-06, + "loss": 0.81887954, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.32470703, + "step": 2072, + "time_per_iteration": 2.945786714553833 + }, + { + "auxiliary_loss_clip": 0.01621866, + "auxiliary_loss_mlp": 0.01060032, + "balance_loss_clip": 1.37816036, + "balance_loss_mlp": 1.02844179, + "epoch": 0.12463550278070043, + "flos": 22279605177600.0, + "grad_norm": 5.450154722698608, + "language_loss": 0.76755404, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.79437298, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.31591797, + "step": 2073, + "time_per_iteration": 2.8799378871917725 + }, + { + "auxiliary_loss_clip": 0.01597206, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.36097407, + "balance_loss_mlp": 1.03015184, + "epoch": 0.12469562603336841, + "flos": 14685884012160.0, + "grad_norm": 1.8862269880444142, + "language_loss": 0.91217256, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93877709, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.33081055, + "step": 2074, + "time_per_iteration": 2.8054869174957275 + }, + { + "auxiliary_loss_clip": 0.01615172, + "auxiliary_loss_mlp": 0.01054364, + "balance_loss_clip": 1.36802602, + "balance_loss_mlp": 1.02227283, + "epoch": 0.12475574928603637, + "flos": 24655552848000.0, + "grad_norm": 2.3030832148173905, + "language_loss": 0.84889865, + "learning_rate": 3.906551883013728e-06, + "loss": 0.87559402, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.32104492, + "step": 2075, + "time_per_iteration": 2.9112493991851807 + }, + { + "auxiliary_loss_clip": 0.01623921, + "auxiliary_loss_mlp": 0.0106625, + "balance_loss_clip": 1.37806177, + "balance_loss_mlp": 1.03139365, + "epoch": 0.12481587253870434, + "flos": 21773160420480.0, + "grad_norm": 3.7049025124925286, + "language_loss": 0.74916637, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.77606809, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.34863281, + "step": 2076, + "time_per_iteration": 2.877589225769043 + }, + { + "auxiliary_loss_clip": 0.01607246, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_clip": 1.36818254, + "balance_loss_mlp": 1.02069616, + "epoch": 0.12487599579137232, + "flos": 21442085084160.0, + "grad_norm": 1.8313329465103614, + "language_loss": 0.76940942, + "learning_rate": 3.906316424944469e-06, + "loss": 0.79599637, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.30786133, + "step": 2077, + "time_per_iteration": 2.827270984649658 + }, + { + "auxiliary_loss_clip": 0.01614529, + "auxiliary_loss_mlp": 0.01060402, + "balance_loss_clip": 1.37228096, + "balance_loss_mlp": 1.02533042, + "epoch": 0.12493611904404028, + "flos": 16116320954880.0, + "grad_norm": 3.3446208754129887, + "language_loss": 0.83976728, + "learning_rate": 3.906198587476043e-06, + "loss": 0.86651659, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.35058594, + "step": 2078, + "time_per_iteration": 4.203848123550415 + }, + { + "auxiliary_loss_clip": 0.01612547, + "auxiliary_loss_mlp": 0.0105952, + "balance_loss_clip": 1.36819601, + "balance_loss_mlp": 1.02611768, + "epoch": 0.12499624229670825, + "flos": 21590325607680.0, + "grad_norm": 1.7074101573953455, + "language_loss": 0.76709235, + "learning_rate": 3.906080677724374e-06, + "loss": 0.79381305, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.33374023, + "step": 2079, + "time_per_iteration": 2.8585753440856934 + }, + { + "auxiliary_loss_clip": 0.01635932, + "auxiliary_loss_mlp": 0.01064232, + "balance_loss_clip": 1.38797569, + "balance_loss_mlp": 1.032022, + "epoch": 0.1250563655493762, + "flos": 25709099454720.0, + "grad_norm": 2.0218307805584326, + "language_loss": 0.85496271, + "learning_rate": 3.905962695693935e-06, + "loss": 0.88196439, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.32177734, + "step": 2080, + "time_per_iteration": 5.707514524459839 + }, + { + "auxiliary_loss_clip": 0.01610978, + "auxiliary_loss_mlp": 0.01050698, + "balance_loss_clip": 1.36820436, + "balance_loss_mlp": 1.01877403, + "epoch": 0.12511648880204418, + "flos": 16918206128640.0, + "grad_norm": 2.083910855234556, + "language_loss": 0.8624317, + "learning_rate": 3.9058446413892e-06, + "loss": 0.88904852, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.3190918, + "step": 2081, + "time_per_iteration": 2.7983481884002686 + }, + { + "auxiliary_loss_clip": 0.01612807, + "auxiliary_loss_mlp": 0.01056378, + "balance_loss_clip": 1.37084818, + "balance_loss_mlp": 1.0238812, + "epoch": 0.12517661205471217, + "flos": 17576691972480.0, + "grad_norm": 1.6667009268914932, + "language_loss": 0.77643979, + "learning_rate": 3.905726514814646e-06, + "loss": 0.80313158, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.32519531, + "step": 2082, + "time_per_iteration": 2.8548994064331055 + }, + { + "auxiliary_loss_clip": 0.01648419, + "auxiliary_loss_mlp": 0.01064546, + "balance_loss_clip": 1.39516556, + "balance_loss_mlp": 1.0305717, + "epoch": 0.12523673530738014, + "flos": 16042245937920.0, + "grad_norm": 2.4157821951766203, + "language_loss": 0.81551063, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.84264028, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.33984375, + "step": 2083, + "time_per_iteration": 2.811779499053955 + }, + { + "auxiliary_loss_clip": 0.01631434, + "auxiliary_loss_mlp": 0.01058664, + "balance_loss_clip": 1.38342071, + "balance_loss_mlp": 1.02604818, + "epoch": 0.1252968585600481, + "flos": 18817190668800.0, + "grad_norm": 2.2044581870548585, + "language_loss": 0.91196978, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.93887079, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.32617188, + "step": 2084, + "time_per_iteration": 2.8120594024658203 + }, + { + "auxiliary_loss_clip": 0.01626564, + "auxiliary_loss_mlp": 0.01057248, + "balance_loss_clip": 1.38315868, + "balance_loss_mlp": 1.02613425, + "epoch": 0.12535698181271607, + "flos": 27282980972160.0, + "grad_norm": 1.7801604830099798, + "language_loss": 0.81407249, + "learning_rate": 3.905371701516869e-06, + "loss": 0.84091055, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.31091309, + "step": 2085, + "time_per_iteration": 2.9167349338531494 + }, + { + "auxiliary_loss_clip": 0.01608488, + "auxiliary_loss_mlp": 0.01055148, + "balance_loss_clip": 1.37053466, + "balance_loss_mlp": 1.02479708, + "epoch": 0.12541710506538403, + "flos": 22064166846720.0, + "grad_norm": 2.2382760727042843, + "language_loss": 0.8869642, + "learning_rate": 3.905253285907856e-06, + "loss": 0.91360056, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.3034668, + "step": 2086, + "time_per_iteration": 2.8384876251220703 + }, + { + "auxiliary_loss_clip": 0.01613716, + "auxiliary_loss_mlp": 0.01056868, + "balance_loss_clip": 1.3768326, + "balance_loss_mlp": 1.02496779, + "epoch": 0.125477228318052, + "flos": 12610489420800.0, + "grad_norm": 1.9929518909390866, + "language_loss": 0.8774333, + "learning_rate": 3.905134798051447e-06, + "loss": 0.90413922, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.31884766, + "step": 2087, + "time_per_iteration": 2.861651659011841 + }, + { + "auxiliary_loss_clip": 0.01637354, + "auxiliary_loss_mlp": 0.01064078, + "balance_loss_clip": 1.39350486, + "balance_loss_mlp": 1.03210652, + "epoch": 0.12553735157071996, + "flos": 23889121614720.0, + "grad_norm": 8.259835630822895, + "language_loss": 0.75077236, + "learning_rate": 3.905016237952136e-06, + "loss": 0.77778673, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.31982422, + "step": 2088, + "time_per_iteration": 2.8551533222198486 + }, + { + "auxiliary_loss_clip": 0.01404792, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.26363981, + "balance_loss_mlp": 1.01079392, + "epoch": 0.12559747482338796, + "flos": 69952120012800.0, + "grad_norm": 0.7616980179909959, + "language_loss": 0.61779481, + "learning_rate": 3.904897605614418e-06, + "loss": 0.64218718, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.23632812, + "step": 2089, + "time_per_iteration": 3.3417320251464844 + }, + { + "auxiliary_loss_clip": 0.01618893, + "auxiliary_loss_mlp": 0.010514, + "balance_loss_clip": 1.37842083, + "balance_loss_mlp": 1.02071595, + "epoch": 0.12565759807605592, + "flos": 24290154691200.0, + "grad_norm": 1.862208469107207, + "language_loss": 0.79005522, + "learning_rate": 3.904778901042793e-06, + "loss": 0.81675816, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.30664062, + "step": 2090, + "time_per_iteration": 2.9206671714782715 + }, + { + "auxiliary_loss_clip": 0.01396923, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.25551224, + "balance_loss_mlp": 1.01258039, + "epoch": 0.12571772132872389, + "flos": 56477298314880.0, + "grad_norm": 0.768343626443453, + "language_loss": 0.59628248, + "learning_rate": 3.90466012424176e-06, + "loss": 0.62056255, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.18457031, + "step": 2091, + "time_per_iteration": 3.2083234786987305 + }, + { + "auxiliary_loss_clip": 0.01618453, + "auxiliary_loss_mlp": 0.01057602, + "balance_loss_clip": 1.37841034, + "balance_loss_mlp": 1.02613103, + "epoch": 0.12577784458139185, + "flos": 41260944556800.0, + "grad_norm": 1.8650488554404532, + "language_loss": 0.64780772, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6745683, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.31469727, + "step": 2092, + "time_per_iteration": 3.0515575408935547 + }, + { + "auxiliary_loss_clip": 0.01642772, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.39556539, + "balance_loss_mlp": 1.02516615, + "epoch": 0.12583796783405982, + "flos": 19764963636480.0, + "grad_norm": 1.904035273090423, + "language_loss": 0.81753719, + "learning_rate": 3.904422353969493e-06, + "loss": 0.84453058, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.3137207, + "step": 2093, + "time_per_iteration": 2.8275680541992188 + }, + { + "auxiliary_loss_clip": 0.01623204, + "auxiliary_loss_mlp": 0.01056547, + "balance_loss_clip": 1.38554382, + "balance_loss_mlp": 1.0238831, + "epoch": 0.12589809108672778, + "flos": 22612490305920.0, + "grad_norm": 1.7726798261647871, + "language_loss": 0.77535284, + "learning_rate": 3.904303360507276e-06, + "loss": 0.80215031, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.3269043, + "step": 2094, + "time_per_iteration": 2.926093101501465 + }, + { + "auxiliary_loss_clip": 0.01613822, + "auxiliary_loss_mlp": 0.01058494, + "balance_loss_clip": 1.37693071, + "balance_loss_mlp": 1.02645016, + "epoch": 0.12595821433939577, + "flos": 45238988517120.0, + "grad_norm": 1.6652711529961535, + "language_loss": 0.77948809, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.80621123, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.32055664, + "step": 2095, + "time_per_iteration": 3.127448320388794 + }, + { + "auxiliary_loss_clip": 0.01625804, + "auxiliary_loss_mlp": 0.01049803, + "balance_loss_clip": 1.38095188, + "balance_loss_mlp": 1.01966643, + "epoch": 0.12601833759206374, + "flos": 14328041736960.0, + "grad_norm": 2.3084912788328595, + "language_loss": 0.84641945, + "learning_rate": 3.904065156953232e-06, + "loss": 0.8731755, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.30126953, + "step": 2096, + "time_per_iteration": 2.8092355728149414 + }, + { + "auxiliary_loss_clip": 0.01637452, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.39316106, + "balance_loss_mlp": 1.0242933, + "epoch": 0.1260784608447317, + "flos": 21298188061440.0, + "grad_norm": 1.7994757369750805, + "language_loss": 0.76743579, + "learning_rate": 3.903945946870439e-06, + "loss": 0.79435194, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.29833984, + "step": 2097, + "time_per_iteration": 2.9253997802734375 + }, + { + "auxiliary_loss_clip": 0.01632158, + "auxiliary_loss_mlp": 0.01059634, + "balance_loss_clip": 1.38937211, + "balance_loss_mlp": 1.02933121, + "epoch": 0.12613858409739967, + "flos": 26262807045120.0, + "grad_norm": 2.1447727644620973, + "language_loss": 0.88983321, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.91675115, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.30297852, + "step": 2098, + "time_per_iteration": 2.8533358573913574 + }, + { + "auxiliary_loss_clip": 0.01650087, + "auxiliary_loss_mlp": 0.01062637, + "balance_loss_clip": 1.39910543, + "balance_loss_mlp": 1.03009272, + "epoch": 0.12619870735006763, + "flos": 21589963649280.0, + "grad_norm": 1.733819333643832, + "language_loss": 0.70768237, + "learning_rate": 3.903707310115912e-06, + "loss": 0.73480964, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.32519531, + "step": 2099, + "time_per_iteration": 2.911386489868164 + }, + { + "auxiliary_loss_clip": 0.01636305, + "auxiliary_loss_mlp": 0.01060933, + "balance_loss_clip": 1.39084482, + "balance_loss_mlp": 1.02853239, + "epoch": 0.1262588306027356, + "flos": 23377292726400.0, + "grad_norm": 2.12675627010851, + "language_loss": 0.83638668, + "learning_rate": 3.903587883453228e-06, + "loss": 0.86335909, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.32421875, + "step": 2100, + "time_per_iteration": 2.8673312664031982 + }, + { + "auxiliary_loss_clip": 0.01645208, + "auxiliary_loss_mlp": 0.01058839, + "balance_loss_clip": 1.39914966, + "balance_loss_mlp": 1.02724826, + "epoch": 0.12631895385540357, + "flos": 23959169844480.0, + "grad_norm": 1.945000114730573, + "language_loss": 0.81657124, + "learning_rate": 3.903468384606302e-06, + "loss": 0.84361172, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.31591797, + "step": 2101, + "time_per_iteration": 2.887495756149292 + }, + { + "auxiliary_loss_clip": 0.01400923, + "auxiliary_loss_mlp": 0.0106181, + "balance_loss_clip": 1.25897551, + "balance_loss_mlp": 1.04540658, + "epoch": 0.12637907710807156, + "flos": 70312586486400.0, + "grad_norm": 0.717195269222542, + "language_loss": 0.57042456, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59505188, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.1640625, + "step": 2102, + "time_per_iteration": 3.414055585861206 + }, + { + "auxiliary_loss_clip": 0.01629495, + "auxiliary_loss_mlp": 0.01050772, + "balance_loss_clip": 1.38639522, + "balance_loss_mlp": 1.0211606, + "epoch": 0.12643920036073952, + "flos": 18924004938240.0, + "grad_norm": 1.9850835855043087, + "language_loss": 0.94781792, + "learning_rate": 3.903229170377845e-06, + "loss": 0.97462058, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.29602051, + "step": 2103, + "time_per_iteration": 2.8365235328674316 + }, + { + "auxiliary_loss_clip": 0.01618084, + "auxiliary_loss_mlp": 0.0104901, + "balance_loss_clip": 1.38236177, + "balance_loss_mlp": 1.02070975, + "epoch": 0.1264993236134075, + "flos": 27794040698880.0, + "grad_norm": 1.5537164557002243, + "language_loss": 0.78657699, + "learning_rate": 3.903109455005387e-06, + "loss": 0.81324792, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28295898, + "step": 2104, + "time_per_iteration": 2.9261186122894287 + }, + { + "auxiliary_loss_clip": 0.01651426, + "auxiliary_loss_mlp": 0.01058029, + "balance_loss_clip": 1.40574932, + "balance_loss_mlp": 1.02944279, + "epoch": 0.12655944686607545, + "flos": 24765624743040.0, + "grad_norm": 1.6743515165320075, + "language_loss": 0.8253082, + "learning_rate": 3.902989667466828e-06, + "loss": 0.85240281, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.28564453, + "step": 2105, + "time_per_iteration": 2.8930609226226807 + }, + { + "auxiliary_loss_clip": 0.01656266, + "auxiliary_loss_mlp": 0.01063911, + "balance_loss_clip": 1.40566373, + "balance_loss_mlp": 1.03396547, + "epoch": 0.12661957011874342, + "flos": 24143452490880.0, + "grad_norm": 2.517018551895882, + "language_loss": 0.84028685, + "learning_rate": 3.90286980776671e-06, + "loss": 0.86748862, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.29931641, + "step": 2106, + "time_per_iteration": 2.918806791305542 + }, + { + "auxiliary_loss_clip": 0.01632457, + "auxiliary_loss_mlp": 0.01060376, + "balance_loss_clip": 1.39177752, + "balance_loss_mlp": 1.03043044, + "epoch": 0.12667969337141138, + "flos": 24579803773440.0, + "grad_norm": 1.7240629244913295, + "language_loss": 0.7427426, + "learning_rate": 3.902749875909578e-06, + "loss": 0.7696709, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.29931641, + "step": 2107, + "time_per_iteration": 4.348583698272705 + }, + { + "auxiliary_loss_clip": 0.01628995, + "auxiliary_loss_mlp": 0.01063398, + "balance_loss_clip": 1.38919306, + "balance_loss_mlp": 1.03428698, + "epoch": 0.12673981662407935, + "flos": 22970785029120.0, + "grad_norm": 5.618936462235351, + "language_loss": 0.80267894, + "learning_rate": 3.90262987189998e-06, + "loss": 0.82960296, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.29125977, + "step": 2108, + "time_per_iteration": 2.8623218536376953 + }, + { + "auxiliary_loss_clip": 0.01642652, + "auxiliary_loss_mlp": 0.01064101, + "balance_loss_clip": 1.39747572, + "balance_loss_mlp": 1.03270102, + "epoch": 0.12679993987674734, + "flos": 17283785264640.0, + "grad_norm": 2.467222786626317, + "language_loss": 0.77198344, + "learning_rate": 3.902509795742467e-06, + "loss": 0.79905093, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.31396484, + "step": 2109, + "time_per_iteration": 2.9067482948303223 + }, + { + "auxiliary_loss_clip": 0.01622299, + "auxiliary_loss_mlp": 0.01062975, + "balance_loss_clip": 1.38525987, + "balance_loss_mlp": 1.03264844, + "epoch": 0.1268600631294153, + "flos": 17284056733440.0, + "grad_norm": 1.980992737430057, + "language_loss": 0.83986288, + "learning_rate": 3.902389647441592e-06, + "loss": 0.86671567, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.30297852, + "step": 2110, + "time_per_iteration": 2.9029057025909424 + }, + { + "auxiliary_loss_clip": 0.01637493, + "auxiliary_loss_mlp": 0.01065234, + "balance_loss_clip": 1.39228368, + "balance_loss_mlp": 1.03675449, + "epoch": 0.12692018638208327, + "flos": 24071684958720.0, + "grad_norm": 1.6025071207210315, + "language_loss": 0.79657531, + "learning_rate": 3.90226942700191e-06, + "loss": 0.82360256, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.28503418, + "step": 2111, + "time_per_iteration": 2.860950231552124 + }, + { + "auxiliary_loss_clip": 0.01654859, + "auxiliary_loss_mlp": 0.01066714, + "balance_loss_clip": 1.40322351, + "balance_loss_mlp": 1.03622055, + "epoch": 0.12698030963475124, + "flos": 31844395128960.0, + "grad_norm": 2.043379690302293, + "language_loss": 0.78562099, + "learning_rate": 3.902149134427982e-06, + "loss": 0.81283671, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.30493164, + "step": 2112, + "time_per_iteration": 2.973675489425659 + }, + { + "auxiliary_loss_clip": 0.01631738, + "auxiliary_loss_mlp": 0.01069191, + "balance_loss_clip": 1.39040613, + "balance_loss_mlp": 1.0371474, + "epoch": 0.1270404328874192, + "flos": 25197542035200.0, + "grad_norm": 2.2602935370931867, + "language_loss": 0.86211932, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88912857, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.32006836, + "step": 2113, + "time_per_iteration": 4.274584054946899 + }, + { + "auxiliary_loss_clip": 0.01626175, + "auxiliary_loss_mlp": 0.01065481, + "balance_loss_clip": 1.38486576, + "balance_loss_mlp": 1.03491616, + "epoch": 0.12710055614008717, + "flos": 16005796611840.0, + "grad_norm": 2.451687823207901, + "language_loss": 0.75314891, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.78006542, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30566406, + "step": 2114, + "time_per_iteration": 2.7744691371917725 + }, + { + "auxiliary_loss_clip": 0.0163325, + "auxiliary_loss_mlp": 0.0106051, + "balance_loss_clip": 1.39299345, + "balance_loss_mlp": 1.02889562, + "epoch": 0.12716067939275516, + "flos": 15093975277440.0, + "grad_norm": 2.219464555787657, + "language_loss": 0.85105813, + "learning_rate": 3.901787823946341e-06, + "loss": 0.87799573, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.31640625, + "step": 2115, + "time_per_iteration": 5.621707916259766 + }, + { + "auxiliary_loss_clip": 0.01636374, + "auxiliary_loss_mlp": 0.01059506, + "balance_loss_clip": 1.39444315, + "balance_loss_mlp": 1.0302999, + "epoch": 0.12722080264542313, + "flos": 28378587260160.0, + "grad_norm": 1.544136934676335, + "language_loss": 0.8793608, + "learning_rate": 3.901667242881065e-06, + "loss": 0.90631956, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.29223633, + "step": 2116, + "time_per_iteration": 2.884709596633911 + }, + { + "auxiliary_loss_clip": 0.01627702, + "auxiliary_loss_mlp": 0.01054114, + "balance_loss_clip": 1.3887639, + "balance_loss_mlp": 1.02505064, + "epoch": 0.1272809258980911, + "flos": 32392718588160.0, + "grad_norm": 1.862791447243837, + "language_loss": 0.7174809, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.74429905, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29064941, + "step": 2117, + "time_per_iteration": 2.9759459495544434 + }, + { + "auxiliary_loss_clip": 0.0163415, + "auxiliary_loss_mlp": 0.01059417, + "balance_loss_clip": 1.39170349, + "balance_loss_mlp": 1.02863693, + "epoch": 0.12734104915075906, + "flos": 16043015099520.0, + "grad_norm": 2.3986544726083583, + "language_loss": 0.87461388, + "learning_rate": 3.901425864420852e-06, + "loss": 0.90154952, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.30786133, + "step": 2118, + "time_per_iteration": 2.7875070571899414 + }, + { + "auxiliary_loss_clip": 0.0162712, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.38721049, + "balance_loss_mlp": 1.02127552, + "epoch": 0.12740117240342702, + "flos": 18269591126400.0, + "grad_norm": 2.0644264972240913, + "language_loss": 0.88818753, + "learning_rate": 3.901305067035068e-06, + "loss": 0.91496235, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29125977, + "step": 2119, + "time_per_iteration": 2.9074556827545166 + }, + { + "auxiliary_loss_clip": 0.01644144, + "auxiliary_loss_mlp": 0.01055681, + "balance_loss_clip": 1.40146899, + "balance_loss_mlp": 1.02519894, + "epoch": 0.127461295656095, + "flos": 12127146773760.0, + "grad_norm": 2.328997574418648, + "language_loss": 0.89291978, + "learning_rate": 3.901184197551605e-06, + "loss": 0.91991794, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.30456543, + "step": 2120, + "time_per_iteration": 2.7332515716552734 + }, + { + "auxiliary_loss_clip": 0.0162673, + "auxiliary_loss_mlp": 0.0104853, + "balance_loss_clip": 1.38562417, + "balance_loss_mlp": 1.01872814, + "epoch": 0.12752141890876295, + "flos": 23159546910720.0, + "grad_norm": 2.5059785140602795, + "language_loss": 0.77517253, + "learning_rate": 3.901063255975046e-06, + "loss": 0.80192512, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2980957, + "step": 2121, + "time_per_iteration": 2.8772099018096924 + }, + { + "auxiliary_loss_clip": 0.01634002, + "auxiliary_loss_mlp": 0.01056302, + "balance_loss_clip": 1.39123988, + "balance_loss_mlp": 1.02502179, + "epoch": 0.12758154216143094, + "flos": 21625870037760.0, + "grad_norm": 2.1846583325754696, + "language_loss": 0.84396738, + "learning_rate": 3.900942242309978e-06, + "loss": 0.87087047, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31298828, + "step": 2122, + "time_per_iteration": 2.813204526901245 + }, + { + "auxiliary_loss_clip": 0.01658908, + "auxiliary_loss_mlp": 0.01059865, + "balance_loss_clip": 1.41334701, + "balance_loss_mlp": 1.02820265, + "epoch": 0.1276416654140989, + "flos": 15933440897280.0, + "grad_norm": 2.3287792807675896, + "language_loss": 0.80762899, + "learning_rate": 3.90082115656099e-06, + "loss": 0.83481669, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.31640625, + "step": 2123, + "time_per_iteration": 2.8450820446014404 + }, + { + "auxiliary_loss_clip": 0.01647326, + "auxiliary_loss_mlp": 0.010608, + "balance_loss_clip": 1.40568304, + "balance_loss_mlp": 1.02877998, + "epoch": 0.12770178866676687, + "flos": 22392889453440.0, + "grad_norm": 1.6073357798633612, + "language_loss": 0.80703634, + "learning_rate": 3.900699998732673e-06, + "loss": 0.83411753, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.32006836, + "step": 2124, + "time_per_iteration": 2.8448784351348877 + }, + { + "auxiliary_loss_clip": 0.01643872, + "auxiliary_loss_mlp": 0.01053042, + "balance_loss_clip": 1.39997065, + "balance_loss_mlp": 1.02314484, + "epoch": 0.12776191191943484, + "flos": 21662364608640.0, + "grad_norm": 2.05156327563068, + "language_loss": 0.77346551, + "learning_rate": 3.900578768829623e-06, + "loss": 0.80043471, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.29882812, + "step": 2125, + "time_per_iteration": 2.87032413482666 + }, + { + "auxiliary_loss_clip": 0.01644311, + "auxiliary_loss_mlp": 0.01050621, + "balance_loss_clip": 1.40218139, + "balance_loss_mlp": 1.01996028, + "epoch": 0.1278220351721028, + "flos": 25745639270400.0, + "grad_norm": 1.9608092027795136, + "language_loss": 0.79620743, + "learning_rate": 3.900457466856434e-06, + "loss": 0.82315677, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.30639648, + "step": 2126, + "time_per_iteration": 2.937234878540039 + }, + { + "auxiliary_loss_clip": 0.01656038, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.41027021, + "balance_loss_mlp": 1.01938105, + "epoch": 0.12788215842477077, + "flos": 41257641686400.0, + "grad_norm": 2.1356998171544808, + "language_loss": 0.70240831, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.7294507, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.28833008, + "step": 2127, + "time_per_iteration": 2.9747846126556396 + }, + { + "auxiliary_loss_clip": 0.01400837, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.26420188, + "balance_loss_mlp": 1.0126797, + "epoch": 0.12794228167743876, + "flos": 70911023201280.0, + "grad_norm": 0.8779028274893623, + "language_loss": 0.62926447, + "learning_rate": 3.900214646718047e-06, + "loss": 0.65358174, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.18261719, + "step": 2128, + "time_per_iteration": 3.4157347679138184 + }, + { + "auxiliary_loss_clip": 0.01661456, + "auxiliary_loss_mlp": 0.01055175, + "balance_loss_clip": 1.41325331, + "balance_loss_mlp": 1.02425194, + "epoch": 0.12800240493010673, + "flos": 16298386606080.0, + "grad_norm": 2.2614201913971264, + "language_loss": 0.79185015, + "learning_rate": 3.900093128562056e-06, + "loss": 0.81901646, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.30908203, + "step": 2129, + "time_per_iteration": 2.87589168548584 + }, + { + "auxiliary_loss_clip": 0.01677079, + "auxiliary_loss_mlp": 0.01063254, + "balance_loss_clip": 1.42477942, + "balance_loss_mlp": 1.03168774, + "epoch": 0.1280625281827747, + "flos": 20641331030400.0, + "grad_norm": 2.1950831985251407, + "language_loss": 0.80688375, + "learning_rate": 3.899971538354343e-06, + "loss": 0.83428705, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.31542969, + "step": 2130, + "time_per_iteration": 2.8276052474975586 + }, + { + "auxiliary_loss_clip": 0.01663917, + "auxiliary_loss_mlp": 0.01049147, + "balance_loss_clip": 1.41508496, + "balance_loss_mlp": 1.01905894, + "epoch": 0.12812265143544266, + "flos": 22648532428800.0, + "grad_norm": 3.473482200945053, + "language_loss": 0.72421002, + "learning_rate": 3.899849876099518e-06, + "loss": 0.75134063, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.30053711, + "step": 2131, + "time_per_iteration": 2.903099775314331 + }, + { + "auxiliary_loss_clip": 0.01645776, + "auxiliary_loss_mlp": 0.0105358, + "balance_loss_clip": 1.40505612, + "balance_loss_mlp": 1.02425408, + "epoch": 0.12818277468811062, + "flos": 34728054410880.0, + "grad_norm": 2.0677663901682437, + "language_loss": 0.73489857, + "learning_rate": 3.899728141802197e-06, + "loss": 0.76189214, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.29296875, + "step": 2132, + "time_per_iteration": 2.9577109813690186 + }, + { + "auxiliary_loss_clip": 0.01634018, + "auxiliary_loss_mlp": 0.01050213, + "balance_loss_clip": 1.39928627, + "balance_loss_mlp": 1.02101898, + "epoch": 0.1282428979407786, + "flos": 23122418912640.0, + "grad_norm": 20.025020147764657, + "language_loss": 0.82535523, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.85219759, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29162598, + "step": 2133, + "time_per_iteration": 2.9344592094421387 + }, + { + "auxiliary_loss_clip": 0.01660342, + "auxiliary_loss_mlp": 0.0105712, + "balance_loss_clip": 1.41056514, + "balance_loss_mlp": 1.02610135, + "epoch": 0.12830302119344655, + "flos": 20896250088960.0, + "grad_norm": 2.3522356312797217, + "language_loss": 0.82245314, + "learning_rate": 3.899484457098528e-06, + "loss": 0.84962779, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.31030273, + "step": 2134, + "time_per_iteration": 2.827894926071167 + }, + { + "auxiliary_loss_clip": 0.0166065, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.41603231, + "balance_loss_mlp": 1.02182615, + "epoch": 0.12836314444611455, + "flos": 21407762263680.0, + "grad_norm": 1.8214021585310218, + "language_loss": 0.84393668, + "learning_rate": 3.899362506701421e-06, + "loss": 0.87107331, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.31201172, + "step": 2135, + "time_per_iteration": 2.9015562534332275 + }, + { + "auxiliary_loss_clip": 0.01652724, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.4110781, + "balance_loss_mlp": 1.02555943, + "epoch": 0.1284232676987825, + "flos": 13670370299520.0, + "grad_norm": 2.205625632692994, + "language_loss": 0.78079748, + "learning_rate": 3.899240484280298e-06, + "loss": 0.80789047, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.31005859, + "step": 2136, + "time_per_iteration": 2.8503968715667725 + }, + { + "auxiliary_loss_clip": 0.01413471, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.2771771, + "balance_loss_mlp": 1.02443528, + "epoch": 0.12848339095145048, + "flos": 60022927290240.0, + "grad_norm": 0.8938063797840368, + "language_loss": 0.59234917, + "learning_rate": 3.899118389839785e-06, + "loss": 0.6168837, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.15527344, + "step": 2137, + "time_per_iteration": 3.552185297012329 + }, + { + "auxiliary_loss_clip": 0.01661881, + "auxiliary_loss_mlp": 0.01056813, + "balance_loss_clip": 1.41612601, + "balance_loss_mlp": 1.02529454, + "epoch": 0.12854351420411844, + "flos": 13889337724800.0, + "grad_norm": 3.312919840058935, + "language_loss": 0.84773469, + "learning_rate": 3.898996223384512e-06, + "loss": 0.87492168, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.31518555, + "step": 2138, + "time_per_iteration": 2.888374090194702 + }, + { + "auxiliary_loss_clip": 0.01670092, + "auxiliary_loss_mlp": 0.01054141, + "balance_loss_clip": 1.42143655, + "balance_loss_mlp": 1.02314615, + "epoch": 0.1286036374567864, + "flos": 22648170470400.0, + "grad_norm": 2.2801957617192827, + "language_loss": 0.79686952, + "learning_rate": 3.898873984919113e-06, + "loss": 0.82411182, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.30981445, + "step": 2139, + "time_per_iteration": 2.861192464828491 + }, + { + "auxiliary_loss_clip": 0.01653429, + "auxiliary_loss_mlp": 0.01062905, + "balance_loss_clip": 1.40953338, + "balance_loss_mlp": 1.03179157, + "epoch": 0.12866376070945437, + "flos": 16333071384960.0, + "grad_norm": 2.2597444541968983, + "language_loss": 0.85780025, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.88496351, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.31103516, + "step": 2140, + "time_per_iteration": 2.907869338989258 + }, + { + "auxiliary_loss_clip": 0.01646678, + "auxiliary_loss_mlp": 0.0104885, + "balance_loss_clip": 1.40486908, + "balance_loss_mlp": 1.02164674, + "epoch": 0.12872388396212234, + "flos": 11881955347200.0, + "grad_norm": 1.877366993386417, + "language_loss": 0.86969358, + "learning_rate": 3.898629291976476e-06, + "loss": 0.89664888, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.2722168, + "step": 2141, + "time_per_iteration": 2.818969488143921 + }, + { + "auxiliary_loss_clip": 0.0168116, + "auxiliary_loss_mlp": 0.01062141, + "balance_loss_clip": 1.43166411, + "balance_loss_mlp": 1.02981114, + "epoch": 0.12878400721479033, + "flos": 28378542015360.0, + "grad_norm": 1.8534255426398376, + "language_loss": 0.7010262, + "learning_rate": 3.898506837508518e-06, + "loss": 0.72845924, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.32348633, + "step": 2142, + "time_per_iteration": 4.313765525817871 + }, + { + "auxiliary_loss_clip": 0.01679738, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.42919934, + "balance_loss_mlp": 1.02361059, + "epoch": 0.1288441304674583, + "flos": 25896820705920.0, + "grad_norm": 1.8872673997452314, + "language_loss": 0.84482837, + "learning_rate": 3.89838431104899e-06, + "loss": 0.87217516, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.31323242, + "step": 2143, + "time_per_iteration": 2.8924560546875 + }, + { + "auxiliary_loss_clip": 0.01683676, + "auxiliary_loss_mlp": 0.01058418, + "balance_loss_clip": 1.43367457, + "balance_loss_mlp": 1.02832925, + "epoch": 0.12890425372012626, + "flos": 20823577660800.0, + "grad_norm": 2.1884324994578424, + "language_loss": 0.82754421, + "learning_rate": 3.898261712602539e-06, + "loss": 0.85496509, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.30078125, + "step": 2144, + "time_per_iteration": 2.905400276184082 + }, + { + "auxiliary_loss_clip": 0.01663716, + "auxiliary_loss_mlp": 0.01057458, + "balance_loss_clip": 1.41558099, + "balance_loss_mlp": 1.02496171, + "epoch": 0.12896437697279423, + "flos": 22576312448640.0, + "grad_norm": 2.0666111985358246, + "language_loss": 0.80337083, + "learning_rate": 3.898139042173813e-06, + "loss": 0.83058262, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.32470703, + "step": 2145, + "time_per_iteration": 2.8358120918273926 + }, + { + "auxiliary_loss_clip": 0.01663341, + "auxiliary_loss_mlp": 0.01057656, + "balance_loss_clip": 1.41845322, + "balance_loss_mlp": 1.0260179, + "epoch": 0.1290245002254622, + "flos": 17502752689920.0, + "grad_norm": 2.0587666839129812, + "language_loss": 0.84065914, + "learning_rate": 3.898016299767465e-06, + "loss": 0.86786908, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.31640625, + "step": 2146, + "time_per_iteration": 2.9032530784606934 + }, + { + "auxiliary_loss_clip": 0.01675224, + "auxiliary_loss_mlp": 0.01063984, + "balance_loss_clip": 1.42948318, + "balance_loss_mlp": 1.03179729, + "epoch": 0.12908462347813016, + "flos": 36328657622400.0, + "grad_norm": 1.9352277580541777, + "language_loss": 0.71953797, + "learning_rate": 3.897893485388149e-06, + "loss": 0.74693, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.32177734, + "step": 2147, + "time_per_iteration": 2.953518867492676 + }, + { + "auxiliary_loss_clip": 0.01663753, + "auxiliary_loss_mlp": 0.01057842, + "balance_loss_clip": 1.41687703, + "balance_loss_mlp": 1.02694261, + "epoch": 0.12914474673079815, + "flos": 22538958226560.0, + "grad_norm": 2.070826211581388, + "language_loss": 0.72703642, + "learning_rate": 3.897770599040521e-06, + "loss": 0.75425231, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.30859375, + "step": 2148, + "time_per_iteration": 4.31352972984314 + }, + { + "auxiliary_loss_clip": 0.0165272, + "auxiliary_loss_mlp": 0.01053122, + "balance_loss_clip": 1.40921736, + "balance_loss_mlp": 1.02467871, + "epoch": 0.12920486998346611, + "flos": 21482199239040.0, + "grad_norm": 1.7283897742699852, + "language_loss": 0.79671329, + "learning_rate": 3.897647640729242e-06, + "loss": 0.82377172, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.28466797, + "step": 2149, + "time_per_iteration": 2.9508273601531982 + }, + { + "auxiliary_loss_clip": 0.0166817, + "auxiliary_loss_mlp": 0.01057679, + "balance_loss_clip": 1.41966498, + "balance_loss_mlp": 1.02635098, + "epoch": 0.12926499323613408, + "flos": 27320108970240.0, + "grad_norm": 2.773146694881116, + "language_loss": 0.77161229, + "learning_rate": 3.897524610458975e-06, + "loss": 0.7988708, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.31347656, + "step": 2150, + "time_per_iteration": 5.824554681777954 + }, + { + "auxiliary_loss_clip": 0.01669165, + "auxiliary_loss_mlp": 0.01052803, + "balance_loss_clip": 1.4202714, + "balance_loss_mlp": 1.02394247, + "epoch": 0.12932511648880204, + "flos": 22101340089600.0, + "grad_norm": 2.2231695450853355, + "language_loss": 0.72308683, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.75030649, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.28845215, + "step": 2151, + "time_per_iteration": 2.897841215133667 + }, + { + "auxiliary_loss_clip": 0.01677673, + "auxiliary_loss_mlp": 0.01052071, + "balance_loss_clip": 1.43147624, + "balance_loss_mlp": 1.02232814, + "epoch": 0.12938523974147, + "flos": 20312382199680.0, + "grad_norm": 2.16190823645622, + "language_loss": 0.85344219, + "learning_rate": 3.897278334060137e-06, + "loss": 0.88073957, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.29724121, + "step": 2152, + "time_per_iteration": 2.818711757659912 + }, + { + "auxiliary_loss_clip": 0.01676267, + "auxiliary_loss_mlp": 0.01056459, + "balance_loss_clip": 1.43103302, + "balance_loss_mlp": 1.02707398, + "epoch": 0.12944536299413797, + "flos": 19509320661120.0, + "grad_norm": 6.404591517578225, + "language_loss": 0.80300915, + "learning_rate": 3.897155087940906e-06, + "loss": 0.83033645, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.29382324, + "step": 2153, + "time_per_iteration": 2.906743288040161 + }, + { + "auxiliary_loss_clip": 0.01666458, + "auxiliary_loss_mlp": 0.01057693, + "balance_loss_clip": 1.41959655, + "balance_loss_mlp": 1.02653217, + "epoch": 0.12950548624680594, + "flos": 27719060785920.0, + "grad_norm": 1.607278894727942, + "language_loss": 0.81345147, + "learning_rate": 3.897031769881364e-06, + "loss": 0.840693, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.31152344, + "step": 2154, + "time_per_iteration": 2.86448335647583 + }, + { + "auxiliary_loss_clip": 0.0166299, + "auxiliary_loss_mlp": 0.01048907, + "balance_loss_clip": 1.41845918, + "balance_loss_mlp": 1.01826978, + "epoch": 0.12956560949947393, + "flos": 17574474977280.0, + "grad_norm": 1.8849033281997272, + "language_loss": 0.84774804, + "learning_rate": 3.896908379886188e-06, + "loss": 0.87486702, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.30639648, + "step": 2155, + "time_per_iteration": 2.8518054485321045 + }, + { + "auxiliary_loss_clip": 0.01671121, + "auxiliary_loss_mlp": 0.0105642, + "balance_loss_clip": 1.42017663, + "balance_loss_mlp": 1.02647495, + "epoch": 0.1296257327521419, + "flos": 20750452784640.0, + "grad_norm": 2.4378881257668628, + "language_loss": 0.77400887, + "learning_rate": 3.896784917960055e-06, + "loss": 0.80128425, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.29980469, + "step": 2156, + "time_per_iteration": 2.8601319789886475 + }, + { + "auxiliary_loss_clip": 0.01666537, + "auxiliary_loss_mlp": 0.01052203, + "balance_loss_clip": 1.4217602, + "balance_loss_mlp": 1.02344966, + "epoch": 0.12968585600480986, + "flos": 16403526817920.0, + "grad_norm": 1.6750137313956903, + "language_loss": 0.87515533, + "learning_rate": 3.896661384107648e-06, + "loss": 0.9023428, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.28808594, + "step": 2157, + "time_per_iteration": 2.8645756244659424 + }, + { + "auxiliary_loss_clip": 0.01671601, + "auxiliary_loss_mlp": 0.01056503, + "balance_loss_clip": 1.41953635, + "balance_loss_mlp": 1.02560413, + "epoch": 0.12974597925747783, + "flos": 28341187793280.0, + "grad_norm": 4.010860267270154, + "language_loss": 0.81924343, + "learning_rate": 3.896537778333651e-06, + "loss": 0.84652448, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.30908203, + "step": 2158, + "time_per_iteration": 2.92305588722229 + }, + { + "auxiliary_loss_clip": 0.01679802, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.42805564, + "balance_loss_mlp": 1.02899432, + "epoch": 0.1298061025101458, + "flos": 9689249692800.0, + "grad_norm": 2.1952580099865515, + "language_loss": 0.76318783, + "learning_rate": 3.896414100642752e-06, + "loss": 0.79058599, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.31005859, + "step": 2159, + "time_per_iteration": 2.8082902431488037 + }, + { + "auxiliary_loss_clip": 0.01654883, + "auxiliary_loss_mlp": 0.01049785, + "balance_loss_clip": 1.41231084, + "balance_loss_mlp": 1.0197444, + "epoch": 0.12986622576281376, + "flos": 27721096801920.0, + "grad_norm": 1.8247818214995024, + "language_loss": 0.83896816, + "learning_rate": 3.89629035103964e-06, + "loss": 0.86601484, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.30029297, + "step": 2160, + "time_per_iteration": 2.9445321559906006 + }, + { + "auxiliary_loss_clip": 0.01646985, + "auxiliary_loss_mlp": 0.01054104, + "balance_loss_clip": 1.40744376, + "balance_loss_mlp": 1.02413523, + "epoch": 0.12992634901548175, + "flos": 18810856396800.0, + "grad_norm": 1.488998792657995, + "language_loss": 0.82969612, + "learning_rate": 3.896166529529008e-06, + "loss": 0.85670698, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.29980469, + "step": 2161, + "time_per_iteration": 2.8418636322021484 + }, + { + "auxiliary_loss_clip": 0.0166843, + "auxiliary_loss_mlp": 0.01053073, + "balance_loss_clip": 1.4221139, + "balance_loss_mlp": 1.02224565, + "epoch": 0.12998647226814972, + "flos": 29138231773440.0, + "grad_norm": 2.2169673703595696, + "language_loss": 0.83763087, + "learning_rate": 3.896042636115551e-06, + "loss": 0.86484593, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.30810547, + "step": 2162, + "time_per_iteration": 2.91082501411438 + }, + { + "auxiliary_loss_clip": 0.01661861, + "auxiliary_loss_mlp": 0.01061619, + "balance_loss_clip": 1.41247511, + "balance_loss_mlp": 1.03060031, + "epoch": 0.13004659552081768, + "flos": 19583259943680.0, + "grad_norm": 2.940988713033312, + "language_loss": 0.74394357, + "learning_rate": 3.895918670803968e-06, + "loss": 0.77117836, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31030273, + "step": 2163, + "time_per_iteration": 2.8295228481292725 + }, + { + "auxiliary_loss_clip": 0.01671608, + "auxiliary_loss_mlp": 0.01059392, + "balance_loss_clip": 1.42239141, + "balance_loss_mlp": 1.02768254, + "epoch": 0.13010671877348565, + "flos": 22500563374080.0, + "grad_norm": 2.783218457135964, + "language_loss": 0.82418096, + "learning_rate": 3.895794633598958e-06, + "loss": 0.85149097, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.31665039, + "step": 2164, + "time_per_iteration": 2.869309186935425 + }, + { + "auxiliary_loss_clip": 0.01671625, + "auxiliary_loss_mlp": 0.01051001, + "balance_loss_clip": 1.42148745, + "balance_loss_mlp": 1.02112758, + "epoch": 0.1301668420261536, + "flos": 23888714411520.0, + "grad_norm": 2.0251328738122054, + "language_loss": 0.73176056, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.75898683, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.29907227, + "step": 2165, + "time_per_iteration": 2.9003772735595703 + }, + { + "auxiliary_loss_clip": 0.0169594, + "auxiliary_loss_mlp": 0.01054281, + "balance_loss_clip": 1.44019222, + "balance_loss_mlp": 1.02235699, + "epoch": 0.13022696527882158, + "flos": 23160768520320.0, + "grad_norm": 2.1938649059543223, + "language_loss": 0.76345921, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.79096144, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 2.55664062, + "router_z_loss_mlp": 0.31933594, + "step": 2166, + "time_per_iteration": 2.8436427116394043 + }, + { + "auxiliary_loss_clip": 0.01674011, + "auxiliary_loss_mlp": 0.01062571, + "balance_loss_clip": 1.42444944, + "balance_loss_mlp": 1.0335083, + "epoch": 0.13028708853148954, + "flos": 26920342748160.0, + "grad_norm": 1.7274862966768487, + "language_loss": 0.8406989, + "learning_rate": 3.895422090670421e-06, + "loss": 0.86806476, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.29052734, + "step": 2167, + "time_per_iteration": 2.9636118412017822 + }, + { + "auxiliary_loss_clip": 0.01687372, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.43953013, + "balance_loss_mlp": 1.03350472, + "epoch": 0.13034721178415754, + "flos": 21261331532160.0, + "grad_norm": 12.55163550913738, + "language_loss": 0.84672737, + "learning_rate": 3.89529776593877e-06, + "loss": 0.87426037, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.32421875, + "step": 2168, + "time_per_iteration": 2.8446903228759766 + }, + { + "auxiliary_loss_clip": 0.01684256, + "auxiliary_loss_mlp": 0.01062946, + "balance_loss_clip": 1.43317986, + "balance_loss_mlp": 1.03114057, + "epoch": 0.1304073350368255, + "flos": 18775628680320.0, + "grad_norm": 2.077894100124159, + "language_loss": 0.81089401, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.83836603, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.31787109, + "step": 2169, + "time_per_iteration": 2.9294979572296143 + }, + { + "auxiliary_loss_clip": 0.01696296, + "auxiliary_loss_mlp": 0.01052024, + "balance_loss_clip": 1.44444036, + "balance_loss_mlp": 1.02183986, + "epoch": 0.13046745828949347, + "flos": 28375963061760.0, + "grad_norm": 2.284785154246083, + "language_loss": 0.68972051, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.71720374, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.30200195, + "step": 2170, + "time_per_iteration": 2.9302124977111816 + }, + { + "auxiliary_loss_clip": 0.01678192, + "auxiliary_loss_mlp": 0.01051462, + "balance_loss_clip": 1.42814541, + "balance_loss_mlp": 1.02146876, + "epoch": 0.13052758154216143, + "flos": 29616642737280.0, + "grad_norm": 1.6425558998549556, + "language_loss": 0.68182331, + "learning_rate": 3.8949243605434e-06, + "loss": 0.70911986, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.29980469, + "step": 2171, + "time_per_iteration": 2.923474073410034 + }, + { + "auxiliary_loss_clip": 0.01680082, + "auxiliary_loss_mlp": 0.01058753, + "balance_loss_clip": 1.42955065, + "balance_loss_mlp": 1.02798557, + "epoch": 0.1305877047948294, + "flos": 19400289396480.0, + "grad_norm": 1.867756000167489, + "language_loss": 0.73864174, + "learning_rate": 3.894799748360537e-06, + "loss": 0.76603007, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.30749512, + "step": 2172, + "time_per_iteration": 2.8557705879211426 + }, + { + "auxiliary_loss_clip": 0.01666457, + "auxiliary_loss_mlp": 0.01058209, + "balance_loss_clip": 1.42362571, + "balance_loss_mlp": 1.0287168, + "epoch": 0.13064782804749736, + "flos": 16882344984960.0, + "grad_norm": 1.8194503484672866, + "language_loss": 0.7670002, + "learning_rate": 3.894675064326678e-06, + "loss": 0.79424685, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.29455566, + "step": 2173, + "time_per_iteration": 2.820206880569458 + }, + { + "auxiliary_loss_clip": 0.01680233, + "auxiliary_loss_mlp": 0.0105648, + "balance_loss_clip": 1.42724478, + "balance_loss_mlp": 1.02493751, + "epoch": 0.13070795130016533, + "flos": 24509891278080.0, + "grad_norm": 3.5695277333585893, + "language_loss": 0.72582513, + "learning_rate": 3.894550308446551e-06, + "loss": 0.75319231, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.31494141, + "step": 2174, + "time_per_iteration": 2.908653736114502 + }, + { + "auxiliary_loss_clip": 0.01425119, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.28536522, + "balance_loss_mlp": 1.0155195, + "epoch": 0.13076807455283332, + "flos": 71086754580480.0, + "grad_norm": 0.8526190183813145, + "language_loss": 0.59141231, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61599517, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.17675781, + "step": 2175, + "time_per_iteration": 3.5488102436065674 + }, + { + "auxiliary_loss_clip": 0.01673109, + "auxiliary_loss_mlp": 0.01062642, + "balance_loss_clip": 1.42248654, + "balance_loss_mlp": 1.03291154, + "epoch": 0.13082819780550128, + "flos": 20273625388800.0, + "grad_norm": 5.281326316603206, + "language_loss": 0.81397521, + "learning_rate": 3.894300581166417e-06, + "loss": 0.84133267, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 2.5078125, + "router_z_loss_mlp": 0.296875, + "step": 2176, + "time_per_iteration": 4.290143966674805 + }, + { + "auxiliary_loss_clip": 0.01670321, + "auxiliary_loss_mlp": 0.01069409, + "balance_loss_clip": 1.420578, + "balance_loss_mlp": 1.03383732, + "epoch": 0.13088832105816925, + "flos": 34216542236160.0, + "grad_norm": 5.963752286514603, + "language_loss": 0.75450838, + "learning_rate": 3.894175609775881e-06, + "loss": 0.78190571, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.35546875, + "step": 2177, + "time_per_iteration": 3.013387441635132 + }, + { + "auxiliary_loss_clip": 0.01677081, + "auxiliary_loss_mlp": 0.01070559, + "balance_loss_clip": 1.43039322, + "balance_loss_mlp": 1.03877783, + "epoch": 0.13094844431083721, + "flos": 17904011990400.0, + "grad_norm": 1.9404430617681954, + "language_loss": 0.83576047, + "learning_rate": 3.894050566558015e-06, + "loss": 0.86323684, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.31762695, + "step": 2178, + "time_per_iteration": 3.0000176429748535 + }, + { + "auxiliary_loss_clip": 0.01671013, + "auxiliary_loss_mlp": 0.01071914, + "balance_loss_clip": 1.42280257, + "balance_loss_mlp": 1.04282701, + "epoch": 0.13100856756350518, + "flos": 17320234590720.0, + "grad_norm": 2.2237775612098654, + "language_loss": 0.76001209, + "learning_rate": 3.893925451517562e-06, + "loss": 0.78744137, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.29052734, + "step": 2179, + "time_per_iteration": 2.962258815765381 + }, + { + "auxiliary_loss_clip": 0.0167029, + "auxiliary_loss_mlp": 0.01074989, + "balance_loss_clip": 1.42477083, + "balance_loss_mlp": 1.04511487, + "epoch": 0.13106869081617314, + "flos": 22210778557440.0, + "grad_norm": 2.143252182243673, + "language_loss": 0.85702217, + "learning_rate": 3.893800264659266e-06, + "loss": 0.88447499, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.29882812, + "step": 2180, + "time_per_iteration": 3.087979793548584 + }, + { + "auxiliary_loss_clip": 0.01668468, + "auxiliary_loss_mlp": 0.01083586, + "balance_loss_clip": 1.42126429, + "balance_loss_mlp": 1.05303311, + "epoch": 0.13112881406884114, + "flos": 21773341399680.0, + "grad_norm": 2.1246501485878273, + "language_loss": 0.90869927, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.93621981, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.30578613, + "step": 2181, + "time_per_iteration": 2.83451509475708 + }, + { + "auxiliary_loss_clip": 0.0168031, + "auxiliary_loss_mlp": 0.01077112, + "balance_loss_clip": 1.4284761, + "balance_loss_mlp": 1.04592705, + "epoch": 0.1311889373215091, + "flos": 23341476827520.0, + "grad_norm": 1.8219470839792329, + "language_loss": 0.7020452, + "learning_rate": 3.893549675508137e-06, + "loss": 0.72961938, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.31152344, + "step": 2182, + "time_per_iteration": 2.8762240409851074 + }, + { + "auxiliary_loss_clip": 0.01692515, + "auxiliary_loss_mlp": 0.01076729, + "balance_loss_clip": 1.43779349, + "balance_loss_mlp": 1.04411364, + "epoch": 0.13124906057417707, + "flos": 21476498394240.0, + "grad_norm": 5.007251636202205, + "language_loss": 0.79953742, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8272298, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.32641602, + "step": 2183, + "time_per_iteration": 4.395808696746826 + }, + { + "auxiliary_loss_clip": 0.01670554, + "auxiliary_loss_mlp": 0.01070114, + "balance_loss_clip": 1.42289698, + "balance_loss_mlp": 1.03878617, + "epoch": 0.13130918382684503, + "flos": 23264958591360.0, + "grad_norm": 1.65536070981637, + "language_loss": 0.86277682, + "learning_rate": 3.893298799142636e-06, + "loss": 0.89018351, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.31323242, + "step": 2184, + "time_per_iteration": 2.840151071548462 + }, + { + "auxiliary_loss_clip": 0.01686953, + "auxiliary_loss_mlp": 0.0107039, + "balance_loss_clip": 1.4372921, + "balance_loss_mlp": 1.03934765, + "epoch": 0.131369307079513, + "flos": 20860072231680.0, + "grad_norm": 1.9195277516365847, + "language_loss": 0.83438444, + "learning_rate": 3.893173253266387e-06, + "loss": 0.86195791, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.31054688, + "step": 2185, + "time_per_iteration": 5.720970392227173 + }, + { + "auxiliary_loss_clip": 0.01679977, + "auxiliary_loss_mlp": 0.01079559, + "balance_loss_clip": 1.42787266, + "balance_loss_mlp": 1.04443979, + "epoch": 0.13142943033218096, + "flos": 17867336440320.0, + "grad_norm": 1.7468460621663786, + "language_loss": 0.74012113, + "learning_rate": 3.893047635600818e-06, + "loss": 0.76771653, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.35131836, + "step": 2186, + "time_per_iteration": 2.8482980728149414 + }, + { + "auxiliary_loss_clip": 0.01672699, + "auxiliary_loss_mlp": 0.01067711, + "balance_loss_clip": 1.42355955, + "balance_loss_mlp": 1.03440392, + "epoch": 0.13148955358484893, + "flos": 21005824291200.0, + "grad_norm": 2.3476334813928776, + "language_loss": 0.81390864, + "learning_rate": 3.892921946150693e-06, + "loss": 0.84131277, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.33325195, + "step": 2187, + "time_per_iteration": 2.939473867416382 + }, + { + "auxiliary_loss_clip": 0.0139705, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.25633216, + "balance_loss_mlp": 1.02583456, + "epoch": 0.13154967683751692, + "flos": 70202831304960.0, + "grad_norm": 0.8398198373172424, + "language_loss": 0.59166908, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61606479, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.16699219, + "step": 2188, + "time_per_iteration": 3.348147392272949 + }, + { + "auxiliary_loss_clip": 0.01681324, + "auxiliary_loss_mlp": 0.01066113, + "balance_loss_clip": 1.43437457, + "balance_loss_mlp": 1.0346421, + "epoch": 0.1316098000901849, + "flos": 20385869034240.0, + "grad_norm": 1.718416462691601, + "language_loss": 0.75070918, + "learning_rate": 3.892670351915842e-06, + "loss": 0.77818358, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.31445312, + "step": 2189, + "time_per_iteration": 2.8964555263519287 + }, + { + "auxiliary_loss_clip": 0.01682458, + "auxiliary_loss_mlp": 0.0105622, + "balance_loss_clip": 1.43206394, + "balance_loss_mlp": 1.02498674, + "epoch": 0.13166992334285285, + "flos": 23231359687680.0, + "grad_norm": 2.0466837355810723, + "language_loss": 0.73773986, + "learning_rate": 3.892544447140657e-06, + "loss": 0.76512665, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.31225586, + "step": 2190, + "time_per_iteration": 2.8651578426361084 + }, + { + "auxiliary_loss_clip": 0.01669767, + "auxiliary_loss_mlp": 0.01065176, + "balance_loss_clip": 1.42175555, + "balance_loss_mlp": 1.03456295, + "epoch": 0.13173004659552082, + "flos": 23341295848320.0, + "grad_norm": 2.2119454856833625, + "language_loss": 0.75608462, + "learning_rate": 3.892418470599996e-06, + "loss": 0.78343403, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.30566406, + "step": 2191, + "time_per_iteration": 2.8664255142211914 + }, + { + "auxiliary_loss_clip": 0.01678812, + "auxiliary_loss_mlp": 0.01059315, + "balance_loss_clip": 1.42433667, + "balance_loss_mlp": 1.02734351, + "epoch": 0.13179016984818878, + "flos": 21261376776960.0, + "grad_norm": 2.046379818784184, + "language_loss": 0.79937422, + "learning_rate": 3.892292422298637e-06, + "loss": 0.82675552, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.31958008, + "step": 2192, + "time_per_iteration": 2.877703905105591 + }, + { + "auxiliary_loss_clip": 0.01690361, + "auxiliary_loss_mlp": 0.01060155, + "balance_loss_clip": 1.43742418, + "balance_loss_mlp": 1.02618027, + "epoch": 0.13185029310085675, + "flos": 17785931765760.0, + "grad_norm": 1.9960683765863096, + "language_loss": 0.86217225, + "learning_rate": 3.892166302241361e-06, + "loss": 0.88967735, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.33959961, + "step": 2193, + "time_per_iteration": 2.8532724380493164 + }, + { + "auxiliary_loss_clip": 0.01398099, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.25543666, + "balance_loss_mlp": 1.01695621, + "epoch": 0.1319104163535247, + "flos": 69884062554240.0, + "grad_norm": 0.8143482546912102, + "language_loss": 0.54079807, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.5651021, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.15332031, + "step": 2194, + "time_per_iteration": 3.383409023284912 + }, + { + "auxiliary_loss_clip": 0.01681369, + "auxiliary_loss_mlp": 0.01061548, + "balance_loss_clip": 1.43191123, + "balance_loss_mlp": 1.02857459, + "epoch": 0.1319705396061927, + "flos": 25204283510400.0, + "grad_norm": 1.642422156053264, + "language_loss": 0.73202914, + "learning_rate": 3.891913846878185e-06, + "loss": 0.7594583, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.3293457, + "step": 2195, + "time_per_iteration": 2.8602216243743896 + }, + { + "auxiliary_loss_clip": 0.01690039, + "auxiliary_loss_mlp": 0.01058075, + "balance_loss_clip": 1.43334174, + "balance_loss_mlp": 1.02600741, + "epoch": 0.13203066285886067, + "flos": 20750136071040.0, + "grad_norm": 1.5969099195726997, + "language_loss": 0.79383564, + "learning_rate": 3.891787511581859e-06, + "loss": 0.82131672, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.32080078, + "step": 2196, + "time_per_iteration": 2.8809094429016113 + }, + { + "auxiliary_loss_clip": 0.01690226, + "auxiliary_loss_mlp": 0.01065472, + "balance_loss_clip": 1.43266702, + "balance_loss_mlp": 1.03378665, + "epoch": 0.13209078611152864, + "flos": 22064528805120.0, + "grad_norm": 1.9168544986567346, + "language_loss": 0.76285124, + "learning_rate": 3.89166110454876e-06, + "loss": 0.79040825, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 2.57226562, + "router_z_loss_mlp": 0.31677246, + "step": 2197, + "time_per_iteration": 2.877366781234741 + }, + { + "auxiliary_loss_clip": 0.01710094, + "auxiliary_loss_mlp": 0.01062606, + "balance_loss_clip": 1.4497025, + "balance_loss_mlp": 1.02872682, + "epoch": 0.1321509093641966, + "flos": 16292369047680.0, + "grad_norm": 2.121745585866298, + "language_loss": 0.81095088, + "learning_rate": 3.891534625783685e-06, + "loss": 0.83867788, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 2.60546875, + "router_z_loss_mlp": 0.33886719, + "step": 2198, + "time_per_iteration": 2.793059825897217 + }, + { + "auxiliary_loss_clip": 0.01675411, + "auxiliary_loss_mlp": 0.01067863, + "balance_loss_clip": 1.42110932, + "balance_loss_mlp": 1.03386486, + "epoch": 0.13221103261686457, + "flos": 16991873942400.0, + "grad_norm": 2.2277847034763516, + "language_loss": 0.84749401, + "learning_rate": 3.891408075291425e-06, + "loss": 0.87492681, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.34008789, + "step": 2199, + "time_per_iteration": 2.83859920501709 + }, + { + "auxiliary_loss_clip": 0.01676103, + "auxiliary_loss_mlp": 0.0106285, + "balance_loss_clip": 1.42487001, + "balance_loss_mlp": 1.0287323, + "epoch": 0.13227115586953253, + "flos": 34245073722240.0, + "grad_norm": 1.639017068424933, + "language_loss": 0.70688629, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.73427582, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.34106445, + "step": 2200, + "time_per_iteration": 2.9393491744995117 + }, + { + "auxiliary_loss_clip": 0.01662444, + "auxiliary_loss_mlp": 0.01062167, + "balance_loss_clip": 1.41594625, + "balance_loss_mlp": 1.02847874, + "epoch": 0.13233127912220052, + "flos": 20714501151360.0, + "grad_norm": 2.7658445640170486, + "language_loss": 0.86538011, + "learning_rate": 3.891154759144557e-06, + "loss": 0.89262617, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.33666992, + "step": 2201, + "time_per_iteration": 2.8718395233154297 + }, + { + "auxiliary_loss_clip": 0.01684257, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.42944264, + "balance_loss_mlp": 1.02385485, + "epoch": 0.1323914023748685, + "flos": 25814239666560.0, + "grad_norm": 1.816227252080514, + "language_loss": 0.87703508, + "learning_rate": 3.891027993499554e-06, + "loss": 0.90444261, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.32641602, + "step": 2202, + "time_per_iteration": 2.9304707050323486 + }, + { + "auxiliary_loss_clip": 0.01699439, + "auxiliary_loss_mlp": 0.01061578, + "balance_loss_clip": 1.44528806, + "balance_loss_mlp": 1.02967763, + "epoch": 0.13245152562753645, + "flos": 21261422021760.0, + "grad_norm": 1.9334356759875198, + "language_loss": 0.73344547, + "learning_rate": 3.89090115614658e-06, + "loss": 0.76105565, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.31860352, + "step": 2203, + "time_per_iteration": 2.86452317237854 + }, + { + "auxiliary_loss_clip": 0.01682608, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.43185163, + "balance_loss_mlp": 1.02929997, + "epoch": 0.13251164888020442, + "flos": 26621192257920.0, + "grad_norm": 2.036001192286987, + "language_loss": 0.75164723, + "learning_rate": 3.890774247090444e-06, + "loss": 0.77910316, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.33666992, + "step": 2204, + "time_per_iteration": 2.9218883514404297 + }, + { + "auxiliary_loss_clip": 0.01684001, + "auxiliary_loss_mlp": 0.01062396, + "balance_loss_clip": 1.43226075, + "balance_loss_mlp": 1.02949381, + "epoch": 0.13257177213287238, + "flos": 29838867788160.0, + "grad_norm": 2.238147785848352, + "language_loss": 0.79478085, + "learning_rate": 3.89064726633596e-06, + "loss": 0.82224488, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.32910156, + "step": 2205, + "time_per_iteration": 2.8755714893341064 + }, + { + "auxiliary_loss_clip": 0.01668532, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.4202466, + "balance_loss_mlp": 1.02535689, + "epoch": 0.13263189538554035, + "flos": 21298459530240.0, + "grad_norm": 1.9028270300135417, + "language_loss": 0.80102271, + "learning_rate": 3.890520213887941e-06, + "loss": 0.82828152, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.31982422, + "step": 2206, + "time_per_iteration": 2.8843042850494385 + }, + { + "auxiliary_loss_clip": 0.01680659, + "auxiliary_loss_mlp": 0.01061332, + "balance_loss_clip": 1.42821443, + "balance_loss_mlp": 1.02978921, + "epoch": 0.13269201863820831, + "flos": 16883068901760.0, + "grad_norm": 1.9869978765794452, + "language_loss": 0.75133812, + "learning_rate": 3.890393089751208e-06, + "loss": 0.77875799, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 2.5234375, + "router_z_loss_mlp": 0.31518555, + "step": 2207, + "time_per_iteration": 2.794328451156616 + }, + { + "auxiliary_loss_clip": 0.01659297, + "auxiliary_loss_mlp": 0.01052658, + "balance_loss_clip": 1.41533446, + "balance_loss_mlp": 1.02037621, + "epoch": 0.1327521418908763, + "flos": 23779411678080.0, + "grad_norm": 2.876504972094219, + "language_loss": 0.85398322, + "learning_rate": 3.890265893930578e-06, + "loss": 0.8811028, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.32275391, + "step": 2208, + "time_per_iteration": 2.8406734466552734 + }, + { + "auxiliary_loss_clip": 0.01652188, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.41390061, + "balance_loss_mlp": 1.0228281, + "epoch": 0.13281226514354427, + "flos": 26516866452480.0, + "grad_norm": 1.5230906910859807, + "language_loss": 0.86054933, + "learning_rate": 3.890138626430876e-06, + "loss": 0.88761348, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.31420898, + "step": 2209, + "time_per_iteration": 2.881795883178711 + }, + { + "auxiliary_loss_clip": 0.01677194, + "auxiliary_loss_mlp": 0.01060504, + "balance_loss_clip": 1.42701781, + "balance_loss_mlp": 1.02450287, + "epoch": 0.13287238839621224, + "flos": 24509484074880.0, + "grad_norm": 2.1293400123410406, + "language_loss": 0.83495939, + "learning_rate": 3.890011287256929e-06, + "loss": 0.8623364, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.36010742, + "step": 2210, + "time_per_iteration": 2.8589446544647217 + }, + { + "auxiliary_loss_clip": 0.01410452, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.26312733, + "balance_loss_mlp": 1.01311207, + "epoch": 0.1329325116488802, + "flos": 67724820069120.0, + "grad_norm": 0.7676414389135011, + "language_loss": 0.58099425, + "learning_rate": 3.889883876413563e-06, + "loss": 0.60542834, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.19824219, + "step": 2211, + "time_per_iteration": 4.889142274856567 + }, + { + "auxiliary_loss_clip": 0.01414147, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.26469052, + "balance_loss_mlp": 1.00319469, + "epoch": 0.13299263490154817, + "flos": 72295328430720.0, + "grad_norm": 0.7945020295737409, + "language_loss": 0.55375969, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57818681, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.25390625, + "step": 2212, + "time_per_iteration": 3.3749260902404785 + }, + { + "auxiliary_loss_clip": 0.01676897, + "auxiliary_loss_mlp": 0.01061147, + "balance_loss_clip": 1.42343068, + "balance_loss_mlp": 1.02726805, + "epoch": 0.13305275815421613, + "flos": 17940280337280.0, + "grad_norm": 2.8677845111011604, + "language_loss": 0.76297772, + "learning_rate": 3.889628839737908e-06, + "loss": 0.79035819, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 2.53515625, + "router_z_loss_mlp": 0.33886719, + "step": 2213, + "time_per_iteration": 2.8619086742401123 + }, + { + "auxiliary_loss_clip": 0.0165671, + "auxiliary_loss_mlp": 0.0105635, + "balance_loss_clip": 1.41372371, + "balance_loss_mlp": 1.02530766, + "epoch": 0.13311288140688413, + "flos": 22349698652160.0, + "grad_norm": 1.8675936190606444, + "language_loss": 0.80350614, + "learning_rate": 3.889501213915291e-06, + "loss": 0.83063668, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.31054688, + "step": 2214, + "time_per_iteration": 2.9495420455932617 + }, + { + "auxiliary_loss_clip": 0.01664263, + "auxiliary_loss_mlp": 0.01057818, + "balance_loss_clip": 1.41579175, + "balance_loss_mlp": 1.02548814, + "epoch": 0.1331730046595521, + "flos": 31881523127040.0, + "grad_norm": 2.1190203223969144, + "language_loss": 0.70033765, + "learning_rate": 3.889373516442597e-06, + "loss": 0.72755849, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.32348633, + "step": 2215, + "time_per_iteration": 2.9240376949310303 + }, + { + "auxiliary_loss_clip": 0.01663946, + "auxiliary_loss_mlp": 0.01052852, + "balance_loss_clip": 1.41442728, + "balance_loss_mlp": 1.02130914, + "epoch": 0.13323312791222006, + "flos": 22577217344640.0, + "grad_norm": 1.6538586919457354, + "language_loss": 0.81904364, + "learning_rate": 3.889245747324671e-06, + "loss": 0.84621167, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.31542969, + "step": 2216, + "time_per_iteration": 2.945469856262207 + }, + { + "auxiliary_loss_clip": 0.01669149, + "auxiliary_loss_mlp": 0.01061474, + "balance_loss_clip": 1.4211359, + "balance_loss_mlp": 1.02809513, + "epoch": 0.13329325116488802, + "flos": 15093568074240.0, + "grad_norm": 3.0655652030881173, + "language_loss": 0.89233387, + "learning_rate": 3.889117906566356e-06, + "loss": 0.91964006, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.33349609, + "step": 2217, + "time_per_iteration": 2.9750473499298096 + }, + { + "auxiliary_loss_clip": 0.01650362, + "auxiliary_loss_mlp": 0.01062849, + "balance_loss_clip": 1.40606153, + "balance_loss_mlp": 1.02541733, + "epoch": 0.133353374417556, + "flos": 27465046623360.0, + "grad_norm": 3.6468459193249894, + "language_loss": 0.75821477, + "learning_rate": 3.888989994172501e-06, + "loss": 0.78534687, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.37451172, + "step": 2218, + "time_per_iteration": 4.425408363342285 + }, + { + "auxiliary_loss_clip": 0.01667583, + "auxiliary_loss_mlp": 0.01052221, + "balance_loss_clip": 1.41787231, + "balance_loss_mlp": 1.01862812, + "epoch": 0.13341349767022395, + "flos": 24104605190400.0, + "grad_norm": 1.9672089722879005, + "language_loss": 0.88079703, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.9079951, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.3359375, + "step": 2219, + "time_per_iteration": 2.887049674987793 + }, + { + "auxiliary_loss_clip": 0.01691852, + "auxiliary_loss_mlp": 0.01068274, + "balance_loss_clip": 1.44059873, + "balance_loss_mlp": 1.03456175, + "epoch": 0.13347362092289192, + "flos": 24143814449280.0, + "grad_norm": 1.4661998144325583, + "language_loss": 0.78493011, + "learning_rate": 3.888733954497574e-06, + "loss": 0.81253147, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.33740234, + "step": 2220, + "time_per_iteration": 5.682001352310181 + }, + { + "auxiliary_loss_clip": 0.01660942, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.41361594, + "balance_loss_mlp": 1.02760613, + "epoch": 0.1335337441755599, + "flos": 18444236630400.0, + "grad_norm": 2.721538810129121, + "language_loss": 0.80234587, + "learning_rate": 3.888605827226212e-06, + "loss": 0.82955945, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 2.47460938, + "router_z_loss_mlp": 0.328125, + "step": 2221, + "time_per_iteration": 3.0044450759887695 + }, + { + "auxiliary_loss_clip": 0.01372753, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.22806633, + "balance_loss_mlp": 1.00461757, + "epoch": 0.13359386742822787, + "flos": 50636221447680.0, + "grad_norm": 0.9701914452700214, + "language_loss": 0.69067019, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71466517, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.22167969, + "step": 2222, + "time_per_iteration": 3.234295129776001 + }, + { + "auxiliary_loss_clip": 0.01668832, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.42110586, + "balance_loss_mlp": 1.03217626, + "epoch": 0.13365399068089584, + "flos": 22787452523520.0, + "grad_norm": 1.8615036882885343, + "language_loss": 0.67828512, + "learning_rate": 3.888349357839982e-06, + "loss": 0.70561469, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.31933594, + "step": 2223, + "time_per_iteration": 2.870704412460327 + }, + { + "auxiliary_loss_clip": 0.01674499, + "auxiliary_loss_mlp": 0.01065433, + "balance_loss_clip": 1.4228878, + "balance_loss_mlp": 1.02983713, + "epoch": 0.1337141139335638, + "flos": 12539264826240.0, + "grad_norm": 1.8214662679262708, + "language_loss": 0.83334225, + "learning_rate": 3.88822101573484e-06, + "loss": 0.86074162, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.35595703, + "step": 2224, + "time_per_iteration": 2.8814492225646973 + }, + { + "auxiliary_loss_clip": 0.01668246, + "auxiliary_loss_mlp": 0.01063721, + "balance_loss_clip": 1.41306114, + "balance_loss_mlp": 1.03031802, + "epoch": 0.13377423718623177, + "flos": 23049203546880.0, + "grad_norm": 2.3635478795185145, + "language_loss": 0.6802423, + "learning_rate": 3.888092602028167e-06, + "loss": 0.70756197, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.33398438, + "step": 2225, + "time_per_iteration": 2.8472397327423096 + }, + { + "auxiliary_loss_clip": 0.01668403, + "auxiliary_loss_mlp": 0.01065517, + "balance_loss_clip": 1.41924739, + "balance_loss_mlp": 1.03232932, + "epoch": 0.13383436043889974, + "flos": 16224945016320.0, + "grad_norm": 2.2150772197248263, + "language_loss": 0.91311574, + "learning_rate": 3.887964116724835e-06, + "loss": 0.94045496, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.33203125, + "step": 2226, + "time_per_iteration": 2.8394641876220703 + }, + { + "auxiliary_loss_clip": 0.01662154, + "auxiliary_loss_mlp": 0.01066676, + "balance_loss_clip": 1.41052139, + "balance_loss_mlp": 1.03239179, + "epoch": 0.1338944836915677, + "flos": 24290290425600.0, + "grad_norm": 17.462183445264127, + "language_loss": 0.75271404, + "learning_rate": 3.887835559829712e-06, + "loss": 0.78000236, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.34301758, + "step": 2227, + "time_per_iteration": 2.850189447402954 + }, + { + "auxiliary_loss_clip": 0.01663371, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_clip": 1.41227126, + "balance_loss_mlp": 1.02378511, + "epoch": 0.1339546069442357, + "flos": 17606580802560.0, + "grad_norm": 1.916135448850988, + "language_loss": 0.86129403, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.88848865, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.32275391, + "step": 2228, + "time_per_iteration": 2.844377040863037 + }, + { + "auxiliary_loss_clip": 0.0164922, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.40495801, + "balance_loss_mlp": 1.02342474, + "epoch": 0.13401473019690366, + "flos": 18999527788800.0, + "grad_norm": 1.7417318010054608, + "language_loss": 0.82190472, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.84895563, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.32446289, + "step": 2229, + "time_per_iteration": 2.8503780364990234 + }, + { + "auxiliary_loss_clip": 0.01669237, + "auxiliary_loss_mlp": 0.01066364, + "balance_loss_clip": 1.41879451, + "balance_loss_mlp": 1.03336644, + "epoch": 0.13407485344957162, + "flos": 26955298995840.0, + "grad_norm": 2.6417757035260596, + "language_loss": 0.75545585, + "learning_rate": 3.887449459642378e-06, + "loss": 0.78281188, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.33007812, + "step": 2230, + "time_per_iteration": 2.907579183578491 + }, + { + "auxiliary_loss_clip": 0.01656785, + "auxiliary_loss_mlp": 0.01057601, + "balance_loss_clip": 1.40785241, + "balance_loss_mlp": 1.02498519, + "epoch": 0.1341349767022396, + "flos": 20349012504960.0, + "grad_norm": 1.9170354954648896, + "language_loss": 0.81355876, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.84070265, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.32568359, + "step": 2231, + "time_per_iteration": 2.8563761711120605 + }, + { + "auxiliary_loss_clip": 0.01679295, + "auxiliary_loss_mlp": 0.01069893, + "balance_loss_clip": 1.42350769, + "balance_loss_mlp": 1.03482127, + "epoch": 0.13419509995490755, + "flos": 29874095504640.0, + "grad_norm": 1.4958494686436283, + "language_loss": 0.73531151, + "learning_rate": 3.887191701647992e-06, + "loss": 0.76280344, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.35058594, + "step": 2232, + "time_per_iteration": 2.955695390701294 + }, + { + "auxiliary_loss_clip": 0.01679415, + "auxiliary_loss_mlp": 0.01059809, + "balance_loss_clip": 1.42565048, + "balance_loss_mlp": 1.02671647, + "epoch": 0.13425522320757552, + "flos": 26954258365440.0, + "grad_norm": 2.6218884210703304, + "language_loss": 0.67043585, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.69782805, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.33081055, + "step": 2233, + "time_per_iteration": 2.9068074226379395 + }, + { + "auxiliary_loss_clip": 0.0165973, + "auxiliary_loss_mlp": 0.01064774, + "balance_loss_clip": 1.40951562, + "balance_loss_mlp": 1.02986979, + "epoch": 0.1343153464602435, + "flos": 15785381352960.0, + "grad_norm": 3.2636277228708157, + "language_loss": 0.83347625, + "learning_rate": 3.886933657403615e-06, + "loss": 0.86072129, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.34887695, + "step": 2234, + "time_per_iteration": 2.8156683444976807 + }, + { + "auxiliary_loss_clip": 0.01674627, + "auxiliary_loss_mlp": 0.01060823, + "balance_loss_clip": 1.42338347, + "balance_loss_mlp": 1.02784967, + "epoch": 0.13437546971291148, + "flos": 24325020449280.0, + "grad_norm": 2.543189045103409, + "language_loss": 0.82392395, + "learning_rate": 3.886804527949909e-06, + "loss": 0.85127842, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.32983398, + "step": 2235, + "time_per_iteration": 2.8894009590148926 + }, + { + "auxiliary_loss_clip": 0.01674732, + "auxiliary_loss_mlp": 0.01065244, + "balance_loss_clip": 1.42466021, + "balance_loss_mlp": 1.03115058, + "epoch": 0.13443559296557944, + "flos": 26661758860800.0, + "grad_norm": 2.854772423550955, + "language_loss": 0.87054396, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.89794374, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.34106445, + "step": 2236, + "time_per_iteration": 3.0925490856170654 + }, + { + "auxiliary_loss_clip": 0.01676027, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.42150438, + "balance_loss_mlp": 1.03273773, + "epoch": 0.1344957162182474, + "flos": 21805537714560.0, + "grad_norm": 1.640106243547918, + "language_loss": 0.78105074, + "learning_rate": 3.886546054403946e-06, + "loss": 0.80850458, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 2.54296875, + "router_z_loss_mlp": 0.36572266, + "step": 2237, + "time_per_iteration": 2.886073589324951 + }, + { + "auxiliary_loss_clip": 0.01680352, + "auxiliary_loss_mlp": 0.01065802, + "balance_loss_clip": 1.4279809, + "balance_loss_mlp": 1.03230405, + "epoch": 0.13455583947091537, + "flos": 19875216510720.0, + "grad_norm": 1.9295275703255441, + "language_loss": 0.80783224, + "learning_rate": 3.886416710321491e-06, + "loss": 0.83529377, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 2.52539062, + "router_z_loss_mlp": 0.33496094, + "step": 2238, + "time_per_iteration": 2.8664093017578125 + }, + { + "auxiliary_loss_clip": 0.0165413, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.40627813, + "balance_loss_mlp": 1.02227569, + "epoch": 0.13461596272358334, + "flos": 30859086960000.0, + "grad_norm": 2.5617866330023746, + "language_loss": 0.68794793, + "learning_rate": 3.886287294705924e-06, + "loss": 0.71504474, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.33276367, + "step": 2239, + "time_per_iteration": 2.922264337539673 + }, + { + "auxiliary_loss_clip": 0.01667044, + "auxiliary_loss_mlp": 0.01056813, + "balance_loss_clip": 1.4162885, + "balance_loss_mlp": 1.02233815, + "epoch": 0.1346760859762513, + "flos": 12501910604160.0, + "grad_norm": 2.2735017952947607, + "language_loss": 0.83779114, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.86502969, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.34472656, + "step": 2240, + "time_per_iteration": 2.898653984069824 + }, + { + "auxiliary_loss_clip": 0.01655942, + "auxiliary_loss_mlp": 0.01052454, + "balance_loss_clip": 1.40491486, + "balance_loss_mlp": 1.01776409, + "epoch": 0.1347362092289193, + "flos": 21846240051840.0, + "grad_norm": 1.6682507039330976, + "language_loss": 0.79314375, + "learning_rate": 3.886028248895093e-06, + "loss": 0.82022774, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.34692383, + "step": 2241, + "time_per_iteration": 2.8611223697662354 + }, + { + "auxiliary_loss_clip": 0.01658151, + "auxiliary_loss_mlp": 0.0105429, + "balance_loss_clip": 1.41353095, + "balance_loss_mlp": 1.02193689, + "epoch": 0.13479633248158726, + "flos": 23518837019520.0, + "grad_norm": 4.032841671550493, + "language_loss": 0.84832186, + "learning_rate": 3.88589861870965e-06, + "loss": 0.87544626, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.32324219, + "step": 2242, + "time_per_iteration": 2.873443603515625 + }, + { + "auxiliary_loss_clip": 0.01661343, + "auxiliary_loss_mlp": 0.01063612, + "balance_loss_clip": 1.41140091, + "balance_loss_mlp": 1.02875507, + "epoch": 0.13485645573425523, + "flos": 29354484510720.0, + "grad_norm": 2.7324749209159416, + "language_loss": 0.66733652, + "learning_rate": 3.885768917010744e-06, + "loss": 0.69458604, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.34887695, + "step": 2243, + "time_per_iteration": 2.946354866027832 + }, + { + "auxiliary_loss_clip": 0.01633454, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.39133239, + "balance_loss_mlp": 1.02646255, + "epoch": 0.1349165789869232, + "flos": 28048552554240.0, + "grad_norm": 1.3540645627905992, + "language_loss": 0.73497415, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.76189399, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.32055664, + "step": 2244, + "time_per_iteration": 2.9020535945892334 + }, + { + "auxiliary_loss_clip": 0.01658021, + "auxiliary_loss_mlp": 0.01057733, + "balance_loss_clip": 1.41167855, + "balance_loss_mlp": 1.02530789, + "epoch": 0.13497670223959116, + "flos": 22863201598080.0, + "grad_norm": 1.6189596213389545, + "language_loss": 0.87018055, + "learning_rate": 3.88550929909221e-06, + "loss": 0.89733815, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.32446289, + "step": 2245, + "time_per_iteration": 2.9105844497680664 + }, + { + "auxiliary_loss_clip": 0.01651272, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.41018355, + "balance_loss_mlp": 1.02504802, + "epoch": 0.13503682549225912, + "flos": 16512241368960.0, + "grad_norm": 1.5945270584582532, + "language_loss": 0.7970829, + "learning_rate": 3.88537938288243e-06, + "loss": 0.82419181, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.34570312, + "step": 2246, + "time_per_iteration": 4.309743881225586 + }, + { + "auxiliary_loss_clip": 0.01393707, + "auxiliary_loss_mlp": 0.01027418, + "balance_loss_clip": 1.24254274, + "balance_loss_mlp": 1.00510204, + "epoch": 0.1350969487449271, + "flos": 70786834928640.0, + "grad_norm": 0.7861586810762631, + "language_loss": 0.6058532, + "learning_rate": 3.885249395178874e-06, + "loss": 0.63006449, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 1.515625, + "router_z_loss_mlp": 0.22363281, + "step": 2247, + "time_per_iteration": 3.4398880004882812 + }, + { + "auxiliary_loss_clip": 0.01692989, + "auxiliary_loss_mlp": 0.01064746, + "balance_loss_clip": 1.43436265, + "balance_loss_mlp": 1.02955556, + "epoch": 0.13515707199759508, + "flos": 23086376789760.0, + "grad_norm": 2.1930484931520824, + "language_loss": 0.82009423, + "learning_rate": 3.885119335986473e-06, + "loss": 0.84767151, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 2.5859375, + "router_z_loss_mlp": 0.3515625, + "step": 2248, + "time_per_iteration": 2.9154210090637207 + }, + { + "auxiliary_loss_clip": 0.01652188, + "auxiliary_loss_mlp": 0.01051647, + "balance_loss_clip": 1.40858138, + "balance_loss_mlp": 1.0197227, + "epoch": 0.13521719525026304, + "flos": 23197036867200.0, + "grad_norm": 2.3099519486092364, + "language_loss": 0.77784169, + "learning_rate": 3.884989205310157e-06, + "loss": 0.80488002, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.3190918, + "step": 2249, + "time_per_iteration": 2.9171531200408936 + }, + { + "auxiliary_loss_clip": 0.01650041, + "auxiliary_loss_mlp": 0.01059806, + "balance_loss_clip": 1.40473747, + "balance_loss_mlp": 1.02699912, + "epoch": 0.135277318502931, + "flos": 24801802600320.0, + "grad_norm": 1.4402641800808507, + "language_loss": 0.85612535, + "learning_rate": 3.884859003154862e-06, + "loss": 0.88322377, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.32788086, + "step": 2250, + "time_per_iteration": 2.8820340633392334 + }, + { + "auxiliary_loss_clip": 0.01665566, + "auxiliary_loss_mlp": 0.01066516, + "balance_loss_clip": 1.41424727, + "balance_loss_mlp": 1.03046703, + "epoch": 0.13533744175559898, + "flos": 21918641011200.0, + "grad_norm": 5.11872802917947, + "language_loss": 0.83319372, + "learning_rate": 3.884728729525524e-06, + "loss": 0.86051452, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.36035156, + "step": 2251, + "time_per_iteration": 2.8914918899536133 + }, + { + "auxiliary_loss_clip": 0.01660265, + "auxiliary_loss_mlp": 0.0106473, + "balance_loss_clip": 1.41006017, + "balance_loss_mlp": 1.03261459, + "epoch": 0.13539756500826694, + "flos": 21220629194880.0, + "grad_norm": 1.7919121606973332, + "language_loss": 0.86674696, + "learning_rate": 3.884598384427084e-06, + "loss": 0.89399695, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 2.50390625, + "router_z_loss_mlp": 0.32080078, + "step": 2252, + "time_per_iteration": 2.802342176437378 + }, + { + "auxiliary_loss_clip": 0.01397023, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.24758983, + "balance_loss_mlp": 1.00765324, + "epoch": 0.1354576882609349, + "flos": 63271170322560.0, + "grad_norm": 0.7940009485906834, + "language_loss": 0.61923593, + "learning_rate": 3.884467967864485e-06, + "loss": 0.6435011, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.21875, + "step": 2253, + "time_per_iteration": 4.861313581466675 + }, + { + "auxiliary_loss_clip": 0.01673203, + "auxiliary_loss_mlp": 0.01074073, + "balance_loss_clip": 1.42429209, + "balance_loss_mlp": 1.04202974, + "epoch": 0.1355178115136029, + "flos": 25493796858240.0, + "grad_norm": 1.7040865700206325, + "language_loss": 0.90616304, + "learning_rate": 3.884337479842671e-06, + "loss": 0.93363583, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.32006836, + "step": 2254, + "time_per_iteration": 2.949700117111206 + }, + { + "auxiliary_loss_clip": 0.01679305, + "auxiliary_loss_mlp": 0.01077563, + "balance_loss_clip": 1.42450094, + "balance_loss_mlp": 1.04394603, + "epoch": 0.13557793476627086, + "flos": 21627091647360.0, + "grad_norm": 1.7491077381864728, + "language_loss": 0.86366153, + "learning_rate": 3.884206920366591e-06, + "loss": 0.89123023, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.33618164, + "step": 2255, + "time_per_iteration": 5.677891731262207 + }, + { + "auxiliary_loss_clip": 0.01660898, + "auxiliary_loss_mlp": 0.01082293, + "balance_loss_clip": 1.41341424, + "balance_loss_mlp": 1.04903388, + "epoch": 0.13563805801893883, + "flos": 24936922131840.0, + "grad_norm": 3.4424764159190784, + "language_loss": 0.76227313, + "learning_rate": 3.884076289441196e-06, + "loss": 0.78970504, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.33276367, + "step": 2256, + "time_per_iteration": 2.937514066696167 + }, + { + "auxiliary_loss_clip": 0.01667709, + "auxiliary_loss_mlp": 0.01077921, + "balance_loss_clip": 1.41406727, + "balance_loss_mlp": 1.04387486, + "epoch": 0.1356981812716068, + "flos": 14756701403520.0, + "grad_norm": 1.8689864515134464, + "language_loss": 0.8449868, + "learning_rate": 3.88394558707144e-06, + "loss": 0.87244308, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 2.5390625, + "router_z_loss_mlp": 0.34033203, + "step": 2257, + "time_per_iteration": 2.8200438022613525 + }, + { + "auxiliary_loss_clip": 0.01681505, + "auxiliary_loss_mlp": 0.01079882, + "balance_loss_clip": 1.42657995, + "balance_loss_mlp": 1.04690862, + "epoch": 0.13575830452427476, + "flos": 11115840827520.0, + "grad_norm": 2.5465193847240295, + "language_loss": 0.83550143, + "learning_rate": 3.883814813262277e-06, + "loss": 0.86311531, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.32983398, + "step": 2258, + "time_per_iteration": 2.8580639362335205 + }, + { + "auxiliary_loss_clip": 0.01670677, + "auxiliary_loss_mlp": 0.01071001, + "balance_loss_clip": 1.41687489, + "balance_loss_mlp": 1.03857636, + "epoch": 0.13581842777694272, + "flos": 17967363989760.0, + "grad_norm": 3.1690223270620055, + "language_loss": 0.84782302, + "learning_rate": 3.883683968018669e-06, + "loss": 0.87523979, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.32421875, + "step": 2259, + "time_per_iteration": 2.7921218872070312 + }, + { + "auxiliary_loss_clip": 0.01648263, + "auxiliary_loss_mlp": 0.01081888, + "balance_loss_clip": 1.40323055, + "balance_loss_mlp": 1.05060768, + "epoch": 0.1358785510296107, + "flos": 22867499854080.0, + "grad_norm": 3.877350175983871, + "language_loss": 0.74765307, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.77495456, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.31298828, + "step": 2260, + "time_per_iteration": 2.8964359760284424 + }, + { + "auxiliary_loss_clip": 0.0164955, + "auxiliary_loss_mlp": 0.01074562, + "balance_loss_clip": 1.40301788, + "balance_loss_mlp": 1.04344821, + "epoch": 0.13593867428227868, + "flos": 25750480464000.0, + "grad_norm": 2.31451821812165, + "language_loss": 0.76886541, + "learning_rate": 3.883422063247961e-06, + "loss": 0.79610658, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.3112793, + "step": 2261, + "time_per_iteration": 2.8680355548858643 + }, + { + "auxiliary_loss_clip": 0.01654392, + "auxiliary_loss_mlp": 0.01073179, + "balance_loss_clip": 1.40667892, + "balance_loss_mlp": 1.04068255, + "epoch": 0.13599879753494665, + "flos": 31261839338880.0, + "grad_norm": 2.3535543114740345, + "language_loss": 0.64015353, + "learning_rate": 3.883291003730794e-06, + "loss": 0.66742921, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.32519531, + "step": 2262, + "time_per_iteration": 2.9303812980651855 + }, + { + "auxiliary_loss_clip": 0.01660415, + "auxiliary_loss_mlp": 0.01064852, + "balance_loss_clip": 1.41097331, + "balance_loss_mlp": 1.03416777, + "epoch": 0.1360589207876146, + "flos": 23925616185600.0, + "grad_norm": 2.7275256750757784, + "language_loss": 0.84277189, + "learning_rate": 3.883159872799043e-06, + "loss": 0.87002462, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.30664062, + "step": 2263, + "time_per_iteration": 2.912876844406128 + }, + { + "auxiliary_loss_clip": 0.01656477, + "auxiliary_loss_mlp": 0.01079756, + "balance_loss_clip": 1.40524697, + "balance_loss_mlp": 1.04575753, + "epoch": 0.13611904404028258, + "flos": 19983659592960.0, + "grad_norm": 7.81441838823987, + "language_loss": 0.88546008, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.91282248, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.33984375, + "step": 2264, + "time_per_iteration": 2.870145082473755 + }, + { + "auxiliary_loss_clip": 0.01657004, + "auxiliary_loss_mlp": 0.01073734, + "balance_loss_clip": 1.4040767, + "balance_loss_mlp": 1.03856683, + "epoch": 0.13617916729295054, + "flos": 15348939580800.0, + "grad_norm": 5.522414941165018, + "language_loss": 0.72830987, + "learning_rate": 3.882897396711683e-06, + "loss": 0.75561726, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.35180664, + "step": 2265, + "time_per_iteration": 2.8204309940338135 + }, + { + "auxiliary_loss_clip": 0.01635791, + "auxiliary_loss_mlp": 0.01069001, + "balance_loss_clip": 1.3912468, + "balance_loss_mlp": 1.03567028, + "epoch": 0.1362392905456185, + "flos": 27462241445760.0, + "grad_norm": 2.0237558501741666, + "language_loss": 0.67840707, + "learning_rate": 3.882766051566027e-06, + "loss": 0.70545495, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.33325195, + "step": 2266, + "time_per_iteration": 2.8790993690490723 + }, + { + "auxiliary_loss_clip": 0.01653075, + "auxiliary_loss_mlp": 0.01073313, + "balance_loss_clip": 1.40594435, + "balance_loss_mlp": 1.03936195, + "epoch": 0.1362994137982865, + "flos": 25019729395200.0, + "grad_norm": 1.581117339619779, + "language_loss": 0.7757796, + "learning_rate": 3.882634635025694e-06, + "loss": 0.80304354, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.33935547, + "step": 2267, + "time_per_iteration": 2.9164888858795166 + }, + { + "auxiliary_loss_clip": 0.0164442, + "auxiliary_loss_mlp": 0.01068306, + "balance_loss_clip": 1.39749694, + "balance_loss_mlp": 1.03578568, + "epoch": 0.13635953705095447, + "flos": 20312698913280.0, + "grad_norm": 2.277653474466681, + "language_loss": 0.83072019, + "learning_rate": 3.882503147095667e-06, + "loss": 0.85784745, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.32495117, + "step": 2268, + "time_per_iteration": 2.82979679107666 + }, + { + "auxiliary_loss_clip": 0.01642001, + "auxiliary_loss_mlp": 0.01068814, + "balance_loss_clip": 1.3992033, + "balance_loss_mlp": 1.03526831, + "epoch": 0.13641966030362243, + "flos": 31371006337920.0, + "grad_norm": 1.7735982068693383, + "language_loss": 0.77372926, + "learning_rate": 3.882371587780931e-06, + "loss": 0.8008374, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.33569336, + "step": 2269, + "time_per_iteration": 2.9534313678741455 + }, + { + "auxiliary_loss_clip": 0.01660239, + "auxiliary_loss_mlp": 0.01065574, + "balance_loss_clip": 1.41085374, + "balance_loss_mlp": 1.03338742, + "epoch": 0.1364797835562904, + "flos": 20485896583680.0, + "grad_norm": 1.8665080539816428, + "language_loss": 0.81702566, + "learning_rate": 3.882239957086477e-06, + "loss": 0.84428376, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.32202148, + "step": 2270, + "time_per_iteration": 2.8775389194488525 + }, + { + "auxiliary_loss_clip": 0.01639669, + "auxiliary_loss_mlp": 0.01068205, + "balance_loss_clip": 1.39274335, + "balance_loss_mlp": 1.03430176, + "epoch": 0.13653990680895836, + "flos": 13086004717440.0, + "grad_norm": 2.830614972113301, + "language_loss": 0.7773959, + "learning_rate": 3.882108255017295e-06, + "loss": 0.80447465, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.33886719, + "step": 2271, + "time_per_iteration": 2.9800453186035156 + }, + { + "auxiliary_loss_clip": 0.01635982, + "auxiliary_loss_mlp": 0.01069304, + "balance_loss_clip": 1.38929415, + "balance_loss_mlp": 1.0356636, + "epoch": 0.13660003006162633, + "flos": 16955605595520.0, + "grad_norm": 2.1387795377239236, + "language_loss": 0.8202256, + "learning_rate": 3.881976481578379e-06, + "loss": 0.84727848, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.33666992, + "step": 2272, + "time_per_iteration": 2.921325445175171 + }, + { + "auxiliary_loss_clip": 0.0137176, + "auxiliary_loss_mlp": 0.01040989, + "balance_loss_clip": 1.2307651, + "balance_loss_mlp": 1.0183872, + "epoch": 0.1366601533142943, + "flos": 68714336004480.0, + "grad_norm": 0.7019627995990968, + "language_loss": 0.60813165, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.63225913, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.22558594, + "step": 2273, + "time_per_iteration": 3.500204086303711 + }, + { + "auxiliary_loss_clip": 0.01638454, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.39592695, + "balance_loss_mlp": 1.02044916, + "epoch": 0.13672027656696228, + "flos": 19253451461760.0, + "grad_norm": 1.6407824163487523, + "language_loss": 0.78502727, + "learning_rate": 3.881712720611336e-06, + "loss": 0.81193703, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 2.42773438, + "router_z_loss_mlp": 0.32055664, + "step": 2274, + "time_per_iteration": 2.8962368965148926 + }, + { + "auxiliary_loss_clip": 0.01646454, + "auxiliary_loss_mlp": 0.01060091, + "balance_loss_clip": 1.39985061, + "balance_loss_mlp": 1.02594924, + "epoch": 0.13678039981963025, + "flos": 24545571442560.0, + "grad_norm": 1.8785823736114449, + "language_loss": 0.79385293, + "learning_rate": 3.881580733093211e-06, + "loss": 0.82091832, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.34130859, + "step": 2275, + "time_per_iteration": 2.924734354019165 + }, + { + "auxiliary_loss_clip": 0.01642125, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.39433694, + "balance_loss_mlp": 1.01625824, + "epoch": 0.13684052307229821, + "flos": 15677797921920.0, + "grad_norm": 3.1705923874028636, + "language_loss": 0.8263731, + "learning_rate": 3.881448674225356e-06, + "loss": 0.85327667, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.31958008, + "step": 2276, + "time_per_iteration": 2.933454751968384 + }, + { + "auxiliary_loss_clip": 0.01663632, + "auxiliary_loss_mlp": 0.01060852, + "balance_loss_clip": 1.4104178, + "balance_loss_mlp": 1.02597153, + "epoch": 0.13690064632496618, + "flos": 28376008306560.0, + "grad_norm": 2.2695959983170453, + "language_loss": 0.72001362, + "learning_rate": 3.881316544012779e-06, + "loss": 0.74725854, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 2.53320312, + "router_z_loss_mlp": 0.34863281, + "step": 2277, + "time_per_iteration": 2.898597240447998 + }, + { + "auxiliary_loss_clip": 0.01659231, + "auxiliary_loss_mlp": 0.01059157, + "balance_loss_clip": 1.40751433, + "balance_loss_mlp": 1.02620721, + "epoch": 0.13696076957763414, + "flos": 23415370865280.0, + "grad_norm": 3.0364842659991287, + "language_loss": 0.81212533, + "learning_rate": 3.88118434246049e-06, + "loss": 0.83930922, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.32958984, + "step": 2278, + "time_per_iteration": 2.833397626876831 + }, + { + "auxiliary_loss_clip": 0.01646031, + "auxiliary_loss_mlp": 0.01054383, + "balance_loss_clip": 1.39948809, + "balance_loss_mlp": 1.02238691, + "epoch": 0.1370208928303021, + "flos": 37210816350720.0, + "grad_norm": 2.4041500475165125, + "language_loss": 0.7602998, + "learning_rate": 3.881052069573502e-06, + "loss": 0.78730386, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.31982422, + "step": 2279, + "time_per_iteration": 3.051732063293457 + }, + { + "auxiliary_loss_clip": 0.01656124, + "auxiliary_loss_mlp": 0.01054392, + "balance_loss_clip": 1.40606272, + "balance_loss_mlp": 1.02134705, + "epoch": 0.13708101608297008, + "flos": 26986771393920.0, + "grad_norm": 1.8178205147775817, + "language_loss": 0.77806687, + "learning_rate": 3.880919725356831e-06, + "loss": 0.80517203, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 0.33056641, + "step": 2280, + "time_per_iteration": 2.9367496967315674 + }, + { + "auxiliary_loss_clip": 0.01637707, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.39479423, + "balance_loss_mlp": 1.02217996, + "epoch": 0.13714113933563807, + "flos": 32568314232960.0, + "grad_norm": 2.2179777110813275, + "language_loss": 0.806638, + "learning_rate": 3.880787309815496e-06, + "loss": 0.83355439, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.31787109, + "step": 2281, + "time_per_iteration": 4.357746124267578 + }, + { + "auxiliary_loss_clip": 0.016762, + "auxiliary_loss_mlp": 0.01062428, + "balance_loss_clip": 1.42039394, + "balance_loss_mlp": 1.03024113, + "epoch": 0.13720126258830603, + "flos": 16108312625280.0, + "grad_norm": 1.6606055116247374, + "language_loss": 0.84687352, + "learning_rate": 3.880654822954518e-06, + "loss": 0.87425983, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.32177734, + "step": 2282, + "time_per_iteration": 2.9281721115112305 + }, + { + "auxiliary_loss_clip": 0.01657257, + "auxiliary_loss_mlp": 0.0105731, + "balance_loss_clip": 1.41208744, + "balance_loss_mlp": 1.02529025, + "epoch": 0.137261385840974, + "flos": 18962987973120.0, + "grad_norm": 1.608301336826467, + "language_loss": 0.74434549, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.77149117, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.31982422, + "step": 2283, + "time_per_iteration": 2.8594882488250732 + }, + { + "auxiliary_loss_clip": 0.01646385, + "auxiliary_loss_mlp": 0.01058813, + "balance_loss_clip": 1.40360332, + "balance_loss_mlp": 1.02750826, + "epoch": 0.13732150909364196, + "flos": 23305932397440.0, + "grad_norm": 2.4711927899980637, + "language_loss": 0.86167693, + "learning_rate": 3.880389635293729e-06, + "loss": 0.88872886, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31298828, + "step": 2284, + "time_per_iteration": 2.8668174743652344 + }, + { + "auxiliary_loss_clip": 0.01671103, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.41624928, + "balance_loss_mlp": 1.03393114, + "epoch": 0.13738163234630993, + "flos": 29363216757120.0, + "grad_norm": 2.858938210318736, + "language_loss": 0.76481891, + "learning_rate": 3.880256934503974e-06, + "loss": 0.79220688, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 2.54882812, + "router_z_loss_mlp": 0.33789062, + "step": 2285, + "time_per_iteration": 2.924229621887207 + }, + { + "auxiliary_loss_clip": 0.01652349, + "auxiliary_loss_mlp": 0.01059683, + "balance_loss_clip": 1.40612292, + "balance_loss_mlp": 1.02587509, + "epoch": 0.1374417555989779, + "flos": 26662573267200.0, + "grad_norm": 1.63354201659485, + "language_loss": 0.75588089, + "learning_rate": 3.880124162414689e-06, + "loss": 0.78300124, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 2.46679688, + "router_z_loss_mlp": 0.33813477, + "step": 2286, + "time_per_iteration": 2.8651490211486816 + }, + { + "auxiliary_loss_clip": 0.01669816, + "auxiliary_loss_mlp": 0.01056692, + "balance_loss_clip": 1.41990328, + "balance_loss_mlp": 1.02283621, + "epoch": 0.1375018788516459, + "flos": 28415308055040.0, + "grad_norm": 2.1169969417480305, + "language_loss": 0.87405366, + "learning_rate": 3.879991319030908e-06, + "loss": 0.90131879, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.33862305, + "step": 2287, + "time_per_iteration": 2.8983330726623535 + }, + { + "auxiliary_loss_clip": 0.01651091, + "auxiliary_loss_mlp": 0.01059394, + "balance_loss_clip": 1.40427899, + "balance_loss_mlp": 1.02685022, + "epoch": 0.13756200210431385, + "flos": 37427612025600.0, + "grad_norm": 1.8316114544046702, + "language_loss": 0.69960749, + "learning_rate": 3.879858404357666e-06, + "loss": 0.72671229, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.32543945, + "step": 2288, + "time_per_iteration": 4.37264347076416 + }, + { + "auxiliary_loss_clip": 0.01648408, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.40008843, + "balance_loss_mlp": 1.02622271, + "epoch": 0.13762212535698182, + "flos": 22721295346560.0, + "grad_norm": 2.6579113753426897, + "language_loss": 0.88650221, + "learning_rate": 3.879725418400005e-06, + "loss": 0.9135974, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 2.48242188, + "router_z_loss_mlp": 0.34863281, + "step": 2289, + "time_per_iteration": 2.8573062419891357 + }, + { + "auxiliary_loss_clip": 0.01633658, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.39061356, + "balance_loss_mlp": 1.02070785, + "epoch": 0.13768224860964978, + "flos": 23962879918080.0, + "grad_norm": 1.7489766907309152, + "language_loss": 0.75444436, + "learning_rate": 3.879592361162969e-06, + "loss": 0.78129482, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.30639648, + "step": 2290, + "time_per_iteration": 5.800458908081055 + }, + { + "auxiliary_loss_clip": 0.0136763, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.22327924, + "balance_loss_mlp": 1.00239408, + "epoch": 0.13774237186231775, + "flos": 63624777569280.0, + "grad_norm": 0.7047354480376266, + "language_loss": 0.5167712, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.54072702, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.25585938, + "step": 2291, + "time_per_iteration": 3.4303178787231445 + }, + { + "auxiliary_loss_clip": 0.01675091, + "auxiliary_loss_mlp": 0.01054533, + "balance_loss_clip": 1.42584825, + "balance_loss_mlp": 1.02134526, + "epoch": 0.1378024951149857, + "flos": 24290018956800.0, + "grad_norm": 1.9185490705451196, + "language_loss": 0.72157234, + "learning_rate": 3.879326032870952e-06, + "loss": 0.74886858, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.33178711, + "step": 2292, + "time_per_iteration": 2.906963348388672 + }, + { + "auxiliary_loss_clip": 0.01659613, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.4115634, + "balance_loss_mlp": 1.02782488, + "epoch": 0.13786261836765368, + "flos": 14028031595520.0, + "grad_norm": 2.3395717324450906, + "language_loss": 0.81437409, + "learning_rate": 3.879192761826071e-06, + "loss": 0.84159362, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 2.47851562, + "router_z_loss_mlp": 0.34545898, + "step": 2293, + "time_per_iteration": 2.9694292545318604 + }, + { + "auxiliary_loss_clip": 0.01674629, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.42215657, + "balance_loss_mlp": 1.02541888, + "epoch": 0.13792274162032167, + "flos": 28890959086080.0, + "grad_norm": 2.062802444330001, + "language_loss": 0.80037081, + "learning_rate": 3.879059419522011e-06, + "loss": 0.8276825, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 2.52734375, + "router_z_loss_mlp": 0.31103516, + "step": 2294, + "time_per_iteration": 3.0505993366241455 + }, + { + "auxiliary_loss_clip": 0.01639186, + "auxiliary_loss_mlp": 0.01054567, + "balance_loss_clip": 1.39530706, + "balance_loss_mlp": 1.02421641, + "epoch": 0.13798286487298964, + "flos": 21150761944320.0, + "grad_norm": 2.1494785522713826, + "language_loss": 0.81638575, + "learning_rate": 3.878926005963831e-06, + "loss": 0.84332329, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.3034668, + "step": 2295, + "time_per_iteration": 2.993037223815918 + }, + { + "auxiliary_loss_clip": 0.01643949, + "auxiliary_loss_mlp": 0.01058472, + "balance_loss_clip": 1.39673519, + "balance_loss_mlp": 1.02661967, + "epoch": 0.1380429881256576, + "flos": 22496491342080.0, + "grad_norm": 2.167183569038902, + "language_loss": 0.79739577, + "learning_rate": 3.878792521156588e-06, + "loss": 0.82441998, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.31860352, + "step": 2296, + "time_per_iteration": 2.994899034500122 + }, + { + "auxiliary_loss_clip": 0.01645051, + "auxiliary_loss_mlp": 0.01057628, + "balance_loss_clip": 1.39886665, + "balance_loss_mlp": 1.02625251, + "epoch": 0.13810311137832557, + "flos": 21402604356480.0, + "grad_norm": 2.2192163318827203, + "language_loss": 0.78922355, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.81625032, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.3137207, + "step": 2297, + "time_per_iteration": 3.0134103298187256 + }, + { + "auxiliary_loss_clip": 0.01644474, + "auxiliary_loss_mlp": 0.0106019, + "balance_loss_clip": 1.40007186, + "balance_loss_mlp": 1.02731204, + "epoch": 0.13816323463099353, + "flos": 25999698677760.0, + "grad_norm": 2.1538941314988445, + "language_loss": 0.69829965, + "learning_rate": 3.878525337815164e-06, + "loss": 0.72534633, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.32861328, + "step": 2298, + "time_per_iteration": 2.923275947570801 + }, + { + "auxiliary_loss_clip": 0.01662359, + "auxiliary_loss_mlp": 0.01067735, + "balance_loss_clip": 1.41073513, + "balance_loss_mlp": 1.03464246, + "epoch": 0.1382233578836615, + "flos": 19253541951360.0, + "grad_norm": 1.7355539396019553, + "language_loss": 0.88303435, + "learning_rate": 3.878391639291116e-06, + "loss": 0.9103353, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.33056641, + "step": 2299, + "time_per_iteration": 2.9149131774902344 + }, + { + "auxiliary_loss_clip": 0.01651722, + "auxiliary_loss_mlp": 0.01072056, + "balance_loss_clip": 1.40090823, + "balance_loss_mlp": 1.0390116, + "epoch": 0.1382834811363295, + "flos": 25677310343040.0, + "grad_norm": 5.734216485344874, + "language_loss": 0.77082896, + "learning_rate": 3.878257869538267e-06, + "loss": 0.79806674, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.33032227, + "step": 2300, + "time_per_iteration": 2.8860790729522705 + }, + { + "auxiliary_loss_clip": 0.01634787, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.38969612, + "balance_loss_mlp": 1.0308826, + "epoch": 0.13834360438899745, + "flos": 19791640085760.0, + "grad_norm": 2.6183225809528055, + "language_loss": 0.837901, + "learning_rate": 3.878124028561692e-06, + "loss": 0.86489105, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.33325195, + "step": 2301, + "time_per_iteration": 2.7982566356658936 + }, + { + "auxiliary_loss_clip": 0.01619347, + "auxiliary_loss_mlp": 0.01065431, + "balance_loss_clip": 1.3763541, + "balance_loss_mlp": 1.0302403, + "epoch": 0.13840372764166542, + "flos": 26663251939200.0, + "grad_norm": 2.334312110046111, + "language_loss": 0.86689854, + "learning_rate": 3.877990116366466e-06, + "loss": 0.89374626, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.35180664, + "step": 2302, + "time_per_iteration": 2.933627128601074 + }, + { + "auxiliary_loss_clip": 0.01375083, + "auxiliary_loss_mlp": 0.01096188, + "balance_loss_clip": 1.23007524, + "balance_loss_mlp": 1.0637629, + "epoch": 0.13846385089433338, + "flos": 70544086721280.0, + "grad_norm": 0.7694075619459086, + "language_loss": 0.65696061, + "learning_rate": 3.877856132957667e-06, + "loss": 0.68167329, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.32421875, + "step": 2303, + "time_per_iteration": 3.4920811653137207 + }, + { + "auxiliary_loss_clip": 0.0163514, + "auxiliary_loss_mlp": 0.01057899, + "balance_loss_clip": 1.39082217, + "balance_loss_mlp": 1.0250448, + "epoch": 0.13852397414700135, + "flos": 17357905526400.0, + "grad_norm": 1.8102974722059613, + "language_loss": 0.79097563, + "learning_rate": 3.877722078340374e-06, + "loss": 0.81790602, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.32836914, + "step": 2304, + "time_per_iteration": 2.86325740814209 + }, + { + "auxiliary_loss_clip": 0.01642673, + "auxiliary_loss_mlp": 0.01066193, + "balance_loss_clip": 1.39341402, + "balance_loss_mlp": 1.03412545, + "epoch": 0.13858409739966931, + "flos": 21553831036800.0, + "grad_norm": 1.7465539057971116, + "language_loss": 0.78547144, + "learning_rate": 3.877587952519672e-06, + "loss": 0.81256008, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.32055664, + "step": 2305, + "time_per_iteration": 2.905118227005005 + }, + { + "auxiliary_loss_clip": 0.01616813, + "auxiliary_loss_mlp": 0.01053882, + "balance_loss_clip": 1.37605703, + "balance_loss_mlp": 1.02133751, + "epoch": 0.13864422065233728, + "flos": 21589873159680.0, + "grad_norm": 1.8400297015789688, + "language_loss": 0.88436997, + "learning_rate": 3.877453755500647e-06, + "loss": 0.9110769, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.32543945, + "step": 2306, + "time_per_iteration": 2.9000682830810547 + }, + { + "auxiliary_loss_clip": 0.01372888, + "auxiliary_loss_mlp": 0.01042115, + "balance_loss_clip": 1.22738898, + "balance_loss_mlp": 1.01312292, + "epoch": 0.13870434390500527, + "flos": 53392525320960.0, + "grad_norm": 0.8759171136330309, + "language_loss": 0.59178662, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61593664, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.2890625, + "step": 2307, + "time_per_iteration": 3.4423067569732666 + }, + { + "auxiliary_loss_clip": 0.01638553, + "auxiliary_loss_mlp": 0.01055021, + "balance_loss_clip": 1.38810611, + "balance_loss_mlp": 1.022668, + "epoch": 0.13876446715767324, + "flos": 22576357693440.0, + "grad_norm": 1.719251173169034, + "language_loss": 0.81373048, + "learning_rate": 3.877185147887984e-06, + "loss": 0.84066623, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 2.50585938, + "router_z_loss_mlp": 0.32348633, + "step": 2308, + "time_per_iteration": 2.871844530105591 + }, + { + "auxiliary_loss_clip": 0.01635757, + "auxiliary_loss_mlp": 0.01055701, + "balance_loss_clip": 1.39072704, + "balance_loss_mlp": 1.02201259, + "epoch": 0.1388245904103412, + "flos": 20715225068160.0, + "grad_norm": 3.1599756978129134, + "language_loss": 0.79424459, + "learning_rate": 3.877050737304533e-06, + "loss": 0.82115918, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 2.44921875, + "router_z_loss_mlp": 0.33642578, + "step": 2309, + "time_per_iteration": 2.9168717861175537 + }, + { + "auxiliary_loss_clip": 0.01650808, + "auxiliary_loss_mlp": 0.01060165, + "balance_loss_clip": 1.39875507, + "balance_loss_mlp": 1.02213693, + "epoch": 0.13888471366300917, + "flos": 20563500695040.0, + "grad_norm": 2.541335455070293, + "language_loss": 0.69368148, + "learning_rate": 3.876916255543129e-06, + "loss": 0.72079116, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.38061523, + "step": 2310, + "time_per_iteration": 2.9068870544433594 + }, + { + "auxiliary_loss_clip": 0.01627544, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.38296056, + "balance_loss_mlp": 1.01898658, + "epoch": 0.13894483691567713, + "flos": 13844201397120.0, + "grad_norm": 1.920746616908058, + "language_loss": 0.84710497, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.87389767, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.32739258, + "step": 2311, + "time_per_iteration": 2.7985341548919678 + }, + { + "auxiliary_loss_clip": 0.01646788, + "auxiliary_loss_mlp": 0.0105646, + "balance_loss_clip": 1.39500189, + "balance_loss_mlp": 1.02281928, + "epoch": 0.1390049601683451, + "flos": 28041087162240.0, + "grad_norm": 2.790235069413957, + "language_loss": 0.83126694, + "learning_rate": 3.876647078506866e-06, + "loss": 0.85829943, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.33642578, + "step": 2312, + "time_per_iteration": 2.9148976802825928 + }, + { + "auxiliary_loss_clip": 0.01640634, + "auxiliary_loss_mlp": 0.01055744, + "balance_loss_clip": 1.38843, + "balance_loss_mlp": 1.02229369, + "epoch": 0.13906508342101306, + "flos": 26767125296640.0, + "grad_norm": 2.036075385292159, + "language_loss": 0.87701476, + "learning_rate": 3.876512383242215e-06, + "loss": 0.90397859, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 2.51953125, + "router_z_loss_mlp": 0.3347168, + "step": 2313, + "time_per_iteration": 2.88066029548645 + }, + { + "auxiliary_loss_clip": 0.01630722, + "auxiliary_loss_mlp": 0.01059798, + "balance_loss_clip": 1.38329327, + "balance_loss_mlp": 1.02703941, + "epoch": 0.13912520667368106, + "flos": 24545797666560.0, + "grad_norm": 3.530057320834044, + "language_loss": 0.81492472, + "learning_rate": 3.876377616820024e-06, + "loss": 0.84182996, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.32763672, + "step": 2314, + "time_per_iteration": 2.9306485652923584 + }, + { + "auxiliary_loss_clip": 0.01632271, + "auxiliary_loss_mlp": 0.01052571, + "balance_loss_clip": 1.38508272, + "balance_loss_mlp": 1.01990771, + "epoch": 0.13918532992634902, + "flos": 19390652254080.0, + "grad_norm": 3.2142751539982592, + "language_loss": 0.86546069, + "learning_rate": 3.876242779245409e-06, + "loss": 0.89230907, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.32666016, + "step": 2315, + "time_per_iteration": 2.8000879287719727 + }, + { + "auxiliary_loss_clip": 0.0161785, + "auxiliary_loss_mlp": 0.0105618, + "balance_loss_clip": 1.37289786, + "balance_loss_mlp": 1.02289653, + "epoch": 0.139245453179017, + "flos": 21333461022720.0, + "grad_norm": 9.465105356672096, + "language_loss": 0.79370141, + "learning_rate": 3.876107870523477e-06, + "loss": 0.82044172, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 2.44726562, + "router_z_loss_mlp": 0.33276367, + "step": 2316, + "time_per_iteration": 4.340923309326172 + }, + { + "auxiliary_loss_clip": 0.01638209, + "auxiliary_loss_mlp": 0.01066801, + "balance_loss_clip": 1.39142644, + "balance_loss_mlp": 1.03239703, + "epoch": 0.13930557643168495, + "flos": 19510135067520.0, + "grad_norm": 1.7566421288694996, + "language_loss": 0.78027683, + "learning_rate": 3.875972890659349e-06, + "loss": 0.80732691, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.34399414, + "step": 2317, + "time_per_iteration": 2.9390931129455566 + }, + { + "auxiliary_loss_clip": 0.01627939, + "auxiliary_loss_mlp": 0.01053554, + "balance_loss_clip": 1.37960374, + "balance_loss_mlp": 1.02100945, + "epoch": 0.13936569968435292, + "flos": 25421757857280.0, + "grad_norm": 1.82452261010462, + "language_loss": 0.8217653, + "learning_rate": 3.875837839658139e-06, + "loss": 0.8485803, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 2.48632812, + "router_z_loss_mlp": 0.32519531, + "step": 2318, + "time_per_iteration": 2.9527127742767334 + }, + { + "auxiliary_loss_clip": 0.01352149, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_clip": 1.21307516, + "balance_loss_mlp": 1.02815557, + "epoch": 0.13942582293702088, + "flos": 70805430541440.0, + "grad_norm": 0.8594569330444389, + "language_loss": 0.59070164, + "learning_rate": 3.87570271752497e-06, + "loss": 0.6147269, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.22265625, + "step": 2319, + "time_per_iteration": 3.4224491119384766 + }, + { + "auxiliary_loss_clip": 0.01642888, + "auxiliary_loss_mlp": 0.01059322, + "balance_loss_clip": 1.39111853, + "balance_loss_mlp": 1.02572858, + "epoch": 0.13948594618968888, + "flos": 35604602784000.0, + "grad_norm": 2.1918677684076338, + "language_loss": 0.66975778, + "learning_rate": 3.875567524264967e-06, + "loss": 0.69677985, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 2.515625, + "router_z_loss_mlp": 0.3359375, + "step": 2320, + "time_per_iteration": 3.0264053344726562 + }, + { + "auxiliary_loss_clip": 0.01623993, + "auxiliary_loss_mlp": 0.01056995, + "balance_loss_clip": 1.38063312, + "balance_loss_mlp": 1.02392662, + "epoch": 0.13954606944235684, + "flos": 21114946045440.0, + "grad_norm": 1.8762078423529711, + "language_loss": 0.71743512, + "learning_rate": 3.875432259883256e-06, + "loss": 0.74424505, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.33081055, + "step": 2321, + "time_per_iteration": 2.8691439628601074 + }, + { + "auxiliary_loss_clip": 0.0165045, + "auxiliary_loss_mlp": 0.01068449, + "balance_loss_clip": 1.40010047, + "balance_loss_mlp": 1.0340457, + "epoch": 0.1396061926950248, + "flos": 25054866622080.0, + "grad_norm": 1.7783384032489562, + "language_loss": 0.87280405, + "learning_rate": 3.875296924384965e-06, + "loss": 0.89999306, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.34423828, + "step": 2322, + "time_per_iteration": 2.893277168273926 + }, + { + "auxiliary_loss_clip": 0.0159457, + "auxiliary_loss_mlp": 0.01064542, + "balance_loss_clip": 1.3581804, + "balance_loss_mlp": 1.03237975, + "epoch": 0.13966631594769277, + "flos": 37648253508480.0, + "grad_norm": 1.7267943961018501, + "language_loss": 0.68017215, + "learning_rate": 3.875161517775226e-06, + "loss": 0.70676327, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.32128906, + "step": 2323, + "time_per_iteration": 4.3679749965667725 + }, + { + "auxiliary_loss_clip": 0.01664538, + "auxiliary_loss_mlp": 0.01065105, + "balance_loss_clip": 1.40908074, + "balance_loss_mlp": 1.0327282, + "epoch": 0.13972643920036074, + "flos": 16699736396160.0, + "grad_norm": 1.927920641441456, + "language_loss": 0.90066803, + "learning_rate": 3.875026040059175e-06, + "loss": 0.92796439, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.32373047, + "step": 2324, + "time_per_iteration": 2.8434062004089355 + }, + { + "auxiliary_loss_clip": 0.01632375, + "auxiliary_loss_mlp": 0.01071455, + "balance_loss_clip": 1.38592601, + "balance_loss_mlp": 1.03845787, + "epoch": 0.1397865624530287, + "flos": 23341431582720.0, + "grad_norm": 2.1257148628097555, + "language_loss": 0.73364228, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.76068056, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.32958984, + "step": 2325, + "time_per_iteration": 5.6795454025268555 + }, + { + "auxiliary_loss_clip": 0.01649918, + "auxiliary_loss_mlp": 0.01075518, + "balance_loss_clip": 1.40032721, + "balance_loss_mlp": 1.04366517, + "epoch": 0.13984668570569667, + "flos": 22787135809920.0, + "grad_norm": 3.7035930695530275, + "language_loss": 0.83274931, + "learning_rate": 3.874754871328688e-06, + "loss": 0.86000371, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 2.49804688, + "router_z_loss_mlp": 0.31835938, + "step": 2326, + "time_per_iteration": 2.8711376190185547 + }, + { + "auxiliary_loss_clip": 0.01614925, + "auxiliary_loss_mlp": 0.01075556, + "balance_loss_clip": 1.37575102, + "balance_loss_mlp": 1.0432744, + "epoch": 0.13990680895836466, + "flos": 19474183434240.0, + "grad_norm": 1.7635959704967323, + "language_loss": 0.90110981, + "learning_rate": 3.874619180324534e-06, + "loss": 0.92801458, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.32275391, + "step": 2327, + "time_per_iteration": 3.0145423412323 + }, + { + "auxiliary_loss_clip": 0.01628261, + "auxiliary_loss_mlp": 0.01073342, + "balance_loss_clip": 1.38491499, + "balance_loss_mlp": 1.04280066, + "epoch": 0.13996693221103262, + "flos": 20313015626880.0, + "grad_norm": 3.4510086391731516, + "language_loss": 0.86017036, + "learning_rate": 3.874483418234632e-06, + "loss": 0.88718641, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.30541992, + "step": 2328, + "time_per_iteration": 2.8392908573150635 + }, + { + "auxiliary_loss_clip": 0.01633239, + "auxiliary_loss_mlp": 0.01075743, + "balance_loss_clip": 1.38645601, + "balance_loss_mlp": 1.04224467, + "epoch": 0.1400270554637006, + "flos": 26629698280320.0, + "grad_norm": 1.6475550783505535, + "language_loss": 0.75199139, + "learning_rate": 3.874347585064131e-06, + "loss": 0.77908123, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.33496094, + "step": 2329, + "time_per_iteration": 2.9233975410461426 + }, + { + "auxiliary_loss_clip": 0.01652117, + "auxiliary_loss_mlp": 0.01078985, + "balance_loss_clip": 1.40495968, + "balance_loss_mlp": 1.04779935, + "epoch": 0.14008717871636855, + "flos": 19401058558080.0, + "grad_norm": 2.3462378288022814, + "language_loss": 0.7931031, + "learning_rate": 3.874211680818183e-06, + "loss": 0.82041419, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.31152344, + "step": 2330, + "time_per_iteration": 2.835026264190674 + }, + { + "auxiliary_loss_clip": 0.01632307, + "auxiliary_loss_mlp": 0.01067094, + "balance_loss_clip": 1.38814712, + "balance_loss_mlp": 1.03583753, + "epoch": 0.14014730196903652, + "flos": 15312580744320.0, + "grad_norm": 2.3779340487929557, + "language_loss": 0.73317617, + "learning_rate": 3.87407570550194e-06, + "loss": 0.76017016, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.3125, + "step": 2331, + "time_per_iteration": 2.884646415710449 + }, + { + "auxiliary_loss_clip": 0.01612648, + "auxiliary_loss_mlp": 0.01070681, + "balance_loss_clip": 1.3779223, + "balance_loss_mlp": 1.03858995, + "epoch": 0.14020742522170448, + "flos": 14947996993920.0, + "grad_norm": 1.7071936378706467, + "language_loss": 0.73611492, + "learning_rate": 3.873939659120557e-06, + "loss": 0.76294822, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.32055664, + "step": 2332, + "time_per_iteration": 2.852910041809082 + }, + { + "auxiliary_loss_clip": 0.013816, + "auxiliary_loss_mlp": 0.01055754, + "balance_loss_clip": 1.23955917, + "balance_loss_mlp": 1.03381908, + "epoch": 0.14026754847437245, + "flos": 48848439922560.0, + "grad_norm": 0.8318201338907145, + "language_loss": 0.56159484, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58596838, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.21972656, + "step": 2333, + "time_per_iteration": 3.272254228591919 + }, + { + "auxiliary_loss_clip": 0.01616496, + "auxiliary_loss_mlp": 0.01066817, + "balance_loss_clip": 1.37790334, + "balance_loss_mlp": 1.03577447, + "epoch": 0.14032767172704044, + "flos": 25783672164480.0, + "grad_norm": 6.766443035333321, + "language_loss": 0.83434403, + "learning_rate": 3.873667353183016e-06, + "loss": 0.86117721, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.30981445, + "step": 2334, + "time_per_iteration": 2.9175710678100586 + }, + { + "auxiliary_loss_clip": 0.01612441, + "auxiliary_loss_mlp": 0.01065029, + "balance_loss_clip": 1.36926818, + "balance_loss_mlp": 1.03238964, + "epoch": 0.1403877949797084, + "flos": 21226511018880.0, + "grad_norm": 1.6988844538218988, + "language_loss": 0.81767982, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.84445453, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.32617188, + "step": 2335, + "time_per_iteration": 2.881754159927368 + }, + { + "auxiliary_loss_clip": 0.01635649, + "auxiliary_loss_mlp": 0.01058145, + "balance_loss_clip": 1.38581192, + "balance_loss_mlp": 1.02402771, + "epoch": 0.14044791823237637, + "flos": 22758378099840.0, + "grad_norm": 1.9862901466903873, + "language_loss": 0.82837868, + "learning_rate": 3.873394763046862e-06, + "loss": 0.85531664, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.34106445, + "step": 2336, + "time_per_iteration": 2.901790142059326 + }, + { + "auxiliary_loss_clip": 0.01607115, + "auxiliary_loss_mlp": 0.0105752, + "balance_loss_clip": 1.3682971, + "balance_loss_mlp": 1.02709782, + "epoch": 0.14050804148504434, + "flos": 22974268878720.0, + "grad_norm": 1.6517150943130394, + "language_loss": 0.81906366, + "learning_rate": 3.873258361417225e-06, + "loss": 0.84571004, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.30371094, + "step": 2337, + "time_per_iteration": 2.9112491607666016 + }, + { + "auxiliary_loss_clip": 0.01624454, + "auxiliary_loss_mlp": 0.01063361, + "balance_loss_clip": 1.38274181, + "balance_loss_mlp": 1.03241444, + "epoch": 0.1405681647377123, + "flos": 22210507088640.0, + "grad_norm": 1.8362264257672276, + "language_loss": 0.79956633, + "learning_rate": 3.873121888753442e-06, + "loss": 0.82644451, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.30957031, + "step": 2338, + "time_per_iteration": 2.862473487854004 + }, + { + "auxiliary_loss_clip": 0.01640933, + "auxiliary_loss_mlp": 0.01058551, + "balance_loss_clip": 1.39533687, + "balance_loss_mlp": 1.02522027, + "epoch": 0.14062828799038027, + "flos": 23743369555200.0, + "grad_norm": 3.9264249806134552, + "language_loss": 0.82020855, + "learning_rate": 3.87298534506069e-06, + "loss": 0.84720337, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 2.45703125, + "router_z_loss_mlp": 0.33325195, + "step": 2339, + "time_per_iteration": 2.879000425338745 + }, + { + "auxiliary_loss_clip": 0.01617225, + "auxiliary_loss_mlp": 0.01057463, + "balance_loss_clip": 1.37799239, + "balance_loss_mlp": 1.02651584, + "epoch": 0.14068841124304826, + "flos": 39217022363520.0, + "grad_norm": 1.6947342328099662, + "language_loss": 0.66542888, + "learning_rate": 3.872848730344146e-06, + "loss": 0.69217575, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.30932617, + "step": 2340, + "time_per_iteration": 2.9939119815826416 + }, + { + "auxiliary_loss_clip": 0.01605963, + "auxiliary_loss_mlp": 0.01054857, + "balance_loss_clip": 1.37140179, + "balance_loss_mlp": 1.02441144, + "epoch": 0.14074853449571623, + "flos": 20201586387840.0, + "grad_norm": 2.4732193786028627, + "language_loss": 0.80035716, + "learning_rate": 3.87271204460899e-06, + "loss": 0.82696533, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.3046875, + "step": 2341, + "time_per_iteration": 2.8511369228363037 + }, + { + "auxiliary_loss_clip": 0.01611395, + "auxiliary_loss_mlp": 0.01058356, + "balance_loss_clip": 1.37125754, + "balance_loss_mlp": 1.02688491, + "epoch": 0.1408086577483842, + "flos": 18415071717120.0, + "grad_norm": 2.006100483965079, + "language_loss": 0.8335678, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.86026525, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.31445312, + "step": 2342, + "time_per_iteration": 2.8208444118499756 + }, + { + "auxiliary_loss_clip": 0.01609853, + "auxiliary_loss_mlp": 0.01062728, + "balance_loss_clip": 1.37654722, + "balance_loss_mlp": 1.03094697, + "epoch": 0.14086878100105216, + "flos": 25275417615360.0, + "grad_norm": 1.8388903944664412, + "language_loss": 0.79156315, + "learning_rate": 3.87243846010358e-06, + "loss": 0.81828898, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.31762695, + "step": 2343, + "time_per_iteration": 2.875600576400757 + }, + { + "auxiliary_loss_clip": 0.01406192, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.26912808, + "balance_loss_mlp": 1.05435371, + "epoch": 0.14092890425372012, + "flos": 66008127404160.0, + "grad_norm": 0.8398256394052303, + "language_loss": 0.61598265, + "learning_rate": 3.872301561343699e-06, + "loss": 0.64079601, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.20800781, + "step": 2344, + "time_per_iteration": 3.286590814590454 + }, + { + "auxiliary_loss_clip": 0.01599927, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.36303604, + "balance_loss_mlp": 1.02539515, + "epoch": 0.1409890275063881, + "flos": 23705200926720.0, + "grad_norm": 1.6145457035707937, + "language_loss": 0.66377115, + "learning_rate": 3.872164591585956e-06, + "loss": 0.69031948, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.29492188, + "step": 2345, + "time_per_iteration": 2.8507354259490967 + }, + { + "auxiliary_loss_clip": 0.01629398, + "auxiliary_loss_mlp": 0.01068856, + "balance_loss_clip": 1.380422, + "balance_loss_mlp": 1.03645444, + "epoch": 0.14104915075905605, + "flos": 23633659618560.0, + "grad_norm": 5.623152688799827, + "language_loss": 0.75418937, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.78117192, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.32373047, + "step": 2346, + "time_per_iteration": 2.9753735065460205 + }, + { + "auxiliary_loss_clip": 0.01617351, + "auxiliary_loss_mlp": 0.01061118, + "balance_loss_clip": 1.37582469, + "balance_loss_mlp": 1.03012383, + "epoch": 0.14110927401172405, + "flos": 20604972193920.0, + "grad_norm": 1.9414593574026042, + "language_loss": 0.78218257, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80896729, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.30981445, + "step": 2347, + "time_per_iteration": 2.8268840312957764 + }, + { + "auxiliary_loss_clip": 0.01617053, + "auxiliary_loss_mlp": 0.01064168, + "balance_loss_clip": 1.37634468, + "balance_loss_mlp": 1.03434181, + "epoch": 0.141169397264392, + "flos": 28559657525760.0, + "grad_norm": 2.2796688253939856, + "language_loss": 0.77822477, + "learning_rate": 3.8717532563775e-06, + "loss": 0.80503696, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.2980957, + "step": 2348, + "time_per_iteration": 3.0220208168029785 + }, + { + "auxiliary_loss_clip": 0.01609035, + "auxiliary_loss_mlp": 0.01062174, + "balance_loss_clip": 1.36837077, + "balance_loss_mlp": 1.03167999, + "epoch": 0.14122952051705998, + "flos": 17101086186240.0, + "grad_norm": 1.7163639182311072, + "language_loss": 0.87789166, + "learning_rate": 3.871616002680272e-06, + "loss": 0.90460378, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.3046875, + "step": 2349, + "time_per_iteration": 2.894247055053711 + }, + { + "auxiliary_loss_clip": 0.01610279, + "auxiliary_loss_mlp": 0.01065742, + "balance_loss_clip": 1.37187839, + "balance_loss_mlp": 1.03248239, + "epoch": 0.14128964376972794, + "flos": 28957840179840.0, + "grad_norm": 1.749751954119047, + "language_loss": 0.89668572, + "learning_rate": 3.871478678011177e-06, + "loss": 0.92344594, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.33251953, + "step": 2350, + "time_per_iteration": 2.912184476852417 + }, + { + "auxiliary_loss_clip": 0.01614644, + "auxiliary_loss_mlp": 0.01054722, + "balance_loss_clip": 1.37495542, + "balance_loss_mlp": 1.02191591, + "epoch": 0.1413497670223959, + "flos": 18999165830400.0, + "grad_norm": 1.6404993629008684, + "language_loss": 0.82245195, + "learning_rate": 3.871341282375423e-06, + "loss": 0.84914565, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.328125, + "step": 2351, + "time_per_iteration": 4.283477306365967 + }, + { + "auxiliary_loss_clip": 0.0162686, + "auxiliary_loss_mlp": 0.01065809, + "balance_loss_clip": 1.38428259, + "balance_loss_mlp": 1.03405178, + "epoch": 0.14140989027506387, + "flos": 29874366973440.0, + "grad_norm": 2.395217832178772, + "language_loss": 0.85020399, + "learning_rate": 3.871203815778219e-06, + "loss": 0.87713069, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.31738281, + "step": 2352, + "time_per_iteration": 2.9445481300354004 + }, + { + "auxiliary_loss_clip": 0.01392869, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.25452483, + "balance_loss_mlp": 1.00222671, + "epoch": 0.14147001352773186, + "flos": 62109587836800.0, + "grad_norm": 0.9085418427079294, + "language_loss": 0.61959434, + "learning_rate": 3.87106627822478e-06, + "loss": 0.64380473, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.25976562, + "step": 2353, + "time_per_iteration": 3.305818796157837 + }, + { + "auxiliary_loss_clip": 0.01594299, + "auxiliary_loss_mlp": 0.01053789, + "balance_loss_clip": 1.35818315, + "balance_loss_mlp": 1.02329564, + "epoch": 0.14153013678039983, + "flos": 22027536541440.0, + "grad_norm": 1.8057413114390988, + "language_loss": 0.88154662, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.90802753, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.3046875, + "step": 2354, + "time_per_iteration": 2.889789581298828 + }, + { + "auxiliary_loss_clip": 0.01604279, + "auxiliary_loss_mlp": 0.01067295, + "balance_loss_clip": 1.36557984, + "balance_loss_mlp": 1.03494215, + "epoch": 0.1415902600330678, + "flos": 19729826409600.0, + "grad_norm": 1.9319551634847458, + "language_loss": 0.75634724, + "learning_rate": 3.870790990270057e-06, + "loss": 0.78306299, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.32324219, + "step": 2355, + "time_per_iteration": 2.846365451812744 + }, + { + "auxiliary_loss_clip": 0.01379029, + "auxiliary_loss_mlp": 0.01050853, + "balance_loss_clip": 1.24227977, + "balance_loss_mlp": 1.03092134, + "epoch": 0.14165038328573576, + "flos": 65929482662400.0, + "grad_norm": 0.691843487536347, + "language_loss": 0.51918292, + "learning_rate": 3.870653239879212e-06, + "loss": 0.54348171, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.19921875, + "step": 2356, + "time_per_iteration": 3.268338918685913 + }, + { + "auxiliary_loss_clip": 0.01600487, + "auxiliary_loss_mlp": 0.01067468, + "balance_loss_clip": 1.36443973, + "balance_loss_mlp": 1.03687882, + "epoch": 0.14171050653840372, + "flos": 12137688812160.0, + "grad_norm": 2.92411820521841, + "language_loss": 0.7202217, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.74690127, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.30566406, + "step": 2357, + "time_per_iteration": 4.253419876098633 + }, + { + "auxiliary_loss_clip": 0.01610081, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_clip": 1.3675406, + "balance_loss_mlp": 1.04560494, + "epoch": 0.1417706297910717, + "flos": 20422182625920.0, + "grad_norm": 2.022395953579635, + "language_loss": 0.83696049, + "learning_rate": 3.870377526296674e-06, + "loss": 0.8638311, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31323242, + "step": 2358, + "time_per_iteration": 2.841686248779297 + }, + { + "auxiliary_loss_clip": 0.01623141, + "auxiliary_loss_mlp": 0.01075058, + "balance_loss_clip": 1.37743437, + "balance_loss_mlp": 1.04229939, + "epoch": 0.14183075304373965, + "flos": 22390265255040.0, + "grad_norm": 2.3433211237524185, + "language_loss": 0.73283213, + "learning_rate": 3.870239563115436e-06, + "loss": 0.75981414, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.32739258, + "step": 2359, + "time_per_iteration": 2.8791491985321045 + }, + { + "auxiliary_loss_clip": 0.01620303, + "auxiliary_loss_mlp": 0.01085294, + "balance_loss_clip": 1.37884593, + "balance_loss_mlp": 1.05491948, + "epoch": 0.14189087629640765, + "flos": 21591140014080.0, + "grad_norm": 2.9742112684483266, + "language_loss": 0.77477169, + "learning_rate": 3.870101529014526e-06, + "loss": 0.80182767, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.30371094, + "step": 2360, + "time_per_iteration": 4.3731913566589355 + }, + { + "auxiliary_loss_clip": 0.01604834, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.37089419, + "balance_loss_mlp": 1.05872917, + "epoch": 0.1419509995490756, + "flos": 20017484720640.0, + "grad_norm": 2.252993672458751, + "language_loss": 0.822909, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84986079, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.31616211, + "step": 2361, + "time_per_iteration": 2.8596935272216797 + }, + { + "auxiliary_loss_clip": 0.01592287, + "auxiliary_loss_mlp": 0.010972, + "balance_loss_clip": 1.35875511, + "balance_loss_mlp": 1.06425071, + "epoch": 0.14201112280174358, + "flos": 31954014576000.0, + "grad_norm": 4.316711822545349, + "language_loss": 0.75756466, + "learning_rate": 3.86982524807463e-06, + "loss": 0.78445947, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.32910156, + "step": 2362, + "time_per_iteration": 3.011115550994873 + }, + { + "auxiliary_loss_clip": 0.01600215, + "auxiliary_loss_mlp": 0.01093529, + "balance_loss_clip": 1.36895275, + "balance_loss_mlp": 1.06110382, + "epoch": 0.14207124605441154, + "flos": 41479142820480.0, + "grad_norm": 2.1826705631937333, + "language_loss": 0.75180179, + "learning_rate": 3.869687001246122e-06, + "loss": 0.77873921, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.32421875, + "step": 2363, + "time_per_iteration": 3.109760046005249 + }, + { + "auxiliary_loss_clip": 0.01601792, + "auxiliary_loss_mlp": 0.01096876, + "balance_loss_clip": 1.36596751, + "balance_loss_mlp": 1.06418872, + "epoch": 0.1421313693070795, + "flos": 31917655739520.0, + "grad_norm": 2.2864797542817756, + "language_loss": 0.73866224, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.76564896, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.32666016, + "step": 2364, + "time_per_iteration": 2.9079737663269043 + }, + { + "auxiliary_loss_clip": 0.01586114, + "auxiliary_loss_mlp": 0.01093332, + "balance_loss_clip": 1.35753059, + "balance_loss_mlp": 1.06386375, + "epoch": 0.14219149255974747, + "flos": 26882717057280.0, + "grad_norm": 2.387982881766686, + "language_loss": 0.91494268, + "learning_rate": 3.869410294898195e-06, + "loss": 0.94173712, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29467773, + "step": 2365, + "time_per_iteration": 2.870861053466797 + }, + { + "auxiliary_loss_clip": 0.01600975, + "auxiliary_loss_mlp": 0.0107681, + "balance_loss_clip": 1.36483407, + "balance_loss_mlp": 1.04572022, + "epoch": 0.14225161581241544, + "flos": 27465996764160.0, + "grad_norm": 1.6627374264620778, + "language_loss": 0.66033643, + "learning_rate": 3.869271835389268e-06, + "loss": 0.6871143, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.31079102, + "step": 2366, + "time_per_iteration": 2.870525360107422 + }, + { + "auxiliary_loss_clip": 0.01596831, + "auxiliary_loss_mlp": 0.01081826, + "balance_loss_clip": 1.36514783, + "balance_loss_mlp": 1.05023527, + "epoch": 0.14231173906508343, + "flos": 10568603243520.0, + "grad_norm": 2.0408473866747947, + "language_loss": 0.81706887, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.84385538, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.31542969, + "step": 2367, + "time_per_iteration": 2.840629816055298 + }, + { + "auxiliary_loss_clip": 0.01620593, + "auxiliary_loss_mlp": 0.01082158, + "balance_loss_clip": 1.38275743, + "balance_loss_mlp": 1.04861271, + "epoch": 0.1423718623177514, + "flos": 28371212357760.0, + "grad_norm": 13.71021952936621, + "language_loss": 0.84252203, + "learning_rate": 3.868994703727742e-06, + "loss": 0.86954951, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.33544922, + "step": 2368, + "time_per_iteration": 2.9260501861572266 + }, + { + "auxiliary_loss_clip": 0.01612974, + "auxiliary_loss_mlp": 0.01077798, + "balance_loss_clip": 1.37518716, + "balance_loss_mlp": 1.04775691, + "epoch": 0.14243198557041936, + "flos": 19363613846400.0, + "grad_norm": 2.4487238844388406, + "language_loss": 0.87895322, + "learning_rate": 3.868856031585652e-06, + "loss": 0.9058609, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.30053711, + "step": 2369, + "time_per_iteration": 2.8076565265655518 + }, + { + "auxiliary_loss_clip": 0.01619131, + "auxiliary_loss_mlp": 0.01069732, + "balance_loss_clip": 1.37982154, + "balance_loss_mlp": 1.03873801, + "epoch": 0.14249210882308733, + "flos": 28819417777920.0, + "grad_norm": 1.4827280377032062, + "language_loss": 0.7668035, + "learning_rate": 3.868717288576354e-06, + "loss": 0.79369217, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.30981445, + "step": 2370, + "time_per_iteration": 2.9657251834869385 + }, + { + "auxiliary_loss_clip": 0.01594248, + "auxiliary_loss_mlp": 0.01067187, + "balance_loss_clip": 1.36166203, + "balance_loss_mlp": 1.03616846, + "epoch": 0.1425522320757553, + "flos": 21844973197440.0, + "grad_norm": 1.5559044979928462, + "language_loss": 0.83770508, + "learning_rate": 3.868578474705109e-06, + "loss": 0.86431944, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.31005859, + "step": 2371, + "time_per_iteration": 2.863511323928833 + }, + { + "auxiliary_loss_clip": 0.01607922, + "auxiliary_loss_mlp": 0.01058515, + "balance_loss_clip": 1.37283063, + "balance_loss_mlp": 1.02728176, + "epoch": 0.14261235532842326, + "flos": 17320415569920.0, + "grad_norm": 1.8955851715887884, + "language_loss": 0.83669901, + "learning_rate": 3.868439589977181e-06, + "loss": 0.86336344, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.31201172, + "step": 2372, + "time_per_iteration": 2.8337435722351074 + }, + { + "auxiliary_loss_clip": 0.01597041, + "auxiliary_loss_mlp": 0.01063265, + "balance_loss_clip": 1.362795, + "balance_loss_mlp": 1.03141248, + "epoch": 0.14267247858109125, + "flos": 18815923814400.0, + "grad_norm": 2.112139036569057, + "language_loss": 0.86047512, + "learning_rate": 3.868300634397836e-06, + "loss": 0.88707817, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.31835938, + "step": 2373, + "time_per_iteration": 2.8205771446228027 + }, + { + "auxiliary_loss_clip": 0.01597377, + "auxiliary_loss_mlp": 0.01053792, + "balance_loss_clip": 1.36404777, + "balance_loss_mlp": 1.02458572, + "epoch": 0.14273260183375922, + "flos": 11364923306880.0, + "grad_norm": 2.749592346777685, + "language_loss": 0.86694163, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.89345336, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.29223633, + "step": 2374, + "time_per_iteration": 2.808540105819702 + }, + { + "auxiliary_loss_clip": 0.01612694, + "auxiliary_loss_mlp": 0.01056301, + "balance_loss_clip": 1.37529993, + "balance_loss_mlp": 1.02554488, + "epoch": 0.14279272508642718, + "flos": 27580276425600.0, + "grad_norm": 2.038114009381148, + "language_loss": 0.79840702, + "learning_rate": 3.868022510705977e-06, + "loss": 0.82509696, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.30737305, + "step": 2375, + "time_per_iteration": 2.9454026222229004 + }, + { + "auxiliary_loss_clip": 0.01596752, + "auxiliary_loss_mlp": 0.0106175, + "balance_loss_clip": 1.36423099, + "balance_loss_mlp": 1.03120875, + "epoch": 0.14285284833909515, + "flos": 16261032384000.0, + "grad_norm": 2.372152200894251, + "language_loss": 0.78125644, + "learning_rate": 3.867883342604009e-06, + "loss": 0.80784148, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.30541992, + "step": 2376, + "time_per_iteration": 2.8123724460601807 + }, + { + "auxiliary_loss_clip": 0.01608547, + "auxiliary_loss_mlp": 0.0105849, + "balance_loss_clip": 1.37433434, + "balance_loss_mlp": 1.02773452, + "epoch": 0.1429129715917631, + "flos": 19765099370880.0, + "grad_norm": 1.672735133373138, + "language_loss": 0.94192588, + "learning_rate": 3.867744103671717e-06, + "loss": 0.96859622, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.30737305, + "step": 2377, + "time_per_iteration": 2.8993492126464844 + }, + { + "auxiliary_loss_clip": 0.01593244, + "auxiliary_loss_mlp": 0.01056885, + "balance_loss_clip": 1.35885954, + "balance_loss_mlp": 1.02512801, + "epoch": 0.14297309484443108, + "flos": 21145649281920.0, + "grad_norm": 1.9915095951989057, + "language_loss": 0.92629075, + "learning_rate": 3.867604793914382e-06, + "loss": 0.95279205, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.31738281, + "step": 2378, + "time_per_iteration": 2.8385934829711914 + }, + { + "auxiliary_loss_clip": 0.01611593, + "auxiliary_loss_mlp": 0.0105057, + "balance_loss_clip": 1.37269735, + "balance_loss_mlp": 1.01914608, + "epoch": 0.14303321809709904, + "flos": 23597165047680.0, + "grad_norm": 1.6268927608668478, + "language_loss": 0.7502929, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.77691448, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.31420898, + "step": 2379, + "time_per_iteration": 2.8775177001953125 + }, + { + "auxiliary_loss_clip": 0.01608942, + "auxiliary_loss_mlp": 0.01059672, + "balance_loss_clip": 1.37264907, + "balance_loss_mlp": 1.02834356, + "epoch": 0.14309334134976703, + "flos": 15896810592000.0, + "grad_norm": 2.1356605410274536, + "language_loss": 0.80074501, + "learning_rate": 3.867325961945714e-06, + "loss": 0.82743108, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.31323242, + "step": 2380, + "time_per_iteration": 2.8050918579101562 + }, + { + "auxiliary_loss_clip": 0.01614913, + "auxiliary_loss_mlp": 0.0105147, + "balance_loss_clip": 1.37609935, + "balance_loss_mlp": 1.01904488, + "epoch": 0.143153464602435, + "flos": 16334112015360.0, + "grad_norm": 6.7112238299017735, + "language_loss": 0.89772177, + "learning_rate": 3.867186439744955e-06, + "loss": 0.92438561, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.32421875, + "step": 2381, + "time_per_iteration": 2.837433338165283 + }, + { + "auxiliary_loss_clip": 0.01599519, + "auxiliary_loss_mlp": 0.01058189, + "balance_loss_clip": 1.36684406, + "balance_loss_mlp": 1.02626514, + "epoch": 0.14321358785510296, + "flos": 17100859962240.0, + "grad_norm": 3.2887071472440463, + "language_loss": 0.78557754, + "learning_rate": 3.867046846740299e-06, + "loss": 0.81215465, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.3190918, + "step": 2382, + "time_per_iteration": 2.831972599029541 + }, + { + "auxiliary_loss_clip": 0.01608142, + "auxiliary_loss_mlp": 0.01056688, + "balance_loss_clip": 1.37095809, + "balance_loss_mlp": 1.02669466, + "epoch": 0.14327371110777093, + "flos": 26334076884480.0, + "grad_norm": 6.409145745830681, + "language_loss": 0.78877008, + "learning_rate": 3.866907182937039e-06, + "loss": 0.81541836, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.29980469, + "step": 2383, + "time_per_iteration": 2.8583290576934814 + }, + { + "auxiliary_loss_clip": 0.01602983, + "auxiliary_loss_mlp": 0.01058576, + "balance_loss_clip": 1.36641049, + "balance_loss_mlp": 1.02712882, + "epoch": 0.1433338343604389, + "flos": 18085308480000.0, + "grad_norm": 2.215159205351764, + "language_loss": 0.890674, + "learning_rate": 3.866767448340471e-06, + "loss": 0.91728956, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.31469727, + "step": 2384, + "time_per_iteration": 2.8275039196014404 + }, + { + "auxiliary_loss_clip": 0.01613571, + "auxiliary_loss_mlp": 0.01072719, + "balance_loss_clip": 1.37281752, + "balance_loss_mlp": 1.03996038, + "epoch": 0.14339395761310686, + "flos": 15531050476800.0, + "grad_norm": 2.3934938586838603, + "language_loss": 0.81163818, + "learning_rate": 3.866627642955895e-06, + "loss": 0.8385011, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.32739258, + "step": 2385, + "time_per_iteration": 2.8215246200561523 + }, + { + "auxiliary_loss_clip": 0.01594399, + "auxiliary_loss_mlp": 0.01063634, + "balance_loss_clip": 1.35633123, + "balance_loss_mlp": 1.02999365, + "epoch": 0.14345408086577485, + "flos": 28560562421760.0, + "grad_norm": 2.2438886305930166, + "language_loss": 0.76533985, + "learning_rate": 3.866487766788612e-06, + "loss": 0.79192019, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.33642578, + "step": 2386, + "time_per_iteration": 4.308884620666504 + }, + { + "auxiliary_loss_clip": 0.01602323, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.36807406, + "balance_loss_mlp": 1.02858698, + "epoch": 0.14351420411844282, + "flos": 20239709771520.0, + "grad_norm": 2.020131010993685, + "language_loss": 0.79871094, + "learning_rate": 3.866347819843925e-06, + "loss": 0.82532734, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.30712891, + "step": 2387, + "time_per_iteration": 2.8708927631378174 + }, + { + "auxiliary_loss_clip": 0.01607053, + "auxiliary_loss_mlp": 0.0106969, + "balance_loss_clip": 1.37247372, + "balance_loss_mlp": 1.03781295, + "epoch": 0.14357432737111078, + "flos": 19874537838720.0, + "grad_norm": 1.9970171609362568, + "language_loss": 0.83954918, + "learning_rate": 3.866207802127143e-06, + "loss": 0.86631662, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.31860352, + "step": 2388, + "time_per_iteration": 2.9311416149139404 + }, + { + "auxiliary_loss_clip": 0.01610535, + "auxiliary_loss_mlp": 0.0106912, + "balance_loss_clip": 1.37482572, + "balance_loss_mlp": 1.03865027, + "epoch": 0.14363445062377875, + "flos": 28268515365120.0, + "grad_norm": 2.146347198167039, + "language_loss": 0.84124887, + "learning_rate": 3.866067713643573e-06, + "loss": 0.86804545, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.3046875, + "step": 2389, + "time_per_iteration": 2.9113101959228516 + }, + { + "auxiliary_loss_clip": 0.01632146, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.38858056, + "balance_loss_mlp": 1.0363282, + "epoch": 0.1436945738764467, + "flos": 18195742333440.0, + "grad_norm": 1.9013253007090372, + "language_loss": 0.84623563, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.87324083, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.3203125, + "step": 2390, + "time_per_iteration": 2.827235698699951 + }, + { + "auxiliary_loss_clip": 0.01612364, + "auxiliary_loss_mlp": 0.01071307, + "balance_loss_clip": 1.37565136, + "balance_loss_mlp": 1.04000306, + "epoch": 0.14375469712911468, + "flos": 27319294563840.0, + "grad_norm": 1.5995499239569122, + "language_loss": 0.75786912, + "learning_rate": 3.865787324397324e-06, + "loss": 0.78470588, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.31298828, + "step": 2391, + "time_per_iteration": 2.95215106010437 + }, + { + "auxiliary_loss_clip": 0.01394926, + "auxiliary_loss_mlp": 0.01024197, + "balance_loss_clip": 1.26203871, + "balance_loss_mlp": 1.01008272, + "epoch": 0.14381482038178264, + "flos": 56918309811840.0, + "grad_norm": 1.1034181602899682, + "language_loss": 0.61851257, + "learning_rate": 3.865647023645277e-06, + "loss": 0.64270383, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.14160156, + "step": 2392, + "time_per_iteration": 4.752182245254517 + }, + { + "auxiliary_loss_clip": 0.01614418, + "auxiliary_loss_mlp": 0.01059714, + "balance_loss_clip": 1.37473547, + "balance_loss_mlp": 1.02750349, + "epoch": 0.14387494363445064, + "flos": 14290099332480.0, + "grad_norm": 2.249445640393279, + "language_loss": 0.78363097, + "learning_rate": 3.865506652147709e-06, + "loss": 0.81037223, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.32226562, + "step": 2393, + "time_per_iteration": 2.8432326316833496 + }, + { + "auxiliary_loss_clip": 0.0159347, + "auxiliary_loss_mlp": 0.01063395, + "balance_loss_clip": 1.35851943, + "balance_loss_mlp": 1.03278255, + "epoch": 0.1439350668871186, + "flos": 26772961875840.0, + "grad_norm": 2.1132475218764113, + "language_loss": 0.77863598, + "learning_rate": 3.865366209909941e-06, + "loss": 0.80520469, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.3059082, + "step": 2394, + "time_per_iteration": 4.298219442367554 + }, + { + "auxiliary_loss_clip": 0.01594426, + "auxiliary_loss_mlp": 0.01064728, + "balance_loss_clip": 1.36001563, + "balance_loss_mlp": 1.03387702, + "epoch": 0.14399519013978657, + "flos": 40715381030400.0, + "grad_norm": 1.5343387424281707, + "language_loss": 0.86661482, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.89320636, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.30834961, + "step": 2395, + "time_per_iteration": 4.431222200393677 + }, + { + "auxiliary_loss_clip": 0.01596243, + "auxiliary_loss_mlp": 0.01052844, + "balance_loss_clip": 1.36416352, + "balance_loss_mlp": 1.02208769, + "epoch": 0.14405531339245453, + "flos": 20567029789440.0, + "grad_norm": 1.6233470949404256, + "language_loss": 0.83255321, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85904408, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.30786133, + "step": 2396, + "time_per_iteration": 2.863414764404297 + }, + { + "auxiliary_loss_clip": 0.01593356, + "auxiliary_loss_mlp": 0.01050513, + "balance_loss_clip": 1.36140323, + "balance_loss_mlp": 1.01715803, + "epoch": 0.1441154366451225, + "flos": 19582309802880.0, + "grad_norm": 2.156972783251991, + "language_loss": 0.84967053, + "learning_rate": 3.864944458808712e-06, + "loss": 0.87610924, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.33349609, + "step": 2397, + "time_per_iteration": 2.841856002807617 + }, + { + "auxiliary_loss_clip": 0.01602675, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.36672473, + "balance_loss_mlp": 1.01849914, + "epoch": 0.14417555989779046, + "flos": 18525053122560.0, + "grad_norm": 2.1261393892660925, + "language_loss": 0.81257457, + "learning_rate": 3.86480373366343e-06, + "loss": 0.83907479, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28857422, + "step": 2398, + "time_per_iteration": 2.8316028118133545 + }, + { + "auxiliary_loss_clip": 0.01592775, + "auxiliary_loss_mlp": 0.01055806, + "balance_loss_clip": 1.36033463, + "balance_loss_mlp": 1.02347636, + "epoch": 0.14423568315045843, + "flos": 26042934723840.0, + "grad_norm": 2.034103437813451, + "language_loss": 0.664698, + "learning_rate": 3.864662937804603e-06, + "loss": 0.69118387, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.32275391, + "step": 2399, + "time_per_iteration": 2.901139974594116 + }, + { + "auxiliary_loss_clip": 0.015951, + "auxiliary_loss_mlp": 0.01044679, + "balance_loss_clip": 1.36251307, + "balance_loss_mlp": 1.0145669, + "epoch": 0.14429580640312642, + "flos": 21298821488640.0, + "grad_norm": 2.5643106553571333, + "language_loss": 0.835513, + "learning_rate": 3.864522071237571e-06, + "loss": 0.86191082, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.30126953, + "step": 2400, + "time_per_iteration": 2.841078042984009 + }, + { + "auxiliary_loss_clip": 0.01605178, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_clip": 1.36731982, + "balance_loss_mlp": 1.01876545, + "epoch": 0.14435592965579438, + "flos": 25638372552960.0, + "grad_norm": 1.6554615050129031, + "language_loss": 0.76015818, + "learning_rate": 3.864381133967676e-06, + "loss": 0.786713, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.31518555, + "step": 2401, + "time_per_iteration": 2.9077584743499756 + }, + { + "auxiliary_loss_clip": 0.01586079, + "auxiliary_loss_mlp": 0.01050051, + "balance_loss_clip": 1.35593474, + "balance_loss_mlp": 1.02020133, + "epoch": 0.14441605290846235, + "flos": 22975038040320.0, + "grad_norm": 1.5019647851175775, + "language_loss": 0.81528318, + "learning_rate": 3.86424012600026e-06, + "loss": 0.84164447, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.2980957, + "step": 2402, + "time_per_iteration": 2.8525078296661377 + }, + { + "auxiliary_loss_clip": 0.01599304, + "auxiliary_loss_mlp": 0.0105837, + "balance_loss_clip": 1.36496878, + "balance_loss_mlp": 1.02675557, + "epoch": 0.14447617616113032, + "flos": 17356548182400.0, + "grad_norm": 2.1686336451362607, + "language_loss": 0.85697448, + "learning_rate": 3.864099047340673e-06, + "loss": 0.88355124, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.31591797, + "step": 2403, + "time_per_iteration": 2.9182560443878174 + }, + { + "auxiliary_loss_clip": 0.01593269, + "auxiliary_loss_mlp": 0.01056956, + "balance_loss_clip": 1.36167645, + "balance_loss_mlp": 1.0249846, + "epoch": 0.14453629941379828, + "flos": 24070599083520.0, + "grad_norm": 1.6363283321188002, + "language_loss": 0.71530497, + "learning_rate": 3.863957897994262e-06, + "loss": 0.74180722, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.31958008, + "step": 2404, + "time_per_iteration": 2.9120168685913086 + }, + { + "auxiliary_loss_clip": 0.01579333, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.34751749, + "balance_loss_mlp": 1.02282739, + "epoch": 0.14459642266646625, + "flos": 14437887408000.0, + "grad_norm": 2.0298217178332507, + "language_loss": 0.74419796, + "learning_rate": 3.863816677966381e-06, + "loss": 0.7705214, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.30151367, + "step": 2405, + "time_per_iteration": 2.83675217628479 + }, + { + "auxiliary_loss_clip": 0.01586212, + "auxiliary_loss_mlp": 0.01056759, + "balance_loss_clip": 1.35589576, + "balance_loss_mlp": 1.0255022, + "epoch": 0.14465654591913424, + "flos": 9873713318400.0, + "grad_norm": 2.0825444185469424, + "language_loss": 0.74851823, + "learning_rate": 3.863675387262386e-06, + "loss": 0.77494794, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.3125, + "step": 2406, + "time_per_iteration": 2.946133613586426 + }, + { + "auxiliary_loss_clip": 0.01590213, + "auxiliary_loss_mlp": 0.01059567, + "balance_loss_clip": 1.35655117, + "balance_loss_mlp": 1.02802372, + "epoch": 0.1447166691718022, + "flos": 24983732517120.0, + "grad_norm": 4.04976035652705, + "language_loss": 0.76477027, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.79126811, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.31542969, + "step": 2407, + "time_per_iteration": 2.867593288421631 + }, + { + "auxiliary_loss_clip": 0.01580872, + "auxiliary_loss_mlp": 0.01055166, + "balance_loss_clip": 1.35105896, + "balance_loss_mlp": 1.02421904, + "epoch": 0.14477679242447017, + "flos": 21918007584000.0, + "grad_norm": 1.7706805467702038, + "language_loss": 0.79922009, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.82558054, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.30932617, + "step": 2408, + "time_per_iteration": 2.9143497943878174 + }, + { + "auxiliary_loss_clip": 0.0158526, + "auxiliary_loss_mlp": 0.0105749, + "balance_loss_clip": 1.35634005, + "balance_loss_mlp": 1.02516067, + "epoch": 0.14483691567713813, + "flos": 20750498029440.0, + "grad_norm": 5.409536954959013, + "language_loss": 0.83895898, + "learning_rate": 3.863251091147299e-06, + "loss": 0.86538649, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.32324219, + "step": 2409, + "time_per_iteration": 2.8502981662750244 + }, + { + "auxiliary_loss_clip": 0.01600371, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.36257112, + "balance_loss_mlp": 1.02605629, + "epoch": 0.1448970389298061, + "flos": 35421179788800.0, + "grad_norm": 2.888511675735519, + "language_loss": 0.76108909, + "learning_rate": 3.863109517792446e-06, + "loss": 0.78765285, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.29956055, + "step": 2410, + "time_per_iteration": 3.0120110511779785 + }, + { + "auxiliary_loss_clip": 0.01581924, + "auxiliary_loss_mlp": 0.0104922, + "balance_loss_clip": 1.35067391, + "balance_loss_mlp": 1.01954901, + "epoch": 0.14495716218247406, + "flos": 15422878863360.0, + "grad_norm": 1.7346306939130545, + "language_loss": 0.81896108, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.84527254, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.29650879, + "step": 2411, + "time_per_iteration": 2.8616509437561035 + }, + { + "auxiliary_loss_clip": 0.01577283, + "auxiliary_loss_mlp": 0.01061336, + "balance_loss_clip": 1.34692359, + "balance_loss_mlp": 1.02862465, + "epoch": 0.14501728543514203, + "flos": 33706342160640.0, + "grad_norm": 2.6051819251714283, + "language_loss": 0.71400374, + "learning_rate": 3.862826159140214e-06, + "loss": 0.74038988, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.32714844, + "step": 2412, + "time_per_iteration": 2.929386854171753 + }, + { + "auxiliary_loss_clip": 0.01575743, + "auxiliary_loss_mlp": 0.01052976, + "balance_loss_clip": 1.34697652, + "balance_loss_mlp": 1.02152848, + "epoch": 0.14507740868781002, + "flos": 15604492066560.0, + "grad_norm": 1.8761303436893757, + "language_loss": 0.7789284, + "learning_rate": 3.862684373853579e-06, + "loss": 0.8052156, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.31469727, + "step": 2413, + "time_per_iteration": 2.852015495300293 + }, + { + "auxiliary_loss_clip": 0.01370912, + "auxiliary_loss_mlp": 0.01075062, + "balance_loss_clip": 1.23643339, + "balance_loss_mlp": 1.05589318, + "epoch": 0.145137531940478, + "flos": 66706229710080.0, + "grad_norm": 0.9317366031918292, + "language_loss": 0.58966589, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61412561, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.19140625, + "step": 2414, + "time_per_iteration": 3.2903523445129395 + }, + { + "auxiliary_loss_clip": 0.01365527, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.23498487, + "balance_loss_mlp": 1.01983476, + "epoch": 0.14519765519314595, + "flos": 67554518065920.0, + "grad_norm": 0.8550514228093964, + "language_loss": 0.62377703, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64779377, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.16308594, + "step": 2415, + "time_per_iteration": 3.317106246948242 + }, + { + "auxiliary_loss_clip": 0.01574332, + "auxiliary_loss_mlp": 0.01059263, + "balance_loss_clip": 1.34486985, + "balance_loss_mlp": 1.02748179, + "epoch": 0.14525777844581392, + "flos": 17207040804480.0, + "grad_norm": 2.144750038224124, + "language_loss": 0.73410118, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.76043713, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.31787109, + "step": 2416, + "time_per_iteration": 2.9066548347473145 + }, + { + "auxiliary_loss_clip": 0.01361149, + "auxiliary_loss_mlp": 0.01065733, + "balance_loss_clip": 1.23172808, + "balance_loss_mlp": 1.05095088, + "epoch": 0.14531790169848188, + "flos": 65436792324480.0, + "grad_norm": 0.7329295152518381, + "language_loss": 0.60450971, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62877846, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.14746094, + "step": 2417, + "time_per_iteration": 3.3311612606048584 + }, + { + "auxiliary_loss_clip": 0.01587677, + "auxiliary_loss_mlp": 0.01061117, + "balance_loss_clip": 1.35425115, + "balance_loss_mlp": 1.03174376, + "epoch": 0.14537802495114985, + "flos": 32575372421760.0, + "grad_norm": 8.367554620212376, + "language_loss": 0.80068278, + "learning_rate": 3.861974388030356e-06, + "loss": 0.82717073, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.29345703, + "step": 2418, + "time_per_iteration": 3.0119051933288574 + }, + { + "auxiliary_loss_clip": 0.01556466, + "auxiliary_loss_mlp": 0.01069577, + "balance_loss_clip": 1.33280754, + "balance_loss_mlp": 1.03996539, + "epoch": 0.1454381482038178, + "flos": 20235456760320.0, + "grad_norm": 1.9943320614385767, + "language_loss": 0.72661495, + "learning_rate": 3.861832179025394e-06, + "loss": 0.75287545, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.29577637, + "step": 2419, + "time_per_iteration": 2.874368906021118 + }, + { + "auxiliary_loss_clip": 0.01569359, + "auxiliary_loss_mlp": 0.01075027, + "balance_loss_clip": 1.34072208, + "balance_loss_mlp": 1.04560614, + "epoch": 0.1454982714564858, + "flos": 22903360997760.0, + "grad_norm": 2.4676035133631777, + "language_loss": 0.91476238, + "learning_rate": 3.861689899419569e-06, + "loss": 0.94120634, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29443359, + "step": 2420, + "time_per_iteration": 2.8443875312805176 + }, + { + "auxiliary_loss_clip": 0.01580572, + "auxiliary_loss_mlp": 0.0107745, + "balance_loss_clip": 1.35306334, + "balance_loss_mlp": 1.04898262, + "epoch": 0.14555839470915377, + "flos": 20239438302720.0, + "grad_norm": 2.056867247746963, + "language_loss": 0.83415735, + "learning_rate": 3.861547549218276e-06, + "loss": 0.86073756, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28466797, + "step": 2421, + "time_per_iteration": 4.289407253265381 + }, + { + "auxiliary_loss_clip": 0.01583608, + "auxiliary_loss_mlp": 0.01082025, + "balance_loss_clip": 1.35238981, + "balance_loss_mlp": 1.05293846, + "epoch": 0.14561851796182174, + "flos": 22246322987520.0, + "grad_norm": 1.5365264888965833, + "language_loss": 0.82462537, + "learning_rate": 3.861405128426914e-06, + "loss": 0.85128164, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29101562, + "step": 2422, + "time_per_iteration": 2.8514363765716553 + }, + { + "auxiliary_loss_clip": 0.01364927, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.23624635, + "balance_loss_mlp": 1.03596163, + "epoch": 0.1456786412144897, + "flos": 52666226225280.0, + "grad_norm": 0.9183582729142833, + "language_loss": 0.63465953, + "learning_rate": 3.861262637050883e-06, + "loss": 0.6588124, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.14355469, + "step": 2423, + "time_per_iteration": 3.37003755569458 + }, + { + "auxiliary_loss_clip": 0.01581372, + "auxiliary_loss_mlp": 0.0108397, + "balance_loss_clip": 1.35357964, + "balance_loss_mlp": 1.05590761, + "epoch": 0.14573876446715767, + "flos": 23232038359680.0, + "grad_norm": 1.8448660604883096, + "language_loss": 0.8257817, + "learning_rate": 3.861120075095585e-06, + "loss": 0.85243511, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.28076172, + "step": 2424, + "time_per_iteration": 2.9177470207214355 + }, + { + "auxiliary_loss_clip": 0.01575357, + "auxiliary_loss_mlp": 0.01081601, + "balance_loss_clip": 1.34791541, + "balance_loss_mlp": 1.05186987, + "epoch": 0.14579888771982563, + "flos": 18123612842880.0, + "grad_norm": 2.3030985538441424, + "language_loss": 0.79468012, + "learning_rate": 3.860977442566429e-06, + "loss": 0.8212496, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.29736328, + "step": 2425, + "time_per_iteration": 2.8158977031707764 + }, + { + "auxiliary_loss_clip": 0.01583432, + "auxiliary_loss_mlp": 0.01069328, + "balance_loss_clip": 1.35475028, + "balance_loss_mlp": 1.03990746, + "epoch": 0.14585901097249362, + "flos": 23010989673600.0, + "grad_norm": 3.2397204750564756, + "language_loss": 0.84608501, + "learning_rate": 3.860834739468821e-06, + "loss": 0.8726126, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.29394531, + "step": 2426, + "time_per_iteration": 2.8816447257995605 + }, + { + "auxiliary_loss_clip": 0.01583948, + "auxiliary_loss_mlp": 0.01057699, + "balance_loss_clip": 1.35471892, + "balance_loss_mlp": 1.03006601, + "epoch": 0.1459191342251616, + "flos": 21918821990400.0, + "grad_norm": 1.90532483152315, + "language_loss": 0.88048673, + "learning_rate": 3.860691965808173e-06, + "loss": 0.90690321, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27648926, + "step": 2427, + "time_per_iteration": 4.269078254699707 + }, + { + "auxiliary_loss_clip": 0.01579121, + "auxiliary_loss_mlp": 0.01055558, + "balance_loss_clip": 1.34598458, + "balance_loss_mlp": 1.02599406, + "epoch": 0.14597925747782955, + "flos": 14983631913600.0, + "grad_norm": 2.7558737183506437, + "language_loss": 0.68661225, + "learning_rate": 3.8605491215899e-06, + "loss": 0.71295905, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.29541016, + "step": 2428, + "time_per_iteration": 2.81520414352417 + }, + { + "auxiliary_loss_clip": 0.01562188, + "auxiliary_loss_mlp": 0.01058032, + "balance_loss_clip": 1.33635032, + "balance_loss_mlp": 1.02903974, + "epoch": 0.14603938073049752, + "flos": 21078677698560.0, + "grad_norm": 5.233510220377944, + "language_loss": 0.84185523, + "learning_rate": 3.860406206819417e-06, + "loss": 0.86805749, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.28955078, + "step": 2429, + "time_per_iteration": 4.34030294418335 + }, + { + "auxiliary_loss_clip": 0.01566806, + "auxiliary_loss_mlp": 0.01064581, + "balance_loss_clip": 1.34114206, + "balance_loss_mlp": 1.02886629, + "epoch": 0.14609950398316549, + "flos": 19874221125120.0, + "grad_norm": 1.7173646408902759, + "language_loss": 0.79886985, + "learning_rate": 3.860263221502145e-06, + "loss": 0.82518369, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.35742188, + "step": 2430, + "time_per_iteration": 4.372245788574219 + }, + { + "auxiliary_loss_clip": 0.01591279, + "auxiliary_loss_mlp": 0.01053677, + "balance_loss_clip": 1.36054361, + "balance_loss_mlp": 1.02451897, + "epoch": 0.14615962723583345, + "flos": 22428976821120.0, + "grad_norm": 3.1586732169302643, + "language_loss": 0.84933364, + "learning_rate": 3.860120165643504e-06, + "loss": 0.87578321, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.29150391, + "step": 2431, + "time_per_iteration": 2.9006173610687256 + }, + { + "auxiliary_loss_clip": 0.01601984, + "auxiliary_loss_mlp": 0.01052978, + "balance_loss_clip": 1.36653161, + "balance_loss_mlp": 1.0224607, + "epoch": 0.14621975048850142, + "flos": 22356440127360.0, + "grad_norm": 1.8143097874824667, + "language_loss": 0.8007319, + "learning_rate": 3.859977039248921e-06, + "loss": 0.82728148, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.30517578, + "step": 2432, + "time_per_iteration": 2.853865623474121 + }, + { + "auxiliary_loss_clip": 0.01576944, + "auxiliary_loss_mlp": 0.01053462, + "balance_loss_clip": 1.34744596, + "balance_loss_mlp": 1.0248518, + "epoch": 0.1462798737411694, + "flos": 24399819383040.0, + "grad_norm": 1.9265081765371002, + "language_loss": 0.81709933, + "learning_rate": 3.859833842323822e-06, + "loss": 0.8434034, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28637695, + "step": 2433, + "time_per_iteration": 2.9274325370788574 + }, + { + "auxiliary_loss_clip": 0.01568038, + "auxiliary_loss_mlp": 0.01052679, + "balance_loss_clip": 1.34438705, + "balance_loss_mlp": 1.0231626, + "epoch": 0.14633999699383737, + "flos": 19253994399360.0, + "grad_norm": 1.9570749873944084, + "language_loss": 0.79375911, + "learning_rate": 3.859690574873638e-06, + "loss": 0.81996632, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.29541016, + "step": 2434, + "time_per_iteration": 2.8409407138824463 + }, + { + "auxiliary_loss_clip": 0.01343004, + "auxiliary_loss_mlp": 0.01020744, + "balance_loss_clip": 1.21680844, + "balance_loss_mlp": 1.00329196, + "epoch": 0.14640012024650534, + "flos": 62690152855680.0, + "grad_norm": 0.8585401572324304, + "language_loss": 0.5845117, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60814917, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.17480469, + "step": 2435, + "time_per_iteration": 3.321195363998413 + }, + { + "auxiliary_loss_clip": 0.01555757, + "auxiliary_loss_mlp": 0.01052843, + "balance_loss_clip": 1.33559108, + "balance_loss_mlp": 1.02342176, + "epoch": 0.1464602434991733, + "flos": 12283848074880.0, + "grad_norm": 2.5635179615582615, + "language_loss": 0.89512515, + "learning_rate": 3.859403828419744e-06, + "loss": 0.92121112, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.29443359, + "step": 2436, + "time_per_iteration": 2.836540460586548 + }, + { + "auxiliary_loss_clip": 0.01585934, + "auxiliary_loss_mlp": 0.01056023, + "balance_loss_clip": 1.3560164, + "balance_loss_mlp": 1.02774692, + "epoch": 0.14652036675184127, + "flos": 20931885008640.0, + "grad_norm": 1.8102987678387317, + "language_loss": 0.75726384, + "learning_rate": 3.85926034942691e-06, + "loss": 0.78368342, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.28295898, + "step": 2437, + "time_per_iteration": 2.8478307723999023 + }, + { + "auxiliary_loss_clip": 0.0157737, + "auxiliary_loss_mlp": 0.01049367, + "balance_loss_clip": 1.34832513, + "balance_loss_mlp": 1.01756191, + "epoch": 0.14658049000450923, + "flos": 27713721899520.0, + "grad_norm": 2.544263970186643, + "language_loss": 0.75034577, + "learning_rate": 3.859116799930736e-06, + "loss": 0.77661312, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.31787109, + "step": 2438, + "time_per_iteration": 2.9239120483398438 + }, + { + "auxiliary_loss_clip": 0.01590221, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_clip": 1.36167932, + "balance_loss_mlp": 1.0230062, + "epoch": 0.14664061325717723, + "flos": 24947328435840.0, + "grad_norm": 2.29104043141979, + "language_loss": 0.75217468, + "learning_rate": 3.858973179936668e-06, + "loss": 0.77859163, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28491211, + "step": 2439, + "time_per_iteration": 2.8377459049224854 + }, + { + "auxiliary_loss_clip": 0.01574495, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_clip": 1.34895861, + "balance_loss_mlp": 1.01953971, + "epoch": 0.1467007365098452, + "flos": 40312628651520.0, + "grad_norm": 2.208045634929891, + "language_loss": 0.75875598, + "learning_rate": 3.85882948945015e-06, + "loss": 0.78497779, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.28137207, + "step": 2440, + "time_per_iteration": 2.9897854328155518 + }, + { + "auxiliary_loss_clip": 0.01560824, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.33978009, + "balance_loss_mlp": 1.02267528, + "epoch": 0.14676085976251316, + "flos": 26552094168960.0, + "grad_norm": 1.5289094849396603, + "language_loss": 0.83531106, + "learning_rate": 3.85868572847663e-06, + "loss": 0.86143726, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.2911377, + "step": 2441, + "time_per_iteration": 2.8824429512023926 + }, + { + "auxiliary_loss_clip": 0.01593841, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.36109829, + "balance_loss_mlp": 1.01973855, + "epoch": 0.14682098301518112, + "flos": 23560579987200.0, + "grad_norm": 4.927821074044505, + "language_loss": 0.73849154, + "learning_rate": 3.858541897021563e-06, + "loss": 0.76492506, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.29772949, + "step": 2442, + "time_per_iteration": 2.905592679977417 + }, + { + "auxiliary_loss_clip": 0.01606083, + "auxiliary_loss_mlp": 0.01047369, + "balance_loss_clip": 1.36716187, + "balance_loss_mlp": 1.01747131, + "epoch": 0.1468811062678491, + "flos": 11657241832320.0, + "grad_norm": 3.3918386416994726, + "language_loss": 0.83834481, + "learning_rate": 3.8583979950904e-06, + "loss": 0.86487937, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.29882812, + "step": 2443, + "time_per_iteration": 2.887540817260742 + }, + { + "auxiliary_loss_clip": 0.01572524, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.34579921, + "balance_loss_mlp": 1.01852942, + "epoch": 0.14694122952051705, + "flos": 23012889955200.0, + "grad_norm": 2.4454216051832858, + "language_loss": 0.83900464, + "learning_rate": 3.858254022688599e-06, + "loss": 0.86521602, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.30102539, + "step": 2444, + "time_per_iteration": 2.8506932258605957 + }, + { + "auxiliary_loss_clip": 0.01580207, + "auxiliary_loss_mlp": 0.01050748, + "balance_loss_clip": 1.35115623, + "balance_loss_mlp": 1.02149463, + "epoch": 0.14700135277318502, + "flos": 26513563582080.0, + "grad_norm": 3.999043060794391, + "language_loss": 0.72562361, + "learning_rate": 3.85810997982162e-06, + "loss": 0.75193322, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29272461, + "step": 2445, + "time_per_iteration": 2.9209847450256348 + }, + { + "auxiliary_loss_clip": 0.01346392, + "auxiliary_loss_mlp": 0.01023744, + "balance_loss_clip": 1.2227, + "balance_loss_mlp": 1.0120616, + "epoch": 0.147061476025853, + "flos": 59477997191040.0, + "grad_norm": 0.8312323359699864, + "language_loss": 0.63300699, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65670836, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.11669922, + "step": 2446, + "time_per_iteration": 3.293339490890503 + }, + { + "auxiliary_loss_clip": 0.015799, + "auxiliary_loss_mlp": 0.0105626, + "balance_loss_clip": 1.35137892, + "balance_loss_mlp": 1.02686274, + "epoch": 0.14712159927852098, + "flos": 28342907095680.0, + "grad_norm": 1.5366549896560606, + "language_loss": 0.75865173, + "learning_rate": 3.857821682713975e-06, + "loss": 0.78501332, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.29394531, + "step": 2447, + "time_per_iteration": 2.956453323364258 + }, + { + "auxiliary_loss_clip": 0.01577446, + "auxiliary_loss_mlp": 0.01055541, + "balance_loss_clip": 1.35084093, + "balance_loss_mlp": 1.02633488, + "epoch": 0.14718172253118894, + "flos": 27101367768960.0, + "grad_norm": 1.9467285956481803, + "language_loss": 0.86396098, + "learning_rate": 3.857677428484242e-06, + "loss": 0.89029086, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.29174805, + "step": 2448, + "time_per_iteration": 3.068194627761841 + }, + { + "auxiliary_loss_clip": 0.01349095, + "auxiliary_loss_mlp": 0.01058682, + "balance_loss_clip": 1.22351503, + "balance_loss_mlp": 1.04227912, + "epoch": 0.1472418457838569, + "flos": 66736344764160.0, + "grad_norm": 0.7725532966744346, + "language_loss": 0.56921339, + "learning_rate": 3.857533103811195e-06, + "loss": 0.59329116, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.1640625, + "step": 2449, + "time_per_iteration": 3.2737629413604736 + }, + { + "auxiliary_loss_clip": 0.01560233, + "auxiliary_loss_mlp": 0.01057431, + "balance_loss_clip": 1.3374387, + "balance_loss_mlp": 1.02693677, + "epoch": 0.14730196903652487, + "flos": 19583124209280.0, + "grad_norm": 2.422147013724452, + "language_loss": 0.86359602, + "learning_rate": 3.857388708700307e-06, + "loss": 0.88977265, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.30493164, + "step": 2450, + "time_per_iteration": 2.8711159229278564 + }, + { + "auxiliary_loss_clip": 0.01580493, + "auxiliary_loss_mlp": 0.01053035, + "balance_loss_clip": 1.3503201, + "balance_loss_mlp": 1.02403164, + "epoch": 0.14736209228919284, + "flos": 16079690649600.0, + "grad_norm": 1.8123697517463242, + "language_loss": 0.76290739, + "learning_rate": 3.857244243157052e-06, + "loss": 0.78924263, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.28967285, + "step": 2451, + "time_per_iteration": 2.82853102684021 + }, + { + "auxiliary_loss_clip": 0.01561362, + "auxiliary_loss_mlp": 0.01053265, + "balance_loss_clip": 1.34118867, + "balance_loss_mlp": 1.0243454, + "epoch": 0.1474222155418608, + "flos": 23049836974080.0, + "grad_norm": 1.9823887850699196, + "language_loss": 0.82716966, + "learning_rate": 3.85709970718691e-06, + "loss": 0.85331589, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.28881836, + "step": 2452, + "time_per_iteration": 2.8779518604278564 + }, + { + "auxiliary_loss_clip": 0.01573375, + "auxiliary_loss_mlp": 0.01053269, + "balance_loss_clip": 1.34931266, + "balance_loss_mlp": 1.02486098, + "epoch": 0.1474823387945288, + "flos": 17027282638080.0, + "grad_norm": 1.851489603150712, + "language_loss": 0.74730116, + "learning_rate": 3.856955100795361e-06, + "loss": 0.77356762, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.28417969, + "step": 2453, + "time_per_iteration": 2.8405158519744873 + }, + { + "auxiliary_loss_clip": 0.01584998, + "auxiliary_loss_mlp": 0.01056789, + "balance_loss_clip": 1.35608816, + "balance_loss_mlp": 1.02777338, + "epoch": 0.14754246204719676, + "flos": 17904102480000.0, + "grad_norm": 2.0879907031203224, + "language_loss": 0.78202295, + "learning_rate": 3.856810423987889e-06, + "loss": 0.8084408, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29003906, + "step": 2454, + "time_per_iteration": 2.8011245727539062 + }, + { + "auxiliary_loss_clip": 0.01568388, + "auxiliary_loss_mlp": 0.01058141, + "balance_loss_clip": 1.34540582, + "balance_loss_mlp": 1.03007889, + "epoch": 0.14760258529986472, + "flos": 13086954858240.0, + "grad_norm": 3.7247532349304953, + "language_loss": 0.84016323, + "learning_rate": 3.856665676769979e-06, + "loss": 0.86642855, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.28076172, + "step": 2455, + "time_per_iteration": 2.8156449794769287 + }, + { + "auxiliary_loss_clip": 0.01594737, + "auxiliary_loss_mlp": 0.01058132, + "balance_loss_clip": 1.36274457, + "balance_loss_mlp": 1.02925968, + "epoch": 0.1476627085525327, + "flos": 30817615461120.0, + "grad_norm": 1.8067855492741638, + "language_loss": 0.84960383, + "learning_rate": 3.85652085914712e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.28881836, + "step": 2456, + "time_per_iteration": 4.3665876388549805 + }, + { + "auxiliary_loss_clip": 0.0156687, + "auxiliary_loss_mlp": 0.01055587, + "balance_loss_clip": 1.34509969, + "balance_loss_mlp": 1.02659535, + "epoch": 0.14772283180520066, + "flos": 21699447361920.0, + "grad_norm": 1.6570976695327901, + "language_loss": 0.85331202, + "learning_rate": 3.856375971124805e-06, + "loss": 0.87953663, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.28955078, + "step": 2457, + "time_per_iteration": 2.893263101577759 + }, + { + "auxiliary_loss_clip": 0.01561137, + "auxiliary_loss_mlp": 0.01056159, + "balance_loss_clip": 1.34075928, + "balance_loss_mlp": 1.02590418, + "epoch": 0.14778295505786862, + "flos": 18779610222720.0, + "grad_norm": 2.0235330563948737, + "language_loss": 0.76298487, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78915787, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.30273438, + "step": 2458, + "time_per_iteration": 2.8886444568634033 + }, + { + "auxiliary_loss_clip": 0.01599291, + "auxiliary_loss_mlp": 0.01060264, + "balance_loss_clip": 1.36543226, + "balance_loss_mlp": 1.03060508, + "epoch": 0.1478430783105366, + "flos": 22903903935360.0, + "grad_norm": 2.072392561101042, + "language_loss": 0.84571671, + "learning_rate": 3.856085983903782e-06, + "loss": 0.87231225, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.29602051, + "step": 2459, + "time_per_iteration": 2.815490245819092 + }, + { + "auxiliary_loss_clip": 0.01565602, + "auxiliary_loss_mlp": 0.0105169, + "balance_loss_clip": 1.3433665, + "balance_loss_mlp": 1.02362835, + "epoch": 0.14790320156320458, + "flos": 15093568074240.0, + "grad_norm": 2.3041233555635605, + "language_loss": 0.76149702, + "learning_rate": 3.855940884716071e-06, + "loss": 0.7876699, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.28076172, + "step": 2460, + "time_per_iteration": 2.8124966621398926 + }, + { + "auxiliary_loss_clip": 0.01574041, + "auxiliary_loss_mlp": 0.01053948, + "balance_loss_clip": 1.34450173, + "balance_loss_mlp": 1.02502751, + "epoch": 0.14796332481587254, + "flos": 26515825822080.0, + "grad_norm": 1.6567060379051366, + "language_loss": 0.82039517, + "learning_rate": 3.855795715150896e-06, + "loss": 0.84667504, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.28930664, + "step": 2461, + "time_per_iteration": 2.8879010677337646 + }, + { + "auxiliary_loss_clip": 0.01567956, + "auxiliary_loss_mlp": 0.010576, + "balance_loss_clip": 1.34080243, + "balance_loss_mlp": 1.02662992, + "epoch": 0.1480234480685405, + "flos": 17571262596480.0, + "grad_norm": 2.4670100001332678, + "language_loss": 0.68136472, + "learning_rate": 3.855650475213761e-06, + "loss": 0.70762032, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.30981445, + "step": 2462, + "time_per_iteration": 4.256190299987793 + }, + { + "auxiliary_loss_clip": 0.01576922, + "auxiliary_loss_mlp": 0.01055861, + "balance_loss_clip": 1.34876978, + "balance_loss_mlp": 1.02543879, + "epoch": 0.14808357132120847, + "flos": 53601584135040.0, + "grad_norm": 1.754159694087517, + "language_loss": 0.68145126, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.70777905, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.30444336, + "step": 2463, + "time_per_iteration": 3.153669595718384 + }, + { + "auxiliary_loss_clip": 0.01576067, + "auxiliary_loss_mlp": 0.01053601, + "balance_loss_clip": 1.34836817, + "balance_loss_mlp": 1.02370334, + "epoch": 0.14814369457387644, + "flos": 19838857674240.0, + "grad_norm": 2.018223381303197, + "language_loss": 0.78488469, + "learning_rate": 3.855359784245646e-06, + "loss": 0.81118131, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.29882812, + "step": 2464, + "time_per_iteration": 5.678634166717529 + }, + { + "auxiliary_loss_clip": 0.01565264, + "auxiliary_loss_mlp": 0.01046345, + "balance_loss_clip": 1.3419764, + "balance_loss_mlp": 1.01853299, + "epoch": 0.1482038178265444, + "flos": 23925525696000.0, + "grad_norm": 1.8028690473382691, + "language_loss": 0.80604744, + "learning_rate": 3.855214333225688e-06, + "loss": 0.83216351, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.2779541, + "step": 2465, + "time_per_iteration": 2.90708327293396 + }, + { + "auxiliary_loss_clip": 0.01594262, + "auxiliary_loss_mlp": 0.01054808, + "balance_loss_clip": 1.36080277, + "balance_loss_mlp": 1.02257442, + "epoch": 0.1482639410792124, + "flos": 24181168671360.0, + "grad_norm": 1.9008397545447315, + "language_loss": 0.77257395, + "learning_rate": 3.855068811855817e-06, + "loss": 0.79906464, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.32226562, + "step": 2466, + "time_per_iteration": 2.842912435531616 + }, + { + "auxiliary_loss_clip": 0.0137556, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.23823953, + "balance_loss_mlp": 1.01764512, + "epoch": 0.14832406433188036, + "flos": 66219222234240.0, + "grad_norm": 0.8151143362827461, + "language_loss": 0.60201919, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62613821, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.18652344, + "step": 2467, + "time_per_iteration": 3.4398300647735596 + }, + { + "auxiliary_loss_clip": 0.01583321, + "auxiliary_loss_mlp": 0.0104779, + "balance_loss_clip": 1.35543346, + "balance_loss_mlp": 1.016891, + "epoch": 0.14838418758454833, + "flos": 25422300794880.0, + "grad_norm": 2.2672539722125156, + "language_loss": 0.88651955, + "learning_rate": 3.85477755808841e-06, + "loss": 0.91283065, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.30883789, + "step": 2468, + "time_per_iteration": 2.9043235778808594 + }, + { + "auxiliary_loss_clip": 0.01588538, + "auxiliary_loss_mlp": 0.01052995, + "balance_loss_clip": 1.35529876, + "balance_loss_mlp": 1.02152348, + "epoch": 0.1484443108372163, + "flos": 23299055187840.0, + "grad_norm": 2.0019168657756117, + "language_loss": 0.77245438, + "learning_rate": 3.854631825701919e-06, + "loss": 0.79886973, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.31469727, + "step": 2469, + "time_per_iteration": 2.849865674972534 + }, + { + "auxiliary_loss_clip": 0.01573398, + "auxiliary_loss_mlp": 0.01050516, + "balance_loss_clip": 1.34565401, + "balance_loss_mlp": 1.02083337, + "epoch": 0.14850443408988426, + "flos": 14655859447680.0, + "grad_norm": 2.0674842589291527, + "language_loss": 0.77087063, + "learning_rate": 3.854486022987603e-06, + "loss": 0.79710978, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.296875, + "step": 2470, + "time_per_iteration": 2.8888399600982666 + }, + { + "auxiliary_loss_clip": 0.01569185, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.34607434, + "balance_loss_mlp": 1.01779079, + "epoch": 0.14856455734255222, + "flos": 23558317747200.0, + "grad_norm": 1.732708116492121, + "language_loss": 0.73290253, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.75906193, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.28955078, + "step": 2471, + "time_per_iteration": 2.8913724422454834 + }, + { + "auxiliary_loss_clip": 0.01592351, + "auxiliary_loss_mlp": 0.01053073, + "balance_loss_clip": 1.35794723, + "balance_loss_mlp": 1.02129173, + "epoch": 0.1486246805952202, + "flos": 18086349110400.0, + "grad_norm": 1.7085807737250474, + "language_loss": 0.90655851, + "learning_rate": 3.854194206597615e-06, + "loss": 0.93301278, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.31811523, + "step": 2472, + "time_per_iteration": 2.8207309246063232 + }, + { + "auxiliary_loss_clip": 0.01575005, + "auxiliary_loss_mlp": 0.01049289, + "balance_loss_clip": 1.34886146, + "balance_loss_mlp": 1.01901042, + "epoch": 0.14868480384788818, + "flos": 19362844684800.0, + "grad_norm": 2.4397001065258848, + "language_loss": 0.82335877, + "learning_rate": 3.854048192933008e-06, + "loss": 0.84960175, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.30322266, + "step": 2473, + "time_per_iteration": 2.7941675186157227 + }, + { + "auxiliary_loss_clip": 0.01584387, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.35510969, + "balance_loss_mlp": 1.02383351, + "epoch": 0.14874492710055615, + "flos": 22210326109440.0, + "grad_norm": 2.1202060645004175, + "language_loss": 0.79939437, + "learning_rate": 3.853902108962709e-06, + "loss": 0.82578009, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.30322266, + "step": 2474, + "time_per_iteration": 2.8389248847961426 + }, + { + "auxiliary_loss_clip": 0.01593018, + "auxiliary_loss_mlp": 0.01054813, + "balance_loss_clip": 1.35854149, + "balance_loss_mlp": 1.02429569, + "epoch": 0.1488050503532241, + "flos": 21112729050240.0, + "grad_norm": 1.7330099272939044, + "language_loss": 0.83658731, + "learning_rate": 3.853755954692255e-06, + "loss": 0.8630656, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.30541992, + "step": 2475, + "time_per_iteration": 2.8134829998016357 + }, + { + "auxiliary_loss_clip": 0.01579512, + "auxiliary_loss_mlp": 0.01056311, + "balance_loss_clip": 1.35233831, + "balance_loss_mlp": 1.02488708, + "epoch": 0.14886517360589208, + "flos": 12794183884800.0, + "grad_norm": 1.726402244021351, + "language_loss": 0.81743729, + "learning_rate": 3.85360973012719e-06, + "loss": 0.84379548, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.31420898, + "step": 2476, + "time_per_iteration": 2.816865921020508 + }, + { + "auxiliary_loss_clip": 0.01562594, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.34267426, + "balance_loss_mlp": 1.01890159, + "epoch": 0.14892529685856004, + "flos": 29034720374400.0, + "grad_norm": 2.134126377427737, + "language_loss": 0.7875638, + "learning_rate": 3.853463435273058e-06, + "loss": 0.81368846, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.30957031, + "step": 2477, + "time_per_iteration": 3.0155012607574463 + }, + { + "auxiliary_loss_clip": 0.01366724, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.22768402, + "balance_loss_mlp": 1.02002716, + "epoch": 0.148985420111228, + "flos": 61954198634880.0, + "grad_norm": 0.8136519674463651, + "language_loss": 0.60190082, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62597048, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.20214844, + "step": 2478, + "time_per_iteration": 3.433900833129883 + }, + { + "auxiliary_loss_clip": 0.01578561, + "auxiliary_loss_mlp": 0.01055321, + "balance_loss_clip": 1.35040665, + "balance_loss_mlp": 1.02644849, + "epoch": 0.149045543363896, + "flos": 23925706675200.0, + "grad_norm": 3.3462954398109206, + "language_loss": 0.71683741, + "learning_rate": 3.853170634719787e-06, + "loss": 0.74317622, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.2890625, + "step": 2479, + "time_per_iteration": 2.8847897052764893 + }, + { + "auxiliary_loss_clip": 0.01586486, + "auxiliary_loss_mlp": 0.01055305, + "balance_loss_clip": 1.3568666, + "balance_loss_mlp": 1.02590847, + "epoch": 0.14910566661656396, + "flos": 23663548448640.0, + "grad_norm": 1.5766715478820439, + "language_loss": 0.81642568, + "learning_rate": 3.853024129031751e-06, + "loss": 0.84284353, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.29394531, + "step": 2480, + "time_per_iteration": 2.863640785217285 + }, + { + "auxiliary_loss_clip": 0.01608447, + "auxiliary_loss_mlp": 0.01059295, + "balance_loss_clip": 1.37329602, + "balance_loss_mlp": 1.03051805, + "epoch": 0.14916578986923193, + "flos": 20523296050560.0, + "grad_norm": 2.4328650103556186, + "language_loss": 0.84873092, + "learning_rate": 3.852877553076854e-06, + "loss": 0.87540829, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.2878418, + "step": 2481, + "time_per_iteration": 2.8146698474884033 + }, + { + "auxiliary_loss_clip": 0.0158862, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.35699892, + "balance_loss_mlp": 1.02925384, + "epoch": 0.1492259131218999, + "flos": 22501965962880.0, + "grad_norm": 1.913244575723662, + "language_loss": 0.78515172, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.8116464, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.31591797, + "step": 2482, + "time_per_iteration": 2.8864920139312744 + }, + { + "auxiliary_loss_clip": 0.01614341, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_clip": 1.37569475, + "balance_loss_mlp": 1.03323638, + "epoch": 0.14928603637456786, + "flos": 23196403440000.0, + "grad_norm": 2.1115634098474523, + "language_loss": 0.81500131, + "learning_rate": 3.852584190388713e-06, + "loss": 0.8417691, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.29223633, + "step": 2483, + "time_per_iteration": 2.8445591926574707 + }, + { + "auxiliary_loss_clip": 0.0159343, + "auxiliary_loss_mlp": 0.01062421, + "balance_loss_clip": 1.3684566, + "balance_loss_mlp": 1.03361988, + "epoch": 0.14934615962723582, + "flos": 21663314749440.0, + "grad_norm": 1.6152145951689327, + "language_loss": 0.71160209, + "learning_rate": 3.852437403666595e-06, + "loss": 0.73816061, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.28808594, + "step": 2484, + "time_per_iteration": 2.895587682723999 + }, + { + "auxiliary_loss_clip": 0.01599585, + "auxiliary_loss_mlp": 0.01066277, + "balance_loss_clip": 1.36653781, + "balance_loss_mlp": 1.03604579, + "epoch": 0.1494062828799038, + "flos": 27019827360000.0, + "grad_norm": 1.8859334464284532, + "language_loss": 0.85205758, + "learning_rate": 3.852290546699863e-06, + "loss": 0.87871617, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.30249023, + "step": 2485, + "time_per_iteration": 2.8532397747039795 + }, + { + "auxiliary_loss_clip": 0.01601726, + "auxiliary_loss_mlp": 0.01066782, + "balance_loss_clip": 1.36763835, + "balance_loss_mlp": 1.03837395, + "epoch": 0.14946640613257178, + "flos": 21224791716480.0, + "grad_norm": 1.9856483252174038, + "language_loss": 0.86183703, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.88852215, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.2845459, + "step": 2486, + "time_per_iteration": 2.85714054107666 + }, + { + "auxiliary_loss_clip": 0.01586767, + "auxiliary_loss_mlp": 0.01059346, + "balance_loss_clip": 1.36299908, + "balance_loss_mlp": 1.03172541, + "epoch": 0.14952652938523975, + "flos": 13378775690880.0, + "grad_norm": 2.157492212902992, + "language_loss": 0.75486183, + "learning_rate": 3.851996622054842e-06, + "loss": 0.78132296, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27648926, + "step": 2487, + "time_per_iteration": 2.8233225345611572 + }, + { + "auxiliary_loss_clip": 0.01599413, + "auxiliary_loss_mlp": 0.01064572, + "balance_loss_clip": 1.36829543, + "balance_loss_mlp": 1.03519833, + "epoch": 0.1495866526379077, + "flos": 35531342173440.0, + "grad_norm": 1.854263091786796, + "language_loss": 0.72683024, + "learning_rate": 3.8518495543877e-06, + "loss": 0.75347006, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29345703, + "step": 2488, + "time_per_iteration": 2.973961114883423 + }, + { + "auxiliary_loss_clip": 0.01621543, + "auxiliary_loss_mlp": 0.01065134, + "balance_loss_clip": 1.38320971, + "balance_loss_mlp": 1.03676164, + "epoch": 0.14964677589057568, + "flos": 17639953482240.0, + "grad_norm": 3.3625234778704334, + "language_loss": 0.71787453, + "learning_rate": 3.851702416498235e-06, + "loss": 0.74474126, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.28369141, + "step": 2489, + "time_per_iteration": 2.8675944805145264 + }, + { + "auxiliary_loss_clip": 0.01618391, + "auxiliary_loss_mlp": 0.01076761, + "balance_loss_clip": 1.38224351, + "balance_loss_mlp": 1.04733992, + "epoch": 0.14970689914324364, + "flos": 20192763651840.0, + "grad_norm": 4.7721797399625885, + "language_loss": 0.83096647, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.85791796, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.29443359, + "step": 2490, + "time_per_iteration": 4.32551646232605 + }, + { + "auxiliary_loss_clip": 0.01628426, + "auxiliary_loss_mlp": 0.01079905, + "balance_loss_clip": 1.39350033, + "balance_loss_mlp": 1.05264211, + "epoch": 0.1497670223959116, + "flos": 37241791056000.0, + "grad_norm": 2.438645370294875, + "language_loss": 0.81092119, + "learning_rate": 3.851407930074666e-06, + "loss": 0.83800453, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27282715, + "step": 2491, + "time_per_iteration": 2.9879095554351807 + }, + { + "auxiliary_loss_clip": 0.01613618, + "auxiliary_loss_mlp": 0.01070407, + "balance_loss_clip": 1.37878942, + "balance_loss_mlp": 1.04101038, + "epoch": 0.1498271456485796, + "flos": 24464890684800.0, + "grad_norm": 1.6562272764995425, + "language_loss": 0.91951412, + "learning_rate": 3.851260581551727e-06, + "loss": 0.94635439, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29370117, + "step": 2492, + "time_per_iteration": 2.9076340198516846 + }, + { + "auxiliary_loss_clip": 0.01612428, + "auxiliary_loss_mlp": 0.01071034, + "balance_loss_clip": 1.38002348, + "balance_loss_mlp": 1.04211402, + "epoch": 0.14988726890124757, + "flos": 16262344483200.0, + "grad_norm": 2.279238099326968, + "language_loss": 0.82309246, + "learning_rate": 3.851113162828802e-06, + "loss": 0.84992707, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.28930664, + "step": 2493, + "time_per_iteration": 2.7941198348999023 + }, + { + "auxiliary_loss_clip": 0.01622126, + "auxiliary_loss_mlp": 0.0106894, + "balance_loss_clip": 1.38497853, + "balance_loss_mlp": 1.03947115, + "epoch": 0.14994739215391553, + "flos": 20675970564480.0, + "grad_norm": 1.9662286675992946, + "language_loss": 0.81213319, + "learning_rate": 3.85096567391148e-06, + "loss": 0.8390438, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.29492188, + "step": 2494, + "time_per_iteration": 2.8706235885620117 + }, + { + "auxiliary_loss_clip": 0.01603992, + "auxiliary_loss_mlp": 0.01064057, + "balance_loss_clip": 1.37315524, + "balance_loss_mlp": 1.03506505, + "epoch": 0.1500075154065835, + "flos": 70674844769280.0, + "grad_norm": 1.799707891145228, + "language_loss": 0.67484945, + "learning_rate": 3.850818114805354e-06, + "loss": 0.70152998, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2902832, + "step": 2495, + "time_per_iteration": 3.234606981277466 + }, + { + "auxiliary_loss_clip": 0.01373944, + "auxiliary_loss_mlp": 0.01064887, + "balance_loss_clip": 1.23830998, + "balance_loss_mlp": 1.04361999, + "epoch": 0.15006763865925146, + "flos": 68039697767040.0, + "grad_norm": 0.8938968667786334, + "language_loss": 0.59531236, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61970061, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.21289062, + "step": 2496, + "time_per_iteration": 3.397343635559082 + }, + { + "auxiliary_loss_clip": 0.01622583, + "auxiliary_loss_mlp": 0.01054606, + "balance_loss_clip": 1.38503027, + "balance_loss_mlp": 1.02632964, + "epoch": 0.15012776191191943, + "flos": 18925498016640.0, + "grad_norm": 1.8480056741254496, + "language_loss": 0.66835529, + "learning_rate": 3.850522786049075e-06, + "loss": 0.69512719, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28295898, + "step": 2497, + "time_per_iteration": 4.300506591796875 + }, + { + "auxiliary_loss_clip": 0.016113, + "auxiliary_loss_mlp": 0.01058123, + "balance_loss_clip": 1.3771522, + "balance_loss_mlp": 1.03068137, + "epoch": 0.1501878851645874, + "flos": 23713209256320.0, + "grad_norm": 1.5262253072980674, + "language_loss": 0.76015729, + "learning_rate": 3.850375016410121e-06, + "loss": 0.78685153, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.2746582, + "step": 2498, + "time_per_iteration": 4.32857871055603 + }, + { + "auxiliary_loss_clip": 0.01624837, + "auxiliary_loss_mlp": 0.01055719, + "balance_loss_clip": 1.38760042, + "balance_loss_mlp": 1.02462912, + "epoch": 0.15024800841725539, + "flos": 20422227870720.0, + "grad_norm": 2.1325572635186716, + "language_loss": 0.72917664, + "learning_rate": 3.850227176604761e-06, + "loss": 0.75598216, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.3112793, + "step": 2499, + "time_per_iteration": 4.273524522781372 + }, + { + "auxiliary_loss_clip": 0.01617872, + "auxiliary_loss_mlp": 0.01052227, + "balance_loss_clip": 1.38276005, + "balance_loss_mlp": 1.0227828, + "epoch": 0.15030813166992335, + "flos": 31843264008960.0, + "grad_norm": 1.8672666170380017, + "language_loss": 0.72872341, + "learning_rate": 3.850079266638601e-06, + "loss": 0.75542444, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.29443359, + "step": 2500, + "time_per_iteration": 2.925649642944336 + }, + { + "auxiliary_loss_clip": 0.01604748, + "auxiliary_loss_mlp": 0.01052024, + "balance_loss_clip": 1.37215281, + "balance_loss_mlp": 1.0232712, + "epoch": 0.15036825492259132, + "flos": 35669990799360.0, + "grad_norm": 1.972318449261161, + "language_loss": 0.66499323, + "learning_rate": 3.849931286517249e-06, + "loss": 0.69156098, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.28735352, + "step": 2501, + "time_per_iteration": 2.989948034286499 + }, + { + "auxiliary_loss_clip": 0.0160788, + "auxiliary_loss_mlp": 0.01052206, + "balance_loss_clip": 1.37502563, + "balance_loss_mlp": 1.02266574, + "epoch": 0.15042837817525928, + "flos": 18846038868480.0, + "grad_norm": 2.7061487107907083, + "language_loss": 0.84778643, + "learning_rate": 3.849783236246318e-06, + "loss": 0.87438732, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.29589844, + "step": 2502, + "time_per_iteration": 2.8654980659484863 + }, + { + "auxiliary_loss_clip": 0.01605688, + "auxiliary_loss_mlp": 0.01056408, + "balance_loss_clip": 1.37317264, + "balance_loss_mlp": 1.02741647, + "epoch": 0.15048850142792725, + "flos": 19544684112000.0, + "grad_norm": 1.8374711822035608, + "language_loss": 0.77874076, + "learning_rate": 3.849635115831421e-06, + "loss": 0.80536175, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28979492, + "step": 2503, + "time_per_iteration": 2.840341091156006 + }, + { + "auxiliary_loss_clip": 0.01605196, + "auxiliary_loss_mlp": 0.01050208, + "balance_loss_clip": 1.37503326, + "balance_loss_mlp": 1.02171719, + "epoch": 0.1505486246805952, + "flos": 22027581786240.0, + "grad_norm": 1.7615489857848237, + "language_loss": 0.86261505, + "learning_rate": 3.849486925278176e-06, + "loss": 0.88916916, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.2845459, + "step": 2504, + "time_per_iteration": 2.8831355571746826 + }, + { + "auxiliary_loss_clip": 0.01593642, + "auxiliary_loss_mlp": 0.01052581, + "balance_loss_clip": 1.36484826, + "balance_loss_mlp": 1.02373278, + "epoch": 0.15060874793326318, + "flos": 20753529431040.0, + "grad_norm": 1.5854309510794777, + "language_loss": 0.83310044, + "learning_rate": 3.8493386645922e-06, + "loss": 0.8595627, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.28857422, + "step": 2505, + "time_per_iteration": 2.8599815368652344 + }, + { + "auxiliary_loss_clip": 0.01602427, + "auxiliary_loss_mlp": 0.01049483, + "balance_loss_clip": 1.37026608, + "balance_loss_mlp": 1.01980019, + "epoch": 0.15066887118593117, + "flos": 16480542746880.0, + "grad_norm": 2.2385796362695713, + "language_loss": 0.77354944, + "learning_rate": 3.849190333779117e-06, + "loss": 0.8000685, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.29663086, + "step": 2506, + "time_per_iteration": 2.868196487426758 + }, + { + "auxiliary_loss_clip": 0.0162217, + "auxiliary_loss_mlp": 0.01052109, + "balance_loss_clip": 1.38238823, + "balance_loss_mlp": 1.02234256, + "epoch": 0.15072899443859913, + "flos": 19867343915520.0, + "grad_norm": 2.973015422606169, + "language_loss": 0.78165102, + "learning_rate": 3.849041932844552e-06, + "loss": 0.80839384, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.29797363, + "step": 2507, + "time_per_iteration": 2.8609442710876465 + }, + { + "auxiliary_loss_clip": 0.01601825, + "auxiliary_loss_mlp": 0.01058746, + "balance_loss_clip": 1.37402892, + "balance_loss_mlp": 1.0287528, + "epoch": 0.1507891176912671, + "flos": 20785725745920.0, + "grad_norm": 2.051840978766195, + "language_loss": 0.70352858, + "learning_rate": 3.848893461794131e-06, + "loss": 0.73013425, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.30029297, + "step": 2508, + "time_per_iteration": 2.8914198875427246 + }, + { + "auxiliary_loss_clip": 0.01625115, + "auxiliary_loss_mlp": 0.01054986, + "balance_loss_clip": 1.38872039, + "balance_loss_mlp": 1.0262686, + "epoch": 0.15084924094393506, + "flos": 23597119802880.0, + "grad_norm": 2.267611538489173, + "language_loss": 0.78644097, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.81324196, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28723145, + "step": 2509, + "time_per_iteration": 2.9159038066864014 + }, + { + "auxiliary_loss_clip": 0.01631631, + "auxiliary_loss_mlp": 0.01058989, + "balance_loss_clip": 1.38951218, + "balance_loss_mlp": 1.02792335, + "epoch": 0.15090936419660303, + "flos": 18919254234240.0, + "grad_norm": 3.982505650285804, + "language_loss": 0.83239502, + "learning_rate": 3.848596309368246e-06, + "loss": 0.85930121, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.3104248, + "step": 2510, + "time_per_iteration": 2.8010637760162354 + }, + { + "auxiliary_loss_clip": 0.01610491, + "auxiliary_loss_mlp": 0.01051064, + "balance_loss_clip": 1.37485719, + "balance_loss_mlp": 1.01990235, + "epoch": 0.150969487449271, + "flos": 17936434529280.0, + "grad_norm": 2.0322452607826156, + "language_loss": 0.7523759, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.77899146, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.31152344, + "step": 2511, + "time_per_iteration": 2.8725454807281494 + }, + { + "auxiliary_loss_clip": 0.01599533, + "auxiliary_loss_mlp": 0.01051421, + "balance_loss_clip": 1.36880207, + "balance_loss_mlp": 1.02259648, + "epoch": 0.151029610701939, + "flos": 24253660120320.0, + "grad_norm": 2.1215117536320194, + "language_loss": 0.70300722, + "learning_rate": 3.848298876546534e-06, + "loss": 0.72951674, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28845215, + "step": 2512, + "time_per_iteration": 2.8383121490478516 + }, + { + "auxiliary_loss_clip": 0.01598163, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.36750031, + "balance_loss_mlp": 1.02559137, + "epoch": 0.15108973395460695, + "flos": 30274449909120.0, + "grad_norm": 2.3107800456872636, + "language_loss": 0.75363833, + "learning_rate": 3.84815005500134e-06, + "loss": 0.78015411, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.27819824, + "step": 2513, + "time_per_iteration": 3.0004611015319824 + }, + { + "auxiliary_loss_clip": 0.0135552, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.22169447, + "balance_loss_mlp": 1.0036248, + "epoch": 0.15114985720727492, + "flos": 60467241657600.0, + "grad_norm": 0.875193166726578, + "language_loss": 0.6494174, + "learning_rate": 3.84800116337411e-06, + "loss": 0.67318434, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.17578125, + "step": 2514, + "time_per_iteration": 3.2983970642089844 + }, + { + "auxiliary_loss_clip": 0.01597282, + "auxiliary_loss_mlp": 0.0104902, + "balance_loss_clip": 1.3673358, + "balance_loss_mlp": 1.02008843, + "epoch": 0.15120998045994288, + "flos": 20531349624960.0, + "grad_norm": 2.4441521147422716, + "language_loss": 0.74837184, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.77483481, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28930664, + "step": 2515, + "time_per_iteration": 2.829244613647461 + }, + { + "auxiliary_loss_clip": 0.01585766, + "auxiliary_loss_mlp": 0.01044615, + "balance_loss_clip": 1.35671401, + "balance_loss_mlp": 1.01710105, + "epoch": 0.15127010371261085, + "flos": 21188840083200.0, + "grad_norm": 2.140563342594692, + "language_loss": 0.78517151, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.81147528, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27514648, + "step": 2516, + "time_per_iteration": 2.871809959411621 + }, + { + "auxiliary_loss_clip": 0.01357589, + "auxiliary_loss_mlp": 0.01023917, + "balance_loss_clip": 1.21955538, + "balance_loss_mlp": 1.00417638, + "epoch": 0.1513302269652788, + "flos": 65349958273920.0, + "grad_norm": 0.7381256305676883, + "language_loss": 0.54631591, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.570131, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.19726562, + "step": 2517, + "time_per_iteration": 3.304560661315918 + }, + { + "auxiliary_loss_clip": 0.01594815, + "auxiliary_loss_mlp": 0.01051012, + "balance_loss_clip": 1.3648839, + "balance_loss_mlp": 1.0225215, + "epoch": 0.15139035021794678, + "flos": 19145279848320.0, + "grad_norm": 2.8061316718824196, + "language_loss": 0.79389095, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.82034922, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28491211, + "step": 2518, + "time_per_iteration": 2.8398139476776123 + }, + { + "auxiliary_loss_clip": 0.01618596, + "auxiliary_loss_mlp": 0.01059716, + "balance_loss_clip": 1.38198149, + "balance_loss_mlp": 1.031165, + "epoch": 0.15145047347061477, + "flos": 26589855594240.0, + "grad_norm": 1.9058183080191238, + "language_loss": 0.71580529, + "learning_rate": 3.847255654205137e-06, + "loss": 0.74258846, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.28552246, + "step": 2519, + "time_per_iteration": 2.8662071228027344 + }, + { + "auxiliary_loss_clip": 0.01603511, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.3705864, + "balance_loss_mlp": 1.01554501, + "epoch": 0.15151059672328274, + "flos": 20312608423680.0, + "grad_norm": 1.9077953421611842, + "language_loss": 0.80221963, + "learning_rate": 3.847106342204354e-06, + "loss": 0.82868671, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27624512, + "step": 2520, + "time_per_iteration": 2.944742441177368 + }, + { + "auxiliary_loss_clip": 0.01609853, + "auxiliary_loss_mlp": 0.01052924, + "balance_loss_clip": 1.373945, + "balance_loss_mlp": 1.02457631, + "epoch": 0.1515707199759507, + "flos": 27238025623680.0, + "grad_norm": 1.7239883373332952, + "language_loss": 0.76870579, + "learning_rate": 3.846956960161114e-06, + "loss": 0.79533356, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28308105, + "step": 2521, + "time_per_iteration": 2.9181275367736816 + }, + { + "auxiliary_loss_clip": 0.01603707, + "auxiliary_loss_mlp": 0.01062517, + "balance_loss_clip": 1.36643648, + "balance_loss_mlp": 1.03414559, + "epoch": 0.15163084322861867, + "flos": 23598024698880.0, + "grad_norm": 2.0711503737590036, + "language_loss": 0.83304751, + "learning_rate": 3.84680750808108e-06, + "loss": 0.85970974, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.28393555, + "step": 2522, + "time_per_iteration": 2.8675577640533447 + }, + { + "auxiliary_loss_clip": 0.0134799, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.21232629, + "balance_loss_mlp": 1.01100564, + "epoch": 0.15169096648128663, + "flos": 66919360556160.0, + "grad_norm": 0.828044025361081, + "language_loss": 0.57985806, + "learning_rate": 3.846657985969922e-06, + "loss": 0.60370553, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.2578125, + "step": 2523, + "time_per_iteration": 3.4418258666992188 + }, + { + "auxiliary_loss_clip": 0.01591048, + "auxiliary_loss_mlp": 0.01054671, + "balance_loss_clip": 1.36251783, + "balance_loss_mlp": 1.02680027, + "epoch": 0.1517510897339546, + "flos": 29107211823360.0, + "grad_norm": 7.081408362405522, + "language_loss": 0.75517678, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.78163397, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27880859, + "step": 2524, + "time_per_iteration": 2.9421255588531494 + }, + { + "auxiliary_loss_clip": 0.01612422, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_clip": 1.37821436, + "balance_loss_mlp": 1.03201509, + "epoch": 0.1518112129866226, + "flos": 18415931368320.0, + "grad_norm": 1.9758242374627144, + "language_loss": 0.7601856, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.78690362, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27392578, + "step": 2525, + "time_per_iteration": 2.871349334716797 + }, + { + "auxiliary_loss_clip": 0.0161856, + "auxiliary_loss_mlp": 0.01062267, + "balance_loss_clip": 1.38014829, + "balance_loss_mlp": 1.03260744, + "epoch": 0.15187133623929056, + "flos": 19434340748160.0, + "grad_norm": 1.6956979987806005, + "language_loss": 0.81109738, + "learning_rate": 3.846208999506402e-06, + "loss": 0.83790565, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.29663086, + "step": 2526, + "time_per_iteration": 4.267542839050293 + }, + { + "auxiliary_loss_clip": 0.0160476, + "auxiliary_loss_mlp": 0.01051937, + "balance_loss_clip": 1.37509894, + "balance_loss_mlp": 1.02516246, + "epoch": 0.15193145949195852, + "flos": 17575063159680.0, + "grad_norm": 1.5872837807902482, + "language_loss": 0.85716796, + "learning_rate": 3.846059197327466e-06, + "loss": 0.88373482, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26782227, + "step": 2527, + "time_per_iteration": 2.932478427886963 + }, + { + "auxiliary_loss_clip": 0.01615607, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.38213837, + "balance_loss_mlp": 1.0288496, + "epoch": 0.15199158274462649, + "flos": 36190551934080.0, + "grad_norm": 1.893735875856623, + "language_loss": 0.70337021, + "learning_rate": 3.845909325145779e-06, + "loss": 0.73009777, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.28320312, + "step": 2528, + "time_per_iteration": 2.9962005615234375 + }, + { + "auxiliary_loss_clip": 0.01600693, + "auxiliary_loss_mlp": 0.01055812, + "balance_loss_clip": 1.37083828, + "balance_loss_mlp": 1.0283457, + "epoch": 0.15205170599729445, + "flos": 23083933570560.0, + "grad_norm": 1.7599441703041991, + "language_loss": 0.88067943, + "learning_rate": 3.845759382967026e-06, + "loss": 0.90724444, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.27429199, + "step": 2529, + "time_per_iteration": 2.877779960632324 + }, + { + "auxiliary_loss_clip": 0.01598379, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.36963797, + "balance_loss_mlp": 1.02368271, + "epoch": 0.15211182924996242, + "flos": 21918460032000.0, + "grad_norm": 1.900061628596223, + "language_loss": 0.84238911, + "learning_rate": 3.845609370796893e-06, + "loss": 0.86889184, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28222656, + "step": 2530, + "time_per_iteration": 2.8227410316467285 + }, + { + "auxiliary_loss_clip": 0.01608967, + "auxiliary_loss_mlp": 0.01059353, + "balance_loss_clip": 1.37574029, + "balance_loss_mlp": 1.03012323, + "epoch": 0.15217195250263038, + "flos": 13889518704000.0, + "grad_norm": 1.9667472561004713, + "language_loss": 0.81538713, + "learning_rate": 3.845459288641066e-06, + "loss": 0.84207034, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.29248047, + "step": 2531, + "time_per_iteration": 2.8331246376037598 + }, + { + "auxiliary_loss_clip": 0.01602951, + "auxiliary_loss_mlp": 0.0105516, + "balance_loss_clip": 1.37220788, + "balance_loss_mlp": 1.02752745, + "epoch": 0.15223207575529837, + "flos": 24546069135360.0, + "grad_norm": 1.6903550646707037, + "language_loss": 0.79937828, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.82595944, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27624512, + "step": 2532, + "time_per_iteration": 4.296976804733276 + }, + { + "auxiliary_loss_clip": 0.01607795, + "auxiliary_loss_mlp": 0.01054614, + "balance_loss_clip": 1.37674105, + "balance_loss_mlp": 1.0247159, + "epoch": 0.15229219900796634, + "flos": 25567781385600.0, + "grad_norm": 1.801418016022388, + "language_loss": 0.88761526, + "learning_rate": 3.845158914395105e-06, + "loss": 0.91423935, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29907227, + "step": 2533, + "time_per_iteration": 4.333013534545898 + }, + { + "auxiliary_loss_clip": 0.01601476, + "auxiliary_loss_mlp": 0.01051406, + "balance_loss_clip": 1.3673563, + "balance_loss_mlp": 1.02081656, + "epoch": 0.1523523222606343, + "flos": 18225495429120.0, + "grad_norm": 2.279287925521899, + "language_loss": 0.80412591, + "learning_rate": 3.84500862231636e-06, + "loss": 0.83065474, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.30541992, + "step": 2534, + "time_per_iteration": 4.271279335021973 + }, + { + "auxiliary_loss_clip": 0.0161644, + "auxiliary_loss_mlp": 0.01054225, + "balance_loss_clip": 1.37415671, + "balance_loss_mlp": 1.02430367, + "epoch": 0.15241244551330227, + "flos": 13267617920640.0, + "grad_norm": 2.4348671456241715, + "language_loss": 0.78204334, + "learning_rate": 3.844858260274702e-06, + "loss": 0.80875003, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.29907227, + "step": 2535, + "time_per_iteration": 2.8210387229919434 + }, + { + "auxiliary_loss_clip": 0.01621277, + "auxiliary_loss_mlp": 0.01054582, + "balance_loss_clip": 1.38071632, + "balance_loss_mlp": 1.02399278, + "epoch": 0.15247256876597023, + "flos": 19724125564800.0, + "grad_norm": 2.158950239940822, + "language_loss": 0.78930646, + "learning_rate": 3.844707828275835e-06, + "loss": 0.81606501, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.3059082, + "step": 2536, + "time_per_iteration": 2.8771963119506836 + }, + { + "auxiliary_loss_clip": 0.01601775, + "auxiliary_loss_mlp": 0.01044813, + "balance_loss_clip": 1.3705709, + "balance_loss_mlp": 1.01629794, + "epoch": 0.1525326920186382, + "flos": 20385507075840.0, + "grad_norm": 2.631227871317481, + "language_loss": 0.76710749, + "learning_rate": 3.844557326325461e-06, + "loss": 0.79357326, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28503418, + "step": 2537, + "time_per_iteration": 2.831979751586914 + }, + { + "auxiliary_loss_clip": 0.01610423, + "auxiliary_loss_mlp": 0.01058816, + "balance_loss_clip": 1.37476563, + "balance_loss_mlp": 1.02839398, + "epoch": 0.15259281527130616, + "flos": 13597969340160.0, + "grad_norm": 3.3427726505628166, + "language_loss": 0.78977305, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.81646544, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.30395508, + "step": 2538, + "time_per_iteration": 2.8561840057373047 + }, + { + "auxiliary_loss_clip": 0.01606132, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.37579846, + "balance_loss_mlp": 1.02053642, + "epoch": 0.15265293852397416, + "flos": 22871571886080.0, + "grad_norm": 2.0223592652886597, + "language_loss": 0.90815651, + "learning_rate": 3.844256112593029e-06, + "loss": 0.93471807, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.29492188, + "step": 2539, + "time_per_iteration": 2.8530733585357666 + }, + { + "auxiliary_loss_clip": 0.01620063, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.38335359, + "balance_loss_mlp": 1.02321482, + "epoch": 0.15271306177664212, + "flos": 29249072830080.0, + "grad_norm": 1.7946378020313043, + "language_loss": 0.94076133, + "learning_rate": 3.844105400822391e-06, + "loss": 0.96748018, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28613281, + "step": 2540, + "time_per_iteration": 2.9425933361053467 + }, + { + "auxiliary_loss_clip": 0.01613366, + "auxiliary_loss_mlp": 0.0105178, + "balance_loss_clip": 1.3796804, + "balance_loss_mlp": 1.02357519, + "epoch": 0.1527731850293101, + "flos": 31258129265280.0, + "grad_norm": 1.636194491730063, + "language_loss": 0.76019597, + "learning_rate": 3.843954619123092e-06, + "loss": 0.78684747, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.2824707, + "step": 2541, + "time_per_iteration": 2.909701347351074 + }, + { + "auxiliary_loss_clip": 0.01601418, + "auxiliary_loss_mlp": 0.01060243, + "balance_loss_clip": 1.36797404, + "balance_loss_mlp": 1.02972567, + "epoch": 0.15283330828197805, + "flos": 22392120291840.0, + "grad_norm": 1.589992988885785, + "language_loss": 0.8223393, + "learning_rate": 3.84380376750085e-06, + "loss": 0.84895593, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.30493164, + "step": 2542, + "time_per_iteration": 2.8625752925872803 + }, + { + "auxiliary_loss_clip": 0.01622406, + "auxiliary_loss_mlp": 0.01054275, + "balance_loss_clip": 1.38269484, + "balance_loss_mlp": 1.02437687, + "epoch": 0.15289343153464602, + "flos": 25531060590720.0, + "grad_norm": 3.143938363340901, + "language_loss": 0.79537344, + "learning_rate": 3.843652845961383e-06, + "loss": 0.82214022, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.29882812, + "step": 2543, + "time_per_iteration": 2.9173190593719482 + }, + { + "auxiliary_loss_clip": 0.01612847, + "auxiliary_loss_mlp": 0.01049985, + "balance_loss_clip": 1.3794055, + "balance_loss_mlp": 1.02175641, + "epoch": 0.15295355478731398, + "flos": 22720028492160.0, + "grad_norm": 1.9130884976258637, + "language_loss": 0.87528157, + "learning_rate": 3.843501854510416e-06, + "loss": 0.90190995, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.2824707, + "step": 2544, + "time_per_iteration": 2.8403937816619873 + }, + { + "auxiliary_loss_clip": 0.01641615, + "auxiliary_loss_mlp": 0.0106493, + "balance_loss_clip": 1.39800692, + "balance_loss_mlp": 1.03419816, + "epoch": 0.15301367803998198, + "flos": 23260660335360.0, + "grad_norm": 2.3069622516942787, + "language_loss": 0.83658034, + "learning_rate": 3.843350793153673e-06, + "loss": 0.86364579, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.30712891, + "step": 2545, + "time_per_iteration": 2.8957438468933105 + }, + { + "auxiliary_loss_clip": 0.01625337, + "auxiliary_loss_mlp": 0.01070759, + "balance_loss_clip": 1.38932848, + "balance_loss_mlp": 1.04064715, + "epoch": 0.15307380129264994, + "flos": 25897635112320.0, + "grad_norm": 2.043272240600342, + "language_loss": 0.71978742, + "learning_rate": 3.843199661896884e-06, + "loss": 0.74674839, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.30090332, + "step": 2546, + "time_per_iteration": 2.917083263397217 + }, + { + "auxiliary_loss_clip": 0.01618081, + "auxiliary_loss_mlp": 0.01060333, + "balance_loss_clip": 1.38059497, + "balance_loss_mlp": 1.03177083, + "epoch": 0.1531339245453179, + "flos": 46989596799360.0, + "grad_norm": 1.9186969118974935, + "language_loss": 0.79109573, + "learning_rate": 3.843048460745779e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.28515625, + "step": 2547, + "time_per_iteration": 3.1293866634368896 + }, + { + "auxiliary_loss_clip": 0.0162461, + "auxiliary_loss_mlp": 0.01068745, + "balance_loss_clip": 1.38408995, + "balance_loss_mlp": 1.03748846, + "epoch": 0.15319404779798587, + "flos": 35895654455040.0, + "grad_norm": 1.98241495911055, + "language_loss": 0.75145614, + "learning_rate": 3.842897189706092e-06, + "loss": 0.77838975, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.3125, + "step": 2548, + "time_per_iteration": 3.0342581272125244 + }, + { + "auxiliary_loss_clip": 0.01606617, + "auxiliary_loss_mlp": 0.01059911, + "balance_loss_clip": 1.37318587, + "balance_loss_mlp": 1.02979875, + "epoch": 0.15325417105065384, + "flos": 25675229082240.0, + "grad_norm": 5.4083055913706595, + "language_loss": 0.81362462, + "learning_rate": 3.842745848783558e-06, + "loss": 0.84028983, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.30078125, + "step": 2549, + "time_per_iteration": 2.9085745811462402 + }, + { + "auxiliary_loss_clip": 0.01630905, + "auxiliary_loss_mlp": 0.01053578, + "balance_loss_clip": 1.39250612, + "balance_loss_mlp": 1.02453828, + "epoch": 0.1533142943033218, + "flos": 18780153160320.0, + "grad_norm": 2.158907651393464, + "language_loss": 0.76123041, + "learning_rate": 3.842594437983917e-06, + "loss": 0.78807521, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.29003906, + "step": 2550, + "time_per_iteration": 2.8572804927825928 + }, + { + "auxiliary_loss_clip": 0.0163423, + "auxiliary_loss_mlp": 0.0104573, + "balance_loss_clip": 1.39037204, + "balance_loss_mlp": 1.01542664, + "epoch": 0.15337441755598977, + "flos": 23116446599040.0, + "grad_norm": 2.274522318010894, + "language_loss": 0.78222334, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.8090229, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30273438, + "step": 2551, + "time_per_iteration": 2.8460793495178223 + }, + { + "auxiliary_loss_clip": 0.01363331, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.22314072, + "balance_loss_mlp": 1.0220269, + "epoch": 0.15343454080865776, + "flos": 59892422728320.0, + "grad_norm": 0.9713160147500561, + "language_loss": 0.56856275, + "learning_rate": 3.842291406776283e-06, + "loss": 0.59258044, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.1640625, + "step": 2552, + "time_per_iteration": 3.309238910675049 + }, + { + "auxiliary_loss_clip": 0.01640009, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_clip": 1.398772, + "balance_loss_mlp": 1.02129292, + "epoch": 0.15349466406132573, + "flos": 11918268938880.0, + "grad_norm": 1.8297669066218012, + "language_loss": 0.90286374, + "learning_rate": 3.84213978637978e-06, + "loss": 0.92979121, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 2.4140625, + "router_z_loss_mlp": 0.31445312, + "step": 2553, + "time_per_iteration": 2.8197031021118164 + }, + { + "auxiliary_loss_clip": 0.01658465, + "auxiliary_loss_mlp": 0.01056023, + "balance_loss_clip": 1.41397941, + "balance_loss_mlp": 1.02438462, + "epoch": 0.1535547873139937, + "flos": 24107410368000.0, + "grad_norm": 2.566335381084273, + "language_loss": 0.79279679, + "learning_rate": 3.841988096129152e-06, + "loss": 0.81994164, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.31665039, + "step": 2554, + "time_per_iteration": 2.896726131439209 + }, + { + "auxiliary_loss_clip": 0.01654047, + "auxiliary_loss_mlp": 0.01056247, + "balance_loss_clip": 1.4095118, + "balance_loss_mlp": 1.02437091, + "epoch": 0.15361491056666166, + "flos": 17575379873280.0, + "grad_norm": 2.2512903173466223, + "language_loss": 0.79565585, + "learning_rate": 3.841836336030151e-06, + "loss": 0.82275879, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.31860352, + "step": 2555, + "time_per_iteration": 2.859384298324585 + }, + { + "auxiliary_loss_clip": 0.01631185, + "auxiliary_loss_mlp": 0.01049292, + "balance_loss_clip": 1.39451671, + "balance_loss_mlp": 1.01953769, + "epoch": 0.15367503381932962, + "flos": 25056902638080.0, + "grad_norm": 1.4847337663701996, + "language_loss": 0.7829572, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.80976188, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.29760742, + "step": 2556, + "time_per_iteration": 2.873593330383301 + }, + { + "auxiliary_loss_clip": 0.01632628, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_clip": 1.39608479, + "balance_loss_mlp": 1.01392555, + "epoch": 0.15373515707199759, + "flos": 21517381710720.0, + "grad_norm": 1.8732318244536432, + "language_loss": 0.9135133, + "learning_rate": 3.84153260631005e-06, + "loss": 0.94029236, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.31347656, + "step": 2557, + "time_per_iteration": 2.8639166355133057 + }, + { + "auxiliary_loss_clip": 0.01648201, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.40629637, + "balance_loss_mlp": 1.01982677, + "epoch": 0.15379528032466555, + "flos": 26005851970560.0, + "grad_norm": 1.9350034792994426, + "language_loss": 0.71392351, + "learning_rate": 3.841380636700468e-06, + "loss": 0.74091518, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.31152344, + "step": 2558, + "time_per_iteration": 2.8799209594726562 + }, + { + "auxiliary_loss_clip": 0.01649946, + "auxiliary_loss_mlp": 0.0104901, + "balance_loss_clip": 1.40728509, + "balance_loss_mlp": 1.01782441, + "epoch": 0.15385540357733354, + "flos": 19285647776640.0, + "grad_norm": 2.1118031545197593, + "language_loss": 0.92297351, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94996303, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 2.42578125, + "router_z_loss_mlp": 0.31201172, + "step": 2559, + "time_per_iteration": 2.8617568016052246 + }, + { + "auxiliary_loss_clip": 0.01659146, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.41513181, + "balance_loss_mlp": 1.02387977, + "epoch": 0.1539155268300015, + "flos": 28561014869760.0, + "grad_norm": 3.3760310849836648, + "language_loss": 0.6536662, + "learning_rate": 3.841076488011055e-06, + "loss": 0.68081981, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 2.43945312, + "router_z_loss_mlp": 0.32299805, + "step": 2560, + "time_per_iteration": 4.383769512176514 + }, + { + "auxiliary_loss_clip": 0.01664383, + "auxiliary_loss_mlp": 0.01055329, + "balance_loss_clip": 1.4165231, + "balance_loss_mlp": 1.02185559, + "epoch": 0.15397565008266947, + "flos": 23557774809600.0, + "grad_norm": 1.5144029758520932, + "language_loss": 0.88866031, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.91585737, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.33496094, + "step": 2561, + "time_per_iteration": 2.88041615486145 + }, + { + "auxiliary_loss_clip": 0.01631185, + "auxiliary_loss_mlp": 0.01046782, + "balance_loss_clip": 1.39907479, + "balance_loss_mlp": 1.01671767, + "epoch": 0.15403577333533744, + "flos": 17138938101120.0, + "grad_norm": 1.7497227769065473, + "language_loss": 0.84260285, + "learning_rate": 3.840772060066425e-06, + "loss": 0.8693825, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.30102539, + "step": 2562, + "time_per_iteration": 2.9252164363861084 + }, + { + "auxiliary_loss_clip": 0.01667845, + "auxiliary_loss_mlp": 0.01057996, + "balance_loss_clip": 1.41801012, + "balance_loss_mlp": 1.02356863, + "epoch": 0.1540958965880054, + "flos": 17903423808000.0, + "grad_norm": 1.6810025159445527, + "language_loss": 0.75984597, + "learning_rate": 3.840619741387832e-06, + "loss": 0.78710437, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 2.50195312, + "router_z_loss_mlp": 0.34423828, + "step": 2563, + "time_per_iteration": 2.8953909873962402 + }, + { + "auxiliary_loss_clip": 0.01675115, + "auxiliary_loss_mlp": 0.01047234, + "balance_loss_clip": 1.42406607, + "balance_loss_mlp": 1.01404631, + "epoch": 0.15415601984067337, + "flos": 32173841652480.0, + "grad_norm": 2.7384725173102673, + "language_loss": 0.77780342, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.80502689, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 2.51171875, + "router_z_loss_mlp": 0.33203125, + "step": 2564, + "time_per_iteration": 2.9364709854125977 + }, + { + "auxiliary_loss_clip": 0.01646751, + "auxiliary_loss_mlp": 0.01056328, + "balance_loss_clip": 1.40505695, + "balance_loss_mlp": 1.02557158, + "epoch": 0.15421614309334136, + "flos": 24035009408640.0, + "grad_norm": 1.8395371730323034, + "language_loss": 0.71759623, + "learning_rate": 3.840314894646969e-06, + "loss": 0.744627, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.30737305, + "step": 2565, + "time_per_iteration": 2.9317879676818848 + }, + { + "auxiliary_loss_clip": 0.0164565, + "auxiliary_loss_mlp": 0.01053375, + "balance_loss_clip": 1.40488219, + "balance_loss_mlp": 1.02157009, + "epoch": 0.15427626634600933, + "flos": 24396199799040.0, + "grad_norm": 2.1125994759249704, + "language_loss": 0.73441565, + "learning_rate": 3.840162366596259e-06, + "loss": 0.76140594, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.31787109, + "step": 2566, + "time_per_iteration": 2.8782708644866943 + }, + { + "auxiliary_loss_clip": 0.0163274, + "auxiliary_loss_mlp": 0.01051025, + "balance_loss_clip": 1.39658511, + "balance_loss_mlp": 1.01969671, + "epoch": 0.1543363895986773, + "flos": 23342019765120.0, + "grad_norm": 1.7208219148838477, + "language_loss": 0.86316633, + "learning_rate": 3.840009768766408e-06, + "loss": 0.89000404, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.31347656, + "step": 2567, + "time_per_iteration": 4.276357173919678 + }, + { + "auxiliary_loss_clip": 0.01637526, + "auxiliary_loss_mlp": 0.01056177, + "balance_loss_clip": 1.40028715, + "balance_loss_mlp": 1.02556372, + "epoch": 0.15439651285134526, + "flos": 24283503705600.0, + "grad_norm": 3.011891565577649, + "language_loss": 0.79423457, + "learning_rate": 3.839857101163202e-06, + "loss": 0.82117158, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.3059082, + "step": 2568, + "time_per_iteration": 2.8969149589538574 + }, + { + "auxiliary_loss_clip": 0.01648715, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.4097954, + "balance_loss_mlp": 1.01875579, + "epoch": 0.15445663610401322, + "flos": 22466512022400.0, + "grad_norm": 1.760446726466699, + "language_loss": 0.70999748, + "learning_rate": 3.83970436379243e-06, + "loss": 0.73698115, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.30883789, + "step": 2569, + "time_per_iteration": 5.690165281295776 + }, + { + "auxiliary_loss_clip": 0.0164007, + "auxiliary_loss_mlp": 0.01049718, + "balance_loss_clip": 1.40339041, + "balance_loss_mlp": 1.01951027, + "epoch": 0.1545167593566812, + "flos": 22058375512320.0, + "grad_norm": 1.7697668783169271, + "language_loss": 0.77817261, + "learning_rate": 3.839551556659884e-06, + "loss": 0.80507052, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.30175781, + "step": 2570, + "time_per_iteration": 2.8607263565063477 + }, + { + "auxiliary_loss_clip": 0.01642218, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.40605521, + "balance_loss_mlp": 1.01894951, + "epoch": 0.15457688260934915, + "flos": 19327752702720.0, + "grad_norm": 2.4000825507321344, + "language_loss": 0.7923736, + "learning_rate": 3.839398679771359e-06, + "loss": 0.81928355, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.29833984, + "step": 2571, + "time_per_iteration": 2.845846176147461 + }, + { + "auxiliary_loss_clip": 0.01645298, + "auxiliary_loss_mlp": 0.0105296, + "balance_loss_clip": 1.40611112, + "balance_loss_mlp": 1.0237056, + "epoch": 0.15463700586201715, + "flos": 24144628855680.0, + "grad_norm": 1.8250865490177208, + "language_loss": 0.83798134, + "learning_rate": 3.839245733132652e-06, + "loss": 0.86496389, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29272461, + "step": 2572, + "time_per_iteration": 2.9069573879241943 + }, + { + "auxiliary_loss_clip": 0.01646523, + "auxiliary_loss_mlp": 0.01054361, + "balance_loss_clip": 1.4043386, + "balance_loss_mlp": 1.02446294, + "epoch": 0.1546971291146851, + "flos": 22431601019520.0, + "grad_norm": 1.8138126767720473, + "language_loss": 0.91531181, + "learning_rate": 3.839092716749563e-06, + "loss": 0.94232064, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.29931641, + "step": 2573, + "time_per_iteration": 2.816009759902954 + }, + { + "auxiliary_loss_clip": 0.0163679, + "auxiliary_loss_mlp": 0.01049755, + "balance_loss_clip": 1.39792073, + "balance_loss_mlp": 1.01918936, + "epoch": 0.15475725236735308, + "flos": 17539202016000.0, + "grad_norm": 2.8787973916440524, + "language_loss": 0.71869111, + "learning_rate": 3.838939630627893e-06, + "loss": 0.74555653, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.30566406, + "step": 2574, + "time_per_iteration": 2.8743228912353516 + }, + { + "auxiliary_loss_clip": 0.01648084, + "auxiliary_loss_mlp": 0.01052061, + "balance_loss_clip": 1.40736151, + "balance_loss_mlp": 1.02137685, + "epoch": 0.15481737562002104, + "flos": 22571109296640.0, + "grad_norm": 1.7093769091606277, + "language_loss": 0.8355512, + "learning_rate": 3.838786474773448e-06, + "loss": 0.86255264, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.30664062, + "step": 2575, + "time_per_iteration": 2.91850209236145 + }, + { + "auxiliary_loss_clip": 0.01618323, + "auxiliary_loss_mlp": 0.01053939, + "balance_loss_clip": 1.38124096, + "balance_loss_mlp": 1.0240413, + "epoch": 0.154877498872689, + "flos": 24911512536960.0, + "grad_norm": 2.2171883783522297, + "language_loss": 0.85930359, + "learning_rate": 3.838633249192036e-06, + "loss": 0.88602626, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.29833984, + "step": 2576, + "time_per_iteration": 2.93196177482605 + }, + { + "auxiliary_loss_clip": 0.01621224, + "auxiliary_loss_mlp": 0.01050313, + "balance_loss_clip": 1.38507676, + "balance_loss_mlp": 1.02184582, + "epoch": 0.15493762212535697, + "flos": 28159212631680.0, + "grad_norm": 1.5179753885282103, + "language_loss": 0.83058643, + "learning_rate": 3.838479953889465e-06, + "loss": 0.85730177, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.28491211, + "step": 2577, + "time_per_iteration": 2.9156129360198975 + }, + { + "auxiliary_loss_clip": 0.016335, + "auxiliary_loss_mlp": 0.01052816, + "balance_loss_clip": 1.396119, + "balance_loss_mlp": 1.02291846, + "epoch": 0.15499774537802496, + "flos": 25422074570880.0, + "grad_norm": 2.1036529255703185, + "language_loss": 0.78839374, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.81525689, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.29907227, + "step": 2578, + "time_per_iteration": 2.8711061477661133 + }, + { + "auxiliary_loss_clip": 0.0163146, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.38903952, + "balance_loss_mlp": 1.02245867, + "epoch": 0.15505786863069293, + "flos": 22101883027200.0, + "grad_norm": 2.175117819425214, + "language_loss": 0.83617848, + "learning_rate": 3.83817315414411e-06, + "loss": 0.8630231, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.30493164, + "step": 2579, + "time_per_iteration": 2.898716926574707 + }, + { + "auxiliary_loss_clip": 0.01643176, + "auxiliary_loss_mlp": 0.01057406, + "balance_loss_clip": 1.4018507, + "balance_loss_mlp": 1.02781808, + "epoch": 0.1551179918833609, + "flos": 18926583891840.0, + "grad_norm": 1.6545136227095534, + "language_loss": 0.81470209, + "learning_rate": 3.838019649712958e-06, + "loss": 0.84170789, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.29541016, + "step": 2580, + "time_per_iteration": 2.8383419513702393 + }, + { + "auxiliary_loss_clip": 0.01367625, + "auxiliary_loss_mlp": 0.01069783, + "balance_loss_clip": 1.22730815, + "balance_loss_mlp": 1.05214047, + "epoch": 0.15517811513602886, + "flos": 66270376120320.0, + "grad_norm": 0.8451640433253461, + "language_loss": 0.58839059, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.61276466, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.17675781, + "step": 2581, + "time_per_iteration": 3.509321928024292 + }, + { + "auxiliary_loss_clip": 0.01636611, + "auxiliary_loss_mlp": 0.01053532, + "balance_loss_clip": 1.39536929, + "balance_loss_mlp": 1.02384901, + "epoch": 0.15523823838869683, + "flos": 24030801642240.0, + "grad_norm": 1.8538154073485402, + "language_loss": 0.85848027, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.88538164, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.29711914, + "step": 2582, + "time_per_iteration": 2.8957931995391846 + }, + { + "auxiliary_loss_clip": 0.01640808, + "auxiliary_loss_mlp": 0.01060677, + "balance_loss_clip": 1.39889836, + "balance_loss_mlp": 1.03011131, + "epoch": 0.1552983616413648, + "flos": 20494674074880.0, + "grad_norm": 2.1890321762433214, + "language_loss": 0.79397821, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.82099301, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 2.41601562, + "router_z_loss_mlp": 0.30541992, + "step": 2583, + "time_per_iteration": 2.997246265411377 + }, + { + "auxiliary_loss_clip": 0.01647022, + "auxiliary_loss_mlp": 0.01051806, + "balance_loss_clip": 1.40578878, + "balance_loss_mlp": 1.02166963, + "epoch": 0.15535848489403276, + "flos": 32136261206400.0, + "grad_norm": 4.0253241972033, + "language_loss": 0.76962703, + "learning_rate": 3.837404935067705e-06, + "loss": 0.7966153, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.30126953, + "step": 2584, + "time_per_iteration": 3.0020172595977783 + }, + { + "auxiliary_loss_clip": 0.01642277, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.39979959, + "balance_loss_mlp": 1.01943612, + "epoch": 0.15541860814670075, + "flos": 19107654157440.0, + "grad_norm": 1.644858288043672, + "language_loss": 0.76656365, + "learning_rate": 3.837251082205368e-06, + "loss": 0.79348123, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.30053711, + "step": 2585, + "time_per_iteration": 2.8652732372283936 + }, + { + "auxiliary_loss_clip": 0.01631761, + "auxiliary_loss_mlp": 0.0104904, + "balance_loss_clip": 1.39608371, + "balance_loss_mlp": 1.02066827, + "epoch": 0.1554787313993687, + "flos": 19181502950400.0, + "grad_norm": 2.0707799477094913, + "language_loss": 0.62286085, + "learning_rate": 3.837097159674286e-06, + "loss": 0.64966881, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28344727, + "step": 2586, + "time_per_iteration": 2.8492679595947266 + }, + { + "auxiliary_loss_clip": 0.01636353, + "auxiliary_loss_mlp": 0.01053844, + "balance_loss_clip": 1.39523315, + "balance_loss_mlp": 1.0229454, + "epoch": 0.15553885465203668, + "flos": 16152453567360.0, + "grad_norm": 1.6819079601048255, + "language_loss": 0.82534117, + "learning_rate": 3.836943167480296e-06, + "loss": 0.85224313, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.30908203, + "step": 2587, + "time_per_iteration": 2.8134825229644775 + }, + { + "auxiliary_loss_clip": 0.0165496, + "auxiliary_loss_mlp": 0.01056467, + "balance_loss_clip": 1.40857446, + "balance_loss_mlp": 1.02592576, + "epoch": 0.15559897790470464, + "flos": 25348678225920.0, + "grad_norm": 2.761328846378903, + "language_loss": 0.90401161, + "learning_rate": 3.836789105629236e-06, + "loss": 0.93112588, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 2.4609375, + "router_z_loss_mlp": 0.30541992, + "step": 2588, + "time_per_iteration": 2.870312213897705 + }, + { + "auxiliary_loss_clip": 0.01632087, + "auxiliary_loss_mlp": 0.01053354, + "balance_loss_clip": 1.3934195, + "balance_loss_mlp": 1.02357531, + "epoch": 0.1556591011573726, + "flos": 23159139707520.0, + "grad_norm": 2.1257687359530637, + "language_loss": 0.66311598, + "learning_rate": 3.83663497412695e-06, + "loss": 0.68997037, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 2.38476562, + "router_z_loss_mlp": 0.29785156, + "step": 2589, + "time_per_iteration": 2.8901350498199463 + }, + { + "auxiliary_loss_clip": 0.01627282, + "auxiliary_loss_mlp": 0.01051066, + "balance_loss_clip": 1.38932514, + "balance_loss_mlp": 1.0210017, + "epoch": 0.15571922441004057, + "flos": 25380919785600.0, + "grad_norm": 1.776521864989682, + "language_loss": 0.84029883, + "learning_rate": 3.836480772979281e-06, + "loss": 0.8670823, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.30053711, + "step": 2590, + "time_per_iteration": 2.872222423553467 + }, + { + "auxiliary_loss_clip": 0.0163915, + "auxiliary_loss_mlp": 0.01056081, + "balance_loss_clip": 1.3979435, + "balance_loss_mlp": 1.02601647, + "epoch": 0.15577934766270854, + "flos": 14509428716160.0, + "grad_norm": 7.249075988758377, + "language_loss": 0.81759346, + "learning_rate": 3.836326502192077e-06, + "loss": 0.84454572, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.30029297, + "step": 2591, + "time_per_iteration": 2.824805498123169 + }, + { + "auxiliary_loss_clip": 0.0162729, + "auxiliary_loss_mlp": 0.01053223, + "balance_loss_clip": 1.39095283, + "balance_loss_mlp": 1.02476799, + "epoch": 0.15583947091537653, + "flos": 37428562166400.0, + "grad_norm": 2.1291681946592553, + "language_loss": 0.66590863, + "learning_rate": 3.836172161771189e-06, + "loss": 0.69271374, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28479004, + "step": 2592, + "time_per_iteration": 2.9606101512908936 + }, + { + "auxiliary_loss_clip": 0.01659898, + "auxiliary_loss_mlp": 0.01054634, + "balance_loss_clip": 1.41462398, + "balance_loss_mlp": 1.02594066, + "epoch": 0.1558995941680445, + "flos": 21844611239040.0, + "grad_norm": 2.3890943494258385, + "language_loss": 0.83769858, + "learning_rate": 3.836017751722467e-06, + "loss": 0.8648439, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.28674316, + "step": 2593, + "time_per_iteration": 2.856794834136963 + }, + { + "auxiliary_loss_clip": 0.01633834, + "auxiliary_loss_mlp": 0.01053213, + "balance_loss_clip": 1.39889145, + "balance_loss_mlp": 1.02493644, + "epoch": 0.15595971742071246, + "flos": 19802272613760.0, + "grad_norm": 1.9298025127565106, + "language_loss": 0.74484283, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.77171326, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.28295898, + "step": 2594, + "time_per_iteration": 2.8282275199890137 + }, + { + "auxiliary_loss_clip": 0.01609529, + "auxiliary_loss_mlp": 0.01054237, + "balance_loss_clip": 1.37908435, + "balance_loss_mlp": 1.02526891, + "epoch": 0.15601984067338043, + "flos": 26732847720960.0, + "grad_norm": 1.9759041152572763, + "language_loss": 0.82317543, + "learning_rate": 3.835708722764952e-06, + "loss": 0.8498131, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.28955078, + "step": 2595, + "time_per_iteration": 2.907241106033325 + }, + { + "auxiliary_loss_clip": 0.01638377, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_clip": 1.39898801, + "balance_loss_mlp": 1.027632, + "epoch": 0.1560799639260484, + "flos": 18378441411840.0, + "grad_norm": 1.7251041729597314, + "language_loss": 0.88243955, + "learning_rate": 3.835554103867876e-06, + "loss": 0.9093917, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.29199219, + "step": 2596, + "time_per_iteration": 4.3701560497283936 + }, + { + "auxiliary_loss_clip": 0.01624927, + "auxiliary_loss_mlp": 0.01051928, + "balance_loss_clip": 1.39265728, + "balance_loss_mlp": 1.02424717, + "epoch": 0.15614008717871636, + "flos": 22608554008320.0, + "grad_norm": 1.6739902056060947, + "language_loss": 0.69295347, + "learning_rate": 3.835399415366404e-06, + "loss": 0.71972203, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27697754, + "step": 2597, + "time_per_iteration": 2.8819804191589355 + }, + { + "auxiliary_loss_clip": 0.01600648, + "auxiliary_loss_mlp": 0.01060049, + "balance_loss_clip": 1.37275147, + "balance_loss_mlp": 1.03115249, + "epoch": 0.15620021043138435, + "flos": 22756975511040.0, + "grad_norm": 1.6257081396852486, + "language_loss": 0.80769765, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.83430469, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28881836, + "step": 2598, + "time_per_iteration": 2.8694796562194824 + }, + { + "auxiliary_loss_clip": 0.01601019, + "auxiliary_loss_mlp": 0.01053026, + "balance_loss_clip": 1.37291944, + "balance_loss_mlp": 1.02422535, + "epoch": 0.15626033368405232, + "flos": 13122635022720.0, + "grad_norm": 2.004055032550572, + "language_loss": 0.84066558, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.8672061, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.2878418, + "step": 2599, + "time_per_iteration": 2.8526926040649414 + }, + { + "auxiliary_loss_clip": 0.01629077, + "auxiliary_loss_mlp": 0.01052619, + "balance_loss_clip": 1.38870692, + "balance_loss_mlp": 1.02324617, + "epoch": 0.15632045693672028, + "flos": 16481040439680.0, + "grad_norm": 1.9435163368768915, + "language_loss": 0.82507885, + "learning_rate": 3.834934932294287e-06, + "loss": 0.85189581, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.29345703, + "step": 2600, + "time_per_iteration": 2.8128502368927 + }, + { + "auxiliary_loss_clip": 0.0160758, + "auxiliary_loss_mlp": 0.01056959, + "balance_loss_clip": 1.37216425, + "balance_loss_mlp": 1.02689457, + "epoch": 0.15638058018938825, + "flos": 20860162721280.0, + "grad_norm": 3.314469505972626, + "language_loss": 0.88979423, + "learning_rate": 3.834779965433917e-06, + "loss": 0.91643959, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.30004883, + "step": 2601, + "time_per_iteration": 4.260806322097778 + }, + { + "auxiliary_loss_clip": 0.01623239, + "auxiliary_loss_mlp": 0.01069632, + "balance_loss_clip": 1.38507938, + "balance_loss_mlp": 1.03861356, + "epoch": 0.1564407034420562, + "flos": 21882372664320.0, + "grad_norm": 1.6580691003442998, + "language_loss": 0.79554772, + "learning_rate": 3.834624928998508e-06, + "loss": 0.82247645, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.31005859, + "step": 2602, + "time_per_iteration": 2.904654026031494 + }, + { + "auxiliary_loss_clip": 0.01614628, + "auxiliary_loss_mlp": 0.01060934, + "balance_loss_clip": 1.37922549, + "balance_loss_mlp": 1.03089309, + "epoch": 0.15650082669472418, + "flos": 21844385015040.0, + "grad_norm": 1.937439385571446, + "language_loss": 0.75048679, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.77724242, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.30065918, + "step": 2603, + "time_per_iteration": 4.331468105316162 + }, + { + "auxiliary_loss_clip": 0.01614298, + "auxiliary_loss_mlp": 0.01053195, + "balance_loss_clip": 1.38080955, + "balance_loss_mlp": 1.02320218, + "epoch": 0.15656094994739214, + "flos": 13806439971840.0, + "grad_norm": 3.102604255606653, + "language_loss": 0.88840109, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.91507608, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.30029297, + "step": 2604, + "time_per_iteration": 4.3682026863098145 + }, + { + "auxiliary_loss_clip": 0.01622214, + "auxiliary_loss_mlp": 0.0105485, + "balance_loss_clip": 1.38359237, + "balance_loss_mlp": 1.02585793, + "epoch": 0.15662107320006013, + "flos": 27319656522240.0, + "grad_norm": 2.7819181130353225, + "language_loss": 0.85959208, + "learning_rate": 3.834159402300841e-06, + "loss": 0.88636273, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.28979492, + "step": 2605, + "time_per_iteration": 2.950221061706543 + }, + { + "auxiliary_loss_clip": 0.01638175, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.39072037, + "balance_loss_mlp": 1.02226114, + "epoch": 0.1566811964527281, + "flos": 26695629233280.0, + "grad_norm": 3.287559990298224, + "language_loss": 0.74554956, + "learning_rate": 3.834004087624087e-06, + "loss": 0.77245653, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.30224609, + "step": 2606, + "time_per_iteration": 2.907951593399048 + }, + { + "auxiliary_loss_clip": 0.01620137, + "auxiliary_loss_mlp": 0.01048257, + "balance_loss_clip": 1.38536084, + "balance_loss_mlp": 1.01974177, + "epoch": 0.15674131970539606, + "flos": 16111027313280.0, + "grad_norm": 2.9467967731541864, + "language_loss": 0.77702117, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.80370522, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28540039, + "step": 2607, + "time_per_iteration": 2.7910096645355225 + }, + { + "auxiliary_loss_clip": 0.01611903, + "auxiliary_loss_mlp": 0.01049577, + "balance_loss_clip": 1.37883162, + "balance_loss_mlp": 1.02056122, + "epoch": 0.15680144295806403, + "flos": 19178471548800.0, + "grad_norm": 1.7571131684443941, + "language_loss": 0.8295809, + "learning_rate": 3.833693249639615e-06, + "loss": 0.85619569, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.29003906, + "step": 2608, + "time_per_iteration": 2.850051164627075 + }, + { + "auxiliary_loss_clip": 0.0162538, + "auxiliary_loss_mlp": 0.01054015, + "balance_loss_clip": 1.385234, + "balance_loss_mlp": 1.02297282, + "epoch": 0.156861566210732, + "flos": 20823125212800.0, + "grad_norm": 2.1172031564850733, + "language_loss": 0.73623216, + "learning_rate": 3.833537726343684e-06, + "loss": 0.76302612, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.31030273, + "step": 2609, + "time_per_iteration": 2.816152334213257 + }, + { + "auxiliary_loss_clip": 0.01619397, + "auxiliary_loss_mlp": 0.01048576, + "balance_loss_clip": 1.38121343, + "balance_loss_mlp": 1.01844001, + "epoch": 0.15692168946339996, + "flos": 20057598875520.0, + "grad_norm": 2.002125679261872, + "language_loss": 0.73359919, + "learning_rate": 3.833382133519818e-06, + "loss": 0.76027894, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.30102539, + "step": 2610, + "time_per_iteration": 2.8395426273345947 + }, + { + "auxiliary_loss_clip": 0.01625978, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.38267303, + "balance_loss_mlp": 1.02278638, + "epoch": 0.15698181271606793, + "flos": 21408078977280.0, + "grad_norm": 1.69024868178728, + "language_loss": 0.7370851, + "learning_rate": 3.833226471173919e-06, + "loss": 0.76387388, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 2.43164062, + "router_z_loss_mlp": 0.30102539, + "step": 2611, + "time_per_iteration": 2.832233428955078 + }, + { + "auxiliary_loss_clip": 0.01611017, + "auxiliary_loss_mlp": 0.01050401, + "balance_loss_clip": 1.37721848, + "balance_loss_mlp": 1.02033663, + "epoch": 0.15704193596873592, + "flos": 20854416631680.0, + "grad_norm": 2.5824780507557676, + "language_loss": 0.71838188, + "learning_rate": 3.833070739311887e-06, + "loss": 0.74499607, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.30078125, + "step": 2612, + "time_per_iteration": 2.8446860313415527 + }, + { + "auxiliary_loss_clip": 0.01617556, + "auxiliary_loss_mlp": 0.01054512, + "balance_loss_clip": 1.38031936, + "balance_loss_mlp": 1.02466202, + "epoch": 0.15710205922140388, + "flos": 21773069930880.0, + "grad_norm": 1.915964718607551, + "language_loss": 0.77198845, + "learning_rate": 3.83291493793963e-06, + "loss": 0.79870909, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.29858398, + "step": 2613, + "time_per_iteration": 2.849599599838257 + }, + { + "auxiliary_loss_clip": 0.01608759, + "auxiliary_loss_mlp": 0.01052729, + "balance_loss_clip": 1.37265515, + "balance_loss_mlp": 1.02223551, + "epoch": 0.15716218247407185, + "flos": 25018145827200.0, + "grad_norm": 1.591344254563311, + "language_loss": 0.67076796, + "learning_rate": 3.832759067063055e-06, + "loss": 0.69738281, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.3046875, + "step": 2614, + "time_per_iteration": 2.8693268299102783 + }, + { + "auxiliary_loss_clip": 0.01620297, + "auxiliary_loss_mlp": 0.01054102, + "balance_loss_clip": 1.38018143, + "balance_loss_mlp": 1.02301192, + "epoch": 0.1572223057267398, + "flos": 20200952960640.0, + "grad_norm": 2.02623135942265, + "language_loss": 0.75767171, + "learning_rate": 3.832603126688072e-06, + "loss": 0.78441572, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.31079102, + "step": 2615, + "time_per_iteration": 2.8613414764404297 + }, + { + "auxiliary_loss_clip": 0.01589078, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_clip": 1.3614471, + "balance_loss_mlp": 1.02247202, + "epoch": 0.15728242897940778, + "flos": 20969284475520.0, + "grad_norm": 1.8688301357663177, + "language_loss": 0.73472822, + "learning_rate": 3.832447116820594e-06, + "loss": 0.76114267, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.29907227, + "step": 2616, + "time_per_iteration": 2.852351427078247 + }, + { + "auxiliary_loss_clip": 0.01608666, + "auxiliary_loss_mlp": 0.01048719, + "balance_loss_clip": 1.37402534, + "balance_loss_mlp": 1.0199182, + "epoch": 0.15734255223207574, + "flos": 23048343895680.0, + "grad_norm": 3.121672885322192, + "language_loss": 0.73382115, + "learning_rate": 3.832291037466539e-06, + "loss": 0.76039505, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2878418, + "step": 2617, + "time_per_iteration": 2.9475014209747314 + }, + { + "auxiliary_loss_clip": 0.01600495, + "auxiliary_loss_mlp": 0.01050506, + "balance_loss_clip": 1.36931086, + "balance_loss_mlp": 1.02072811, + "epoch": 0.15740267548474374, + "flos": 20559157194240.0, + "grad_norm": 2.0214347638481853, + "language_loss": 0.75326204, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.77977204, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.29785156, + "step": 2618, + "time_per_iteration": 3.0378968715667725 + }, + { + "auxiliary_loss_clip": 0.01634777, + "auxiliary_loss_mlp": 0.0105651, + "balance_loss_clip": 1.39019561, + "balance_loss_mlp": 1.02563429, + "epoch": 0.1574627987374117, + "flos": 22675208878080.0, + "grad_norm": 2.326808137895127, + "language_loss": 0.79776192, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.82467473, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 2.44140625, + "router_z_loss_mlp": 0.30859375, + "step": 2619, + "time_per_iteration": 2.874711036682129 + }, + { + "auxiliary_loss_clip": 0.01599739, + "auxiliary_loss_mlp": 0.01059076, + "balance_loss_clip": 1.36874652, + "balance_loss_mlp": 1.0295831, + "epoch": 0.15752292199007967, + "flos": 16809491577600.0, + "grad_norm": 8.26704663131024, + "language_loss": 0.77854824, + "learning_rate": 3.831822382544101e-06, + "loss": 0.80513638, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.29516602, + "step": 2620, + "time_per_iteration": 2.871803045272827 + }, + { + "auxiliary_loss_clip": 0.01606977, + "auxiliary_loss_mlp": 0.01059856, + "balance_loss_clip": 1.37299037, + "balance_loss_mlp": 1.02809858, + "epoch": 0.15758304524274763, + "flos": 29838189116160.0, + "grad_norm": 1.7140126462016083, + "language_loss": 0.72915179, + "learning_rate": 3.831666025302944e-06, + "loss": 0.75582016, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.31738281, + "step": 2621, + "time_per_iteration": 2.9047112464904785 + }, + { + "auxiliary_loss_clip": 0.01617432, + "auxiliary_loss_mlp": 0.01052878, + "balance_loss_clip": 1.37913704, + "balance_loss_mlp": 1.02307546, + "epoch": 0.1576431684954156, + "flos": 53595566576640.0, + "grad_norm": 2.34860319127087, + "language_loss": 0.73683274, + "learning_rate": 3.831509598604828e-06, + "loss": 0.76353586, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.29785156, + "step": 2622, + "time_per_iteration": 3.155513048171997 + }, + { + "auxiliary_loss_clip": 0.01597266, + "auxiliary_loss_mlp": 0.01052297, + "balance_loss_clip": 1.36777747, + "balance_loss_mlp": 1.02285194, + "epoch": 0.15770329174808356, + "flos": 20823351436800.0, + "grad_norm": 1.5996322987874487, + "language_loss": 0.8877691, + "learning_rate": 3.831353102455684e-06, + "loss": 0.91426474, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.29418945, + "step": 2623, + "time_per_iteration": 2.8539695739746094 + }, + { + "auxiliary_loss_clip": 0.01597133, + "auxiliary_loss_mlp": 0.01072343, + "balance_loss_clip": 1.3657403, + "balance_loss_mlp": 1.04115748, + "epoch": 0.15776341500075153, + "flos": 24985089861120.0, + "grad_norm": 1.647105977284795, + "language_loss": 0.82549006, + "learning_rate": 3.831196536861448e-06, + "loss": 0.85218489, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.31176758, + "step": 2624, + "time_per_iteration": 2.8981943130493164 + }, + { + "auxiliary_loss_clip": 0.0160729, + "auxiliary_loss_mlp": 0.01065559, + "balance_loss_clip": 1.36984444, + "balance_loss_mlp": 1.03535187, + "epoch": 0.15782353825341952, + "flos": 21917871849600.0, + "grad_norm": 2.064116908777265, + "language_loss": 0.81132948, + "learning_rate": 3.831039901828054e-06, + "loss": 0.83805799, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.30175781, + "step": 2625, + "time_per_iteration": 2.8564672470092773 + }, + { + "auxiliary_loss_clip": 0.0160038, + "auxiliary_loss_mlp": 0.01058964, + "balance_loss_clip": 1.36514795, + "balance_loss_mlp": 1.02885211, + "epoch": 0.15788366150608749, + "flos": 26188370069760.0, + "grad_norm": 2.108462591272416, + "language_loss": 0.81901252, + "learning_rate": 3.830883197361445e-06, + "loss": 0.84560597, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.30078125, + "step": 2626, + "time_per_iteration": 2.9134159088134766 + }, + { + "auxiliary_loss_clip": 0.01612493, + "auxiliary_loss_mlp": 0.01056829, + "balance_loss_clip": 1.37884748, + "balance_loss_mlp": 1.0263114, + "epoch": 0.15794378475875545, + "flos": 27721458760320.0, + "grad_norm": 1.5806516177776577, + "language_loss": 0.75221908, + "learning_rate": 3.830726423467561e-06, + "loss": 0.77891237, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.30517578, + "step": 2627, + "time_per_iteration": 2.9704458713531494 + }, + { + "auxiliary_loss_clip": 0.01580814, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.35104084, + "balance_loss_mlp": 1.03286278, + "epoch": 0.15800390801142342, + "flos": 12137688812160.0, + "grad_norm": 2.0847956288084206, + "language_loss": 0.86668396, + "learning_rate": 3.830569580152348e-06, + "loss": 0.89312351, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.30249023, + "step": 2628, + "time_per_iteration": 2.7923219203948975 + }, + { + "auxiliary_loss_clip": 0.01588342, + "auxiliary_loss_mlp": 0.01061353, + "balance_loss_clip": 1.35676146, + "balance_loss_mlp": 1.03038216, + "epoch": 0.15806403126409138, + "flos": 20714591640960.0, + "grad_norm": 1.6580929155312911, + "language_loss": 0.77814448, + "learning_rate": 3.830412667421752e-06, + "loss": 0.80464137, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.31005859, + "step": 2629, + "time_per_iteration": 2.8830883502960205 + }, + { + "auxiliary_loss_clip": 0.01604155, + "auxiliary_loss_mlp": 0.01062098, + "balance_loss_clip": 1.36911631, + "balance_loss_mlp": 1.0314852, + "epoch": 0.15812415451675935, + "flos": 17830660890240.0, + "grad_norm": 2.004518686400378, + "language_loss": 0.74859357, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.77525616, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.30615234, + "step": 2630, + "time_per_iteration": 2.8465445041656494 + }, + { + "auxiliary_loss_clip": 0.01616332, + "auxiliary_loss_mlp": 0.01057651, + "balance_loss_clip": 1.3761946, + "balance_loss_mlp": 1.0271337, + "epoch": 0.15818427776942734, + "flos": 20093640998400.0, + "grad_norm": 2.740947650616946, + "language_loss": 0.85272038, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.87946022, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.30517578, + "step": 2631, + "time_per_iteration": 4.258861303329468 + }, + { + "auxiliary_loss_clip": 0.01591371, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.3582803, + "balance_loss_mlp": 1.02102613, + "epoch": 0.1582444010220953, + "flos": 21224746471680.0, + "grad_norm": 1.5943112649802726, + "language_loss": 0.80922747, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.83566475, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.31298828, + "step": 2632, + "time_per_iteration": 2.8729360103607178 + }, + { + "auxiliary_loss_clip": 0.01609232, + "auxiliary_loss_mlp": 0.01057632, + "balance_loss_clip": 1.37338853, + "balance_loss_mlp": 1.02620888, + "epoch": 0.15830452427476327, + "flos": 17867336440320.0, + "grad_norm": 2.179945172219195, + "language_loss": 0.84350991, + "learning_rate": 3.829784322464594e-06, + "loss": 0.87017858, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.31396484, + "step": 2633, + "time_per_iteration": 2.8289215564727783 + }, + { + "auxiliary_loss_clip": 0.01613383, + "auxiliary_loss_mlp": 0.01058635, + "balance_loss_clip": 1.37582099, + "balance_loss_mlp": 1.02890444, + "epoch": 0.15836464752743123, + "flos": 24545616687360.0, + "grad_norm": 4.769344172342034, + "language_loss": 0.7843591, + "learning_rate": 3.829627062746394e-06, + "loss": 0.81107932, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.29711914, + "step": 2634, + "time_per_iteration": 2.882767915725708 + }, + { + "auxiliary_loss_clip": 0.01601833, + "auxiliary_loss_mlp": 0.0106642, + "balance_loss_clip": 1.36312985, + "balance_loss_mlp": 1.03416193, + "epoch": 0.1584247707800992, + "flos": 20130497527680.0, + "grad_norm": 1.806932623852216, + "language_loss": 0.906142, + "learning_rate": 3.829469733648552e-06, + "loss": 0.93282455, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.32250977, + "step": 2635, + "time_per_iteration": 2.864513874053955 + }, + { + "auxiliary_loss_clip": 0.01626635, + "auxiliary_loss_mlp": 0.01057267, + "balance_loss_clip": 1.38550043, + "balance_loss_mlp": 1.02601039, + "epoch": 0.15848489403276717, + "flos": 20385642810240.0, + "grad_norm": 2.3002295264599764, + "language_loss": 0.77474111, + "learning_rate": 3.829312335177034e-06, + "loss": 0.80158013, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.3125, + "step": 2636, + "time_per_iteration": 4.223943710327148 + }, + { + "auxiliary_loss_clip": 0.01604533, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.36422682, + "balance_loss_mlp": 1.02407885, + "epoch": 0.15854501728543513, + "flos": 39361055120640.0, + "grad_norm": 2.4787139436363574, + "language_loss": 0.73554635, + "learning_rate": 3.82915486733781e-06, + "loss": 0.7621572, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.32495117, + "step": 2637, + "time_per_iteration": 3.032208204269409 + }, + { + "auxiliary_loss_clip": 0.01609362, + "auxiliary_loss_mlp": 0.01053684, + "balance_loss_clip": 1.3735292, + "balance_loss_mlp": 1.02361941, + "epoch": 0.15860514053810312, + "flos": 24875198945280.0, + "grad_norm": 2.0738940886622803, + "language_loss": 0.79394639, + "learning_rate": 3.82899733013685e-06, + "loss": 0.82057691, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.30053711, + "step": 2638, + "time_per_iteration": 4.290754556655884 + }, + { + "auxiliary_loss_clip": 0.0161002, + "auxiliary_loss_mlp": 0.0105545, + "balance_loss_clip": 1.37200952, + "balance_loss_mlp": 1.02455139, + "epoch": 0.1586652637907711, + "flos": 26189003496960.0, + "grad_norm": 1.698112253020446, + "language_loss": 0.76742268, + "learning_rate": 3.828839723580128e-06, + "loss": 0.7940774, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.30908203, + "step": 2639, + "time_per_iteration": 4.3289453983306885 + }, + { + "auxiliary_loss_clip": 0.01612791, + "auxiliary_loss_mlp": 0.01059362, + "balance_loss_clip": 1.37601399, + "balance_loss_mlp": 1.02738976, + "epoch": 0.15872538704343905, + "flos": 19801548696960.0, + "grad_norm": 1.7634147481983948, + "language_loss": 0.81772196, + "learning_rate": 3.82868204767362e-06, + "loss": 0.8444435, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.31958008, + "step": 2640, + "time_per_iteration": 2.850390672683716 + }, + { + "auxiliary_loss_clip": 0.01590862, + "auxiliary_loss_mlp": 0.01054014, + "balance_loss_clip": 1.36016381, + "balance_loss_mlp": 1.02247167, + "epoch": 0.15878551029610702, + "flos": 28487030342400.0, + "grad_norm": 3.074691249356448, + "language_loss": 0.6780107, + "learning_rate": 3.828524302423306e-06, + "loss": 0.70445943, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.31542969, + "step": 2641, + "time_per_iteration": 2.9436044692993164 + }, + { + "auxiliary_loss_clip": 0.01635535, + "auxiliary_loss_mlp": 0.01058979, + "balance_loss_clip": 1.38949549, + "balance_loss_mlp": 1.02738881, + "epoch": 0.15884563354877498, + "flos": 24217301283840.0, + "grad_norm": 2.214073619051438, + "language_loss": 0.76988006, + "learning_rate": 3.828366487835167e-06, + "loss": 0.79682511, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.31567383, + "step": 2642, + "time_per_iteration": 2.8773348331451416 + }, + { + "auxiliary_loss_clip": 0.01601123, + "auxiliary_loss_mlp": 0.01053487, + "balance_loss_clip": 1.36640704, + "balance_loss_mlp": 1.02325523, + "epoch": 0.15890575680144295, + "flos": 23959939006080.0, + "grad_norm": 2.235749034698769, + "language_loss": 0.71417439, + "learning_rate": 3.828208603915186e-06, + "loss": 0.74072051, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.30249023, + "step": 2643, + "time_per_iteration": 2.9342520236968994 + }, + { + "auxiliary_loss_clip": 0.0160646, + "auxiliary_loss_mlp": 0.01055976, + "balance_loss_clip": 1.37334967, + "balance_loss_mlp": 1.02481484, + "epoch": 0.15896588005411091, + "flos": 21224882206080.0, + "grad_norm": 2.008854224017572, + "language_loss": 0.79803884, + "learning_rate": 3.828050650669353e-06, + "loss": 0.82466316, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.3112793, + "step": 2644, + "time_per_iteration": 2.8819541931152344 + }, + { + "auxiliary_loss_clip": 0.01611601, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.37814069, + "balance_loss_mlp": 1.01971591, + "epoch": 0.1590260033067789, + "flos": 24362962853760.0, + "grad_norm": 3.531467892318732, + "language_loss": 0.83227396, + "learning_rate": 3.827892628103657e-06, + "loss": 0.85889488, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.30761719, + "step": 2645, + "time_per_iteration": 2.8480734825134277 + }, + { + "auxiliary_loss_clip": 0.01612952, + "auxiliary_loss_mlp": 0.01058966, + "balance_loss_clip": 1.37552297, + "balance_loss_mlp": 1.02768576, + "epoch": 0.15908612655944687, + "flos": 32061009824640.0, + "grad_norm": 2.0677842008116176, + "language_loss": 0.71077991, + "learning_rate": 3.827734536224087e-06, + "loss": 0.73749912, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.3125, + "step": 2646, + "time_per_iteration": 2.953688383102417 + }, + { + "auxiliary_loss_clip": 0.01612105, + "auxiliary_loss_mlp": 0.01055931, + "balance_loss_clip": 1.37933159, + "balance_loss_mlp": 1.02262425, + "epoch": 0.15914624981211484, + "flos": 17794347298560.0, + "grad_norm": 2.6887426794725466, + "language_loss": 0.64177781, + "learning_rate": 3.827576375036642e-06, + "loss": 0.66845822, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.33300781, + "step": 2647, + "time_per_iteration": 2.903324842453003 + }, + { + "auxiliary_loss_clip": 0.01615079, + "auxiliary_loss_mlp": 0.01048587, + "balance_loss_clip": 1.38211095, + "balance_loss_mlp": 1.01604295, + "epoch": 0.1592063730647828, + "flos": 17721493891200.0, + "grad_norm": 1.9949579235957824, + "language_loss": 0.90778589, + "learning_rate": 3.827418144547318e-06, + "loss": 0.93442255, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.32568359, + "step": 2648, + "time_per_iteration": 2.8464112281799316 + }, + { + "auxiliary_loss_clip": 0.0159995, + "auxiliary_loss_mlp": 0.01049785, + "balance_loss_clip": 1.37163115, + "balance_loss_mlp": 1.02050734, + "epoch": 0.15926649631745077, + "flos": 18811897027200.0, + "grad_norm": 1.7659156693240317, + "language_loss": 0.92583275, + "learning_rate": 3.827259844762114e-06, + "loss": 0.95233011, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.29272461, + "step": 2649, + "time_per_iteration": 2.8318769931793213 + }, + { + "auxiliary_loss_clip": 0.01646041, + "auxiliary_loss_mlp": 0.01057363, + "balance_loss_clip": 1.39780092, + "balance_loss_mlp": 1.02462816, + "epoch": 0.15932661957011873, + "flos": 17575108404480.0, + "grad_norm": 2.475911996594256, + "language_loss": 0.73907804, + "learning_rate": 3.827101475687033e-06, + "loss": 0.76611209, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 2.48046875, + "router_z_loss_mlp": 0.32714844, + "step": 2650, + "time_per_iteration": 2.856846809387207 + }, + { + "auxiliary_loss_clip": 0.01599089, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.37090087, + "balance_loss_mlp": 1.01787508, + "epoch": 0.15938674282278673, + "flos": 13342235875200.0, + "grad_norm": 9.06325930139845, + "language_loss": 0.73372757, + "learning_rate": 3.826943037328082e-06, + "loss": 0.76020908, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.31176758, + "step": 2651, + "time_per_iteration": 2.8261358737945557 + }, + { + "auxiliary_loss_clip": 0.01615088, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.38042283, + "balance_loss_mlp": 1.01931739, + "epoch": 0.1594468660754547, + "flos": 22498210644480.0, + "grad_norm": 2.2080079602833216, + "language_loss": 0.81045806, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.83711892, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.31689453, + "step": 2652, + "time_per_iteration": 2.842430591583252 + }, + { + "auxiliary_loss_clip": 0.01594656, + "auxiliary_loss_mlp": 0.01047915, + "balance_loss_clip": 1.3647778, + "balance_loss_mlp": 1.01947224, + "epoch": 0.15950698932812266, + "flos": 15014244660480.0, + "grad_norm": 3.3576818225381397, + "language_loss": 0.71258742, + "learning_rate": 3.826625952782601e-06, + "loss": 0.73901308, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28405762, + "step": 2653, + "time_per_iteration": 2.859027624130249 + }, + { + "auxiliary_loss_clip": 0.01607326, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.37278712, + "balance_loss_mlp": 1.01620197, + "epoch": 0.15956711258079062, + "flos": 30167499905280.0, + "grad_norm": 2.3601181635757125, + "language_loss": 0.78717971, + "learning_rate": 3.826467306608095e-06, + "loss": 0.81371486, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.30004883, + "step": 2654, + "time_per_iteration": 2.93780779838562 + }, + { + "auxiliary_loss_clip": 0.01593966, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_clip": 1.3645823, + "balance_loss_mlp": 1.02080154, + "epoch": 0.1596272358334586, + "flos": 21042545086080.0, + "grad_norm": 1.7879940375890833, + "language_loss": 0.82858014, + "learning_rate": 3.826308591173765e-06, + "loss": 0.85502148, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.29370117, + "step": 2655, + "time_per_iteration": 2.8724172115325928 + }, + { + "auxiliary_loss_clip": 0.01603668, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_clip": 1.37008476, + "balance_loss_mlp": 1.0262084, + "epoch": 0.15968735908612655, + "flos": 15276402887040.0, + "grad_norm": 2.2824374618427776, + "language_loss": 0.74793398, + "learning_rate": 3.826149806485631e-06, + "loss": 0.77452993, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.29699707, + "step": 2656, + "time_per_iteration": 2.8425276279449463 + }, + { + "auxiliary_loss_clip": 0.01580695, + "auxiliary_loss_mlp": 0.01047362, + "balance_loss_clip": 1.35638452, + "balance_loss_mlp": 1.01767898, + "epoch": 0.15974748233879452, + "flos": 52683745242240.0, + "grad_norm": 1.8806308988735125, + "language_loss": 0.78882951, + "learning_rate": 3.825990952549713e-06, + "loss": 0.81511009, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.296875, + "step": 2657, + "time_per_iteration": 3.185101270675659 + }, + { + "auxiliary_loss_clip": 0.01593332, + "auxiliary_loss_mlp": 0.01048499, + "balance_loss_clip": 1.36397934, + "balance_loss_mlp": 1.01979351, + "epoch": 0.1598076055914625, + "flos": 18742256000640.0, + "grad_norm": 1.9121184795779922, + "language_loss": 0.76022953, + "learning_rate": 3.825832029372035e-06, + "loss": 0.78664786, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28723145, + "step": 2658, + "time_per_iteration": 2.8831698894500732 + }, + { + "auxiliary_loss_clip": 0.01600454, + "auxiliary_loss_mlp": 0.01055891, + "balance_loss_clip": 1.36806679, + "balance_loss_mlp": 1.02372789, + "epoch": 0.15986772884413047, + "flos": 34362339540480.0, + "grad_norm": 1.9309123985165926, + "language_loss": 0.76653248, + "learning_rate": 3.825673036958624e-06, + "loss": 0.79309595, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.3215332, + "step": 2659, + "time_per_iteration": 2.9735920429229736 + }, + { + "auxiliary_loss_clip": 0.01608684, + "auxiliary_loss_mlp": 0.01054269, + "balance_loss_clip": 1.3704617, + "balance_loss_mlp": 1.02394271, + "epoch": 0.15992785209679844, + "flos": 22065252721920.0, + "grad_norm": 2.092294779487242, + "language_loss": 0.91538513, + "learning_rate": 3.825513975315508e-06, + "loss": 0.94201463, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.30322266, + "step": 2660, + "time_per_iteration": 2.889688730239868 + }, + { + "auxiliary_loss_clip": 0.01611046, + "auxiliary_loss_mlp": 0.01050055, + "balance_loss_clip": 1.3747344, + "balance_loss_mlp": 1.01994288, + "epoch": 0.1599879753494664, + "flos": 33077971370880.0, + "grad_norm": 2.306836262570851, + "language_loss": 0.79088128, + "learning_rate": 3.82535484444872e-06, + "loss": 0.81749225, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.30102539, + "step": 2661, + "time_per_iteration": 2.9748692512512207 + }, + { + "auxiliary_loss_clip": 0.01587119, + "auxiliary_loss_mlp": 0.01047929, + "balance_loss_clip": 1.35695589, + "balance_loss_mlp": 1.01750684, + "epoch": 0.16004809860213437, + "flos": 28049683674240.0, + "grad_norm": 1.8473604986134236, + "language_loss": 0.74831957, + "learning_rate": 3.825195644364292e-06, + "loss": 0.77467012, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.30419922, + "step": 2662, + "time_per_iteration": 2.98846697807312 + }, + { + "auxiliary_loss_clip": 0.01607017, + "auxiliary_loss_mlp": 0.01050391, + "balance_loss_clip": 1.37133384, + "balance_loss_mlp": 1.02061212, + "epoch": 0.16010822185480234, + "flos": 22789398049920.0, + "grad_norm": 1.8779810872207148, + "language_loss": 0.82742882, + "learning_rate": 3.825036375068263e-06, + "loss": 0.85400295, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.29760742, + "step": 2663, + "time_per_iteration": 2.8696935176849365 + }, + { + "auxiliary_loss_clip": 0.01600454, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.36755121, + "balance_loss_mlp": 1.01575017, + "epoch": 0.16016834510747033, + "flos": 20093595753600.0, + "grad_norm": 3.015397025760737, + "language_loss": 0.81480312, + "learning_rate": 3.824877036566672e-06, + "loss": 0.84127104, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.30566406, + "step": 2664, + "time_per_iteration": 2.8440945148468018 + }, + { + "auxiliary_loss_clip": 0.0159103, + "auxiliary_loss_mlp": 0.0105058, + "balance_loss_clip": 1.35932112, + "balance_loss_mlp": 1.02032495, + "epoch": 0.1602284683601383, + "flos": 21182460566400.0, + "grad_norm": 1.6213041638617758, + "language_loss": 0.9485153, + "learning_rate": 3.824717628865561e-06, + "loss": 0.97493136, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.30273438, + "step": 2665, + "time_per_iteration": 2.8728368282318115 + }, + { + "auxiliary_loss_clip": 0.01590397, + "auxiliary_loss_mlp": 0.01048334, + "balance_loss_clip": 1.35747623, + "balance_loss_mlp": 1.01698184, + "epoch": 0.16028859161280626, + "flos": 14655723713280.0, + "grad_norm": 2.223637824848212, + "language_loss": 0.8592754, + "learning_rate": 3.824558151970974e-06, + "loss": 0.88566267, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.3137207, + "step": 2666, + "time_per_iteration": 4.247317790985107 + }, + { + "auxiliary_loss_clip": 0.01585727, + "auxiliary_loss_mlp": 0.01053972, + "balance_loss_clip": 1.35585093, + "balance_loss_mlp": 1.02476621, + "epoch": 0.16034871486547422, + "flos": 20998630368000.0, + "grad_norm": 1.9034036223601567, + "language_loss": 0.82624924, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.85264623, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.29199219, + "step": 2667, + "time_per_iteration": 2.845552682876587 + }, + { + "auxiliary_loss_clip": 0.01591541, + "auxiliary_loss_mlp": 0.01056965, + "balance_loss_clip": 1.36273456, + "balance_loss_mlp": 1.02613759, + "epoch": 0.1604088381181422, + "flos": 21407717018880.0, + "grad_norm": 1.7699693423341676, + "language_loss": 0.75165534, + "learning_rate": 3.824238990625567e-06, + "loss": 0.77814043, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.30859375, + "step": 2668, + "time_per_iteration": 2.896623373031616 + }, + { + "auxiliary_loss_clip": 0.0158348, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.35352409, + "balance_loss_mlp": 1.0241766, + "epoch": 0.16046896137081015, + "flos": 23887221333120.0, + "grad_norm": 1.7203920087994096, + "language_loss": 0.78023243, + "learning_rate": 3.824079306186848e-06, + "loss": 0.80660146, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.29248047, + "step": 2669, + "time_per_iteration": 2.8599913120269775 + }, + { + "auxiliary_loss_clip": 0.01375579, + "auxiliary_loss_mlp": 0.01057861, + "balance_loss_clip": 1.24050307, + "balance_loss_mlp": 1.03640366, + "epoch": 0.16052908462347812, + "flos": 59833396247040.0, + "grad_norm": 0.8129268739191908, + "language_loss": 0.55579388, + "learning_rate": 3.823919552578861e-06, + "loss": 0.58012831, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.21484375, + "step": 2670, + "time_per_iteration": 3.2614214420318604 + }, + { + "auxiliary_loss_clip": 0.01592961, + "auxiliary_loss_mlp": 0.01055774, + "balance_loss_clip": 1.35915184, + "balance_loss_mlp": 1.02544713, + "epoch": 0.1605892078761461, + "flos": 18305768983680.0, + "grad_norm": 1.9794305651126223, + "language_loss": 0.7896781, + "learning_rate": 3.82375972980766e-06, + "loss": 0.81616545, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.30322266, + "step": 2671, + "time_per_iteration": 4.320765495300293 + }, + { + "auxiliary_loss_clip": 0.01593988, + "auxiliary_loss_mlp": 0.01057038, + "balance_loss_clip": 1.36140537, + "balance_loss_mlp": 1.0261631, + "epoch": 0.16064933112881408, + "flos": 32173434449280.0, + "grad_norm": 1.7858999089260297, + "language_loss": 0.65871245, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.68522269, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.30883789, + "step": 2672, + "time_per_iteration": 2.9302310943603516 + }, + { + "auxiliary_loss_clip": 0.0159015, + "auxiliary_loss_mlp": 0.01064486, + "balance_loss_clip": 1.35478151, + "balance_loss_mlp": 1.03353894, + "epoch": 0.16070945438148204, + "flos": 19838359981440.0, + "grad_norm": 1.7102616335339451, + "language_loss": 0.87586981, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.90241611, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.30932617, + "step": 2673, + "time_per_iteration": 4.291688442230225 + }, + { + "auxiliary_loss_clip": 0.01581296, + "auxiliary_loss_mlp": 0.01062934, + "balance_loss_clip": 1.35150528, + "balance_loss_mlp": 1.03379941, + "epoch": 0.16076957763415, + "flos": 18921561719040.0, + "grad_norm": 2.1323214392725607, + "language_loss": 0.73599589, + "learning_rate": 3.823279846575403e-06, + "loss": 0.76243818, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.29125977, + "step": 2674, + "time_per_iteration": 4.266111612319946 + }, + { + "auxiliary_loss_clip": 0.015852, + "auxiliary_loss_mlp": 0.01071962, + "balance_loss_clip": 1.35572255, + "balance_loss_mlp": 1.04132557, + "epoch": 0.16082970088681797, + "flos": 16773223230720.0, + "grad_norm": 1.5026744987343803, + "language_loss": 0.85460067, + "learning_rate": 3.823119747211986e-06, + "loss": 0.88117224, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.30639648, + "step": 2675, + "time_per_iteration": 2.881798505783081 + }, + { + "auxiliary_loss_clip": 0.01591788, + "auxiliary_loss_mlp": 0.01062902, + "balance_loss_clip": 1.35874534, + "balance_loss_mlp": 1.03224111, + "epoch": 0.16088982413948594, + "flos": 35163138839040.0, + "grad_norm": 1.7432678347691335, + "language_loss": 0.83230954, + "learning_rate": 3.822959578715685e-06, + "loss": 0.85885644, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.30664062, + "step": 2676, + "time_per_iteration": 2.959638833999634 + }, + { + "auxiliary_loss_clip": 0.01588409, + "auxiliary_loss_mlp": 0.01068798, + "balance_loss_clip": 1.36366427, + "balance_loss_mlp": 1.03913903, + "epoch": 0.1609499473921539, + "flos": 18634446345600.0, + "grad_norm": 1.687461807249356, + "language_loss": 0.75042188, + "learning_rate": 3.822799341092573e-06, + "loss": 0.77699393, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.29638672, + "step": 2677, + "time_per_iteration": 2.864696741104126 + }, + { + "auxiliary_loss_clip": 0.01596053, + "auxiliary_loss_mlp": 0.01061102, + "balance_loss_clip": 1.36756015, + "balance_loss_mlp": 1.03284943, + "epoch": 0.1610100706448219, + "flos": 33158652128640.0, + "grad_norm": 1.7158489402383736, + "language_loss": 0.77605093, + "learning_rate": 3.822639034348728e-06, + "loss": 0.80262244, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28198242, + "step": 2678, + "time_per_iteration": 2.9880831241607666 + }, + { + "auxiliary_loss_clip": 0.01579959, + "auxiliary_loss_mlp": 0.01054422, + "balance_loss_clip": 1.35108423, + "balance_loss_mlp": 1.02461982, + "epoch": 0.16107019389748986, + "flos": 34689659558400.0, + "grad_norm": 1.9616875181724773, + "language_loss": 0.71030337, + "learning_rate": 3.822478658490228e-06, + "loss": 0.73664719, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29785156, + "step": 2679, + "time_per_iteration": 2.965186595916748 + }, + { + "auxiliary_loss_clip": 0.01378908, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.23960102, + "balance_loss_mlp": 1.01811433, + "epoch": 0.16113031715015783, + "flos": 65740585046400.0, + "grad_norm": 0.7918206362109462, + "language_loss": 0.51853359, + "learning_rate": 3.822318213523154e-06, + "loss": 0.54269826, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.19433594, + "step": 2680, + "time_per_iteration": 3.416412591934204 + }, + { + "auxiliary_loss_clip": 0.01595864, + "auxiliary_loss_mlp": 0.01057234, + "balance_loss_clip": 1.36326814, + "balance_loss_mlp": 1.02645421, + "epoch": 0.1611904404028258, + "flos": 20819324649600.0, + "grad_norm": 1.6152444160079937, + "language_loss": 0.81228745, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.83881843, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.30761719, + "step": 2681, + "time_per_iteration": 2.8343873023986816 + }, + { + "auxiliary_loss_clip": 0.01593064, + "auxiliary_loss_mlp": 0.01059294, + "balance_loss_clip": 1.36561704, + "balance_loss_mlp": 1.03130364, + "epoch": 0.16125056365549376, + "flos": 27023944636800.0, + "grad_norm": 1.7766701767194217, + "language_loss": 0.70335263, + "learning_rate": 3.821997116287627e-06, + "loss": 0.72987622, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28027344, + "step": 2682, + "time_per_iteration": 2.9119699001312256 + }, + { + "auxiliary_loss_clip": 0.01585919, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.35715437, + "balance_loss_mlp": 1.02459252, + "epoch": 0.16131068690816172, + "flos": 19284652391040.0, + "grad_norm": 1.9559692945712817, + "language_loss": 0.88658893, + "learning_rate": 3.821836464031348e-06, + "loss": 0.91297895, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28442383, + "step": 2683, + "time_per_iteration": 2.8799540996551514 + }, + { + "auxiliary_loss_clip": 0.01592412, + "auxiliary_loss_mlp": 0.0106876, + "balance_loss_clip": 1.36380267, + "balance_loss_mlp": 1.04014933, + "epoch": 0.16137081016082971, + "flos": 35352217434240.0, + "grad_norm": 1.5974300914206567, + "language_loss": 0.75010633, + "learning_rate": 3.821675742690849e-06, + "loss": 0.77671802, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28613281, + "step": 2684, + "time_per_iteration": 2.9951331615448 + }, + { + "auxiliary_loss_clip": 0.01594362, + "auxiliary_loss_mlp": 0.01061377, + "balance_loss_clip": 1.3617897, + "balance_loss_mlp": 1.03228974, + "epoch": 0.16143093341349768, + "flos": 34247154983040.0, + "grad_norm": 1.7600596975420553, + "language_loss": 0.7113508, + "learning_rate": 3.821514952272223e-06, + "loss": 0.73790818, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.29077148, + "step": 2685, + "time_per_iteration": 2.950395107269287 + }, + { + "auxiliary_loss_clip": 0.01573704, + "auxiliary_loss_mlp": 0.01055227, + "balance_loss_clip": 1.35068476, + "balance_loss_mlp": 1.0256151, + "epoch": 0.16149105666616564, + "flos": 28010338680960.0, + "grad_norm": 2.247434190655626, + "language_loss": 0.72669697, + "learning_rate": 3.821354092781567e-06, + "loss": 0.75298631, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.29589844, + "step": 2686, + "time_per_iteration": 2.895186424255371 + }, + { + "auxiliary_loss_clip": 0.01582732, + "auxiliary_loss_mlp": 0.01053929, + "balance_loss_clip": 1.35291028, + "balance_loss_mlp": 1.02384067, + "epoch": 0.1615511799188336, + "flos": 19430313960960.0, + "grad_norm": 4.068738315098536, + "language_loss": 0.82724714, + "learning_rate": 3.821193164224981e-06, + "loss": 0.85361373, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.30053711, + "step": 2687, + "time_per_iteration": 2.91774845123291 + }, + { + "auxiliary_loss_clip": 0.01602428, + "auxiliary_loss_mlp": 0.01055369, + "balance_loss_clip": 1.36609113, + "balance_loss_mlp": 1.02439833, + "epoch": 0.16161130317150157, + "flos": 22864694676480.0, + "grad_norm": 2.391970464282639, + "language_loss": 0.7313624, + "learning_rate": 3.821032166608568e-06, + "loss": 0.75794041, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.30981445, + "step": 2688, + "time_per_iteration": 2.8329267501831055 + }, + { + "auxiliary_loss_clip": 0.01591972, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_clip": 1.36088467, + "balance_loss_mlp": 1.02232516, + "epoch": 0.16167142642416954, + "flos": 26123389257600.0, + "grad_norm": 1.659713035245241, + "language_loss": 0.7688719, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.79530597, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.29101562, + "step": 2689, + "time_per_iteration": 2.927610158920288 + }, + { + "auxiliary_loss_clip": 0.01578839, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.3539412, + "balance_loss_mlp": 1.01992774, + "epoch": 0.1617315496768375, + "flos": 22788809867520.0, + "grad_norm": 1.733325902024169, + "language_loss": 0.88225782, + "learning_rate": 3.820709964220683e-06, + "loss": 0.90853226, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.28710938, + "step": 2690, + "time_per_iteration": 2.9339325428009033 + }, + { + "auxiliary_loss_clip": 0.01585746, + "auxiliary_loss_mlp": 0.01047692, + "balance_loss_clip": 1.35783434, + "balance_loss_mlp": 1.0189867, + "epoch": 0.1617916729295055, + "flos": 22027265072640.0, + "grad_norm": 1.5984491444769409, + "language_loss": 0.88968188, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.91601616, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.28735352, + "step": 2691, + "time_per_iteration": 2.8716928958892822 + }, + { + "auxiliary_loss_clip": 0.01609192, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.37140656, + "balance_loss_mlp": 1.02282691, + "epoch": 0.16185179618217346, + "flos": 23448064872960.0, + "grad_norm": 1.8690730831954052, + "language_loss": 0.83437407, + "learning_rate": 3.820387485666784e-06, + "loss": 0.86100775, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.3137207, + "step": 2692, + "time_per_iteration": 2.908597230911255 + }, + { + "auxiliary_loss_clip": 0.01607296, + "auxiliary_loss_mlp": 0.01055683, + "balance_loss_clip": 1.37011027, + "balance_loss_mlp": 1.0241636, + "epoch": 0.16191191943484143, + "flos": 25677084119040.0, + "grad_norm": 2.1541684727812345, + "language_loss": 0.83114487, + "learning_rate": 3.820226142842862e-06, + "loss": 0.85777467, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.31518555, + "step": 2693, + "time_per_iteration": 2.8760287761688232 + }, + { + "auxiliary_loss_clip": 0.01573365, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_clip": 1.34891105, + "balance_loss_mlp": 1.02252543, + "epoch": 0.1619720426875094, + "flos": 23487862314240.0, + "grad_norm": 1.501855241174925, + "language_loss": 0.8533839, + "learning_rate": 3.820064730995783e-06, + "loss": 0.87963539, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.29272461, + "step": 2694, + "time_per_iteration": 2.876495838165283 + }, + { + "auxiliary_loss_clip": 0.01593449, + "auxiliary_loss_mlp": 0.01062834, + "balance_loss_clip": 1.35982609, + "balance_loss_mlp": 1.02914536, + "epoch": 0.16203216594017736, + "flos": 24144221652480.0, + "grad_norm": 1.8835494212949921, + "language_loss": 0.70133942, + "learning_rate": 3.819903250131667e-06, + "loss": 0.72790229, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.3371582, + "step": 2695, + "time_per_iteration": 2.869046688079834 + }, + { + "auxiliary_loss_clip": 0.01605448, + "auxiliary_loss_mlp": 0.01054915, + "balance_loss_clip": 1.37189412, + "balance_loss_mlp": 1.02437329, + "epoch": 0.16209228919284532, + "flos": 22349698652160.0, + "grad_norm": 1.8630886311549049, + "language_loss": 0.8382442, + "learning_rate": 3.819741700256637e-06, + "loss": 0.86484778, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.30541992, + "step": 2696, + "time_per_iteration": 2.847719430923462 + }, + { + "auxiliary_loss_clip": 0.01609281, + "auxiliary_loss_mlp": 0.01059733, + "balance_loss_clip": 1.36838996, + "balance_loss_mlp": 1.02780831, + "epoch": 0.1621524124455133, + "flos": 15823595226240.0, + "grad_norm": 2.095555561456398, + "language_loss": 0.90546346, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.93215358, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.3190918, + "step": 2697, + "time_per_iteration": 2.947194814682007 + }, + { + "auxiliary_loss_clip": 0.01575833, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.35069537, + "balance_loss_mlp": 1.01901484, + "epoch": 0.16221253569818128, + "flos": 30197886428160.0, + "grad_norm": 1.638272167800017, + "language_loss": 0.8192848, + "learning_rate": 3.819418393498343e-06, + "loss": 0.8455292, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.29589844, + "step": 2698, + "time_per_iteration": 2.9250917434692383 + }, + { + "auxiliary_loss_clip": 0.01580149, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.35692525, + "balance_loss_mlp": 1.02028179, + "epoch": 0.16227265895084925, + "flos": 24616162609920.0, + "grad_norm": 1.5416982538881954, + "language_loss": 0.78489995, + "learning_rate": 3.819256636627339e-06, + "loss": 0.81121135, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.30688477, + "step": 2699, + "time_per_iteration": 2.9487640857696533 + }, + { + "auxiliary_loss_clip": 0.01588639, + "auxiliary_loss_mlp": 0.01053927, + "balance_loss_clip": 1.3591013, + "balance_loss_mlp": 1.02500629, + "epoch": 0.1623327822035172, + "flos": 19582807495680.0, + "grad_norm": 2.064988544353501, + "language_loss": 0.87959051, + "learning_rate": 3.81909481076994e-06, + "loss": 0.90601611, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2890625, + "step": 2700, + "time_per_iteration": 4.248523473739624 + }, + { + "auxiliary_loss_clip": 0.01594848, + "auxiliary_loss_mlp": 0.01054137, + "balance_loss_clip": 1.3659507, + "balance_loss_mlp": 1.02282083, + "epoch": 0.16239290545618518, + "flos": 26479557475200.0, + "grad_norm": 1.5642403883383815, + "language_loss": 0.81302071, + "learning_rate": 3.818932915932284e-06, + "loss": 0.83951056, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.31311035, + "step": 2701, + "time_per_iteration": 2.858170747756958 + }, + { + "auxiliary_loss_clip": 0.01588956, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.35817051, + "balance_loss_mlp": 1.02391624, + "epoch": 0.16245302870885314, + "flos": 15860497000320.0, + "grad_norm": 1.500038689765602, + "language_loss": 0.74944043, + "learning_rate": 3.818770952120511e-06, + "loss": 0.77586961, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.30053711, + "step": 2702, + "time_per_iteration": 2.863884449005127 + }, + { + "auxiliary_loss_clip": 0.01598279, + "auxiliary_loss_mlp": 0.01052962, + "balance_loss_clip": 1.36484575, + "balance_loss_mlp": 1.02263534, + "epoch": 0.1625131519615211, + "flos": 14764573998720.0, + "grad_norm": 1.8711781713318716, + "language_loss": 0.73938787, + "learning_rate": 3.81860891934076e-06, + "loss": 0.76590031, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.3034668, + "step": 2703, + "time_per_iteration": 2.7934072017669678 + }, + { + "auxiliary_loss_clip": 0.01595509, + "auxiliary_loss_mlp": 0.01055165, + "balance_loss_clip": 1.36306775, + "balance_loss_mlp": 1.02536297, + "epoch": 0.1625732752141891, + "flos": 28232247018240.0, + "grad_norm": 1.7764492351537695, + "language_loss": 0.715734, + "learning_rate": 3.818446817599176e-06, + "loss": 0.74224073, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2980957, + "step": 2704, + "time_per_iteration": 2.937492847442627 + }, + { + "auxiliary_loss_clip": 0.0135888, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.22629595, + "balance_loss_mlp": 1.00673795, + "epoch": 0.16263339846685707, + "flos": 67357928833920.0, + "grad_norm": 0.7880556130178895, + "language_loss": 0.53408283, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55790687, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.16796875, + "step": 2705, + "time_per_iteration": 3.357480049133301 + }, + { + "auxiliary_loss_clip": 0.01599516, + "auxiliary_loss_mlp": 0.01053716, + "balance_loss_clip": 1.36505532, + "balance_loss_mlp": 1.02522492, + "epoch": 0.16269352171952503, + "flos": 14327227330560.0, + "grad_norm": 2.185713522063839, + "language_loss": 0.76696676, + "learning_rate": 3.818122407255102e-06, + "loss": 0.79349911, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28491211, + "step": 2706, + "time_per_iteration": 4.326091289520264 + }, + { + "auxiliary_loss_clip": 0.01585112, + "auxiliary_loss_mlp": 0.01050743, + "balance_loss_clip": 1.3537432, + "balance_loss_mlp": 1.0223, + "epoch": 0.162753644972193, + "flos": 28371619560960.0, + "grad_norm": 1.9106819087472995, + "language_loss": 0.74335587, + "learning_rate": 3.817960098664914e-06, + "loss": 0.76971447, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.28466797, + "step": 2707, + "time_per_iteration": 2.905479907989502 + }, + { + "auxiliary_loss_clip": 0.01585845, + "auxiliary_loss_mlp": 0.01056884, + "balance_loss_clip": 1.35736537, + "balance_loss_mlp": 1.02932262, + "epoch": 0.16281376822486096, + "flos": 19947346001280.0, + "grad_norm": 2.5729147518875974, + "language_loss": 0.85108483, + "learning_rate": 3.817797721137495e-06, + "loss": 0.8775121, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27587891, + "step": 2708, + "time_per_iteration": 4.251897573471069 + }, + { + "auxiliary_loss_clip": 0.01603282, + "auxiliary_loss_mlp": 0.0105321, + "balance_loss_clip": 1.36826265, + "balance_loss_mlp": 1.02262115, + "epoch": 0.16287389147752893, + "flos": 21261648245760.0, + "grad_norm": 2.0650967786453687, + "language_loss": 0.86887348, + "learning_rate": 3.817635274679006e-06, + "loss": 0.89543843, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.3059082, + "step": 2709, + "time_per_iteration": 4.352931499481201 + }, + { + "auxiliary_loss_clip": 0.01604238, + "auxiliary_loss_mlp": 0.0105051, + "balance_loss_clip": 1.37168622, + "balance_loss_mlp": 1.023736, + "epoch": 0.1629340147301969, + "flos": 19253994399360.0, + "grad_norm": 2.241123526908354, + "language_loss": 0.92233521, + "learning_rate": 3.817472759295605e-06, + "loss": 0.9488827, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26757812, + "step": 2710, + "time_per_iteration": 2.839712381362915 + }, + { + "auxiliary_loss_clip": 0.01594448, + "auxiliary_loss_mlp": 0.01054685, + "balance_loss_clip": 1.36519611, + "balance_loss_mlp": 1.02652752, + "epoch": 0.16299413798286488, + "flos": 21259250271360.0, + "grad_norm": 2.2486898392510826, + "language_loss": 0.82904702, + "learning_rate": 3.817310174993453e-06, + "loss": 0.85553837, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28173828, + "step": 2711, + "time_per_iteration": 2.8382749557495117 + }, + { + "auxiliary_loss_clip": 0.01620201, + "auxiliary_loss_mlp": 0.01057233, + "balance_loss_clip": 1.38112998, + "balance_loss_mlp": 1.02909994, + "epoch": 0.16305426123553285, + "flos": 18779836446720.0, + "grad_norm": 2.115160502879445, + "language_loss": 0.82627457, + "learning_rate": 3.817147521778719e-06, + "loss": 0.85304892, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.28149414, + "step": 2712, + "time_per_iteration": 2.850048542022705 + }, + { + "auxiliary_loss_clip": 0.01609243, + "auxiliary_loss_mlp": 0.01058863, + "balance_loss_clip": 1.37273932, + "balance_loss_mlp": 1.03175521, + "epoch": 0.16311438448820081, + "flos": 22096951344000.0, + "grad_norm": 1.6347075971863712, + "language_loss": 0.78146076, + "learning_rate": 3.816984799657568e-06, + "loss": 0.80814183, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.27148438, + "step": 2713, + "time_per_iteration": 2.8194034099578857 + }, + { + "auxiliary_loss_clip": 0.0157939, + "auxiliary_loss_mlp": 0.01053864, + "balance_loss_clip": 1.35503006, + "balance_loss_mlp": 1.02592182, + "epoch": 0.16317450774086878, + "flos": 16475249105280.0, + "grad_norm": 2.1529777362633467, + "language_loss": 0.80450857, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.83084106, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.27954102, + "step": 2714, + "time_per_iteration": 2.847712516784668 + }, + { + "auxiliary_loss_clip": 0.01603275, + "auxiliary_loss_mlp": 0.01059504, + "balance_loss_clip": 1.37555337, + "balance_loss_mlp": 1.03239632, + "epoch": 0.16323463099353674, + "flos": 24363641525760.0, + "grad_norm": 2.078578984082396, + "language_loss": 0.79171741, + "learning_rate": 3.816659148720702e-06, + "loss": 0.81834519, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27148438, + "step": 2715, + "time_per_iteration": 2.907170295715332 + }, + { + "auxiliary_loss_clip": 0.01598268, + "auxiliary_loss_mlp": 0.01047684, + "balance_loss_clip": 1.36796737, + "balance_loss_mlp": 1.02087343, + "epoch": 0.1632947542462047, + "flos": 24911784005760.0, + "grad_norm": 2.141270755157285, + "language_loss": 0.82947874, + "learning_rate": 3.816496219917336e-06, + "loss": 0.85593826, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.26806641, + "step": 2716, + "time_per_iteration": 2.920557975769043 + }, + { + "auxiliary_loss_clip": 0.01608257, + "auxiliary_loss_mlp": 0.01054143, + "balance_loss_clip": 1.37573314, + "balance_loss_mlp": 1.02686739, + "epoch": 0.1633548774988727, + "flos": 24911150578560.0, + "grad_norm": 1.6589996224188939, + "language_loss": 0.87833977, + "learning_rate": 3.816333222232251e-06, + "loss": 0.90496373, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27282715, + "step": 2717, + "time_per_iteration": 2.873380184173584 + }, + { + "auxiliary_loss_clip": 0.01594787, + "auxiliary_loss_mlp": 0.01053976, + "balance_loss_clip": 1.36740768, + "balance_loss_mlp": 1.02686739, + "epoch": 0.16341500075154067, + "flos": 30452986465920.0, + "grad_norm": 3.4529431227198617, + "language_loss": 0.77940929, + "learning_rate": 3.816170155671629e-06, + "loss": 0.80589688, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27087402, + "step": 2718, + "time_per_iteration": 2.921759843826294 + }, + { + "auxiliary_loss_clip": 0.01608118, + "auxiliary_loss_mlp": 0.01055188, + "balance_loss_clip": 1.37610018, + "balance_loss_mlp": 1.02705431, + "epoch": 0.16347512400420863, + "flos": 22794963160320.0, + "grad_norm": 1.79458183173792, + "language_loss": 0.75323707, + "learning_rate": 3.816007020241652e-06, + "loss": 0.77987015, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.28125, + "step": 2719, + "time_per_iteration": 2.9134223461151123 + }, + { + "auxiliary_loss_clip": 0.01596169, + "auxiliary_loss_mlp": 0.01055139, + "balance_loss_clip": 1.36702919, + "balance_loss_mlp": 1.02787566, + "epoch": 0.1635352472568766, + "flos": 22642288646400.0, + "grad_norm": 1.6900624207785235, + "language_loss": 0.73370445, + "learning_rate": 3.815843815948507e-06, + "loss": 0.76021749, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27258301, + "step": 2720, + "time_per_iteration": 2.832446813583374 + }, + { + "auxiliary_loss_clip": 0.01590967, + "auxiliary_loss_mlp": 0.01053426, + "balance_loss_clip": 1.36393452, + "balance_loss_mlp": 1.02510142, + "epoch": 0.16359537050954456, + "flos": 15531321945600.0, + "grad_norm": 2.0157200159257203, + "language_loss": 0.76900476, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.79544872, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.28295898, + "step": 2721, + "time_per_iteration": 2.861409902572632 + }, + { + "auxiliary_loss_clip": 0.01602785, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.37004828, + "balance_loss_mlp": 1.02456129, + "epoch": 0.16365549376221253, + "flos": 22100118480000.0, + "grad_norm": 1.893099167812414, + "language_loss": 0.79755056, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.82409573, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27172852, + "step": 2722, + "time_per_iteration": 2.8554818630218506 + }, + { + "auxiliary_loss_clip": 0.01603838, + "auxiliary_loss_mlp": 0.01060137, + "balance_loss_clip": 1.36861515, + "balance_loss_mlp": 1.02938056, + "epoch": 0.1637156170148805, + "flos": 24070870552320.0, + "grad_norm": 1.938939394878388, + "language_loss": 0.86279172, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.88943148, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.30761719, + "step": 2723, + "time_per_iteration": 2.9678895473480225 + }, + { + "auxiliary_loss_clip": 0.01579694, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.35644579, + "balance_loss_mlp": 1.01802969, + "epoch": 0.1637757402675485, + "flos": 26695991191680.0, + "grad_norm": 1.9178513351905633, + "language_loss": 0.71912241, + "learning_rate": 3.815190310268058e-06, + "loss": 0.74536943, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26965332, + "step": 2724, + "time_per_iteration": 2.8785109519958496 + }, + { + "auxiliary_loss_clip": 0.01588206, + "auxiliary_loss_mlp": 0.01051457, + "balance_loss_clip": 1.3640275, + "balance_loss_mlp": 1.02502787, + "epoch": 0.16383586352021645, + "flos": 16115461303680.0, + "grad_norm": 1.7879332302423578, + "language_loss": 0.71879905, + "learning_rate": 3.815026761751955e-06, + "loss": 0.74519569, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26428223, + "step": 2725, + "time_per_iteration": 2.8342082500457764 + }, + { + "auxiliary_loss_clip": 0.01592833, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_clip": 1.36833858, + "balance_loss_mlp": 1.016011, + "epoch": 0.16389598677288442, + "flos": 19173811334400.0, + "grad_norm": 1.7683384916261575, + "language_loss": 0.8869341, + "learning_rate": 3.814863144409855e-06, + "loss": 0.91328663, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26403809, + "step": 2726, + "time_per_iteration": 2.8463456630706787 + }, + { + "auxiliary_loss_clip": 0.01593928, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.36755335, + "balance_loss_mlp": 1.01823592, + "epoch": 0.16395611002555238, + "flos": 21516974507520.0, + "grad_norm": 1.8115273600200583, + "language_loss": 0.7472235, + "learning_rate": 3.814699458247963e-06, + "loss": 0.77361119, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26623535, + "step": 2727, + "time_per_iteration": 2.8463990688323975 + }, + { + "auxiliary_loss_clip": 0.01588306, + "auxiliary_loss_mlp": 0.01048298, + "balance_loss_clip": 1.36570191, + "balance_loss_mlp": 1.02326393, + "epoch": 0.16401623327822035, + "flos": 21480977629440.0, + "grad_norm": 1.5288230464573362, + "language_loss": 0.83873481, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.86510086, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.25048828, + "step": 2728, + "time_per_iteration": 2.853574275970459 + }, + { + "auxiliary_loss_clip": 0.01608897, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.3768611, + "balance_loss_mlp": 1.02181005, + "epoch": 0.1640763565308883, + "flos": 13634192442240.0, + "grad_norm": 2.021904478365285, + "language_loss": 0.86451054, + "learning_rate": 3.814371879489633e-06, + "loss": 0.8910892, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27124023, + "step": 2729, + "time_per_iteration": 2.8147494792938232 + }, + { + "auxiliary_loss_clip": 0.01597602, + "auxiliary_loss_mlp": 0.01051486, + "balance_loss_clip": 1.36969411, + "balance_loss_mlp": 1.02363825, + "epoch": 0.16413647978355628, + "flos": 15460142595840.0, + "grad_norm": 1.856458318423306, + "language_loss": 0.73904687, + "learning_rate": 3.814207986905616e-06, + "loss": 0.76553774, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.27880859, + "step": 2730, + "time_per_iteration": 2.8573720455169678 + }, + { + "auxiliary_loss_clip": 0.01603044, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.37058663, + "balance_loss_mlp": 1.02514148, + "epoch": 0.16419660303622427, + "flos": 45894759672960.0, + "grad_norm": 1.5285417413020395, + "language_loss": 0.75369555, + "learning_rate": 3.814044025526651e-06, + "loss": 0.78027141, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29394531, + "step": 2731, + "time_per_iteration": 3.055819511413574 + }, + { + "auxiliary_loss_clip": 0.01605237, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.37328148, + "balance_loss_mlp": 1.02245712, + "epoch": 0.16425672628889224, + "flos": 18962083077120.0, + "grad_norm": 2.096944475819252, + "language_loss": 0.8103013, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.83685136, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.27319336, + "step": 2732, + "time_per_iteration": 2.863966703414917 + }, + { + "auxiliary_loss_clip": 0.01596423, + "auxiliary_loss_mlp": 0.01051613, + "balance_loss_clip": 1.36686659, + "balance_loss_mlp": 1.02488601, + "epoch": 0.1643168495415602, + "flos": 24322803454080.0, + "grad_norm": 1.8209462009688147, + "language_loss": 0.70347261, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.72995305, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26733398, + "step": 2733, + "time_per_iteration": 2.928468704223633 + }, + { + "auxiliary_loss_clip": 0.01604449, + "auxiliary_loss_mlp": 0.01060298, + "balance_loss_clip": 1.37542129, + "balance_loss_mlp": 1.03130674, + "epoch": 0.16437697279422817, + "flos": 26438583669120.0, + "grad_norm": 1.669515368625835, + "language_loss": 0.82145083, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8480984, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28955078, + "step": 2734, + "time_per_iteration": 2.9648277759552 + }, + { + "auxiliary_loss_clip": 0.01595056, + "auxiliary_loss_mlp": 0.01051337, + "balance_loss_clip": 1.36549258, + "balance_loss_mlp": 1.02403808, + "epoch": 0.16443709604689613, + "flos": 34545536311680.0, + "grad_norm": 2.2059251172237246, + "language_loss": 0.82975954, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.85622346, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.27294922, + "step": 2735, + "time_per_iteration": 4.3453216552734375 + }, + { + "auxiliary_loss_clip": 0.01584857, + "auxiliary_loss_mlp": 0.01047989, + "balance_loss_clip": 1.36327088, + "balance_loss_mlp": 1.02187061, + "epoch": 0.1644972192995641, + "flos": 23268397196160.0, + "grad_norm": 2.3714330018903627, + "language_loss": 0.79278493, + "learning_rate": 3.813223186925296e-06, + "loss": 0.81911337, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26098633, + "step": 2736, + "time_per_iteration": 2.904789447784424 + }, + { + "auxiliary_loss_clip": 0.01604648, + "auxiliary_loss_mlp": 0.01060634, + "balance_loss_clip": 1.37664306, + "balance_loss_mlp": 1.03309703, + "epoch": 0.1645573425522321, + "flos": 26990843425920.0, + "grad_norm": 2.026261997995536, + "language_loss": 0.82117724, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.84783012, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27563477, + "step": 2737, + "time_per_iteration": 2.9587416648864746 + }, + { + "auxiliary_loss_clip": 0.01608238, + "auxiliary_loss_mlp": 0.01052486, + "balance_loss_clip": 1.37728405, + "balance_loss_mlp": 1.0272851, + "epoch": 0.16461746580490005, + "flos": 28743759192960.0, + "grad_norm": 1.7758283393732583, + "language_loss": 0.88713461, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.91374183, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.25170898, + "step": 2738, + "time_per_iteration": 2.9515221118927 + }, + { + "auxiliary_loss_clip": 0.0160604, + "auxiliary_loss_mlp": 0.01055116, + "balance_loss_clip": 1.37575412, + "balance_loss_mlp": 1.02800739, + "epoch": 0.16467758905756802, + "flos": 24939320106240.0, + "grad_norm": 1.808680046642556, + "language_loss": 0.72810495, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.75471652, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27148438, + "step": 2739, + "time_per_iteration": 2.8563249111175537 + }, + { + "auxiliary_loss_clip": 0.01600997, + "auxiliary_loss_mlp": 0.01052333, + "balance_loss_clip": 1.37344003, + "balance_loss_mlp": 1.02562976, + "epoch": 0.16473771231023598, + "flos": 24837166051200.0, + "grad_norm": 1.6378156737256593, + "language_loss": 0.82714778, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.85368109, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.26733398, + "step": 2740, + "time_per_iteration": 4.390120983123779 + }, + { + "auxiliary_loss_clip": 0.01622298, + "auxiliary_loss_mlp": 0.01069029, + "balance_loss_clip": 1.38910198, + "balance_loss_mlp": 1.04134881, + "epoch": 0.16479783556290395, + "flos": 39910871658240.0, + "grad_norm": 2.177495743261992, + "language_loss": 0.70371485, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.73062813, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.27685547, + "step": 2741, + "time_per_iteration": 3.0364878177642822 + }, + { + "auxiliary_loss_clip": 0.01626802, + "auxiliary_loss_mlp": 0.01063983, + "balance_loss_clip": 1.3949697, + "balance_loss_mlp": 1.03633881, + "epoch": 0.16485795881557191, + "flos": 19905422054400.0, + "grad_norm": 1.773815238008752, + "language_loss": 0.80618739, + "learning_rate": 3.812235911671472e-06, + "loss": 0.83309525, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27648926, + "step": 2742, + "time_per_iteration": 2.8167736530303955 + }, + { + "auxiliary_loss_clip": 0.01616064, + "auxiliary_loss_mlp": 0.01059975, + "balance_loss_clip": 1.38725471, + "balance_loss_mlp": 1.03225923, + "epoch": 0.16491808206823988, + "flos": 20565265242240.0, + "grad_norm": 1.6991793314733272, + "language_loss": 0.86206609, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.88882649, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27709961, + "step": 2743, + "time_per_iteration": 5.794186353683472 + }, + { + "auxiliary_loss_clip": 0.01601915, + "auxiliary_loss_mlp": 0.01054882, + "balance_loss_clip": 1.37516189, + "balance_loss_mlp": 1.02928829, + "epoch": 0.16497820532090787, + "flos": 23810114914560.0, + "grad_norm": 1.5287917868819372, + "language_loss": 0.86309063, + "learning_rate": 3.811906270092265e-06, + "loss": 0.88965857, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.25634766, + "step": 2744, + "time_per_iteration": 2.83689284324646 + }, + { + "auxiliary_loss_clip": 0.01589881, + "auxiliary_loss_mlp": 0.01057203, + "balance_loss_clip": 1.36994553, + "balance_loss_mlp": 1.03156066, + "epoch": 0.16503832857357584, + "flos": 25493299165440.0, + "grad_norm": 1.7879826868946753, + "language_loss": 0.83612132, + "learning_rate": 3.811741346238036e-06, + "loss": 0.86259222, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25646973, + "step": 2745, + "time_per_iteration": 2.9046008586883545 + }, + { + "auxiliary_loss_clip": 0.0161549, + "auxiliary_loss_mlp": 0.0106651, + "balance_loss_clip": 1.38643146, + "balance_loss_mlp": 1.03931832, + "epoch": 0.1650984518262438, + "flos": 17685044565120.0, + "grad_norm": 1.8826265246282474, + "language_loss": 0.77842623, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.80524623, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27172852, + "step": 2746, + "time_per_iteration": 2.894489049911499 + }, + { + "auxiliary_loss_clip": 0.01599921, + "auxiliary_loss_mlp": 0.01055336, + "balance_loss_clip": 1.37301099, + "balance_loss_mlp": 1.02934813, + "epoch": 0.16515857507891177, + "flos": 18707752200960.0, + "grad_norm": 1.6605029303979764, + "language_loss": 0.8120116, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83856416, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26000977, + "step": 2747, + "time_per_iteration": 2.8995914459228516 + }, + { + "auxiliary_loss_clip": 0.01617245, + "auxiliary_loss_mlp": 0.01054455, + "balance_loss_clip": 1.39019036, + "balance_loss_mlp": 1.02838373, + "epoch": 0.16521869833157973, + "flos": 15018226202880.0, + "grad_norm": 6.6997436522379195, + "language_loss": 0.71750724, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.74422419, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26074219, + "step": 2748, + "time_per_iteration": 2.8298449516296387 + }, + { + "auxiliary_loss_clip": 0.0160504, + "auxiliary_loss_mlp": 0.01059302, + "balance_loss_clip": 1.37904882, + "balance_loss_mlp": 1.03233707, + "epoch": 0.1652788215842477, + "flos": 22129690596480.0, + "grad_norm": 2.3616412096320434, + "language_loss": 0.89128625, + "learning_rate": 3.811080963869561e-06, + "loss": 0.91792971, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26953125, + "step": 2749, + "time_per_iteration": 2.833669424057007 + }, + { + "auxiliary_loss_clip": 0.01602778, + "auxiliary_loss_mlp": 0.01048203, + "balance_loss_clip": 1.37270153, + "balance_loss_mlp": 1.02160788, + "epoch": 0.16533894483691566, + "flos": 18342308799360.0, + "grad_norm": 1.8462377256156128, + "language_loss": 0.80379081, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.83030057, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26623535, + "step": 2750, + "time_per_iteration": 2.877185106277466 + }, + { + "auxiliary_loss_clip": 0.01598146, + "auxiliary_loss_mlp": 0.01049827, + "balance_loss_clip": 1.37143171, + "balance_loss_mlp": 1.02219439, + "epoch": 0.16539906808958366, + "flos": 22392165536640.0, + "grad_norm": 1.6527178372335332, + "language_loss": 0.95802462, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9845044, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.27624512, + "step": 2751, + "time_per_iteration": 2.852721691131592 + }, + { + "auxiliary_loss_clip": 0.01601404, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.3777492, + "balance_loss_mlp": 1.02342963, + "epoch": 0.16545919134225162, + "flos": 22721385836160.0, + "grad_norm": 2.9745700888473676, + "language_loss": 0.71423185, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.74074841, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26806641, + "step": 2752, + "time_per_iteration": 2.9071462154388428 + }, + { + "auxiliary_loss_clip": 0.01363652, + "auxiliary_loss_mlp": 0.01047124, + "balance_loss_clip": 1.22961998, + "balance_loss_mlp": 1.02538037, + "epoch": 0.1655193145949196, + "flos": 67833263151360.0, + "grad_norm": 0.7816193247288641, + "language_loss": 0.54139882, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56550658, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.21777344, + "step": 2753, + "time_per_iteration": 3.4628214836120605 + }, + { + "auxiliary_loss_clip": 0.01598749, + "auxiliary_loss_mlp": 0.01042787, + "balance_loss_clip": 1.37384295, + "balance_loss_mlp": 1.01654935, + "epoch": 0.16557943784758755, + "flos": 24291195321600.0, + "grad_norm": 2.117350844613627, + "language_loss": 0.76228255, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.78869784, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.26257324, + "step": 2754, + "time_per_iteration": 2.8683784008026123 + }, + { + "auxiliary_loss_clip": 0.01617654, + "auxiliary_loss_mlp": 0.01054258, + "balance_loss_clip": 1.38281131, + "balance_loss_mlp": 1.02517056, + "epoch": 0.16563956110025552, + "flos": 20093098060800.0, + "grad_norm": 2.049923216316248, + "language_loss": 0.88161379, + "learning_rate": 3.810088330151188e-06, + "loss": 0.90833288, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.2902832, + "step": 2755, + "time_per_iteration": 2.913374662399292 + }, + { + "auxiliary_loss_clip": 0.01600084, + "auxiliary_loss_mlp": 0.01053088, + "balance_loss_clip": 1.37415957, + "balance_loss_mlp": 1.02689791, + "epoch": 0.16569968435292348, + "flos": 28045204439040.0, + "grad_norm": 2.171444428920699, + "language_loss": 0.74609953, + "learning_rate": 3.80992265092595e-06, + "loss": 0.77263129, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26184082, + "step": 2756, + "time_per_iteration": 2.9047093391418457 + }, + { + "auxiliary_loss_clip": 0.01582794, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.3619163, + "balance_loss_mlp": 1.02293825, + "epoch": 0.16575980760559147, + "flos": 26261449701120.0, + "grad_norm": 1.525332380133221, + "language_loss": 0.7650488, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.79137075, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.2644043, + "step": 2757, + "time_per_iteration": 2.9518797397613525 + }, + { + "auxiliary_loss_clip": 0.01598562, + "auxiliary_loss_mlp": 0.01043807, + "balance_loss_clip": 1.37401974, + "balance_loss_mlp": 1.01742578, + "epoch": 0.16581993085825944, + "flos": 26955479975040.0, + "grad_norm": 1.6351623188186124, + "language_loss": 0.85588235, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.88230598, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26403809, + "step": 2758, + "time_per_iteration": 2.9060680866241455 + }, + { + "auxiliary_loss_clip": 0.01606054, + "auxiliary_loss_mlp": 0.01050195, + "balance_loss_clip": 1.37945616, + "balance_loss_mlp": 1.02381361, + "epoch": 0.1658800541109274, + "flos": 21663812442240.0, + "grad_norm": 1.9506120249273695, + "language_loss": 0.80618447, + "learning_rate": 3.809425201480689e-06, + "loss": 0.83274698, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26379395, + "step": 2759, + "time_per_iteration": 2.8620259761810303 + }, + { + "auxiliary_loss_clip": 0.01603685, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.37382078, + "balance_loss_mlp": 1.02452302, + "epoch": 0.16594017736359537, + "flos": 16444138665600.0, + "grad_norm": 2.0966080529369004, + "language_loss": 0.7677834, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.79433095, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26538086, + "step": 2760, + "time_per_iteration": 2.845219135284424 + }, + { + "auxiliary_loss_clip": 0.01612188, + "auxiliary_loss_mlp": 0.01053424, + "balance_loss_clip": 1.38032818, + "balance_loss_mlp": 1.02557635, + "epoch": 0.16600030061626334, + "flos": 22647356064000.0, + "grad_norm": 1.6984243941316202, + "language_loss": 0.74273872, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.76939487, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.27856445, + "step": 2761, + "time_per_iteration": 2.850893497467041 + }, + { + "auxiliary_loss_clip": 0.01605943, + "auxiliary_loss_mlp": 0.01049256, + "balance_loss_clip": 1.37816739, + "balance_loss_mlp": 1.02121794, + "epoch": 0.1660604238689313, + "flos": 26407925677440.0, + "grad_norm": 1.800612317312686, + "language_loss": 0.89373171, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.92028368, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.28027344, + "step": 2762, + "time_per_iteration": 2.905747413635254 + }, + { + "auxiliary_loss_clip": 0.01609356, + "auxiliary_loss_mlp": 0.01049375, + "balance_loss_clip": 1.3762269, + "balance_loss_mlp": 1.0181663, + "epoch": 0.16612054712159927, + "flos": 23050515646080.0, + "grad_norm": 1.6678352312702054, + "language_loss": 0.89313686, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.91972417, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.31225586, + "step": 2763, + "time_per_iteration": 2.8344943523406982 + }, + { + "auxiliary_loss_clip": 0.01383506, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.24781585, + "balance_loss_mlp": 1.00915086, + "epoch": 0.16618067037426726, + "flos": 59272512716160.0, + "grad_norm": 0.7818588704811839, + "language_loss": 0.59703863, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.62113678, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.171875, + "step": 2764, + "time_per_iteration": 3.3896310329437256 + }, + { + "auxiliary_loss_clip": 0.01600294, + "auxiliary_loss_mlp": 0.01054392, + "balance_loss_clip": 1.37272048, + "balance_loss_mlp": 1.02625847, + "epoch": 0.16624079362693522, + "flos": 27210263299200.0, + "grad_norm": 3.0038514368067455, + "language_loss": 0.8278631, + "learning_rate": 3.808428450193401e-06, + "loss": 0.85440993, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28149414, + "step": 2765, + "time_per_iteration": 2.929936647415161 + }, + { + "auxiliary_loss_clip": 0.01630398, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_clip": 1.39151156, + "balance_loss_mlp": 1.01988316, + "epoch": 0.1663009168796032, + "flos": 10932010629120.0, + "grad_norm": 2.2816297940319896, + "language_loss": 0.71593374, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.74272478, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.28796387, + "step": 2766, + "time_per_iteration": 2.7745814323425293 + }, + { + "auxiliary_loss_clip": 0.01611286, + "auxiliary_loss_mlp": 0.01047835, + "balance_loss_clip": 1.38481927, + "balance_loss_mlp": 1.02034569, + "epoch": 0.16636104013227115, + "flos": 17903333318400.0, + "grad_norm": 2.1142283960677584, + "language_loss": 0.90370381, + "learning_rate": 3.808095651090769e-06, + "loss": 0.93029505, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27453613, + "step": 2767, + "time_per_iteration": 2.8963611125946045 + }, + { + "auxiliary_loss_clip": 0.01375673, + "auxiliary_loss_mlp": 0.0102693, + "balance_loss_clip": 1.24150515, + "balance_loss_mlp": 1.00995469, + "epoch": 0.16642116338493912, + "flos": 66760641221760.0, + "grad_norm": 0.6493214302939047, + "language_loss": 0.53006876, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.55409479, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16992188, + "step": 2768, + "time_per_iteration": 3.4092953205108643 + }, + { + "auxiliary_loss_clip": 0.01629141, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.39357114, + "balance_loss_mlp": 1.01930106, + "epoch": 0.16648128663760708, + "flos": 19035388932480.0, + "grad_norm": 2.221244132513017, + "language_loss": 0.86572361, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.89248168, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27380371, + "step": 2769, + "time_per_iteration": 2.875262975692749 + }, + { + "auxiliary_loss_clip": 0.01378907, + "auxiliary_loss_mlp": 0.01039843, + "balance_loss_clip": 1.24386811, + "balance_loss_mlp": 1.01924348, + "epoch": 0.16654140989027508, + "flos": 70165069879680.0, + "grad_norm": 0.8256878176546514, + "language_loss": 0.5762465, + "learning_rate": 3.80759593822885e-06, + "loss": 0.60043406, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.20605469, + "step": 2770, + "time_per_iteration": 4.558475017547607 + }, + { + "auxiliary_loss_clip": 0.01371159, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.23561716, + "balance_loss_mlp": 1.01177144, + "epoch": 0.16660153314294304, + "flos": 70300325145600.0, + "grad_norm": 0.8669892154633343, + "language_loss": 0.56280625, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58684343, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.20800781, + "step": 2771, + "time_per_iteration": 3.1377315521240234 + }, + { + "auxiliary_loss_clip": 0.0161457, + "auxiliary_loss_mlp": 0.01058525, + "balance_loss_clip": 1.38789535, + "balance_loss_mlp": 1.02974844, + "epoch": 0.166661656395611, + "flos": 23085200424960.0, + "grad_norm": 1.9914496041899008, + "language_loss": 0.71804589, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.74477684, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2878418, + "step": 2772, + "time_per_iteration": 2.93401837348938 + }, + { + "auxiliary_loss_clip": 0.01599499, + "auxiliary_loss_mlp": 0.01049049, + "balance_loss_clip": 1.37349856, + "balance_loss_mlp": 1.02048635, + "epoch": 0.16672177964827897, + "flos": 28378089567360.0, + "grad_norm": 1.7744444628120086, + "language_loss": 0.87364137, + "learning_rate": 3.807095608468975e-06, + "loss": 0.90012681, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28564453, + "step": 2773, + "time_per_iteration": 2.962458848953247 + }, + { + "auxiliary_loss_clip": 0.016082, + "auxiliary_loss_mlp": 0.01050883, + "balance_loss_clip": 1.38043809, + "balance_loss_mlp": 1.02253485, + "epoch": 0.16678190290094694, + "flos": 19098288483840.0, + "grad_norm": 2.9035173147030573, + "language_loss": 0.82424635, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.85083717, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28320312, + "step": 2774, + "time_per_iteration": 2.8210043907165527 + }, + { + "auxiliary_loss_clip": 0.0161443, + "auxiliary_loss_mlp": 0.01052821, + "balance_loss_clip": 1.38447618, + "balance_loss_mlp": 1.02337623, + "epoch": 0.1668420261536149, + "flos": 21809202543360.0, + "grad_norm": 2.6816967603304107, + "language_loss": 0.84382415, + "learning_rate": 3.806761712658952e-06, + "loss": 0.87049669, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.29443359, + "step": 2775, + "time_per_iteration": 2.9129908084869385 + }, + { + "auxiliary_loss_clip": 0.01601791, + "auxiliary_loss_mlp": 0.01059828, + "balance_loss_clip": 1.37349606, + "balance_loss_mlp": 1.03233802, + "epoch": 0.16690214940628287, + "flos": 19071883503360.0, + "grad_norm": 1.7854333428420543, + "language_loss": 0.81832075, + "learning_rate": 3.806594661981897e-06, + "loss": 0.84493691, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27490234, + "step": 2776, + "time_per_iteration": 4.377525091171265 + }, + { + "auxiliary_loss_clip": 0.01594496, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.37337494, + "balance_loss_mlp": 1.02389908, + "epoch": 0.16696227265895086, + "flos": 18597861285120.0, + "grad_norm": 1.776314486351668, + "language_loss": 0.80625343, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.83271539, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27832031, + "step": 2777, + "time_per_iteration": 2.8709166049957275 + }, + { + "auxiliary_loss_clip": 0.01609522, + "auxiliary_loss_mlp": 0.01055268, + "balance_loss_clip": 1.38025033, + "balance_loss_mlp": 1.02638376, + "epoch": 0.16702239591161883, + "flos": 23304394074240.0, + "grad_norm": 1.7477294245594004, + "language_loss": 0.85741556, + "learning_rate": 3.806260355115371e-06, + "loss": 0.88406348, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28894043, + "step": 2778, + "time_per_iteration": 4.250521183013916 + }, + { + "auxiliary_loss_clip": 0.016164, + "auxiliary_loss_mlp": 0.01053506, + "balance_loss_clip": 1.38450801, + "balance_loss_mlp": 1.02558708, + "epoch": 0.1670825191642868, + "flos": 24436313953920.0, + "grad_norm": 1.9941612244790587, + "language_loss": 0.75028366, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.77698267, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27929688, + "step": 2779, + "time_per_iteration": 4.389432191848755 + }, + { + "auxiliary_loss_clip": 0.01619454, + "auxiliary_loss_mlp": 0.01056769, + "balance_loss_clip": 1.38575304, + "balance_loss_mlp": 1.02734804, + "epoch": 0.16714264241695476, + "flos": 26808868264320.0, + "grad_norm": 1.9632521668421017, + "language_loss": 0.66767561, + "learning_rate": 3.805925774274554e-06, + "loss": 0.69443786, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.29418945, + "step": 2780, + "time_per_iteration": 2.913623094558716 + }, + { + "auxiliary_loss_clip": 0.01611811, + "auxiliary_loss_mlp": 0.01050401, + "balance_loss_clip": 1.38283253, + "balance_loss_mlp": 1.02226758, + "epoch": 0.16720276566962272, + "flos": 21845335155840.0, + "grad_norm": 2.2742941893791433, + "language_loss": 0.79272687, + "learning_rate": 3.805758381129643e-06, + "loss": 0.81934893, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28125, + "step": 2781, + "time_per_iteration": 2.8073558807373047 + }, + { + "auxiliary_loss_clip": 0.01616329, + "auxiliary_loss_mlp": 0.01044999, + "balance_loss_clip": 1.38484335, + "balance_loss_mlp": 1.01754498, + "epoch": 0.1672628889222907, + "flos": 21480344202240.0, + "grad_norm": 1.4941241268741021, + "language_loss": 0.75631189, + "learning_rate": 3.805590919510193e-06, + "loss": 0.78292513, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.27441406, + "step": 2782, + "time_per_iteration": 2.857926607131958 + }, + { + "auxiliary_loss_clip": 0.01634169, + "auxiliary_loss_mlp": 0.01050465, + "balance_loss_clip": 1.39194274, + "balance_loss_mlp": 1.02248621, + "epoch": 0.16732301217495865, + "flos": 30786278797440.0, + "grad_norm": 2.01513793046619, + "language_loss": 0.68764257, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.71448886, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.27966309, + "step": 2783, + "time_per_iteration": 2.880960464477539 + }, + { + "auxiliary_loss_clip": 0.01610119, + "auxiliary_loss_mlp": 0.01049322, + "balance_loss_clip": 1.38040268, + "balance_loss_mlp": 1.0232029, + "epoch": 0.16738313542762664, + "flos": 23484423709440.0, + "grad_norm": 1.678104323982807, + "language_loss": 0.70707327, + "learning_rate": 3.805255790873081e-06, + "loss": 0.73366761, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26135254, + "step": 2784, + "time_per_iteration": 2.884366035461426 + }, + { + "auxiliary_loss_clip": 0.01609809, + "auxiliary_loss_mlp": 0.01054782, + "balance_loss_clip": 1.3765316, + "balance_loss_mlp": 1.02497923, + "epoch": 0.1674432586802946, + "flos": 29801151607680.0, + "grad_norm": 1.8626186891941043, + "language_loss": 0.62335467, + "learning_rate": 3.805088123868126e-06, + "loss": 0.65000057, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.29785156, + "step": 2785, + "time_per_iteration": 2.8714518547058105 + }, + { + "auxiliary_loss_clip": 0.0133823, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.20751691, + "balance_loss_mlp": 1.03533506, + "epoch": 0.16750338193296258, + "flos": 66168900737280.0, + "grad_norm": 0.8034414241294885, + "language_loss": 0.589607, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.61349815, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.15527344, + "step": 2786, + "time_per_iteration": 3.4300501346588135 + }, + { + "auxiliary_loss_clip": 0.01623086, + "auxiliary_loss_mlp": 0.01053165, + "balance_loss_clip": 1.38767767, + "balance_loss_mlp": 1.0249958, + "epoch": 0.16756350518563054, + "flos": 25706249032320.0, + "grad_norm": 1.6358852934009838, + "language_loss": 0.77025211, + "learning_rate": 3.80475258451721e-06, + "loss": 0.79701459, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28137207, + "step": 2787, + "time_per_iteration": 2.9150922298431396 + }, + { + "auxiliary_loss_clip": 0.01624278, + "auxiliary_loss_mlp": 0.01054225, + "balance_loss_clip": 1.39066386, + "balance_loss_mlp": 1.02712882, + "epoch": 0.1676236284382985, + "flos": 23845116407040.0, + "grad_norm": 1.86449052251365, + "language_loss": 0.78385091, + "learning_rate": 3.804584712183972e-06, + "loss": 0.81063592, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27111816, + "step": 2788, + "time_per_iteration": 2.8436031341552734 + }, + { + "auxiliary_loss_clip": 0.01350726, + "auxiliary_loss_mlp": 0.01065554, + "balance_loss_clip": 1.21678388, + "balance_loss_mlp": 1.04953194, + "epoch": 0.16768375169096647, + "flos": 59900367830400.0, + "grad_norm": 0.8723305933151231, + "language_loss": 0.59402454, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61818731, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16015625, + "step": 2789, + "time_per_iteration": 3.2209463119506836 + }, + { + "auxiliary_loss_clip": 0.01627909, + "auxiliary_loss_mlp": 0.01069503, + "balance_loss_clip": 1.39139032, + "balance_loss_mlp": 1.03905678, + "epoch": 0.16774387494363446, + "flos": 38449821968640.0, + "grad_norm": 1.4652685085799015, + "language_loss": 0.71048516, + "learning_rate": 3.804248762233765e-06, + "loss": 0.7374593, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.30444336, + "step": 2790, + "time_per_iteration": 3.08988881111145 + }, + { + "auxiliary_loss_clip": 0.01626645, + "auxiliary_loss_mlp": 0.01068408, + "balance_loss_clip": 1.39407873, + "balance_loss_mlp": 1.0415504, + "epoch": 0.16780399819630243, + "flos": 22647763267200.0, + "grad_norm": 1.8347936673068679, + "language_loss": 0.80022144, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.82717198, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.26855469, + "step": 2791, + "time_per_iteration": 2.8578810691833496 + }, + { + "auxiliary_loss_clip": 0.01625309, + "auxiliary_loss_mlp": 0.01070108, + "balance_loss_clip": 1.39460397, + "balance_loss_mlp": 1.04159367, + "epoch": 0.1678641214489704, + "flos": 32904683210880.0, + "grad_norm": 1.6821492542686085, + "language_loss": 0.72790956, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.75486368, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28491211, + "step": 2792, + "time_per_iteration": 3.012328624725342 + }, + { + "auxiliary_loss_clip": 0.01614637, + "auxiliary_loss_mlp": 0.01064513, + "balance_loss_clip": 1.38145101, + "balance_loss_mlp": 1.03709435, + "epoch": 0.16792424470163836, + "flos": 19984564488960.0, + "grad_norm": 1.890484521125794, + "language_loss": 0.72768176, + "learning_rate": 3.803744324194691e-06, + "loss": 0.75447333, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27441406, + "step": 2793, + "time_per_iteration": 2.834085464477539 + }, + { + "auxiliary_loss_clip": 0.01615966, + "auxiliary_loss_mlp": 0.01063767, + "balance_loss_clip": 1.38325179, + "balance_loss_mlp": 1.03569293, + "epoch": 0.16798436795430632, + "flos": 19729554940800.0, + "grad_norm": 1.8527835437950408, + "language_loss": 0.78148866, + "learning_rate": 3.803576041376831e-06, + "loss": 0.80828601, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.28088379, + "step": 2794, + "time_per_iteration": 2.8562545776367188 + }, + { + "auxiliary_loss_clip": 0.01617735, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.38319361, + "balance_loss_mlp": 1.02918768, + "epoch": 0.1680444912069743, + "flos": 28114935955200.0, + "grad_norm": 2.107896202389036, + "language_loss": 0.72491789, + "learning_rate": 3.803407690167187e-06, + "loss": 0.75167048, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.2833252, + "step": 2795, + "time_per_iteration": 2.916116714477539 + }, + { + "auxiliary_loss_clip": 0.01600953, + "auxiliary_loss_mlp": 0.01060805, + "balance_loss_clip": 1.37129819, + "balance_loss_mlp": 1.03312504, + "epoch": 0.16810461445964225, + "flos": 18083724912000.0, + "grad_norm": 1.9095803018111115, + "language_loss": 0.84981704, + "learning_rate": 3.803239270572142e-06, + "loss": 0.87643456, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27661133, + "step": 2796, + "time_per_iteration": 2.8631703853607178 + }, + { + "auxiliary_loss_clip": 0.01631265, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.3931191, + "balance_loss_mlp": 1.03217983, + "epoch": 0.16816473771231025, + "flos": 23889257349120.0, + "grad_norm": 2.156130284384972, + "language_loss": 0.83095014, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.85786331, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.27868652, + "step": 2797, + "time_per_iteration": 2.881422996520996 + }, + { + "auxiliary_loss_clip": 0.01592851, + "auxiliary_loss_mlp": 0.01050881, + "balance_loss_clip": 1.36933303, + "balance_loss_mlp": 1.02471471, + "epoch": 0.1682248609649782, + "flos": 22793786795520.0, + "grad_norm": 1.409335606914219, + "language_loss": 0.76224887, + "learning_rate": 3.802902226251401e-06, + "loss": 0.78868616, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26171875, + "step": 2798, + "time_per_iteration": 2.8413352966308594 + }, + { + "auxiliary_loss_clip": 0.01623268, + "auxiliary_loss_mlp": 0.01052766, + "balance_loss_clip": 1.39022005, + "balance_loss_mlp": 1.02570581, + "epoch": 0.16828498421764618, + "flos": 20715089333760.0, + "grad_norm": 1.392369436209934, + "language_loss": 0.80759901, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.83435929, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27099609, + "step": 2799, + "time_per_iteration": 2.871710777282715 + }, + { + "auxiliary_loss_clip": 0.0162851, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_clip": 1.39297366, + "balance_loss_mlp": 1.01917887, + "epoch": 0.16834510747031414, + "flos": 29431636174080.0, + "grad_norm": 2.0399396974007438, + "language_loss": 0.71797371, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.74473894, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.28808594, + "step": 2800, + "time_per_iteration": 2.903916358947754 + }, + { + "auxiliary_loss_clip": 0.01619026, + "auxiliary_loss_mlp": 0.01049481, + "balance_loss_clip": 1.38622439, + "balance_loss_mlp": 1.02126408, + "epoch": 0.1684052307229821, + "flos": 18153184959360.0, + "grad_norm": 1.973732544480156, + "language_loss": 0.847247, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.87393206, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.28186035, + "step": 2801, + "time_per_iteration": 2.8598334789276123 + }, + { + "auxiliary_loss_clip": 0.01630832, + "auxiliary_loss_mlp": 0.01052073, + "balance_loss_clip": 1.39588833, + "balance_loss_mlp": 1.02515531, + "epoch": 0.16846535397565007, + "flos": 16581927640320.0, + "grad_norm": 2.1392848212626583, + "language_loss": 0.8443954, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.87122446, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.26953125, + "step": 2802, + "time_per_iteration": 2.815573215484619 + }, + { + "auxiliary_loss_clip": 0.01629104, + "auxiliary_loss_mlp": 0.01057139, + "balance_loss_clip": 1.39627147, + "balance_loss_mlp": 1.02783751, + "epoch": 0.16852547722831807, + "flos": 30420971130240.0, + "grad_norm": 1.4172072361107468, + "language_loss": 0.82270634, + "learning_rate": 3.802058419152413e-06, + "loss": 0.84956872, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.29321289, + "step": 2803, + "time_per_iteration": 2.9673140048980713 + }, + { + "auxiliary_loss_clip": 0.01616471, + "auxiliary_loss_mlp": 0.01044616, + "balance_loss_clip": 1.38573802, + "balance_loss_mlp": 1.0167923, + "epoch": 0.16858560048098603, + "flos": 33519616295040.0, + "grad_norm": 2.4350058750240655, + "language_loss": 0.77881527, + "learning_rate": 3.801889452704297e-06, + "loss": 0.80542612, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.27832031, + "step": 2804, + "time_per_iteration": 2.990938901901245 + }, + { + "auxiliary_loss_clip": 0.01326541, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.19291353, + "balance_loss_mlp": 1.02747393, + "epoch": 0.168645723733654, + "flos": 67402069776000.0, + "grad_norm": 0.873745274085285, + "language_loss": 0.55485404, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57863265, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.23828125, + "step": 2805, + "time_per_iteration": 4.784535884857178 + }, + { + "auxiliary_loss_clip": 0.01608137, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_clip": 1.38376284, + "balance_loss_mlp": 1.02176571, + "epoch": 0.16870584698632196, + "flos": 21334546897920.0, + "grad_norm": 1.904249226389861, + "language_loss": 0.74102151, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.76758146, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.26123047, + "step": 2806, + "time_per_iteration": 2.864797830581665 + }, + { + "auxiliary_loss_clip": 0.01596721, + "auxiliary_loss_mlp": 0.01049811, + "balance_loss_clip": 1.37034154, + "balance_loss_mlp": 1.0219872, + "epoch": 0.16876597023898993, + "flos": 20750498029440.0, + "grad_norm": 1.7486331085808124, + "language_loss": 0.71319467, + "learning_rate": 3.80138214341862e-06, + "loss": 0.73966002, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.2779541, + "step": 2807, + "time_per_iteration": 2.9024243354797363 + }, + { + "auxiliary_loss_clip": 0.01606184, + "auxiliary_loss_mlp": 0.01049983, + "balance_loss_clip": 1.37690067, + "balance_loss_mlp": 1.02142072, + "epoch": 0.1688260934916579, + "flos": 20313287095680.0, + "grad_norm": 2.3214225518164673, + "language_loss": 0.7161938, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.74275547, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.28552246, + "step": 2808, + "time_per_iteration": 2.837751865386963 + }, + { + "auxiliary_loss_clip": 0.01628368, + "auxiliary_loss_mlp": 0.01049174, + "balance_loss_clip": 1.3933022, + "balance_loss_mlp": 1.02006352, + "epoch": 0.16888621674432586, + "flos": 20350686562560.0, + "grad_norm": 2.161041544060095, + "language_loss": 0.81397098, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8407464, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.29150391, + "step": 2809, + "time_per_iteration": 2.8653998374938965 + }, + { + "auxiliary_loss_clip": 0.01628159, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.3919313, + "balance_loss_mlp": 1.02324688, + "epoch": 0.16894633999699385, + "flos": 16251259507200.0, + "grad_norm": 1.9906322657510123, + "language_loss": 0.88974905, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.91653949, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27661133, + "step": 2810, + "time_per_iteration": 2.8020527362823486 + }, + { + "auxiliary_loss_clip": 0.01621038, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_clip": 1.38579369, + "balance_loss_mlp": 1.03001189, + "epoch": 0.16900646324966181, + "flos": 19619754514560.0, + "grad_norm": 1.7612844260987612, + "language_loss": 0.93506283, + "learning_rate": 3.800704774747416e-06, + "loss": 0.9618516, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.27807617, + "step": 2811, + "time_per_iteration": 4.2328972816467285 + }, + { + "auxiliary_loss_clip": 0.01620167, + "auxiliary_loss_mlp": 0.01049526, + "balance_loss_clip": 1.38676262, + "balance_loss_mlp": 1.02253687, + "epoch": 0.16906658650232978, + "flos": 22028260458240.0, + "grad_norm": 2.02111539014105, + "language_loss": 0.79789007, + "learning_rate": 3.800535261856291e-06, + "loss": 0.82458705, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27026367, + "step": 2812, + "time_per_iteration": 2.869966983795166 + }, + { + "auxiliary_loss_clip": 0.01623165, + "auxiliary_loss_mlp": 0.01053876, + "balance_loss_clip": 1.39141703, + "balance_loss_mlp": 1.02750695, + "epoch": 0.16912670975499774, + "flos": 11770707087360.0, + "grad_norm": 2.072021587414466, + "language_loss": 0.7617147, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.78848505, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26367188, + "step": 2813, + "time_per_iteration": 4.243016958236694 + }, + { + "auxiliary_loss_clip": 0.01633459, + "auxiliary_loss_mlp": 0.01056463, + "balance_loss_clip": 1.39576721, + "balance_loss_mlp": 1.02873504, + "epoch": 0.1691868330076657, + "flos": 17169234134400.0, + "grad_norm": 4.409729988489119, + "language_loss": 0.70997715, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.73687631, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.27758789, + "step": 2814, + "time_per_iteration": 4.360554218292236 + }, + { + "auxiliary_loss_clip": 0.01623903, + "auxiliary_loss_mlp": 0.01057984, + "balance_loss_clip": 1.39090264, + "balance_loss_mlp": 1.03001738, + "epoch": 0.16924695626033368, + "flos": 22426081153920.0, + "grad_norm": 1.7123896977945778, + "language_loss": 0.63036692, + "learning_rate": 3.800026313549776e-06, + "loss": 0.65718585, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27954102, + "step": 2815, + "time_per_iteration": 2.8277604579925537 + }, + { + "auxiliary_loss_clip": 0.01612454, + "auxiliary_loss_mlp": 0.01058628, + "balance_loss_clip": 1.38140047, + "balance_loss_mlp": 1.03085256, + "epoch": 0.16930707951300164, + "flos": 25750797177600.0, + "grad_norm": 1.531774173569356, + "language_loss": 0.83058381, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.85729462, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27783203, + "step": 2816, + "time_per_iteration": 2.9322831630706787 + }, + { + "auxiliary_loss_clip": 0.01631358, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.39629841, + "balance_loss_mlp": 1.02782726, + "epoch": 0.16936720276566963, + "flos": 22757201735040.0, + "grad_norm": 1.9634597170648742, + "language_loss": 0.89029086, + "learning_rate": 3.799686673382153e-06, + "loss": 0.9171719, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.28955078, + "step": 2817, + "time_per_iteration": 2.8612382411956787 + }, + { + "auxiliary_loss_clip": 0.01605863, + "auxiliary_loss_mlp": 0.01068504, + "balance_loss_clip": 1.37635565, + "balance_loss_mlp": 1.04003668, + "epoch": 0.1694273260183376, + "flos": 19583621902080.0, + "grad_norm": 1.6186674899766902, + "language_loss": 0.82643259, + "learning_rate": 3.799516750928672e-06, + "loss": 0.85317624, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.28491211, + "step": 2818, + "time_per_iteration": 2.8798255920410156 + }, + { + "auxiliary_loss_clip": 0.01621188, + "auxiliary_loss_mlp": 0.01057403, + "balance_loss_clip": 1.38933063, + "balance_loss_mlp": 1.0305804, + "epoch": 0.16948744927100556, + "flos": 12465280298880.0, + "grad_norm": 2.731194865262301, + "language_loss": 0.82945752, + "learning_rate": 3.799346760237336e-06, + "loss": 0.85624343, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.26818848, + "step": 2819, + "time_per_iteration": 2.819330930709839 + }, + { + "auxiliary_loss_clip": 0.01347946, + "auxiliary_loss_mlp": 0.01082433, + "balance_loss_clip": 1.21149695, + "balance_loss_mlp": 1.06402719, + "epoch": 0.16954757252367353, + "flos": 71319277463040.0, + "grad_norm": 0.9517858556259979, + "language_loss": 0.61326921, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.637573, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.18359375, + "step": 2820, + "time_per_iteration": 3.2812106609344482 + }, + { + "auxiliary_loss_clip": 0.01639983, + "auxiliary_loss_mlp": 0.01062917, + "balance_loss_clip": 1.40477633, + "balance_loss_mlp": 1.03591573, + "epoch": 0.1696076957763415, + "flos": 29618497774080.0, + "grad_norm": 2.4370234912406183, + "language_loss": 0.79150999, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.81853902, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26989746, + "step": 2821, + "time_per_iteration": 2.9248547554016113 + }, + { + "auxiliary_loss_clip": 0.01633746, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.3996464, + "balance_loss_mlp": 1.03440154, + "epoch": 0.16966781902900946, + "flos": 24399095466240.0, + "grad_norm": 1.9089503889405433, + "language_loss": 0.79614437, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.82311058, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.28430176, + "step": 2822, + "time_per_iteration": 2.869173526763916 + }, + { + "auxiliary_loss_clip": 0.01610703, + "auxiliary_loss_mlp": 0.01055513, + "balance_loss_clip": 1.3819344, + "balance_loss_mlp": 1.02792811, + "epoch": 0.16972794228167745, + "flos": 23049022567680.0, + "grad_norm": 1.773372201814986, + "language_loss": 0.75966936, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.78633153, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.27612305, + "step": 2823, + "time_per_iteration": 2.8967485427856445 + }, + { + "auxiliary_loss_clip": 0.01643595, + "auxiliary_loss_mlp": 0.01067333, + "balance_loss_clip": 1.40696633, + "balance_loss_mlp": 1.03860354, + "epoch": 0.16978806553434542, + "flos": 35243005190400.0, + "grad_norm": 1.752924672250813, + "language_loss": 0.60586309, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.63297242, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.28723145, + "step": 2824, + "time_per_iteration": 2.9743940830230713 + }, + { + "auxiliary_loss_clip": 0.01627224, + "auxiliary_loss_mlp": 0.01059007, + "balance_loss_clip": 1.39524722, + "balance_loss_mlp": 1.03007519, + "epoch": 0.16984818878701338, + "flos": 32026822738560.0, + "grad_norm": 2.332364146939632, + "language_loss": 0.74103343, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.7678957, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.28942871, + "step": 2825, + "time_per_iteration": 2.9573376178741455 + }, + { + "auxiliary_loss_clip": 0.01666574, + "auxiliary_loss_mlp": 0.01067032, + "balance_loss_clip": 1.42101514, + "balance_loss_mlp": 1.03937578, + "epoch": 0.16990831203968135, + "flos": 22828426329600.0, + "grad_norm": 1.9742259902937975, + "language_loss": 0.86451685, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.89185292, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 2.453125, + "router_z_loss_mlp": 0.2767334, + "step": 2826, + "time_per_iteration": 2.94449520111084 + }, + { + "auxiliary_loss_clip": 0.01640563, + "auxiliary_loss_mlp": 0.01060833, + "balance_loss_clip": 1.40151918, + "balance_loss_mlp": 1.03349829, + "epoch": 0.1699684352923493, + "flos": 23050017953280.0, + "grad_norm": 1.725359448694524, + "language_loss": 0.8317672, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.85878122, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.27331543, + "step": 2827, + "time_per_iteration": 2.8417346477508545 + }, + { + "auxiliary_loss_clip": 0.01647723, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.40631211, + "balance_loss_mlp": 1.03167963, + "epoch": 0.17002855854501728, + "flos": 21444075855360.0, + "grad_norm": 1.7150798976900403, + "language_loss": 0.74646032, + "learning_rate": 3.797813774376267e-06, + "loss": 0.77353841, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 2.41015625, + "router_z_loss_mlp": 0.28405762, + "step": 2828, + "time_per_iteration": 2.9025771617889404 + }, + { + "auxiliary_loss_clip": 0.01341808, + "auxiliary_loss_mlp": 0.01017136, + "balance_loss_clip": 1.20686519, + "balance_loss_mlp": 1.00082779, + "epoch": 0.17008868179768524, + "flos": 71485253948160.0, + "grad_norm": 0.7614167373342754, + "language_loss": 0.56567609, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58926558, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.16308594, + "step": 2829, + "time_per_iteration": 3.401987075805664 + }, + { + "auxiliary_loss_clip": 0.01622315, + "auxiliary_loss_mlp": 0.01059285, + "balance_loss_clip": 1.38945925, + "balance_loss_mlp": 1.03290427, + "epoch": 0.17014880505035324, + "flos": 24911195823360.0, + "grad_norm": 1.748877294333387, + "language_loss": 0.84664994, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.87346596, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.26391602, + "step": 2830, + "time_per_iteration": 2.9089419841766357 + }, + { + "auxiliary_loss_clip": 0.01640631, + "auxiliary_loss_mlp": 0.010556, + "balance_loss_clip": 1.40171957, + "balance_loss_mlp": 1.02644145, + "epoch": 0.1702089283030212, + "flos": 29874593197440.0, + "grad_norm": 2.4733871200220263, + "language_loss": 0.79934978, + "learning_rate": 3.797301551737529e-06, + "loss": 0.82631207, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 2.390625, + "router_z_loss_mlp": 0.29174805, + "step": 2831, + "time_per_iteration": 2.9880707263946533 + }, + { + "auxiliary_loss_clip": 0.01647137, + "auxiliary_loss_mlp": 0.01051107, + "balance_loss_clip": 1.40727878, + "balance_loss_mlp": 1.02371264, + "epoch": 0.17026905155568917, + "flos": 17751925658880.0, + "grad_norm": 1.9290870453598357, + "language_loss": 0.8064847, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.83346713, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.27392578, + "step": 2832, + "time_per_iteration": 2.889906644821167 + }, + { + "auxiliary_loss_clip": 0.01623735, + "auxiliary_loss_mlp": 0.01061064, + "balance_loss_clip": 1.38978839, + "balance_loss_mlp": 1.03341961, + "epoch": 0.17032917480835713, + "flos": 23159003973120.0, + "grad_norm": 1.6185858591342732, + "language_loss": 0.89880717, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.92565513, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27648926, + "step": 2833, + "time_per_iteration": 2.9115099906921387 + }, + { + "auxiliary_loss_clip": 0.01622766, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.3910799, + "balance_loss_mlp": 1.0319773, + "epoch": 0.1703892980610251, + "flos": 39217112853120.0, + "grad_norm": 2.090492320068467, + "language_loss": 0.74131179, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.76812649, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26733398, + "step": 2834, + "time_per_iteration": 3.008330821990967 + }, + { + "auxiliary_loss_clip": 0.0162429, + "auxiliary_loss_mlp": 0.01060576, + "balance_loss_clip": 1.38863993, + "balance_loss_mlp": 1.0328958, + "epoch": 0.17044942131369306, + "flos": 23049384526080.0, + "grad_norm": 2.6425822801442584, + "language_loss": 0.87634879, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.90319741, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.27661133, + "step": 2835, + "time_per_iteration": 2.8872783184051514 + }, + { + "auxiliary_loss_clip": 0.01632597, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.3909992, + "balance_loss_mlp": 1.03213751, + "epoch": 0.17050954456636103, + "flos": 17063596229760.0, + "grad_norm": 2.3010789670334524, + "language_loss": 0.75271428, + "learning_rate": 3.796446484348989e-06, + "loss": 0.77964354, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 2.41796875, + "router_z_loss_mlp": 0.28173828, + "step": 2836, + "time_per_iteration": 2.81679105758667 + }, + { + "auxiliary_loss_clip": 0.01646348, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.40619111, + "balance_loss_mlp": 1.02086174, + "epoch": 0.17056966781902902, + "flos": 16845759924480.0, + "grad_norm": 2.1592794453561304, + "language_loss": 0.80802172, + "learning_rate": 3.796275266481036e-06, + "loss": 0.83497494, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.28100586, + "step": 2837, + "time_per_iteration": 2.832899570465088 + }, + { + "auxiliary_loss_clip": 0.01612678, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.38628578, + "balance_loss_mlp": 1.02611804, + "epoch": 0.17062979107169698, + "flos": 17721539136000.0, + "grad_norm": 1.8882289705037207, + "language_loss": 0.84764183, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.87430799, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27832031, + "step": 2838, + "time_per_iteration": 2.9394476413726807 + }, + { + "auxiliary_loss_clip": 0.0162409, + "auxiliary_loss_mlp": 0.01052391, + "balance_loss_clip": 1.3926059, + "balance_loss_mlp": 1.02525949, + "epoch": 0.17068991432436495, + "flos": 22534343256960.0, + "grad_norm": 1.6464511963149713, + "language_loss": 0.94335306, + "learning_rate": 3.795932626406812e-06, + "loss": 0.97011787, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27124023, + "step": 2839, + "time_per_iteration": 4.3152430057525635 + }, + { + "auxiliary_loss_clip": 0.01642902, + "auxiliary_loss_mlp": 0.01055714, + "balance_loss_clip": 1.40643537, + "balance_loss_mlp": 1.02674603, + "epoch": 0.17075003757703291, + "flos": 25893427345920.0, + "grad_norm": 1.8042351478714167, + "language_loss": 0.84555954, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.87254572, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28955078, + "step": 2840, + "time_per_iteration": 2.87135910987854 + }, + { + "auxiliary_loss_clip": 0.01624254, + "auxiliary_loss_mlp": 0.01056115, + "balance_loss_clip": 1.39069688, + "balance_loss_mlp": 1.02926922, + "epoch": 0.17081016082970088, + "flos": 20130361793280.0, + "grad_norm": 2.0316283431778817, + "language_loss": 0.77820551, + "learning_rate": 3.79558971392481e-06, + "loss": 0.80500925, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26855469, + "step": 2841, + "time_per_iteration": 2.852778673171997 + }, + { + "auxiliary_loss_clip": 0.01636451, + "auxiliary_loss_mlp": 0.01053294, + "balance_loss_clip": 1.40133142, + "balance_loss_mlp": 1.02606666, + "epoch": 0.17087028408236885, + "flos": 24947283191040.0, + "grad_norm": 1.7500509155725317, + "language_loss": 0.77878666, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.80568409, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.2722168, + "step": 2842, + "time_per_iteration": 2.8538520336151123 + }, + { + "auxiliary_loss_clip": 0.01618756, + "auxiliary_loss_mlp": 0.01053739, + "balance_loss_clip": 1.39110851, + "balance_loss_mlp": 1.02489042, + "epoch": 0.17093040733503684, + "flos": 19066001679360.0, + "grad_norm": 1.9276078649091348, + "language_loss": 0.86907762, + "learning_rate": 3.795246529087043e-06, + "loss": 0.89580262, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.28833008, + "step": 2843, + "time_per_iteration": 2.868248462677002 + }, + { + "auxiliary_loss_clip": 0.0163288, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_clip": 1.4012152, + "balance_loss_mlp": 1.0295763, + "epoch": 0.1709905305877048, + "flos": 13086773879040.0, + "grad_norm": 2.8605043197785722, + "language_loss": 0.70111001, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.7279979, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26330566, + "step": 2844, + "time_per_iteration": 2.8288321495056152 + }, + { + "auxiliary_loss_clip": 0.01636478, + "auxiliary_loss_mlp": 0.01054393, + "balance_loss_clip": 1.40265238, + "balance_loss_mlp": 1.02680826, + "epoch": 0.17105065384037277, + "flos": 19218947662080.0, + "grad_norm": 2.184491945257295, + "language_loss": 0.79141927, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.81832796, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27600098, + "step": 2845, + "time_per_iteration": 2.8958675861358643 + }, + { + "auxiliary_loss_clip": 0.01639165, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_clip": 1.40375876, + "balance_loss_mlp": 1.02274394, + "epoch": 0.17111077709304073, + "flos": 18524374450560.0, + "grad_norm": 2.177958445239607, + "language_loss": 0.79524291, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.82213825, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.27587891, + "step": 2846, + "time_per_iteration": 4.243098735809326 + }, + { + "auxiliary_loss_clip": 0.01624325, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_clip": 1.39539266, + "balance_loss_mlp": 1.02741957, + "epoch": 0.1711709003457087, + "flos": 25093578188160.0, + "grad_norm": 1.76729089380069, + "language_loss": 0.80911696, + "learning_rate": 3.794559342552472e-06, + "loss": 0.83591342, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.27856445, + "step": 2847, + "time_per_iteration": 2.876802921295166 + }, + { + "auxiliary_loss_clip": 0.01639016, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.40262842, + "balance_loss_mlp": 1.02526522, + "epoch": 0.17123102359837666, + "flos": 17575289383680.0, + "grad_norm": 2.4383593959308842, + "language_loss": 0.88338697, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.910303, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.2734375, + "step": 2848, + "time_per_iteration": 4.2619102001190186 + }, + { + "auxiliary_loss_clip": 0.0162222, + "auxiliary_loss_mlp": 0.01047007, + "balance_loss_clip": 1.3883481, + "balance_loss_mlp": 1.01846826, + "epoch": 0.17129114685104463, + "flos": 26183890834560.0, + "grad_norm": 1.810445930291935, + "language_loss": 0.75843549, + "learning_rate": 3.794215340959902e-06, + "loss": 0.78512782, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28540039, + "step": 2849, + "time_per_iteration": 4.429996490478516 + }, + { + "auxiliary_loss_clip": 0.01336644, + "auxiliary_loss_mlp": 0.01024633, + "balance_loss_clip": 1.20783257, + "balance_loss_mlp": 1.01004148, + "epoch": 0.17135127010371262, + "flos": 69302275925760.0, + "grad_norm": 0.7985522339736943, + "language_loss": 0.57518601, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59879887, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.14550781, + "step": 2850, + "time_per_iteration": 3.3634066581726074 + }, + { + "auxiliary_loss_clip": 0.01618069, + "auxiliary_loss_mlp": 0.01049314, + "balance_loss_clip": 1.39044285, + "balance_loss_mlp": 1.0233978, + "epoch": 0.1714113933563806, + "flos": 23560579987200.0, + "grad_norm": 2.1162594894939364, + "language_loss": 0.81841159, + "learning_rate": 3.793871067220031e-06, + "loss": 0.84508538, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25927734, + "step": 2851, + "time_per_iteration": 2.8618435859680176 + }, + { + "auxiliary_loss_clip": 0.01619241, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.39160442, + "balance_loss_mlp": 1.0185957, + "epoch": 0.17147151660904855, + "flos": 21152390757120.0, + "grad_norm": 1.732074913186605, + "language_loss": 0.94374502, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.97037959, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25610352, + "step": 2852, + "time_per_iteration": 2.8610286712646484 + }, + { + "auxiliary_loss_clip": 0.01646739, + "auxiliary_loss_mlp": 0.01060638, + "balance_loss_clip": 1.40904355, + "balance_loss_mlp": 1.03283858, + "epoch": 0.17153163986171652, + "flos": 18634174876800.0, + "grad_norm": 1.8005997772224238, + "language_loss": 0.70013833, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.72721213, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.2779541, + "step": 2853, + "time_per_iteration": 2.9042739868164062 + }, + { + "auxiliary_loss_clip": 0.0163249, + "auxiliary_loss_mlp": 0.01053343, + "balance_loss_clip": 1.39740908, + "balance_loss_mlp": 1.02587724, + "epoch": 0.17159176311438448, + "flos": 18232372638720.0, + "grad_norm": 1.9651989038372577, + "language_loss": 0.67544657, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.7023049, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.2746582, + "step": 2854, + "time_per_iteration": 2.813811779022217 + }, + { + "auxiliary_loss_clip": 0.01607327, + "auxiliary_loss_mlp": 0.01054934, + "balance_loss_clip": 1.37902641, + "balance_loss_mlp": 1.02833879, + "epoch": 0.17165188636705245, + "flos": 20748281034240.0, + "grad_norm": 1.4999436386006164, + "language_loss": 0.89763689, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.92425954, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26599121, + "step": 2855, + "time_per_iteration": 2.8968088626861572 + }, + { + "auxiliary_loss_clip": 0.01621035, + "auxiliary_loss_mlp": 0.01048449, + "balance_loss_clip": 1.39150953, + "balance_loss_mlp": 1.02199674, + "epoch": 0.17171200961972044, + "flos": 24910652885760.0, + "grad_norm": 3.2177419471930815, + "language_loss": 0.84161317, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86830795, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26452637, + "step": 2856, + "time_per_iteration": 2.875809907913208 + }, + { + "auxiliary_loss_clip": 0.01631782, + "auxiliary_loss_mlp": 0.01051543, + "balance_loss_clip": 1.39961922, + "balance_loss_mlp": 1.02365971, + "epoch": 0.1717721328723884, + "flos": 20166901608960.0, + "grad_norm": 1.8378713394671833, + "language_loss": 0.87111503, + "learning_rate": 3.792836613639026e-06, + "loss": 0.89794827, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.27868652, + "step": 2857, + "time_per_iteration": 2.8666584491729736 + }, + { + "auxiliary_loss_clip": 0.01636696, + "auxiliary_loss_mlp": 0.01056204, + "balance_loss_clip": 1.40285432, + "balance_loss_mlp": 1.02804708, + "epoch": 0.17183225612505637, + "flos": 23370506006400.0, + "grad_norm": 1.9723516071370553, + "language_loss": 0.78922021, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.81614912, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.28198242, + "step": 2858, + "time_per_iteration": 2.875959873199463 + }, + { + "auxiliary_loss_clip": 0.01647275, + "auxiliary_loss_mlp": 0.01059591, + "balance_loss_clip": 1.40510428, + "balance_loss_mlp": 1.03043246, + "epoch": 0.17189237937772434, + "flos": 18123431863680.0, + "grad_norm": 1.9171488850389518, + "language_loss": 0.77805775, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.80512643, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.29150391, + "step": 2859, + "time_per_iteration": 2.8499233722686768 + }, + { + "auxiliary_loss_clip": 0.01622195, + "auxiliary_loss_mlp": 0.0105319, + "balance_loss_clip": 1.3912468, + "balance_loss_mlp": 1.02671373, + "epoch": 0.1719525026303923, + "flos": 23268849644160.0, + "grad_norm": 1.8187726868356588, + "language_loss": 0.7751565, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.8019104, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.26477051, + "step": 2860, + "time_per_iteration": 2.903247117996216 + }, + { + "auxiliary_loss_clip": 0.01633084, + "auxiliary_loss_mlp": 0.01052044, + "balance_loss_clip": 1.39671302, + "balance_loss_mlp": 1.02530503, + "epoch": 0.17201262588306027, + "flos": 20819324649600.0, + "grad_norm": 2.1701632797666126, + "language_loss": 0.82243323, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84928453, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.2677002, + "step": 2861, + "time_per_iteration": 2.834003210067749 + }, + { + "auxiliary_loss_clip": 0.01634026, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.39978838, + "balance_loss_mlp": 1.02728772, + "epoch": 0.17207274913572823, + "flos": 20385507075840.0, + "grad_norm": 2.0616255801719494, + "language_loss": 0.87142777, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.89830327, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.26269531, + "step": 2862, + "time_per_iteration": 2.8595054149627686 + }, + { + "auxiliary_loss_clip": 0.01617773, + "auxiliary_loss_mlp": 0.01049987, + "balance_loss_clip": 1.38941741, + "balance_loss_mlp": 1.0230813, + "epoch": 0.17213287238839622, + "flos": 26809139733120.0, + "grad_norm": 1.8477965123029554, + "language_loss": 0.7916314, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.81830895, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.26916504, + "step": 2863, + "time_per_iteration": 2.8680238723754883 + }, + { + "auxiliary_loss_clip": 0.01619498, + "auxiliary_loss_mlp": 0.01046168, + "balance_loss_clip": 1.39095819, + "balance_loss_mlp": 1.01840448, + "epoch": 0.1721929956410642, + "flos": 26041848848640.0, + "grad_norm": 2.1053294688458384, + "language_loss": 0.73797679, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.76463342, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27758789, + "step": 2864, + "time_per_iteration": 2.9081218242645264 + }, + { + "auxiliary_loss_clip": 0.01649655, + "auxiliary_loss_mlp": 0.01049555, + "balance_loss_clip": 1.41287899, + "balance_loss_mlp": 1.02156425, + "epoch": 0.17225311889373215, + "flos": 22283224761600.0, + "grad_norm": 1.754793282992755, + "language_loss": 0.73624116, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.7632333, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.28027344, + "step": 2865, + "time_per_iteration": 2.842339277267456 + }, + { + "auxiliary_loss_clip": 0.01633465, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.40062165, + "balance_loss_mlp": 1.0214262, + "epoch": 0.17231324214640012, + "flos": 21297599879040.0, + "grad_norm": 2.250846319648036, + "language_loss": 0.80163974, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.82846689, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27783203, + "step": 2866, + "time_per_iteration": 2.8709352016448975 + }, + { + "auxiliary_loss_clip": 0.01627577, + "auxiliary_loss_mlp": 0.01046946, + "balance_loss_clip": 1.3927834, + "balance_loss_mlp": 1.01801372, + "epoch": 0.17237336539906808, + "flos": 19689712254720.0, + "grad_norm": 1.6613665084014997, + "language_loss": 0.80434775, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.83109295, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28942871, + "step": 2867, + "time_per_iteration": 2.8786492347717285 + }, + { + "auxiliary_loss_clip": 0.01627817, + "auxiliary_loss_mlp": 0.01053462, + "balance_loss_clip": 1.39518309, + "balance_loss_mlp": 1.02470875, + "epoch": 0.17243348865173605, + "flos": 17538478099200.0, + "grad_norm": 1.9220886547893297, + "language_loss": 0.80312109, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.82993388, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.28771973, + "step": 2868, + "time_per_iteration": 2.807342767715454 + }, + { + "auxiliary_loss_clip": 0.01641906, + "auxiliary_loss_mlp": 0.01048285, + "balance_loss_clip": 1.40398574, + "balance_loss_mlp": 1.01977038, + "epoch": 0.17249361190440402, + "flos": 18269002944000.0, + "grad_norm": 2.5580785067046277, + "language_loss": 0.84989595, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.87679785, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.28540039, + "step": 2869, + "time_per_iteration": 2.8509926795959473 + }, + { + "auxiliary_loss_clip": 0.01633436, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.39994264, + "balance_loss_mlp": 1.02221465, + "epoch": 0.172553735157072, + "flos": 21183139238400.0, + "grad_norm": 1.817340716412683, + "language_loss": 0.78106964, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.80792129, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.29541016, + "step": 2870, + "time_per_iteration": 2.8495724201202393 + }, + { + "auxiliary_loss_clip": 0.01619664, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.39453387, + "balance_loss_mlp": 1.01152956, + "epoch": 0.17261385840973997, + "flos": 22283450985600.0, + "grad_norm": 1.714521544714359, + "language_loss": 0.78355873, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.81012428, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.25378418, + "step": 2871, + "time_per_iteration": 2.9107699394226074 + }, + { + "auxiliary_loss_clip": 0.01640803, + "auxiliary_loss_mlp": 0.01056483, + "balance_loss_clip": 1.40466702, + "balance_loss_mlp": 1.02858853, + "epoch": 0.17267398166240794, + "flos": 27932236876800.0, + "grad_norm": 2.110833919427104, + "language_loss": 0.75499499, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.78196782, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.27905273, + "step": 2872, + "time_per_iteration": 2.9394659996032715 + }, + { + "auxiliary_loss_clip": 0.01625771, + "auxiliary_loss_mlp": 0.01050417, + "balance_loss_clip": 1.39689016, + "balance_loss_mlp": 1.02298737, + "epoch": 0.1727341049150759, + "flos": 21955271316480.0, + "grad_norm": 1.6396490380994089, + "language_loss": 0.83021456, + "learning_rate": 3.790066109323988e-06, + "loss": 0.85697645, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27429199, + "step": 2873, + "time_per_iteration": 2.8937807083129883 + }, + { + "auxiliary_loss_clip": 0.01627455, + "auxiliary_loss_mlp": 0.01049929, + "balance_loss_clip": 1.3965919, + "balance_loss_mlp": 1.02066326, + "epoch": 0.17279422816774387, + "flos": 18115423534080.0, + "grad_norm": 2.01797401827311, + "language_loss": 0.75674272, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.78351653, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.29284668, + "step": 2874, + "time_per_iteration": 4.301179647445679 + }, + { + "auxiliary_loss_clip": 0.01636421, + "auxiliary_loss_mlp": 0.010555, + "balance_loss_clip": 1.40091753, + "balance_loss_mlp": 1.02722394, + "epoch": 0.17285435142041183, + "flos": 21845516135040.0, + "grad_norm": 2.1079780662817367, + "language_loss": 0.81981921, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8467384, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 2.35742188, + "router_z_loss_mlp": 0.28271484, + "step": 2875, + "time_per_iteration": 2.865319013595581 + }, + { + "auxiliary_loss_clip": 0.01639776, + "auxiliary_loss_mlp": 0.01056509, + "balance_loss_clip": 1.40273213, + "balance_loss_mlp": 1.02866232, + "epoch": 0.17291447467307983, + "flos": 18377400781440.0, + "grad_norm": 2.317335940964352, + "language_loss": 0.88980877, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.91677165, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.27856445, + "step": 2876, + "time_per_iteration": 2.820697546005249 + }, + { + "auxiliary_loss_clip": 0.01641822, + "auxiliary_loss_mlp": 0.01052387, + "balance_loss_clip": 1.40844154, + "balance_loss_mlp": 1.02468276, + "epoch": 0.1729745979257478, + "flos": 18633858163200.0, + "grad_norm": 2.042506142251443, + "language_loss": 0.86009514, + "learning_rate": 3.789370767013681e-06, + "loss": 0.88703716, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.27697754, + "step": 2877, + "time_per_iteration": 2.8731465339660645 + }, + { + "auxiliary_loss_clip": 0.01643229, + "auxiliary_loss_mlp": 0.01053168, + "balance_loss_clip": 1.4077388, + "balance_loss_mlp": 1.02644074, + "epoch": 0.17303472117841576, + "flos": 23007370089600.0, + "grad_norm": 2.585433965030499, + "language_loss": 0.80597019, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.8329342, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.26733398, + "step": 2878, + "time_per_iteration": 2.8330061435699463 + }, + { + "auxiliary_loss_clip": 0.01641771, + "auxiliary_loss_mlp": 0.01056844, + "balance_loss_clip": 1.40820384, + "balance_loss_mlp": 1.03016472, + "epoch": 0.17309484443108372, + "flos": 25674912368640.0, + "grad_norm": 1.5165889813248241, + "language_loss": 0.71685362, + "learning_rate": 3.78902268871344e-06, + "loss": 0.7438398, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.26660156, + "step": 2879, + "time_per_iteration": 2.906954050064087 + }, + { + "auxiliary_loss_clip": 0.01633527, + "auxiliary_loss_mlp": 0.01054234, + "balance_loss_clip": 1.39746094, + "balance_loss_mlp": 1.02567124, + "epoch": 0.1731549676837517, + "flos": 13560750852480.0, + "grad_norm": 2.075902235316382, + "language_loss": 0.84916943, + "learning_rate": 3.78884854780014e-06, + "loss": 0.87604707, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.28588867, + "step": 2880, + "time_per_iteration": 2.7952699661254883 + }, + { + "auxiliary_loss_clip": 0.01650575, + "auxiliary_loss_mlp": 0.01051124, + "balance_loss_clip": 1.4116025, + "balance_loss_mlp": 1.02381301, + "epoch": 0.17321509093641965, + "flos": 22867409364480.0, + "grad_norm": 2.0587562863674034, + "language_loss": 0.82571435, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.85273135, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 2.39257812, + "router_z_loss_mlp": 0.27355957, + "step": 2881, + "time_per_iteration": 4.250097036361694 + }, + { + "auxiliary_loss_clip": 0.01631244, + "auxiliary_loss_mlp": 0.01049813, + "balance_loss_clip": 1.39733875, + "balance_loss_mlp": 1.02471972, + "epoch": 0.17327521418908762, + "flos": 24363460546560.0, + "grad_norm": 1.7693101890567067, + "language_loss": 0.77868998, + "learning_rate": 3.788500062480197e-06, + "loss": 0.80550063, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.25097656, + "step": 2882, + "time_per_iteration": 2.892826795578003 + }, + { + "auxiliary_loss_clip": 0.01636301, + "auxiliary_loss_mlp": 0.01054038, + "balance_loss_clip": 1.40472627, + "balance_loss_mlp": 1.02726388, + "epoch": 0.1733353374417556, + "flos": 33117633077760.0, + "grad_norm": 1.9491577240449371, + "language_loss": 0.77310652, + "learning_rate": 3.788325718086769e-06, + "loss": 0.80000985, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.2677002, + "step": 2883, + "time_per_iteration": 4.383300065994263 + }, + { + "auxiliary_loss_clip": 0.01625288, + "auxiliary_loss_mlp": 0.01052051, + "balance_loss_clip": 1.3917774, + "balance_loss_mlp": 1.02434695, + "epoch": 0.17339546069442358, + "flos": 24399547914240.0, + "grad_norm": 2.015922795578572, + "language_loss": 0.86269844, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.88947183, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.27709961, + "step": 2884, + "time_per_iteration": 4.3652215003967285 + }, + { + "auxiliary_loss_clip": 0.01629365, + "auxiliary_loss_mlp": 0.01048082, + "balance_loss_clip": 1.39744365, + "balance_loss_mlp": 1.022452, + "epoch": 0.17345558394709154, + "flos": 27465680050560.0, + "grad_norm": 1.4877797200063771, + "language_loss": 0.75731319, + "learning_rate": 3.787976825866055e-06, + "loss": 0.78408766, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.25646973, + "step": 2885, + "time_per_iteration": 2.9380688667297363 + }, + { + "auxiliary_loss_clip": 0.01630786, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.40302038, + "balance_loss_mlp": 1.02199006, + "epoch": 0.1735157071997595, + "flos": 24693178538880.0, + "grad_norm": 1.4327145704245927, + "language_loss": 0.7188493, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.7456311, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.25427246, + "step": 2886, + "time_per_iteration": 2.932492971420288 + }, + { + "auxiliary_loss_clip": 0.01628649, + "auxiliary_loss_mlp": 0.01056137, + "balance_loss_clip": 1.39567304, + "balance_loss_mlp": 1.0283618, + "epoch": 0.17357583045242747, + "flos": 21698813934720.0, + "grad_norm": 2.1183313962572847, + "language_loss": 0.70203257, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.72888041, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.27783203, + "step": 2887, + "time_per_iteration": 2.8332271575927734 + }, + { + "auxiliary_loss_clip": 0.01633627, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.40089309, + "balance_loss_mlp": 1.02712786, + "epoch": 0.17363595370509544, + "flos": 15383669604480.0, + "grad_norm": 1.6030740472724412, + "language_loss": 0.85973579, + "learning_rate": 3.787452979049585e-06, + "loss": 0.88660336, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26013184, + "step": 2888, + "time_per_iteration": 2.8336009979248047 + }, + { + "auxiliary_loss_clip": 0.01649746, + "auxiliary_loss_mlp": 0.01054382, + "balance_loss_clip": 1.41486764, + "balance_loss_mlp": 1.02527082, + "epoch": 0.1736960769577634, + "flos": 23451458232960.0, + "grad_norm": 2.203216711235482, + "language_loss": 0.80506301, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.83210427, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 2.34570312, + "router_z_loss_mlp": 0.29101562, + "step": 2889, + "time_per_iteration": 2.886361837387085 + }, + { + "auxiliary_loss_clip": 0.01625194, + "auxiliary_loss_mlp": 0.01054048, + "balance_loss_clip": 1.39716625, + "balance_loss_mlp": 1.02667785, + "epoch": 0.1737562002104314, + "flos": 18596956389120.0, + "grad_norm": 2.650369707809917, + "language_loss": 0.85291713, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.8797096, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.27392578, + "step": 2890, + "time_per_iteration": 2.8939504623413086 + }, + { + "auxiliary_loss_clip": 0.01650691, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.41426754, + "balance_loss_mlp": 1.02343893, + "epoch": 0.17381632346309936, + "flos": 16006701507840.0, + "grad_norm": 2.6683810226414804, + "language_loss": 0.83437383, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.86139023, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.27539062, + "step": 2891, + "time_per_iteration": 2.871232032775879 + }, + { + "auxiliary_loss_clip": 0.01659437, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_clip": 1.418648, + "balance_loss_mlp": 1.02387071, + "epoch": 0.17387644671576732, + "flos": 13377463591680.0, + "grad_norm": 2.157608811100299, + "language_loss": 0.82369673, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.85081518, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.28540039, + "step": 2892, + "time_per_iteration": 2.776482582092285 + }, + { + "auxiliary_loss_clip": 0.01660172, + "auxiliary_loss_mlp": 0.01058968, + "balance_loss_clip": 1.42324495, + "balance_loss_mlp": 1.03076267, + "epoch": 0.1739365699684353, + "flos": 26626938347520.0, + "grad_norm": 1.6740093401938398, + "language_loss": 0.75374609, + "learning_rate": 3.786578545502627e-06, + "loss": 0.78093755, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.28173828, + "step": 2893, + "time_per_iteration": 2.9211268424987793 + }, + { + "auxiliary_loss_clip": 0.0164104, + "auxiliary_loss_mlp": 0.01048002, + "balance_loss_clip": 1.40517497, + "balance_loss_mlp": 1.02102542, + "epoch": 0.17399669322110325, + "flos": 23378469091200.0, + "grad_norm": 1.8835910226118764, + "language_loss": 0.82699746, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85388792, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.26928711, + "step": 2894, + "time_per_iteration": 2.839189291000366 + }, + { + "auxiliary_loss_clip": 0.01648383, + "auxiliary_loss_mlp": 0.01051619, + "balance_loss_clip": 1.41346383, + "balance_loss_mlp": 1.02117324, + "epoch": 0.17405681647377122, + "flos": 22064166846720.0, + "grad_norm": 1.8884708948098978, + "language_loss": 0.75275254, + "learning_rate": 3.786228297806741e-06, + "loss": 0.77975255, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.30444336, + "step": 2895, + "time_per_iteration": 2.8679981231689453 + }, + { + "auxiliary_loss_clip": 0.0136592, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.23162127, + "balance_loss_mlp": 1.0207963, + "epoch": 0.1741169397264392, + "flos": 61487008381440.0, + "grad_norm": 0.877650649642171, + "language_loss": 0.62851411, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.65254241, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.16113281, + "step": 2896, + "time_per_iteration": 3.499652147293091 + }, + { + "auxiliary_loss_clip": 0.01640483, + "auxiliary_loss_mlp": 0.01050256, + "balance_loss_clip": 1.40570939, + "balance_loss_mlp": 1.02129984, + "epoch": 0.17417706297910718, + "flos": 27029419257600.0, + "grad_norm": 1.6974573428065702, + "language_loss": 0.76829791, + "learning_rate": 3.785877779175034e-06, + "loss": 0.7952053, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.28967285, + "step": 2897, + "time_per_iteration": 2.9249603748321533 + }, + { + "auxiliary_loss_clip": 0.01624301, + "auxiliary_loss_mlp": 0.01043834, + "balance_loss_clip": 1.39589, + "balance_loss_mlp": 1.01615369, + "epoch": 0.17423718623177514, + "flos": 33521426087040.0, + "grad_norm": 1.6961086473796378, + "language_loss": 0.69573861, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.72241998, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27697754, + "step": 2898, + "time_per_iteration": 2.999910354614258 + }, + { + "auxiliary_loss_clip": 0.01656325, + "auxiliary_loss_mlp": 0.01048403, + "balance_loss_clip": 1.41804075, + "balance_loss_mlp": 1.02067542, + "epoch": 0.1742973094844431, + "flos": 27210037075200.0, + "grad_norm": 2.342783761968962, + "language_loss": 0.77579415, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.80284148, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 2.38085938, + "router_z_loss_mlp": 0.27746582, + "step": 2899, + "time_per_iteration": 2.897024393081665 + }, + { + "auxiliary_loss_clip": 0.01621999, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_clip": 1.39324284, + "balance_loss_mlp": 1.01648855, + "epoch": 0.17435743273711107, + "flos": 22720933388160.0, + "grad_norm": 1.8292987891557757, + "language_loss": 0.73842347, + "learning_rate": 3.785351493339121e-06, + "loss": 0.76508605, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.27746582, + "step": 2900, + "time_per_iteration": 2.9009578227996826 + }, + { + "auxiliary_loss_clip": 0.01639036, + "auxiliary_loss_mlp": 0.01050714, + "balance_loss_clip": 1.40720308, + "balance_loss_mlp": 1.02224684, + "epoch": 0.17441755598977904, + "flos": 41661479940480.0, + "grad_norm": 1.4913788490848425, + "language_loss": 0.70868152, + "learning_rate": 3.785175929316863e-06, + "loss": 0.73557901, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28466797, + "step": 2901, + "time_per_iteration": 3.035282850265503 + }, + { + "auxiliary_loss_clip": 0.01652639, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.41750813, + "balance_loss_mlp": 1.01467037, + "epoch": 0.174477679242447, + "flos": 26298396720000.0, + "grad_norm": 1.6980890118681062, + "language_loss": 0.76763457, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.79458761, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.27978516, + "step": 2902, + "time_per_iteration": 2.892570734024048 + }, + { + "auxiliary_loss_clip": 0.01643983, + "auxiliary_loss_mlp": 0.01050635, + "balance_loss_clip": 1.41057622, + "balance_loss_mlp": 1.02228713, + "epoch": 0.174537802495115, + "flos": 17867426929920.0, + "grad_norm": 4.941679178011067, + "language_loss": 0.82878745, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.85573363, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.28344727, + "step": 2903, + "time_per_iteration": 2.89803409576416 + }, + { + "auxiliary_loss_clip": 0.01628102, + "auxiliary_loss_mlp": 0.01046342, + "balance_loss_clip": 1.39958954, + "balance_loss_mlp": 1.01959109, + "epoch": 0.17459792574778296, + "flos": 16948049713920.0, + "grad_norm": 2.0291559735126308, + "language_loss": 0.74938911, + "learning_rate": 3.784648831112429e-06, + "loss": 0.77613354, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.2677002, + "step": 2904, + "time_per_iteration": 2.804347515106201 + }, + { + "auxiliary_loss_clip": 0.01627131, + "auxiliary_loss_mlp": 0.01046638, + "balance_loss_clip": 1.39500189, + "balance_loss_mlp": 1.02063894, + "epoch": 0.17465804900045093, + "flos": 25530924856320.0, + "grad_norm": 1.7724340846560571, + "language_loss": 0.65704334, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.68378103, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26013184, + "step": 2905, + "time_per_iteration": 2.922879219055176 + }, + { + "auxiliary_loss_clip": 0.01657209, + "auxiliary_loss_mlp": 0.01052628, + "balance_loss_clip": 1.41621935, + "balance_loss_mlp": 1.02296829, + "epoch": 0.1747181722531189, + "flos": 24139244724480.0, + "grad_norm": 1.8968010165143598, + "language_loss": 0.80352318, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.83062154, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.29663086, + "step": 2906, + "time_per_iteration": 2.8492133617401123 + }, + { + "auxiliary_loss_clip": 0.01641726, + "auxiliary_loss_mlp": 0.01061291, + "balance_loss_clip": 1.40648675, + "balance_loss_mlp": 1.03352702, + "epoch": 0.17477829550578686, + "flos": 17757762238080.0, + "grad_norm": 1.8111513567333353, + "language_loss": 0.82217312, + "learning_rate": 3.784121123841449e-06, + "loss": 0.84920329, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.27770996, + "step": 2907, + "time_per_iteration": 2.8808391094207764 + }, + { + "auxiliary_loss_clip": 0.01636596, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.40230727, + "balance_loss_mlp": 1.02584541, + "epoch": 0.17483841875845482, + "flos": 15385705620480.0, + "grad_norm": 2.006332569175584, + "language_loss": 0.83116955, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.85806501, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.27124023, + "step": 2908, + "time_per_iteration": 2.8707082271575928 + }, + { + "auxiliary_loss_clip": 0.0162986, + "auxiliary_loss_mlp": 0.01059422, + "balance_loss_clip": 1.39904976, + "balance_loss_mlp": 1.03119349, + "epoch": 0.17489854201112282, + "flos": 17171586864000.0, + "grad_norm": 2.318113387163912, + "language_loss": 0.81748843, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.84438127, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28186035, + "step": 2909, + "time_per_iteration": 4.289958715438843 + }, + { + "auxiliary_loss_clip": 0.01642522, + "auxiliary_loss_mlp": 0.01056213, + "balance_loss_clip": 1.40884233, + "balance_loss_mlp": 1.02910447, + "epoch": 0.17495866526379078, + "flos": 19764692167680.0, + "grad_norm": 1.7836296017424638, + "language_loss": 0.77106416, + "learning_rate": 3.783592807684017e-06, + "loss": 0.79805148, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 2.3359375, + "router_z_loss_mlp": 0.27124023, + "step": 2910, + "time_per_iteration": 2.894209384918213 + }, + { + "auxiliary_loss_clip": 0.01631241, + "auxiliary_loss_mlp": 0.01051031, + "balance_loss_clip": 1.40010786, + "balance_loss_mlp": 1.02305245, + "epoch": 0.17501878851645875, + "flos": 28522620017280.0, + "grad_norm": 10.751554962096531, + "language_loss": 0.87556946, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.90239215, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.27990723, + "step": 2911, + "time_per_iteration": 2.943964719772339 + }, + { + "auxiliary_loss_clip": 0.01645683, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.41398501, + "balance_loss_mlp": 1.02281296, + "epoch": 0.1750789117691267, + "flos": 17940189847680.0, + "grad_norm": 2.0176788172800153, + "language_loss": 0.90612984, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.93307739, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 2.31640625, + "router_z_loss_mlp": 0.26269531, + "step": 2912, + "time_per_iteration": 2.7971255779266357 + }, + { + "auxiliary_loss_clip": 0.01656304, + "auxiliary_loss_mlp": 0.01055331, + "balance_loss_clip": 1.41879404, + "balance_loss_mlp": 1.02848518, + "epoch": 0.17513903502179468, + "flos": 18268460006400.0, + "grad_norm": 1.821388732532384, + "language_loss": 0.74081528, + "learning_rate": 3.783063882820439e-06, + "loss": 0.76793158, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.26879883, + "step": 2913, + "time_per_iteration": 2.8408894538879395 + }, + { + "auxiliary_loss_clip": 0.01634102, + "auxiliary_loss_mlp": 0.01050385, + "balance_loss_clip": 1.40582705, + "balance_loss_mlp": 1.02382517, + "epoch": 0.17519915827446264, + "flos": 20714591640960.0, + "grad_norm": 1.8280092042543734, + "language_loss": 0.70523477, + "learning_rate": 3.782887439295741e-06, + "loss": 0.73207963, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26550293, + "step": 2914, + "time_per_iteration": 2.816277265548706 + }, + { + "auxiliary_loss_clip": 0.01630143, + "auxiliary_loss_mlp": 0.01050798, + "balance_loss_clip": 1.40147889, + "balance_loss_mlp": 1.02450085, + "epoch": 0.1752592815271306, + "flos": 20533521375360.0, + "grad_norm": 2.410958784187916, + "language_loss": 0.94405955, + "learning_rate": 3.782710928163772e-06, + "loss": 0.97086906, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26306152, + "step": 2915, + "time_per_iteration": 2.861619472503662 + }, + { + "auxiliary_loss_clip": 0.01621858, + "auxiliary_loss_mlp": 0.01055466, + "balance_loss_clip": 1.3949188, + "balance_loss_mlp": 1.02823877, + "epoch": 0.1753194047797986, + "flos": 21809293032960.0, + "grad_norm": 1.7615862567716367, + "language_loss": 0.81756401, + "learning_rate": 3.782534349431226e-06, + "loss": 0.84433722, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27246094, + "step": 2916, + "time_per_iteration": 4.355432033538818 + }, + { + "auxiliary_loss_clip": 0.01643309, + "auxiliary_loss_mlp": 0.01054005, + "balance_loss_clip": 1.41060579, + "balance_loss_mlp": 1.0269326, + "epoch": 0.17537952803246656, + "flos": 20678232804480.0, + "grad_norm": 1.6689210587084746, + "language_loss": 0.74754357, + "learning_rate": 3.782357703104799e-06, + "loss": 0.7745167, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.27075195, + "step": 2917, + "time_per_iteration": 2.8803858757019043 + }, + { + "auxiliary_loss_clip": 0.01630025, + "auxiliary_loss_mlp": 0.01051694, + "balance_loss_clip": 1.40310526, + "balance_loss_mlp": 1.02527761, + "epoch": 0.17543965128513453, + "flos": 23305570439040.0, + "grad_norm": 1.9342664611213547, + "language_loss": 0.77496624, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.80178344, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26403809, + "step": 2918, + "time_per_iteration": 4.300485849380493 + }, + { + "auxiliary_loss_clip": 0.01661164, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.42300403, + "balance_loss_mlp": 1.02643955, + "epoch": 0.1754997745378025, + "flos": 29107438047360.0, + "grad_norm": 2.034844487956608, + "language_loss": 0.75399876, + "learning_rate": 3.782004207697098e-06, + "loss": 0.78115022, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 2.3828125, + "router_z_loss_mlp": 0.27563477, + "step": 2919, + "time_per_iteration": 4.387455940246582 + }, + { + "auxiliary_loss_clip": 0.01643606, + "auxiliary_loss_mlp": 0.01057846, + "balance_loss_clip": 1.40769362, + "balance_loss_mlp": 1.03153682, + "epoch": 0.17555989779047046, + "flos": 30383797887360.0, + "grad_norm": 1.720157431518989, + "language_loss": 0.75575703, + "learning_rate": 3.781827358629228e-06, + "loss": 0.78277159, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.26306152, + "step": 2920, + "time_per_iteration": 2.927471160888672 + }, + { + "auxiliary_loss_clip": 0.01622832, + "auxiliary_loss_mlp": 0.0104534, + "balance_loss_clip": 1.39551866, + "balance_loss_mlp": 1.02085447, + "epoch": 0.17562002104313842, + "flos": 23296340499840.0, + "grad_norm": 2.0757910658212024, + "language_loss": 0.80180454, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.8284862, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.24462891, + "step": 2921, + "time_per_iteration": 2.810398817062378 + }, + { + "auxiliary_loss_clip": 0.0165343, + "auxiliary_loss_mlp": 0.01053292, + "balance_loss_clip": 1.41668248, + "balance_loss_mlp": 1.02632642, + "epoch": 0.1756801442958064, + "flos": 24801259662720.0, + "grad_norm": 1.5969847538369375, + "language_loss": 0.88529325, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.91236043, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.26989746, + "step": 2922, + "time_per_iteration": 2.945934772491455 + }, + { + "auxiliary_loss_clip": 0.01635525, + "auxiliary_loss_mlp": 0.01052364, + "balance_loss_clip": 1.40031171, + "balance_loss_mlp": 1.02563739, + "epoch": 0.17574026754847438, + "flos": 25781636148480.0, + "grad_norm": 3.8933677402424056, + "language_loss": 0.64108264, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.66796154, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 2.3515625, + "router_z_loss_mlp": 0.26757812, + "step": 2923, + "time_per_iteration": 2.9647111892700195 + }, + { + "auxiliary_loss_clip": 0.01641889, + "auxiliary_loss_mlp": 0.01047811, + "balance_loss_clip": 1.40977192, + "balance_loss_mlp": 1.02111959, + "epoch": 0.17580039080114235, + "flos": 17465398467840.0, + "grad_norm": 2.703590365731881, + "language_loss": 0.82282996, + "learning_rate": 3.78111928675413e-06, + "loss": 0.84972697, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26696777, + "step": 2924, + "time_per_iteration": 2.8882668018341064 + }, + { + "auxiliary_loss_clip": 0.01651215, + "auxiliary_loss_mlp": 0.01057336, + "balance_loss_clip": 1.41392851, + "balance_loss_mlp": 1.02919078, + "epoch": 0.1758605140538103, + "flos": 14872383653760.0, + "grad_norm": 1.8390265932495413, + "language_loss": 0.72686321, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.75394869, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 2.37109375, + "router_z_loss_mlp": 0.28149414, + "step": 2925, + "time_per_iteration": 2.7982075214385986 + }, + { + "auxiliary_loss_clip": 0.01623006, + "auxiliary_loss_mlp": 0.01051523, + "balance_loss_clip": 1.39615619, + "balance_loss_mlp": 1.02327085, + "epoch": 0.17592063730647828, + "flos": 23014971216000.0, + "grad_norm": 1.622814802581916, + "language_loss": 0.71985406, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74659932, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.28271484, + "step": 2926, + "time_per_iteration": 2.894348621368408 + }, + { + "auxiliary_loss_clip": 0.01653045, + "auxiliary_loss_mlp": 0.01046597, + "balance_loss_clip": 1.41603494, + "balance_loss_mlp": 1.01860631, + "epoch": 0.17598076055914624, + "flos": 20751448170240.0, + "grad_norm": 1.7417307172847563, + "language_loss": 0.86313659, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.89013302, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 2.36914062, + "router_z_loss_mlp": 0.2800293, + "step": 2927, + "time_per_iteration": 2.902040958404541 + }, + { + "auxiliary_loss_clip": 0.01623528, + "auxiliary_loss_mlp": 0.01046521, + "balance_loss_clip": 1.39614916, + "balance_loss_mlp": 1.02087879, + "epoch": 0.1760408838118142, + "flos": 34105112997120.0, + "grad_norm": 3.294574193743095, + "language_loss": 0.72379196, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.75049245, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.25634766, + "step": 2928, + "time_per_iteration": 2.943513870239258 + }, + { + "auxiliary_loss_clip": 0.01619244, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.3934741, + "balance_loss_mlp": 1.01945829, + "epoch": 0.1761010070644822, + "flos": 24178499228160.0, + "grad_norm": 1.9034731352695664, + "language_loss": 0.83693314, + "learning_rate": 3.780232677305744e-06, + "loss": 0.86358017, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26037598, + "step": 2929, + "time_per_iteration": 2.8696253299713135 + }, + { + "auxiliary_loss_clip": 0.01630605, + "auxiliary_loss_mlp": 0.01047196, + "balance_loss_clip": 1.40099072, + "balance_loss_mlp": 1.02151799, + "epoch": 0.17616113031715017, + "flos": 26587502864640.0, + "grad_norm": 1.698358140683655, + "language_loss": 0.80336797, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.83014596, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.25695801, + "step": 2930, + "time_per_iteration": 2.8695695400238037 + }, + { + "auxiliary_loss_clip": 0.01642028, + "auxiliary_loss_mlp": 0.01046963, + "balance_loss_clip": 1.40805709, + "balance_loss_mlp": 1.01892519, + "epoch": 0.17622125356981813, + "flos": 25677808035840.0, + "grad_norm": 2.0377069883495555, + "language_loss": 0.78336084, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.81025082, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28027344, + "step": 2931, + "time_per_iteration": 2.900754690170288 + }, + { + "auxiliary_loss_clip": 0.01627901, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.3988477, + "balance_loss_mlp": 1.01548505, + "epoch": 0.1762813768224861, + "flos": 16517218296960.0, + "grad_norm": 2.694097764737487, + "language_loss": 0.76317978, + "learning_rate": 3.779699901503696e-06, + "loss": 0.78987461, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.26098633, + "step": 2932, + "time_per_iteration": 2.802534580230713 + }, + { + "auxiliary_loss_clip": 0.01642127, + "auxiliary_loss_mlp": 0.01046134, + "balance_loss_clip": 1.40504503, + "balance_loss_mlp": 1.01962233, + "epoch": 0.17634150007515406, + "flos": 11217542434560.0, + "grad_norm": 2.2659570715040056, + "language_loss": 0.91414493, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.94102752, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 2.375, + "router_z_loss_mlp": 0.26550293, + "step": 2933, + "time_per_iteration": 2.8136985301971436 + }, + { + "auxiliary_loss_clip": 0.01621127, + "auxiliary_loss_mlp": 0.01051061, + "balance_loss_clip": 1.39368033, + "balance_loss_mlp": 1.0239526, + "epoch": 0.17640162332782203, + "flos": 23670516147840.0, + "grad_norm": 1.6739750940612477, + "language_loss": 0.88702762, + "learning_rate": 3.779344380192448e-06, + "loss": 0.91374946, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 2.2734375, + "router_z_loss_mlp": 0.27124023, + "step": 2934, + "time_per_iteration": 2.8906068801879883 + }, + { + "auxiliary_loss_clip": 0.01617, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_clip": 1.39147055, + "balance_loss_mlp": 1.01988423, + "epoch": 0.17646174658049, + "flos": 53815348408320.0, + "grad_norm": 1.5956990718187216, + "language_loss": 0.71835947, + "learning_rate": 3.779166518324077e-06, + "loss": 0.74497896, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25036621, + "step": 2935, + "time_per_iteration": 3.1136581897735596 + }, + { + "auxiliary_loss_clip": 0.01656298, + "auxiliary_loss_mlp": 0.01043461, + "balance_loss_clip": 1.41606569, + "balance_loss_mlp": 1.0168891, + "epoch": 0.17652186983315798, + "flos": 24254655505920.0, + "grad_norm": 1.9671984420325368, + "language_loss": 0.71587646, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.74287403, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 2.40234375, + "router_z_loss_mlp": 0.26574707, + "step": 2936, + "time_per_iteration": 2.9083011150360107 + }, + { + "auxiliary_loss_clip": 0.01635802, + "auxiliary_loss_mlp": 0.0104069, + "balance_loss_clip": 1.40728438, + "balance_loss_mlp": 1.01568007, + "epoch": 0.17658199308582595, + "flos": 27465815784960.0, + "grad_norm": 3.6092565786113657, + "language_loss": 0.72567838, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.75244331, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.25012207, + "step": 2937, + "time_per_iteration": 2.8812270164489746 + }, + { + "auxiliary_loss_clip": 0.01645452, + "auxiliary_loss_mlp": 0.01046486, + "balance_loss_clip": 1.40912163, + "balance_loss_mlp": 1.02016497, + "epoch": 0.17664211633849392, + "flos": 22428750597120.0, + "grad_norm": 2.5949458988276564, + "language_loss": 0.77297288, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.79989225, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 2.36523438, + "router_z_loss_mlp": 0.26342773, + "step": 2938, + "time_per_iteration": 2.865772247314453 + }, + { + "auxiliary_loss_clip": 0.01644154, + "auxiliary_loss_mlp": 0.01053537, + "balance_loss_clip": 1.40884066, + "balance_loss_mlp": 1.02576137, + "epoch": 0.17670223959116188, + "flos": 24725239119360.0, + "grad_norm": 2.187387106995236, + "language_loss": 0.71918607, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.74616301, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.27783203, + "step": 2939, + "time_per_iteration": 2.8809831142425537 + }, + { + "auxiliary_loss_clip": 0.01633497, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.40299845, + "balance_loss_mlp": 1.02291214, + "epoch": 0.17676236284382985, + "flos": 22536876965760.0, + "grad_norm": 2.083775139836246, + "language_loss": 0.74803144, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.77485657, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.2611084, + "step": 2940, + "time_per_iteration": 2.8164851665496826 + }, + { + "auxiliary_loss_clip": 0.01655813, + "auxiliary_loss_mlp": 0.01050521, + "balance_loss_clip": 1.42109883, + "balance_loss_mlp": 1.02278125, + "epoch": 0.1768224860964978, + "flos": 12392653115520.0, + "grad_norm": 2.4586285286176697, + "language_loss": 0.87024164, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.89730501, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27746582, + "step": 2941, + "time_per_iteration": 2.8304901123046875 + }, + { + "auxiliary_loss_clip": 0.01648675, + "auxiliary_loss_mlp": 0.01056187, + "balance_loss_clip": 1.41319454, + "balance_loss_mlp": 1.02882874, + "epoch": 0.1768826093491658, + "flos": 24364184463360.0, + "grad_norm": 2.1935046670056697, + "language_loss": 0.77445686, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.80150551, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.27380371, + "step": 2942, + "time_per_iteration": 2.919424057006836 + }, + { + "auxiliary_loss_clip": 0.01651599, + "auxiliary_loss_mlp": 0.0105045, + "balance_loss_clip": 1.4149276, + "balance_loss_mlp": 1.02362752, + "epoch": 0.17694273260183377, + "flos": 23597662740480.0, + "grad_norm": 1.680600467525539, + "language_loss": 0.80957931, + "learning_rate": 3.77774119516197e-06, + "loss": 0.83659983, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.26855469, + "step": 2943, + "time_per_iteration": 2.9306061267852783 + }, + { + "auxiliary_loss_clip": 0.01661323, + "auxiliary_loss_mlp": 0.01051633, + "balance_loss_clip": 1.42393816, + "balance_loss_mlp": 1.02545428, + "epoch": 0.17700285585450173, + "flos": 26772328448640.0, + "grad_norm": 2.825912607285867, + "language_loss": 0.82061505, + "learning_rate": 3.777562726341155e-06, + "loss": 0.84774458, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.26184082, + "step": 2944, + "time_per_iteration": 4.327883005142212 + }, + { + "auxiliary_loss_clip": 0.01626638, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_clip": 1.39399886, + "balance_loss_mlp": 1.02352071, + "epoch": 0.1770629791071697, + "flos": 42791318559360.0, + "grad_norm": 1.766634963712301, + "language_loss": 0.7472102, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.77397156, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.26013184, + "step": 2945, + "time_per_iteration": 3.016019821166992 + }, + { + "auxiliary_loss_clip": 0.01648341, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_clip": 1.41502905, + "balance_loss_mlp": 1.02086735, + "epoch": 0.17712310235983766, + "flos": 17353923984000.0, + "grad_norm": 2.2989242289044394, + "language_loss": 0.7939626, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.82091069, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.25585938, + "step": 2946, + "time_per_iteration": 2.829660177230835 + }, + { + "auxiliary_loss_clip": 0.0163548, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.40610456, + "balance_loss_mlp": 1.02868652, + "epoch": 0.17718322561250563, + "flos": 23889031125120.0, + "grad_norm": 1.842446015105371, + "language_loss": 0.7738834, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.80078304, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.25793457, + "step": 2947, + "time_per_iteration": 2.955284833908081 + }, + { + "auxiliary_loss_clip": 0.01636186, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.40516829, + "balance_loss_mlp": 1.0218519, + "epoch": 0.1772433488651736, + "flos": 36480246261120.0, + "grad_norm": 1.9503009720327058, + "language_loss": 0.73518908, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.76203674, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.26721191, + "step": 2948, + "time_per_iteration": 2.9634721279144287 + }, + { + "auxiliary_loss_clip": 0.01609223, + "auxiliary_loss_mlp": 0.01048962, + "balance_loss_clip": 1.38399613, + "balance_loss_mlp": 1.02355814, + "epoch": 0.1773034721178416, + "flos": 26695176785280.0, + "grad_norm": 2.0575593281937725, + "language_loss": 0.82462519, + "learning_rate": 3.776669371292171e-06, + "loss": 0.85120702, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.25415039, + "step": 2949, + "time_per_iteration": 2.907304286956787 + }, + { + "auxiliary_loss_clip": 0.01380706, + "auxiliary_loss_mlp": 0.01095307, + "balance_loss_clip": 1.24239326, + "balance_loss_mlp": 1.07442188, + "epoch": 0.17736359537050955, + "flos": 57146235707520.0, + "grad_norm": 0.7702825210729315, + "language_loss": 0.65063787, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67539799, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.20898438, + "step": 2950, + "time_per_iteration": 4.854967355728149 + }, + { + "auxiliary_loss_clip": 0.01611571, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_clip": 1.38550007, + "balance_loss_mlp": 1.02273524, + "epoch": 0.17742371862317752, + "flos": 27209494137600.0, + "grad_norm": 3.1502187614615953, + "language_loss": 0.85082304, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.87740755, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.24133301, + "step": 2951, + "time_per_iteration": 2.8940465450286865 + }, + { + "auxiliary_loss_clip": 0.01628466, + "auxiliary_loss_mlp": 0.01055359, + "balance_loss_clip": 1.39576638, + "balance_loss_mlp": 1.03028917, + "epoch": 0.17748384187584548, + "flos": 20969510699520.0, + "grad_norm": 2.408494408044741, + "language_loss": 0.81750107, + "learning_rate": 3.776132549750806e-06, + "loss": 0.84433931, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.25073242, + "step": 2952, + "time_per_iteration": 2.8993639945983887 + }, + { + "auxiliary_loss_clip": 0.01626639, + "auxiliary_loss_mlp": 0.01049495, + "balance_loss_clip": 1.39694929, + "balance_loss_mlp": 1.02126646, + "epoch": 0.17754396512851345, + "flos": 25020951004800.0, + "grad_norm": 1.7956034835201298, + "language_loss": 0.80914211, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.83590353, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.28222656, + "step": 2953, + "time_per_iteration": 4.326072454452515 + }, + { + "auxiliary_loss_clip": 0.01629756, + "auxiliary_loss_mlp": 0.01048982, + "balance_loss_clip": 1.39822245, + "balance_loss_mlp": 1.02394819, + "epoch": 0.1776040883811814, + "flos": 32064312695040.0, + "grad_norm": 5.316177094787945, + "language_loss": 0.89274025, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.91952765, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.25061035, + "step": 2954, + "time_per_iteration": 4.37099289894104 + }, + { + "auxiliary_loss_clip": 0.01629607, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.39975619, + "balance_loss_mlp": 1.02997637, + "epoch": 0.17766421163384938, + "flos": 21582588746880.0, + "grad_norm": 1.7321218794579412, + "language_loss": 0.86359781, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.89045566, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26208496, + "step": 2955, + "time_per_iteration": 2.8261091709136963 + }, + { + "auxiliary_loss_clip": 0.01616259, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_clip": 1.38821959, + "balance_loss_mlp": 1.02156007, + "epoch": 0.17772433488651737, + "flos": 22429565003520.0, + "grad_norm": 1.6816381996325576, + "language_loss": 0.72227752, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.74890232, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.24682617, + "step": 2956, + "time_per_iteration": 2.9160516262054443 + }, + { + "auxiliary_loss_clip": 0.01604729, + "auxiliary_loss_mlp": 0.01050459, + "balance_loss_clip": 1.38040805, + "balance_loss_mlp": 1.02438831, + "epoch": 0.17778445813918534, + "flos": 25640363324160.0, + "grad_norm": 1.8372343765698587, + "language_loss": 0.83878374, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.86533558, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26086426, + "step": 2957, + "time_per_iteration": 2.8634071350097656 + }, + { + "auxiliary_loss_clip": 0.01606613, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.38088655, + "balance_loss_mlp": 1.01963234, + "epoch": 0.1778445813918533, + "flos": 25639639407360.0, + "grad_norm": 1.569165147851712, + "language_loss": 0.75876069, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.78527844, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25537109, + "step": 2958, + "time_per_iteration": 2.899592638015747 + }, + { + "auxiliary_loss_clip": 0.01632393, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.40318155, + "balance_loss_mlp": 1.02145004, + "epoch": 0.17790470464452127, + "flos": 22355535231360.0, + "grad_norm": 2.6932169538337267, + "language_loss": 0.81959009, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.84638381, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.25524902, + "step": 2959, + "time_per_iteration": 2.886813163757324 + }, + { + "auxiliary_loss_clip": 0.01638388, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.40372586, + "balance_loss_mlp": 1.01813138, + "epoch": 0.17796482789718923, + "flos": 18773909377920.0, + "grad_norm": 1.8258093158987274, + "language_loss": 0.5291487, + "learning_rate": 3.774698062689362e-06, + "loss": 0.55599022, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.27624512, + "step": 2960, + "time_per_iteration": 2.872018337249756 + }, + { + "auxiliary_loss_clip": 0.0161701, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.38705218, + "balance_loss_mlp": 1.02194238, + "epoch": 0.1780249511498572, + "flos": 23451458232960.0, + "grad_norm": 1.796405615850173, + "language_loss": 0.90128624, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.92794448, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 2.30078125, + "router_z_loss_mlp": 0.26867676, + "step": 2961, + "time_per_iteration": 2.9000484943389893 + }, + { + "auxiliary_loss_clip": 0.01640946, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.40679646, + "balance_loss_mlp": 1.02200675, + "epoch": 0.1780850744025252, + "flos": 23377835664000.0, + "grad_norm": 1.5396357020657288, + "language_loss": 0.8010301, + "learning_rate": 3.774338767820631e-06, + "loss": 0.82794011, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.28063965, + "step": 2962, + "time_per_iteration": 2.850339889526367 + }, + { + "auxiliary_loss_clip": 0.01631854, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.39933097, + "balance_loss_mlp": 1.01647806, + "epoch": 0.17814519765519315, + "flos": 13779944501760.0, + "grad_norm": 1.617243927754289, + "language_loss": 0.76183093, + "learning_rate": 3.774159019458203e-06, + "loss": 0.7885828, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.26867676, + "step": 2963, + "time_per_iteration": 2.890949249267578 + }, + { + "auxiliary_loss_clip": 0.0165682, + "auxiliary_loss_mlp": 0.01047661, + "balance_loss_clip": 1.41783059, + "balance_loss_mlp": 1.01921797, + "epoch": 0.17820532090786112, + "flos": 21985205391360.0, + "grad_norm": 1.473427185578327, + "language_loss": 0.7942754, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.82132018, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 2.38867188, + "router_z_loss_mlp": 0.28417969, + "step": 2964, + "time_per_iteration": 2.840341806411743 + }, + { + "auxiliary_loss_clip": 0.01627709, + "auxiliary_loss_mlp": 0.01050203, + "balance_loss_clip": 1.3959794, + "balance_loss_mlp": 1.02404881, + "epoch": 0.17826544416052909, + "flos": 24801576376320.0, + "grad_norm": 1.5932770154028075, + "language_loss": 0.82208145, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.84886056, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26171875, + "step": 2965, + "time_per_iteration": 2.9253883361816406 + }, + { + "auxiliary_loss_clip": 0.01629136, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.39808202, + "balance_loss_mlp": 1.02271652, + "epoch": 0.17832556741319705, + "flos": 13887075484800.0, + "grad_norm": 2.705253662378235, + "language_loss": 0.95503563, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.98181301, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.2590332, + "step": 2966, + "time_per_iteration": 3.0353846549987793 + }, + { + "auxiliary_loss_clip": 0.01634378, + "auxiliary_loss_mlp": 0.01048985, + "balance_loss_clip": 1.40333652, + "balance_loss_mlp": 1.02125669, + "epoch": 0.17838569066586502, + "flos": 36653805889920.0, + "grad_norm": 2.467933445685487, + "language_loss": 0.73162389, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.75845754, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.27734375, + "step": 2967, + "time_per_iteration": 2.972776412963867 + }, + { + "auxiliary_loss_clip": 0.01618129, + "auxiliary_loss_mlp": 0.01051719, + "balance_loss_clip": 1.39357269, + "balance_loss_mlp": 1.02550554, + "epoch": 0.17844581391853298, + "flos": 18734745363840.0, + "grad_norm": 1.9656983214698367, + "language_loss": 0.77106416, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79776269, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.2623291, + "step": 2968, + "time_per_iteration": 2.8052916526794434 + }, + { + "auxiliary_loss_clip": 0.01624457, + "auxiliary_loss_mlp": 0.01046865, + "balance_loss_clip": 1.39600265, + "balance_loss_mlp": 1.02067506, + "epoch": 0.17850593717120097, + "flos": 27388980835200.0, + "grad_norm": 2.2786863643561426, + "language_loss": 0.76668841, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.7934016, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26171875, + "step": 2969, + "time_per_iteration": 2.8895046710968018 + }, + { + "auxiliary_loss_clip": 0.01384487, + "auxiliary_loss_mlp": 0.01019812, + "balance_loss_clip": 1.25388145, + "balance_loss_mlp": 0.99930811, + "epoch": 0.17856606042386894, + "flos": 67024998460800.0, + "grad_norm": 0.8316687423556082, + "language_loss": 0.69017637, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71421933, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.20507812, + "step": 2970, + "time_per_iteration": 3.4533793926239014 + }, + { + "auxiliary_loss_clip": 0.01621727, + "auxiliary_loss_mlp": 0.01047074, + "balance_loss_clip": 1.38884199, + "balance_loss_mlp": 1.01958418, + "epoch": 0.1786261836765369, + "flos": 36990989274240.0, + "grad_norm": 1.9521587244786585, + "language_loss": 0.68928576, + "learning_rate": 3.772718611185505e-06, + "loss": 0.71597379, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 2.328125, + "router_z_loss_mlp": 0.27526855, + "step": 2971, + "time_per_iteration": 2.982680559158325 + }, + { + "auxiliary_loss_clip": 0.01617922, + "auxiliary_loss_mlp": 0.01051552, + "balance_loss_clip": 1.38717365, + "balance_loss_mlp": 1.02251291, + "epoch": 0.17868630692920487, + "flos": 24835808707200.0, + "grad_norm": 1.8815234562354384, + "language_loss": 0.90671802, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.93341279, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.2902832, + "step": 2972, + "time_per_iteration": 2.8624298572540283 + }, + { + "auxiliary_loss_clip": 0.01621218, + "auxiliary_loss_mlp": 0.01053471, + "balance_loss_clip": 1.39070415, + "balance_loss_mlp": 1.02699471, + "epoch": 0.17874643018187283, + "flos": 16990561843200.0, + "grad_norm": 1.8729448739556978, + "language_loss": 0.88928127, + "learning_rate": 3.77235783676401e-06, + "loss": 0.9160282, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.26489258, + "step": 2973, + "time_per_iteration": 2.8614211082458496 + }, + { + "auxiliary_loss_clip": 0.0161896, + "auxiliary_loss_mlp": 0.01048597, + "balance_loss_clip": 1.38838482, + "balance_loss_mlp": 1.02125001, + "epoch": 0.1788065534345408, + "flos": 21041956903680.0, + "grad_norm": 8.615802125905443, + "language_loss": 0.76767939, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.79435498, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.27392578, + "step": 2974, + "time_per_iteration": 2.8256115913391113 + }, + { + "auxiliary_loss_clip": 0.01616259, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.38759375, + "balance_loss_mlp": 1.02801061, + "epoch": 0.17886667668720876, + "flos": 23998243368960.0, + "grad_norm": 2.189199356447207, + "language_loss": 0.76109707, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.78782314, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.28320312, + "step": 2975, + "time_per_iteration": 2.863884925842285 + }, + { + "auxiliary_loss_clip": 0.0160543, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.3807472, + "balance_loss_mlp": 1.02067769, + "epoch": 0.17892679993987676, + "flos": 25750570953600.0, + "grad_norm": 1.5029797385412778, + "language_loss": 0.7392534, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.76577961, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26513672, + "step": 2976, + "time_per_iteration": 2.878466844558716 + }, + { + "auxiliary_loss_clip": 0.01589862, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.37133121, + "balance_loss_mlp": 1.02322197, + "epoch": 0.17898692319254472, + "flos": 25709913861120.0, + "grad_norm": 1.5082773337038489, + "language_loss": 0.77882051, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.8051964, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24511719, + "step": 2977, + "time_per_iteration": 2.975379705429077 + }, + { + "auxiliary_loss_clip": 0.01623631, + "auxiliary_loss_mlp": 0.01054665, + "balance_loss_clip": 1.39691019, + "balance_loss_mlp": 1.02721155, + "epoch": 0.1790470464452127, + "flos": 19327164520320.0, + "grad_norm": 2.054767639507437, + "language_loss": 0.80732775, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.83411068, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2746582, + "step": 2978, + "time_per_iteration": 2.973639488220215 + }, + { + "auxiliary_loss_clip": 0.01619872, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_clip": 1.39080286, + "balance_loss_mlp": 1.02975559, + "epoch": 0.17910716969788065, + "flos": 30056342135040.0, + "grad_norm": 1.4666729429733372, + "language_loss": 0.77505744, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.80184889, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.29541016, + "step": 2979, + "time_per_iteration": 4.302835941314697 + }, + { + "auxiliary_loss_clip": 0.01597727, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_clip": 1.37575638, + "balance_loss_mlp": 1.03106165, + "epoch": 0.17916729295054862, + "flos": 19437236415360.0, + "grad_norm": 1.77361566499179, + "language_loss": 0.70758104, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.73415208, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.28344727, + "step": 2980, + "time_per_iteration": 2.9176223278045654 + }, + { + "auxiliary_loss_clip": 0.01613347, + "auxiliary_loss_mlp": 0.01050449, + "balance_loss_clip": 1.38462484, + "balance_loss_mlp": 1.01902556, + "epoch": 0.17922741620321658, + "flos": 14619817324800.0, + "grad_norm": 1.7247385515337654, + "language_loss": 0.71766913, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.74430716, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.31420898, + "step": 2981, + "time_per_iteration": 2.9716856479644775 + }, + { + "auxiliary_loss_clip": 0.01630647, + "auxiliary_loss_mlp": 0.01055466, + "balance_loss_clip": 1.39822352, + "balance_loss_mlp": 1.02723694, + "epoch": 0.17928753945588458, + "flos": 17174165817600.0, + "grad_norm": 2.117777545824384, + "language_loss": 0.82878584, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.85564697, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.28234863, + "step": 2982, + "time_per_iteration": 2.853656768798828 + }, + { + "auxiliary_loss_clip": 0.01596498, + "auxiliary_loss_mlp": 0.01052374, + "balance_loss_clip": 1.37164021, + "balance_loss_mlp": 1.0243721, + "epoch": 0.17934766270855254, + "flos": 31408405804800.0, + "grad_norm": 1.4726549271428797, + "language_loss": 0.83827752, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.86476624, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27990723, + "step": 2983, + "time_per_iteration": 3.0725090503692627 + }, + { + "auxiliary_loss_clip": 0.01617942, + "auxiliary_loss_mlp": 0.01052617, + "balance_loss_clip": 1.38429534, + "balance_loss_mlp": 1.02467477, + "epoch": 0.1794077859612205, + "flos": 20824075353600.0, + "grad_norm": 2.0526427173994675, + "language_loss": 0.86775917, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.89446473, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.27954102, + "step": 2984, + "time_per_iteration": 2.9185433387756348 + }, + { + "auxiliary_loss_clip": 0.01621136, + "auxiliary_loss_mlp": 0.01046714, + "balance_loss_clip": 1.39019859, + "balance_loss_mlp": 1.01891446, + "epoch": 0.17946790921388847, + "flos": 28998316293120.0, + "grad_norm": 1.3893461191685796, + "language_loss": 0.89761209, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.92429054, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27783203, + "step": 2985, + "time_per_iteration": 2.9402990341186523 + }, + { + "auxiliary_loss_clip": 0.01596829, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.37592387, + "balance_loss_mlp": 1.01803195, + "epoch": 0.17952803246655644, + "flos": 20746290263040.0, + "grad_norm": 1.9970787276600273, + "language_loss": 0.71329236, + "learning_rate": 3.770006252694922e-06, + "loss": 0.7397036, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26269531, + "step": 2986, + "time_per_iteration": 4.266680479049683 + }, + { + "auxiliary_loss_clip": 0.01596939, + "auxiliary_loss_mlp": 0.01045282, + "balance_loss_clip": 1.37221336, + "balance_loss_mlp": 1.01729202, + "epoch": 0.1795881557192244, + "flos": 28267474734720.0, + "grad_norm": 2.6105307330933787, + "language_loss": 0.78426725, + "learning_rate": 3.769824891588688e-06, + "loss": 0.81068945, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2800293, + "step": 2987, + "time_per_iteration": 2.8936524391174316 + }, + { + "auxiliary_loss_clip": 0.01625778, + "auxiliary_loss_mlp": 0.01047969, + "balance_loss_clip": 1.39434302, + "balance_loss_mlp": 1.0201931, + "epoch": 0.17964827897189237, + "flos": 18561321469440.0, + "grad_norm": 1.7373078006280476, + "language_loss": 0.79623419, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.82297164, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.27783203, + "step": 2988, + "time_per_iteration": 5.822129726409912 + }, + { + "auxiliary_loss_clip": 0.01383321, + "auxiliary_loss_mlp": 0.01050976, + "balance_loss_clip": 1.25470257, + "balance_loss_mlp": 1.02818298, + "epoch": 0.17970840222456036, + "flos": 58191412026240.0, + "grad_norm": 0.7665769679826688, + "language_loss": 0.62763727, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.65198028, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.22753906, + "step": 2989, + "time_per_iteration": 3.3023786544799805 + }, + { + "auxiliary_loss_clip": 0.01612458, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.38476038, + "balance_loss_mlp": 1.02121186, + "epoch": 0.17976852547722832, + "flos": 20309893735680.0, + "grad_norm": 1.8565261101538135, + "language_loss": 0.7194463, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.74606407, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28100586, + "step": 2990, + "time_per_iteration": 2.8790295124053955 + }, + { + "auxiliary_loss_clip": 0.01611332, + "auxiliary_loss_mlp": 0.01052472, + "balance_loss_clip": 1.38038731, + "balance_loss_mlp": 1.02448201, + "epoch": 0.1798286487298963, + "flos": 39682086111360.0, + "grad_norm": 2.0921015494584974, + "language_loss": 0.70240724, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.72904527, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.28027344, + "step": 2991, + "time_per_iteration": 2.9883179664611816 + }, + { + "auxiliary_loss_clip": 0.01600666, + "auxiliary_loss_mlp": 0.01057478, + "balance_loss_clip": 1.37411189, + "balance_loss_mlp": 1.029917, + "epoch": 0.17988877198256426, + "flos": 25531286814720.0, + "grad_norm": 2.011859847521587, + "language_loss": 0.83643544, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.86301684, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.27587891, + "step": 2992, + "time_per_iteration": 2.887079954147339 + }, + { + "auxiliary_loss_clip": 0.01581138, + "auxiliary_loss_mlp": 0.01049317, + "balance_loss_clip": 1.36227262, + "balance_loss_mlp": 1.02284026, + "epoch": 0.17994889523523222, + "flos": 18816873955200.0, + "grad_norm": 1.9624032307797579, + "language_loss": 0.83637226, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.86267686, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26489258, + "step": 2993, + "time_per_iteration": 2.8448290824890137 + }, + { + "auxiliary_loss_clip": 0.01606904, + "auxiliary_loss_mlp": 0.01050248, + "balance_loss_clip": 1.3782258, + "balance_loss_mlp": 1.02312815, + "epoch": 0.18000901848790019, + "flos": 21113995904640.0, + "grad_norm": 1.7209832391186357, + "language_loss": 0.79388607, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.82045758, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.27124023, + "step": 2994, + "time_per_iteration": 2.8345654010772705 + }, + { + "auxiliary_loss_clip": 0.01616864, + "auxiliary_loss_mlp": 0.01055484, + "balance_loss_clip": 1.38722324, + "balance_loss_mlp": 1.02803016, + "epoch": 0.18006914174056818, + "flos": 19655977616640.0, + "grad_norm": 2.2088684044925695, + "language_loss": 0.81640399, + "learning_rate": 3.768371587287296e-06, + "loss": 0.84312743, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27453613, + "step": 2995, + "time_per_iteration": 2.866931200027466 + }, + { + "auxiliary_loss_clip": 0.01614502, + "auxiliary_loss_mlp": 0.01049488, + "balance_loss_clip": 1.3873806, + "balance_loss_mlp": 1.02439463, + "epoch": 0.18012926499323614, + "flos": 19509230171520.0, + "grad_norm": 1.5732920666537045, + "language_loss": 0.85159761, + "learning_rate": 3.768189622421512e-06, + "loss": 0.87823755, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.25097656, + "step": 2996, + "time_per_iteration": 2.8279361724853516 + }, + { + "auxiliary_loss_clip": 0.01591131, + "auxiliary_loss_mlp": 0.01051631, + "balance_loss_clip": 1.3711952, + "balance_loss_mlp": 1.02519035, + "epoch": 0.1801893882459041, + "flos": 19474183434240.0, + "grad_norm": 1.5162828933255263, + "language_loss": 0.88859338, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.915021, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.2644043, + "step": 2997, + "time_per_iteration": 2.891843557357788 + }, + { + "auxiliary_loss_clip": 0.01612156, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.3797996, + "balance_loss_mlp": 1.02421546, + "epoch": 0.18024951149857207, + "flos": 26881993140480.0, + "grad_norm": 1.715242369542393, + "language_loss": 0.86400568, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.89064431, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.27478027, + "step": 2998, + "time_per_iteration": 2.875412702560425 + }, + { + "auxiliary_loss_clip": 0.01595914, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.37371993, + "balance_loss_mlp": 1.0191294, + "epoch": 0.18030963475124004, + "flos": 30238724499840.0, + "grad_norm": 1.555222437926751, + "language_loss": 0.85787821, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.88429755, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.26928711, + "step": 2999, + "time_per_iteration": 2.9452645778656006 + }, + { + "auxiliary_loss_clip": 0.01599154, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.37348986, + "balance_loss_mlp": 1.01975775, + "epoch": 0.180369758003908, + "flos": 22317095134080.0, + "grad_norm": 3.1077099751266144, + "language_loss": 0.76249516, + "learning_rate": 3.76746109252814e-06, + "loss": 0.78896439, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28015137, + "step": 3000, + "time_per_iteration": 2.9183127880096436 + }, + { + "auxiliary_loss_clip": 0.0158489, + "auxiliary_loss_mlp": 0.01054641, + "balance_loss_clip": 1.36469936, + "balance_loss_mlp": 1.02654326, + "epoch": 0.18042988125657597, + "flos": 23742419414400.0, + "grad_norm": 1.7054782225155767, + "language_loss": 0.7210319, + "learning_rate": 3.76727879248177e-06, + "loss": 0.74742723, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.28100586, + "step": 3001, + "time_per_iteration": 2.851259231567383 + }, + { + "auxiliary_loss_clip": 0.01607875, + "auxiliary_loss_mlp": 0.01048931, + "balance_loss_clip": 1.37844682, + "balance_loss_mlp": 1.02139354, + "epoch": 0.18049000450924396, + "flos": 24103202601600.0, + "grad_norm": 2.2414862607342507, + "language_loss": 0.90090787, + "learning_rate": 3.767096425420011e-06, + "loss": 0.92747593, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.2755127, + "step": 3002, + "time_per_iteration": 3.0116467475891113 + }, + { + "auxiliary_loss_clip": 0.01595342, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_clip": 1.37004817, + "balance_loss_mlp": 1.01984906, + "epoch": 0.18055012776191193, + "flos": 22173152866560.0, + "grad_norm": 2.0011063982639956, + "language_loss": 0.81863594, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.84505874, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27099609, + "step": 3003, + "time_per_iteration": 2.8391895294189453 + }, + { + "auxiliary_loss_clip": 0.01595438, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.36877847, + "balance_loss_mlp": 1.01992202, + "epoch": 0.1806102510145799, + "flos": 28925372396160.0, + "grad_norm": 1.9514632307231041, + "language_loss": 0.68379909, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.71024096, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.28808594, + "step": 3004, + "time_per_iteration": 2.925210952758789 + }, + { + "auxiliary_loss_clip": 0.01604791, + "auxiliary_loss_mlp": 0.01043112, + "balance_loss_clip": 1.37688792, + "balance_loss_mlp": 1.01576591, + "epoch": 0.18067037426724786, + "flos": 19034936484480.0, + "grad_norm": 1.668662036338473, + "language_loss": 0.86156332, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.88804239, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2734375, + "step": 3005, + "time_per_iteration": 2.8200674057006836 + }, + { + "auxiliary_loss_clip": 0.01585038, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.36439681, + "balance_loss_mlp": 1.01903439, + "epoch": 0.18073049751991582, + "flos": 27465001378560.0, + "grad_norm": 1.4200395099157528, + "language_loss": 0.8350445, + "learning_rate": 3.766366287157432e-06, + "loss": 0.86135268, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26745605, + "step": 3006, + "time_per_iteration": 2.8907268047332764 + }, + { + "auxiliary_loss_clip": 0.01595843, + "auxiliary_loss_mlp": 0.01051116, + "balance_loss_clip": 1.37060285, + "balance_loss_mlp": 1.02229154, + "epoch": 0.1807906207725838, + "flos": 28740320588160.0, + "grad_norm": 2.080788231387073, + "language_loss": 0.78320837, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.80967796, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.2878418, + "step": 3007, + "time_per_iteration": 2.9011638164520264 + }, + { + "auxiliary_loss_clip": 0.01372586, + "auxiliary_loss_mlp": 0.0106574, + "balance_loss_clip": 1.2390368, + "balance_loss_mlp": 1.03980052, + "epoch": 0.18085074402525175, + "flos": 64501190208000.0, + "grad_norm": 0.8189027334258258, + "language_loss": 0.56896174, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59334505, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.25976562, + "step": 3008, + "time_per_iteration": 3.518815279006958 + }, + { + "auxiliary_loss_clip": 0.01599811, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.37089169, + "balance_loss_mlp": 1.02480125, + "epoch": 0.18091086727791975, + "flos": 23487364621440.0, + "grad_norm": 1.6635814046776505, + "language_loss": 0.69269216, + "learning_rate": 3.765817980138021e-06, + "loss": 0.71921957, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.28149414, + "step": 3009, + "time_per_iteration": 2.9329047203063965 + }, + { + "auxiliary_loss_clip": 0.01601786, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.37486005, + "balance_loss_mlp": 1.01493585, + "epoch": 0.1809709905305877, + "flos": 24181123426560.0, + "grad_norm": 1.821955060163229, + "language_loss": 0.770661, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.79709041, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2623291, + "step": 3010, + "time_per_iteration": 2.8764238357543945 + }, + { + "auxiliary_loss_clip": 0.01563878, + "auxiliary_loss_mlp": 0.01043254, + "balance_loss_clip": 1.34680688, + "balance_loss_mlp": 1.0171833, + "epoch": 0.18103111378325568, + "flos": 21660373837440.0, + "grad_norm": 1.7261735831896263, + "language_loss": 0.69120538, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.71727669, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26098633, + "step": 3011, + "time_per_iteration": 2.834564685821533 + }, + { + "auxiliary_loss_clip": 0.0158813, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.36339331, + "balance_loss_mlp": 1.02074838, + "epoch": 0.18109123703592364, + "flos": 53705095534080.0, + "grad_norm": 1.6236451804044802, + "language_loss": 0.7213937, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.7477603, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27783203, + "step": 3012, + "time_per_iteration": 3.198803663253784 + }, + { + "auxiliary_loss_clip": 0.0157766, + "auxiliary_loss_mlp": 0.01048352, + "balance_loss_clip": 1.35901237, + "balance_loss_mlp": 1.02009976, + "epoch": 0.1811513602885916, + "flos": 35859159884160.0, + "grad_norm": 2.1952698678065383, + "language_loss": 0.63972127, + "learning_rate": 3.765085966704609e-06, + "loss": 0.66598141, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.28222656, + "step": 3013, + "time_per_iteration": 2.95149302482605 + }, + { + "auxiliary_loss_clip": 0.01596709, + "auxiliary_loss_mlp": 0.0104878, + "balance_loss_clip": 1.37229824, + "balance_loss_mlp": 1.02182722, + "epoch": 0.18121148354125957, + "flos": 23743098086400.0, + "grad_norm": 1.6542926420986608, + "language_loss": 0.768538, + "learning_rate": 3.764902795998309e-06, + "loss": 0.79499292, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26953125, + "step": 3014, + "time_per_iteration": 4.232020378112793 + }, + { + "auxiliary_loss_clip": 0.01621062, + "auxiliary_loss_mlp": 0.01046864, + "balance_loss_clip": 1.38858461, + "balance_loss_mlp": 1.01808667, + "epoch": 0.18127160679392756, + "flos": 28739415692160.0, + "grad_norm": 2.6504177648213, + "language_loss": 0.66336465, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.69004393, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.2878418, + "step": 3015, + "time_per_iteration": 2.9526240825653076 + }, + { + "auxiliary_loss_clip": 0.01588193, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.36676002, + "balance_loss_mlp": 1.01636672, + "epoch": 0.18133173004659553, + "flos": 20494674074880.0, + "grad_norm": 1.6132220761490403, + "language_loss": 0.78970933, + "learning_rate": 3.764536253816785e-06, + "loss": 0.81601655, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26196289, + "step": 3016, + "time_per_iteration": 2.9327392578125 + }, + { + "auxiliary_loss_clip": 0.01616927, + "auxiliary_loss_mlp": 0.01055377, + "balance_loss_clip": 1.3875432, + "balance_loss_mlp": 1.02869785, + "epoch": 0.1813918532992635, + "flos": 22861120337280.0, + "grad_norm": 2.639956202503164, + "language_loss": 0.84328079, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.87000376, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.26647949, + "step": 3017, + "time_per_iteration": 2.880951404571533 + }, + { + "auxiliary_loss_clip": 0.01589937, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.37006938, + "balance_loss_mlp": 1.01376581, + "epoch": 0.18145197655193146, + "flos": 36078941715840.0, + "grad_norm": 1.7469010677750592, + "language_loss": 0.68389225, + "learning_rate": 3.764169443989697e-06, + "loss": 0.71018517, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25598145, + "step": 3018, + "time_per_iteration": 3.005589246749878 + }, + { + "auxiliary_loss_clip": 0.0159934, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.37099195, + "balance_loss_mlp": 1.01526237, + "epoch": 0.18151209980459942, + "flos": 24034421226240.0, + "grad_norm": 2.2229472042761564, + "language_loss": 0.77318674, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.79959565, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.26318359, + "step": 3019, + "time_per_iteration": 2.8761940002441406 + }, + { + "auxiliary_loss_clip": 0.01605896, + "auxiliary_loss_mlp": 0.01051326, + "balance_loss_clip": 1.37794065, + "balance_loss_mlp": 1.02355015, + "epoch": 0.1815722230572674, + "flos": 23962246490880.0, + "grad_norm": 1.9909950487827597, + "language_loss": 0.82786679, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.85443902, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.27746582, + "step": 3020, + "time_per_iteration": 4.235971450805664 + }, + { + "auxiliary_loss_clip": 0.01594797, + "auxiliary_loss_mlp": 0.01044329, + "balance_loss_clip": 1.37069631, + "balance_loss_mlp": 1.01848459, + "epoch": 0.18163234630993536, + "flos": 24396426023040.0, + "grad_norm": 2.262727653193273, + "language_loss": 0.79409486, + "learning_rate": 3.763618727535352e-06, + "loss": 0.82048607, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.25830078, + "step": 3021, + "time_per_iteration": 2.8747997283935547 + }, + { + "auxiliary_loss_clip": 0.01573456, + "auxiliary_loss_mlp": 0.01049085, + "balance_loss_clip": 1.35263133, + "balance_loss_mlp": 1.02111816, + "epoch": 0.18169246956260335, + "flos": 24692137908480.0, + "grad_norm": 1.5309916704481707, + "language_loss": 0.85972619, + "learning_rate": 3.763435021621422e-06, + "loss": 0.88595164, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.27954102, + "step": 3022, + "time_per_iteration": 2.898169755935669 + }, + { + "auxiliary_loss_clip": 0.01590368, + "auxiliary_loss_mlp": 0.01047169, + "balance_loss_clip": 1.36484706, + "balance_loss_mlp": 1.02044272, + "epoch": 0.1817525928152713, + "flos": 24253931589120.0, + "grad_norm": 1.7296631339070057, + "language_loss": 0.70935148, + "learning_rate": 3.763251248837859e-06, + "loss": 0.73572683, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26721191, + "step": 3023, + "time_per_iteration": 5.948983192443848 + }, + { + "auxiliary_loss_clip": 0.0157593, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.35315979, + "balance_loss_mlp": 1.02000976, + "epoch": 0.18181271606793928, + "flos": 16480768970880.0, + "grad_norm": 1.6427241051333257, + "language_loss": 0.75114805, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.77736473, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25744629, + "step": 3024, + "time_per_iteration": 2.8479886054992676 + }, + { + "auxiliary_loss_clip": 0.01581776, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.35910428, + "balance_loss_mlp": 1.0165143, + "epoch": 0.18187283932060724, + "flos": 18588405121920.0, + "grad_norm": 2.716885197252315, + "language_loss": 0.89975691, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.92599094, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25097656, + "step": 3025, + "time_per_iteration": 2.8938727378845215 + }, + { + "auxiliary_loss_clip": 0.01585352, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.36393666, + "balance_loss_mlp": 1.02416599, + "epoch": 0.1819329625732752, + "flos": 20276566300800.0, + "grad_norm": 1.7875175134911327, + "language_loss": 0.79645306, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.82280964, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26147461, + "step": 3026, + "time_per_iteration": 2.8368606567382812 + }, + { + "auxiliary_loss_clip": 0.0160756, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.38061786, + "balance_loss_mlp": 1.01896167, + "epoch": 0.18199308582594317, + "flos": 25924583030400.0, + "grad_norm": 1.7312307695549096, + "language_loss": 0.76823127, + "learning_rate": 3.762515489146692e-06, + "loss": 0.79476482, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26806641, + "step": 3027, + "time_per_iteration": 2.8911874294281006 + }, + { + "auxiliary_loss_clip": 0.01610078, + "auxiliary_loss_mlp": 0.01052672, + "balance_loss_clip": 1.37770486, + "balance_loss_mlp": 1.02511096, + "epoch": 0.18205320907861114, + "flos": 15385931844480.0, + "grad_norm": 2.07349572309412, + "language_loss": 0.86137521, + "learning_rate": 3.762331382119546e-06, + "loss": 0.88800275, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.27563477, + "step": 3028, + "time_per_iteration": 2.7945613861083984 + }, + { + "auxiliary_loss_clip": 0.01600691, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_clip": 1.3737191, + "balance_loss_mlp": 1.02019906, + "epoch": 0.18211333233127913, + "flos": 25633893317760.0, + "grad_norm": 1.6982789691425906, + "language_loss": 0.83440506, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.86087847, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26452637, + "step": 3029, + "time_per_iteration": 2.88871431350708 + }, + { + "auxiliary_loss_clip": 0.01613224, + "auxiliary_loss_mlp": 0.01053521, + "balance_loss_clip": 1.38581991, + "balance_loss_mlp": 1.02662754, + "epoch": 0.1821734555839471, + "flos": 14984265340800.0, + "grad_norm": 1.8990703094625885, + "language_loss": 0.79456413, + "learning_rate": 3.761962967588891e-06, + "loss": 0.82123154, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26904297, + "step": 3030, + "time_per_iteration": 2.8022425174713135 + }, + { + "auxiliary_loss_clip": 0.01601537, + "auxiliary_loss_mlp": 0.01054784, + "balance_loss_clip": 1.37310338, + "balance_loss_mlp": 1.02797365, + "epoch": 0.18223357883661506, + "flos": 20203758138240.0, + "grad_norm": 2.42246219127193, + "language_loss": 0.86471856, + "learning_rate": 3.761778660099352e-06, + "loss": 0.89128178, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.26818848, + "step": 3031, + "time_per_iteration": 2.833371639251709 + }, + { + "auxiliary_loss_clip": 0.01602777, + "auxiliary_loss_mlp": 0.01051704, + "balance_loss_clip": 1.37483692, + "balance_loss_mlp": 1.02472734, + "epoch": 0.18229370208928303, + "flos": 15240451253760.0, + "grad_norm": 1.6932489490197948, + "language_loss": 0.81326652, + "learning_rate": 3.76159428580299e-06, + "loss": 0.83981133, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27001953, + "step": 3032, + "time_per_iteration": 2.7804315090179443 + }, + { + "auxiliary_loss_clip": 0.0161761, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.38292086, + "balance_loss_mlp": 1.03330612, + "epoch": 0.182353825341951, + "flos": 23850636272640.0, + "grad_norm": 1.9536347406283008, + "language_loss": 0.82120931, + "learning_rate": 3.761409844706795e-06, + "loss": 0.84797788, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.2598877, + "step": 3033, + "time_per_iteration": 2.872609853744507 + }, + { + "auxiliary_loss_clip": 0.01342822, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.20859289, + "balance_loss_mlp": 1.01655507, + "epoch": 0.18241394859461896, + "flos": 61217430007680.0, + "grad_norm": 0.8938365965950319, + "language_loss": 0.63502586, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6588695, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.25, + "step": 3034, + "time_per_iteration": 3.263434648513794 + }, + { + "auxiliary_loss_clip": 0.0159771, + "auxiliary_loss_mlp": 0.01049952, + "balance_loss_clip": 1.37023187, + "balance_loss_mlp": 1.0226295, + "epoch": 0.18247407184728695, + "flos": 18479057143680.0, + "grad_norm": 1.9252464848018913, + "language_loss": 0.81432569, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.84080231, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27331543, + "step": 3035, + "time_per_iteration": 2.8233375549316406 + }, + { + "auxiliary_loss_clip": 0.01577208, + "auxiliary_loss_mlp": 0.01051572, + "balance_loss_clip": 1.35646927, + "balance_loss_mlp": 1.02501202, + "epoch": 0.18253419509995492, + "flos": 21804270860160.0, + "grad_norm": 1.9406629030865525, + "language_loss": 0.85605156, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.88233942, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26586914, + "step": 3036, + "time_per_iteration": 2.9440064430236816 + }, + { + "auxiliary_loss_clip": 0.01566036, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.35066855, + "balance_loss_mlp": 1.01570296, + "epoch": 0.18259431835262288, + "flos": 20157490690560.0, + "grad_norm": 1.87785190411536, + "language_loss": 0.80992097, + "learning_rate": 3.760671412463617e-06, + "loss": 0.83600074, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26245117, + "step": 3037, + "time_per_iteration": 2.8726749420166016 + }, + { + "auxiliary_loss_clip": 0.0158822, + "auxiliary_loss_mlp": 0.01051862, + "balance_loss_clip": 1.36233878, + "balance_loss_mlp": 1.02322817, + "epoch": 0.18265444160529085, + "flos": 16990154640000.0, + "grad_norm": 3.3174196948556665, + "language_loss": 0.81417024, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.84057105, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.28662109, + "step": 3038, + "time_per_iteration": 2.8423168659210205 + }, + { + "auxiliary_loss_clip": 0.01576221, + "auxiliary_loss_mlp": 0.01052155, + "balance_loss_clip": 1.35638511, + "balance_loss_mlp": 1.02566695, + "epoch": 0.1827145648579588, + "flos": 34436640781440.0, + "grad_norm": 2.0546133293214526, + "language_loss": 0.68376994, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.71005368, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26489258, + "step": 3039, + "time_per_iteration": 2.9753291606903076 + }, + { + "auxiliary_loss_clip": 0.01590552, + "auxiliary_loss_mlp": 0.0105184, + "balance_loss_clip": 1.36614609, + "balance_loss_mlp": 1.02629352, + "epoch": 0.18277468811062678, + "flos": 53305465046400.0, + "grad_norm": 1.7009241634455452, + "language_loss": 0.74687195, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.77329582, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25585938, + "step": 3040, + "time_per_iteration": 3.1346511840820312 + }, + { + "auxiliary_loss_clip": 0.01584328, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.36361814, + "balance_loss_mlp": 1.02613616, + "epoch": 0.18283481136329474, + "flos": 31663415352960.0, + "grad_norm": 1.600453395815543, + "language_loss": 0.61668408, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.64307648, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.28808594, + "step": 3041, + "time_per_iteration": 2.9518566131591797 + }, + { + "auxiliary_loss_clip": 0.01596128, + "auxiliary_loss_mlp": 0.01051832, + "balance_loss_clip": 1.37067115, + "balance_loss_mlp": 1.02429461, + "epoch": 0.18289493461596273, + "flos": 53155188506880.0, + "grad_norm": 1.4857973684734103, + "language_loss": 0.60755801, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.63403767, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.27539062, + "step": 3042, + "time_per_iteration": 3.129289388656616 + }, + { + "auxiliary_loss_clip": 0.01590056, + "auxiliary_loss_mlp": 0.01050955, + "balance_loss_clip": 1.3668561, + "balance_loss_mlp": 1.02275014, + "epoch": 0.1829550578686307, + "flos": 25599344273280.0, + "grad_norm": 1.482706649275083, + "language_loss": 0.88618481, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.91259497, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.28222656, + "step": 3043, + "time_per_iteration": 2.871034860610962 + }, + { + "auxiliary_loss_clip": 0.01589577, + "auxiliary_loss_mlp": 0.01046097, + "balance_loss_clip": 1.36364007, + "balance_loss_mlp": 1.01882195, + "epoch": 0.18301518112129866, + "flos": 22611630654720.0, + "grad_norm": 1.8556504998421095, + "language_loss": 0.71551585, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.74187255, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27294922, + "step": 3044, + "time_per_iteration": 2.83789324760437 + }, + { + "auxiliary_loss_clip": 0.01620762, + "auxiliary_loss_mlp": 0.01052758, + "balance_loss_clip": 1.39048076, + "balance_loss_mlp": 1.02423084, + "epoch": 0.18307530437396663, + "flos": 34033797912960.0, + "grad_norm": 2.6027762387892523, + "language_loss": 0.6581406, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.68487585, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 2.30273438, + "router_z_loss_mlp": 0.28527832, + "step": 3045, + "time_per_iteration": 2.992873191833496 + }, + { + "auxiliary_loss_clip": 0.0160302, + "auxiliary_loss_mlp": 0.01049151, + "balance_loss_clip": 1.3807795, + "balance_loss_mlp": 1.02334189, + "epoch": 0.1831354276266346, + "flos": 21287510288640.0, + "grad_norm": 2.6264895227660583, + "language_loss": 0.80077684, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.82729852, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.2578125, + "step": 3046, + "time_per_iteration": 2.885042428970337 + }, + { + "auxiliary_loss_clip": 0.01601826, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_clip": 1.37252033, + "balance_loss_mlp": 1.02184868, + "epoch": 0.18319555087930256, + "flos": 21042861799680.0, + "grad_norm": 1.7252790470988058, + "language_loss": 0.79782617, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.82434469, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.28161621, + "step": 3047, + "time_per_iteration": 2.8199899196624756 + }, + { + "auxiliary_loss_clip": 0.01589127, + "auxiliary_loss_mlp": 0.01048802, + "balance_loss_clip": 1.36805117, + "balance_loss_mlp": 1.02219462, + "epoch": 0.18325567413197055, + "flos": 34395350261760.0, + "grad_norm": 2.8035015328506825, + "language_loss": 0.81619906, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.84257835, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26635742, + "step": 3048, + "time_per_iteration": 2.9440200328826904 + }, + { + "auxiliary_loss_clip": 0.01598112, + "auxiliary_loss_mlp": 0.01048835, + "balance_loss_clip": 1.37411475, + "balance_loss_mlp": 1.02250195, + "epoch": 0.18331579738463852, + "flos": 20568206154240.0, + "grad_norm": 1.9058231202847264, + "language_loss": 0.87774944, + "learning_rate": 3.758449708105424e-06, + "loss": 0.90421903, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26367188, + "step": 3049, + "time_per_iteration": 4.247037887573242 + }, + { + "auxiliary_loss_clip": 0.01623745, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.38744581, + "balance_loss_mlp": 1.03025973, + "epoch": 0.18337592063730648, + "flos": 19616858847360.0, + "grad_norm": 2.7706086819088855, + "language_loss": 0.78558272, + "learning_rate": 3.75826413248424e-06, + "loss": 0.81239933, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 2.36132812, + "router_z_loss_mlp": 0.27661133, + "step": 3050, + "time_per_iteration": 2.787618398666382 + }, + { + "auxiliary_loss_clip": 0.01598559, + "auxiliary_loss_mlp": 0.01055698, + "balance_loss_clip": 1.37307954, + "balance_loss_mlp": 1.02916241, + "epoch": 0.18343604388997445, + "flos": 20860841393280.0, + "grad_norm": 2.213225992882047, + "language_loss": 1.00510514, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.03164768, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.265625, + "step": 3051, + "time_per_iteration": 2.9342830181121826 + }, + { + "auxiliary_loss_clip": 0.01600912, + "auxiliary_loss_mlp": 0.01059523, + "balance_loss_clip": 1.37968016, + "balance_loss_mlp": 1.0330112, + "epoch": 0.1834961671426424, + "flos": 24406379879040.0, + "grad_norm": 1.4188719815068753, + "language_loss": 0.87281567, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.89942002, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26501465, + "step": 3052, + "time_per_iteration": 2.8994123935699463 + }, + { + "auxiliary_loss_clip": 0.01590446, + "auxiliary_loss_mlp": 0.01056849, + "balance_loss_clip": 1.36940038, + "balance_loss_mlp": 1.03183913, + "epoch": 0.18355629039531038, + "flos": 21261512511360.0, + "grad_norm": 1.7822702781039785, + "language_loss": 0.74749172, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.77396464, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25, + "step": 3053, + "time_per_iteration": 2.835766315460205 + }, + { + "auxiliary_loss_clip": 0.01621529, + "auxiliary_loss_mlp": 0.01069215, + "balance_loss_clip": 1.39365673, + "balance_loss_mlp": 1.04053307, + "epoch": 0.18361641364797834, + "flos": 28667874384000.0, + "grad_norm": 1.8770222838086152, + "language_loss": 0.63112718, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.65803462, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.28710938, + "step": 3054, + "time_per_iteration": 2.9282031059265137 + }, + { + "auxiliary_loss_clip": 0.01600925, + "auxiliary_loss_mlp": 0.01048639, + "balance_loss_clip": 1.37507975, + "balance_loss_mlp": 1.02399874, + "epoch": 0.18367653690064634, + "flos": 20927496263040.0, + "grad_norm": 1.828336230725532, + "language_loss": 0.79577047, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.8222661, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.24658203, + "step": 3055, + "time_per_iteration": 2.8252155780792236 + }, + { + "auxiliary_loss_clip": 0.01591882, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_clip": 1.37298083, + "balance_loss_mlp": 1.04404533, + "epoch": 0.1837366601533143, + "flos": 28776905648640.0, + "grad_norm": 1.7063745000853896, + "language_loss": 0.70883799, + "learning_rate": 3.757149278859014e-06, + "loss": 0.73544854, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25109863, + "step": 3056, + "time_per_iteration": 4.451047420501709 + }, + { + "auxiliary_loss_clip": 0.01593275, + "auxiliary_loss_mlp": 0.01056205, + "balance_loss_clip": 1.3692323, + "balance_loss_mlp": 1.03180289, + "epoch": 0.18379678340598227, + "flos": 21261286287360.0, + "grad_norm": 1.7862100494979685, + "language_loss": 0.80993521, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.83643007, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 2.23730469, + "router_z_loss_mlp": 0.24414062, + "step": 3057, + "time_per_iteration": 4.323470115661621 + }, + { + "auxiliary_loss_clip": 0.01633036, + "auxiliary_loss_mlp": 0.01066663, + "balance_loss_clip": 1.39809716, + "balance_loss_mlp": 1.03910196, + "epoch": 0.18385690665865023, + "flos": 20459491603200.0, + "grad_norm": 2.0278979092414064, + "language_loss": 0.83253753, + "learning_rate": 3.756777127858533e-06, + "loss": 0.8595345, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.27612305, + "step": 3058, + "time_per_iteration": 4.268897533416748 + }, + { + "auxiliary_loss_clip": 0.01604676, + "auxiliary_loss_mlp": 0.0105737, + "balance_loss_clip": 1.37649333, + "balance_loss_mlp": 1.03183579, + "epoch": 0.1839170299113182, + "flos": 26151694519680.0, + "grad_norm": 2.289172897646812, + "language_loss": 0.86897445, + "learning_rate": 3.756590952429017e-06, + "loss": 0.89559484, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 2.28320312, + "router_z_loss_mlp": 0.2557373, + "step": 3059, + "time_per_iteration": 2.980544328689575 + }, + { + "auxiliary_loss_clip": 0.01598213, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.37391472, + "balance_loss_mlp": 1.02494025, + "epoch": 0.18397715316398616, + "flos": 31770274867200.0, + "grad_norm": 1.5384141609670152, + "language_loss": 0.73199058, + "learning_rate": 3.756404710389396e-06, + "loss": 0.75847375, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.25170898, + "step": 3060, + "time_per_iteration": 3.0302295684814453 + }, + { + "auxiliary_loss_clip": 0.01592993, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.36662543, + "balance_loss_mlp": 1.02480352, + "epoch": 0.18403727641665413, + "flos": 24623718491520.0, + "grad_norm": 1.5342711444995145, + "language_loss": 0.73482126, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.76126766, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26843262, + "step": 3061, + "time_per_iteration": 2.931584358215332 + }, + { + "auxiliary_loss_clip": 0.0160961, + "auxiliary_loss_mlp": 0.01051453, + "balance_loss_clip": 1.38447404, + "balance_loss_mlp": 1.02643096, + "epoch": 0.18409739966932212, + "flos": 23450146133760.0, + "grad_norm": 1.6370563509800602, + "language_loss": 0.81887186, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.84548247, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.25024414, + "step": 3062, + "time_per_iteration": 2.819403886795044 + }, + { + "auxiliary_loss_clip": 0.01610298, + "auxiliary_loss_mlp": 0.01053328, + "balance_loss_clip": 1.3805604, + "balance_loss_mlp": 1.02593398, + "epoch": 0.18415752292199009, + "flos": 21882236929920.0, + "grad_norm": 1.8567265541757927, + "language_loss": 0.747298, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.77393425, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.27441406, + "step": 3063, + "time_per_iteration": 2.861109495162964 + }, + { + "auxiliary_loss_clip": 0.01589805, + "auxiliary_loss_mlp": 0.01042849, + "balance_loss_clip": 1.36544287, + "balance_loss_mlp": 1.0183754, + "epoch": 0.18421764617465805, + "flos": 25421576878080.0, + "grad_norm": 1.6457789439200605, + "language_loss": 0.66683215, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.69315869, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.24475098, + "step": 3064, + "time_per_iteration": 2.8432421684265137 + }, + { + "auxiliary_loss_clip": 0.01597441, + "auxiliary_loss_mlp": 0.01049273, + "balance_loss_clip": 1.37401342, + "balance_loss_mlp": 1.02174747, + "epoch": 0.18427776942732602, + "flos": 27209403648000.0, + "grad_norm": 2.013671463011374, + "language_loss": 0.69988918, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.72635639, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.27490234, + "step": 3065, + "time_per_iteration": 2.8677608966827393 + }, + { + "auxiliary_loss_clip": 0.01611328, + "auxiliary_loss_mlp": 0.01049372, + "balance_loss_clip": 1.38190746, + "balance_loss_mlp": 1.02231133, + "epoch": 0.18433789267999398, + "flos": 27863138787840.0, + "grad_norm": 3.2088341141597363, + "language_loss": 0.73982489, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.76643187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.27062988, + "step": 3066, + "time_per_iteration": 2.869561195373535 + }, + { + "auxiliary_loss_clip": 0.01600077, + "auxiliary_loss_mlp": 0.01043098, + "balance_loss_clip": 1.37322176, + "balance_loss_mlp": 1.01608515, + "epoch": 0.18439801593266195, + "flos": 17865571893120.0, + "grad_norm": 2.1749540738599484, + "language_loss": 0.83962184, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.86605358, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2701416, + "step": 3067, + "time_per_iteration": 2.826500415802002 + }, + { + "auxiliary_loss_clip": 0.0134491, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.21193385, + "balance_loss_mlp": 1.00531864, + "epoch": 0.18445813918532994, + "flos": 56418561285120.0, + "grad_norm": 0.7960857806283291, + "language_loss": 0.59891534, + "learning_rate": 3.754912376956657e-06, + "loss": 0.62266934, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.25195312, + "step": 3068, + "time_per_iteration": 3.22337007522583 + }, + { + "auxiliary_loss_clip": 0.01589571, + "auxiliary_loss_mlp": 0.01050094, + "balance_loss_clip": 1.36847401, + "balance_loss_mlp": 1.02222323, + "epoch": 0.1845182624379979, + "flos": 20966343563520.0, + "grad_norm": 1.6228102970853626, + "language_loss": 0.7708047, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.7972014, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27880859, + "step": 3069, + "time_per_iteration": 2.852297067642212 + }, + { + "auxiliary_loss_clip": 0.01607469, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.38000727, + "balance_loss_mlp": 1.01863384, + "epoch": 0.18457838569066587, + "flos": 20494809809280.0, + "grad_norm": 1.8623436679341008, + "language_loss": 0.86134636, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.88788533, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.27770996, + "step": 3070, + "time_per_iteration": 2.9017724990844727 + }, + { + "auxiliary_loss_clip": 0.01607099, + "auxiliary_loss_mlp": 0.01046627, + "balance_loss_clip": 1.38117766, + "balance_loss_mlp": 1.01903057, + "epoch": 0.18463850894333383, + "flos": 25020996249600.0, + "grad_norm": 1.856812338504629, + "language_loss": 0.78813571, + "learning_rate": 3.754351653708265e-06, + "loss": 0.81467301, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.27600098, + "step": 3071, + "time_per_iteration": 2.872328281402588 + }, + { + "auxiliary_loss_clip": 0.01607673, + "auxiliary_loss_mlp": 0.01050075, + "balance_loss_clip": 1.37988663, + "balance_loss_mlp": 1.02277577, + "epoch": 0.1846986321960018, + "flos": 16809446332800.0, + "grad_norm": 2.1287905309281197, + "language_loss": 0.79425293, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.8208304, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27282715, + "step": 3072, + "time_per_iteration": 2.8471953868865967 + }, + { + "auxiliary_loss_clip": 0.01586199, + "auxiliary_loss_mlp": 0.01044691, + "balance_loss_clip": 1.35995579, + "balance_loss_mlp": 1.01803565, + "epoch": 0.18475875544866976, + "flos": 20824256332800.0, + "grad_norm": 1.736123124276282, + "language_loss": 0.8752318, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.90154076, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.26696777, + "step": 3073, + "time_per_iteration": 2.812178373336792 + }, + { + "auxiliary_loss_clip": 0.01601434, + "auxiliary_loss_mlp": 0.01050143, + "balance_loss_clip": 1.37336254, + "balance_loss_mlp": 1.02303445, + "epoch": 0.18481887870133773, + "flos": 22611630654720.0, + "grad_norm": 2.130435358591496, + "language_loss": 0.93073797, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.95725381, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27124023, + "step": 3074, + "time_per_iteration": 2.8817856311798096 + }, + { + "auxiliary_loss_clip": 0.0159196, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.36770582, + "balance_loss_mlp": 1.01911616, + "epoch": 0.18487900195400572, + "flos": 29470302495360.0, + "grad_norm": 1.7090676629681654, + "language_loss": 0.65549338, + "learning_rate": 3.75360309139087e-06, + "loss": 0.68188518, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.28149414, + "step": 3075, + "time_per_iteration": 2.915580987930298 + }, + { + "auxiliary_loss_clip": 0.01586594, + "auxiliary_loss_mlp": 0.01049849, + "balance_loss_clip": 1.36525321, + "balance_loss_mlp": 1.02483916, + "epoch": 0.1849391252066737, + "flos": 20637847180800.0, + "grad_norm": 1.8422215436623135, + "language_loss": 0.73748791, + "learning_rate": 3.753415784551761e-06, + "loss": 0.76385224, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25012207, + "step": 3076, + "time_per_iteration": 2.81750226020813 + }, + { + "auxiliary_loss_clip": 0.01608004, + "auxiliary_loss_mlp": 0.01049633, + "balance_loss_clip": 1.38103187, + "balance_loss_mlp": 1.02351427, + "epoch": 0.18499924845934165, + "flos": 14436620553600.0, + "grad_norm": 2.2051118359075796, + "language_loss": 0.81717765, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.84375399, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26147461, + "step": 3077, + "time_per_iteration": 2.808136463165283 + }, + { + "auxiliary_loss_clip": 0.01585913, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.36635745, + "balance_loss_mlp": 1.02182472, + "epoch": 0.18505937171200962, + "flos": 23736899548800.0, + "grad_norm": 1.7644228152478623, + "language_loss": 0.79103899, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81737125, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25500488, + "step": 3078, + "time_per_iteration": 2.8228471279144287 + }, + { + "auxiliary_loss_clip": 0.01589914, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.36853755, + "balance_loss_mlp": 1.02393651, + "epoch": 0.18511949496467758, + "flos": 25968407258880.0, + "grad_norm": 2.381254428289374, + "language_loss": 0.78845322, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.81484807, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.2565918, + "step": 3079, + "time_per_iteration": 2.8915460109710693 + }, + { + "auxiliary_loss_clip": 0.01579392, + "auxiliary_loss_mlp": 0.01048225, + "balance_loss_clip": 1.35788453, + "balance_loss_mlp": 1.02193975, + "epoch": 0.18517961821734555, + "flos": 42428997048960.0, + "grad_norm": 2.1670813518155767, + "language_loss": 0.82928956, + "learning_rate": 3.752665892369369e-06, + "loss": 0.85556567, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26281738, + "step": 3080, + "time_per_iteration": 3.0204455852508545 + }, + { + "auxiliary_loss_clip": 0.01610891, + "auxiliary_loss_mlp": 0.01054851, + "balance_loss_clip": 1.37876654, + "balance_loss_mlp": 1.02805293, + "epoch": 0.18523974147001354, + "flos": 24108224774400.0, + "grad_norm": 2.346149553027522, + "language_loss": 0.7522769, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.77893436, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 2.3203125, + "router_z_loss_mlp": 0.26831055, + "step": 3081, + "time_per_iteration": 2.852064371109009 + }, + { + "auxiliary_loss_clip": 0.01595403, + "auxiliary_loss_mlp": 0.01054909, + "balance_loss_clip": 1.37214148, + "balance_loss_mlp": 1.02870679, + "epoch": 0.1852998647226815, + "flos": 27385225516800.0, + "grad_norm": 2.170746243683545, + "language_loss": 0.72607875, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.75258189, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26245117, + "step": 3082, + "time_per_iteration": 2.949664354324341 + }, + { + "auxiliary_loss_clip": 0.01605714, + "auxiliary_loss_mlp": 0.01047599, + "balance_loss_clip": 1.37885666, + "balance_loss_mlp": 1.02237391, + "epoch": 0.18535998797534947, + "flos": 18341901596160.0, + "grad_norm": 2.0191752026535954, + "language_loss": 0.7183038, + "learning_rate": 3.752102775364407e-06, + "loss": 0.74483693, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25219727, + "step": 3083, + "time_per_iteration": 2.7898952960968018 + }, + { + "auxiliary_loss_clip": 0.01576542, + "auxiliary_loss_mlp": 0.010496, + "balance_loss_clip": 1.35810089, + "balance_loss_mlp": 1.02507854, + "epoch": 0.18542011122801744, + "flos": 37858307708160.0, + "grad_norm": 2.8464044840684415, + "language_loss": 0.70341402, + "learning_rate": 3.751914936806767e-06, + "loss": 0.72967541, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24523926, + "step": 3084, + "time_per_iteration": 4.505472898483276 + }, + { + "auxiliary_loss_clip": 0.01589471, + "auxiliary_loss_mlp": 0.01045367, + "balance_loss_clip": 1.36871672, + "balance_loss_mlp": 1.02032113, + "epoch": 0.1854802344806854, + "flos": 25195506019200.0, + "grad_norm": 1.5634431893791922, + "language_loss": 0.78721899, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.8135674, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25061035, + "step": 3085, + "time_per_iteration": 2.9087185859680176 + }, + { + "auxiliary_loss_clip": 0.01577388, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.35466838, + "balance_loss_mlp": 1.02353978, + "epoch": 0.18554035773335337, + "flos": 26695267274880.0, + "grad_norm": 1.7348941432389504, + "language_loss": 0.74432713, + "learning_rate": 3.751539060400244e-06, + "loss": 0.77057862, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24230957, + "step": 3086, + "time_per_iteration": 2.859485149383545 + }, + { + "auxiliary_loss_clip": 0.01588021, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.36376607, + "balance_loss_mlp": 1.02428532, + "epoch": 0.18560048098602133, + "flos": 22357254533760.0, + "grad_norm": 2.200940108815574, + "language_loss": 0.7042948, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.73066914, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.25085449, + "step": 3087, + "time_per_iteration": 2.8246090412139893 + }, + { + "auxiliary_loss_clip": 0.01595528, + "auxiliary_loss_mlp": 0.0105542, + "balance_loss_clip": 1.37228227, + "balance_loss_mlp": 1.02883673, + "epoch": 0.18566060423868933, + "flos": 17757400279680.0, + "grad_norm": 2.0853841219800264, + "language_loss": 0.73787624, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.76438582, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26574707, + "step": 3088, + "time_per_iteration": 2.8525400161743164 + }, + { + "auxiliary_loss_clip": 0.0157282, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.35476875, + "balance_loss_mlp": 1.02140141, + "epoch": 0.1857207274913573, + "flos": 24687025246080.0, + "grad_norm": 1.7420464362943386, + "language_loss": 0.92892414, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.9551062, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.23986816, + "step": 3089, + "time_per_iteration": 2.8494317531585693 + }, + { + "auxiliary_loss_clip": 0.01586337, + "auxiliary_loss_mlp": 0.01047262, + "balance_loss_clip": 1.36514401, + "balance_loss_mlp": 1.02294302, + "epoch": 0.18578085074402526, + "flos": 28159619834880.0, + "grad_norm": 4.026178448699983, + "language_loss": 0.58596957, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.61230558, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.2434082, + "step": 3090, + "time_per_iteration": 4.317478179931641 + }, + { + "auxiliary_loss_clip": 0.01574362, + "auxiliary_loss_mlp": 0.01043692, + "balance_loss_clip": 1.35477138, + "balance_loss_mlp": 1.01903987, + "epoch": 0.18584097399669322, + "flos": 23962563204480.0, + "grad_norm": 1.9706383359133086, + "language_loss": 0.82662368, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.85280418, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24682617, + "step": 3091, + "time_per_iteration": 2.8373935222625732 + }, + { + "auxiliary_loss_clip": 0.01596492, + "auxiliary_loss_mlp": 0.01048154, + "balance_loss_clip": 1.37011766, + "balance_loss_mlp": 1.02236867, + "epoch": 0.18590109724936119, + "flos": 17210524654080.0, + "grad_norm": 6.603371660398848, + "language_loss": 0.8560307, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.88247722, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.25805664, + "step": 3092, + "time_per_iteration": 2.8349838256835938 + }, + { + "auxiliary_loss_clip": 0.01596332, + "auxiliary_loss_mlp": 0.01049586, + "balance_loss_clip": 1.36758494, + "balance_loss_mlp": 1.02397943, + "epoch": 0.18596122050202915, + "flos": 17242585234560.0, + "grad_norm": 2.0359302782008353, + "language_loss": 0.94295681, + "learning_rate": 3.750221401168038e-06, + "loss": 0.9694159, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 2.28710938, + "router_z_loss_mlp": 0.25622559, + "step": 3093, + "time_per_iteration": 5.73705792427063 + }, + { + "auxiliary_loss_clip": 0.01587062, + "auxiliary_loss_mlp": 0.01050137, + "balance_loss_clip": 1.3635478, + "balance_loss_mlp": 1.02428055, + "epoch": 0.18602134375469712, + "flos": 19028692702080.0, + "grad_norm": 1.7760503156419998, + "language_loss": 0.78137785, + "learning_rate": 3.750032898603443e-06, + "loss": 0.80774987, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.25830078, + "step": 3094, + "time_per_iteration": 2.8374452590942383 + }, + { + "auxiliary_loss_clip": 0.01576762, + "auxiliary_loss_mlp": 0.01048331, + "balance_loss_clip": 1.35852695, + "balance_loss_mlp": 1.02299929, + "epoch": 0.1860814670073651, + "flos": 50967459780480.0, + "grad_norm": 1.5040545335937183, + "language_loss": 0.7075128, + "learning_rate": 3.749844329677425e-06, + "loss": 0.73376375, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.25354004, + "step": 3095, + "time_per_iteration": 3.1230666637420654 + }, + { + "auxiliary_loss_clip": 0.01605037, + "auxiliary_loss_mlp": 0.01050539, + "balance_loss_clip": 1.37517095, + "balance_loss_mlp": 1.02395535, + "epoch": 0.18614159026003307, + "flos": 19400425130880.0, + "grad_norm": 1.9666147567122005, + "language_loss": 0.8160435, + "learning_rate": 3.749655694397135e-06, + "loss": 0.84259927, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.26586914, + "step": 3096, + "time_per_iteration": 2.849066734313965 + }, + { + "auxiliary_loss_clip": 0.01593557, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.36806417, + "balance_loss_mlp": 1.02838373, + "epoch": 0.18620171351270104, + "flos": 21808795340160.0, + "grad_norm": 1.8990177457502415, + "language_loss": 0.76614517, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.79261708, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.25231934, + "step": 3097, + "time_per_iteration": 2.8417656421661377 + }, + { + "auxiliary_loss_clip": 0.01580902, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.36206722, + "balance_loss_mlp": 1.01881897, + "epoch": 0.186261836765369, + "flos": 16371601971840.0, + "grad_norm": 2.5236827785577796, + "language_loss": 0.67612875, + "learning_rate": 3.749278224802352e-06, + "loss": 0.70237345, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24755859, + "step": 3098, + "time_per_iteration": 2.8494958877563477 + }, + { + "auxiliary_loss_clip": 0.01597372, + "auxiliary_loss_mlp": 0.01053154, + "balance_loss_clip": 1.36911786, + "balance_loss_mlp": 1.02535391, + "epoch": 0.18632196001803697, + "flos": 23381093289600.0, + "grad_norm": 1.6121854396080253, + "language_loss": 0.70674115, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.73324645, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.27807617, + "step": 3099, + "time_per_iteration": 2.896617889404297 + }, + { + "auxiliary_loss_clip": 0.01588226, + "auxiliary_loss_mlp": 0.0104522, + "balance_loss_clip": 1.36664987, + "balance_loss_mlp": 1.01929212, + "epoch": 0.18638208327070493, + "flos": 22502192186880.0, + "grad_norm": 1.400517788288609, + "language_loss": 0.72710669, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.75344121, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.25927734, + "step": 3100, + "time_per_iteration": 2.82242488861084 + }, + { + "auxiliary_loss_clip": 0.01585829, + "auxiliary_loss_mlp": 0.01054425, + "balance_loss_clip": 1.36110568, + "balance_loss_mlp": 1.02787685, + "epoch": 0.18644220652337293, + "flos": 29176219422720.0, + "grad_norm": 1.64665895939195, + "language_loss": 0.80493307, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.83133566, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26574707, + "step": 3101, + "time_per_iteration": 2.8969950675964355 + }, + { + "auxiliary_loss_clip": 0.01570735, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.35500503, + "balance_loss_mlp": 1.01510143, + "epoch": 0.1865023297760409, + "flos": 24254565016320.0, + "grad_norm": 1.678994472610055, + "language_loss": 0.77573967, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.80184209, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24414062, + "step": 3102, + "time_per_iteration": 2.8357949256896973 + }, + { + "auxiliary_loss_clip": 0.01583343, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.35898161, + "balance_loss_mlp": 1.02325714, + "epoch": 0.18656245302870886, + "flos": 19136321377920.0, + "grad_norm": 2.0220148303767194, + "language_loss": 0.77671146, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.80302548, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24816895, + "step": 3103, + "time_per_iteration": 2.8197343349456787 + }, + { + "auxiliary_loss_clip": 0.01587597, + "auxiliary_loss_mlp": 0.01048269, + "balance_loss_clip": 1.36489046, + "balance_loss_mlp": 1.02284133, + "epoch": 0.18662257628137682, + "flos": 17795252194560.0, + "grad_norm": 1.5422258845471915, + "language_loss": 0.80031168, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.82667035, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.25463867, + "step": 3104, + "time_per_iteration": 2.8937339782714844 + }, + { + "auxiliary_loss_clip": 0.01587057, + "auxiliary_loss_mlp": 0.01046773, + "balance_loss_clip": 1.36821222, + "balance_loss_mlp": 1.02293134, + "epoch": 0.1866826995340448, + "flos": 24034964163840.0, + "grad_norm": 1.9829630349458243, + "language_loss": 0.86477798, + "learning_rate": 3.747954992113354e-06, + "loss": 0.89111626, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.23840332, + "step": 3105, + "time_per_iteration": 2.860485792160034 + }, + { + "auxiliary_loss_clip": 0.01599053, + "auxiliary_loss_mlp": 0.01050577, + "balance_loss_clip": 1.37010515, + "balance_loss_mlp": 1.02442288, + "epoch": 0.18674282278671275, + "flos": 26152554170880.0, + "grad_norm": 2.2503152319466393, + "language_loss": 0.88044596, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.90694225, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 2.29296875, + "router_z_loss_mlp": 0.26171875, + "step": 3106, + "time_per_iteration": 2.9390101432800293 + }, + { + "auxiliary_loss_clip": 0.01611999, + "auxiliary_loss_mlp": 0.01050351, + "balance_loss_clip": 1.38261056, + "balance_loss_mlp": 1.02333808, + "epoch": 0.18680294603938072, + "flos": 19209627233280.0, + "grad_norm": 1.7627480877800392, + "language_loss": 0.79199135, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.81861484, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.2701416, + "step": 3107, + "time_per_iteration": 2.808230400085449 + }, + { + "auxiliary_loss_clip": 0.01583091, + "auxiliary_loss_mlp": 0.01049799, + "balance_loss_clip": 1.35752559, + "balance_loss_mlp": 1.0237515, + "epoch": 0.1868630692920487, + "flos": 28555675983360.0, + "grad_norm": 2.0043511386521944, + "language_loss": 0.75405759, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.78038651, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.26049805, + "step": 3108, + "time_per_iteration": 2.871408700942993 + }, + { + "auxiliary_loss_clip": 0.015852, + "auxiliary_loss_mlp": 0.01046015, + "balance_loss_clip": 1.36164904, + "balance_loss_mlp": 1.01958632, + "epoch": 0.18692319254471668, + "flos": 17246838245760.0, + "grad_norm": 1.8943349607537183, + "language_loss": 0.75499725, + "learning_rate": 3.747197400772658e-06, + "loss": 0.78130937, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26428223, + "step": 3109, + "time_per_iteration": 2.7994892597198486 + }, + { + "auxiliary_loss_clip": 0.01585231, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.36222601, + "balance_loss_mlp": 1.01905417, + "epoch": 0.18698331579738464, + "flos": 23195543788800.0, + "grad_norm": 1.4826598194296665, + "language_loss": 0.85746121, + "learning_rate": 3.747007837284772e-06, + "loss": 0.88375998, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25610352, + "step": 3110, + "time_per_iteration": 2.8691396713256836 + }, + { + "auxiliary_loss_clip": 0.01597003, + "auxiliary_loss_mlp": 0.01046001, + "balance_loss_clip": 1.3732183, + "balance_loss_mlp": 1.01950121, + "epoch": 0.1870434390500526, + "flos": 25526762334720.0, + "grad_norm": 1.4901486294504438, + "language_loss": 0.85208362, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.87851369, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26489258, + "step": 3111, + "time_per_iteration": 2.9198365211486816 + }, + { + "auxiliary_loss_clip": 0.01590754, + "auxiliary_loss_mlp": 0.01043847, + "balance_loss_clip": 1.36846089, + "balance_loss_mlp": 1.01894462, + "epoch": 0.18710356230272057, + "flos": 19510406536320.0, + "grad_norm": 1.826205982519557, + "language_loss": 0.77759337, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.80393934, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.24914551, + "step": 3112, + "time_per_iteration": 2.850834608078003 + }, + { + "auxiliary_loss_clip": 0.0158572, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.36319566, + "balance_loss_mlp": 1.02132416, + "epoch": 0.18716368555538854, + "flos": 26772147469440.0, + "grad_norm": 2.4476659706045685, + "language_loss": 0.65902406, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.6853388, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.2442627, + "step": 3113, + "time_per_iteration": 2.931129217147827 + }, + { + "auxiliary_loss_clip": 0.01593778, + "auxiliary_loss_mlp": 0.01049287, + "balance_loss_clip": 1.36923981, + "balance_loss_mlp": 1.02267981, + "epoch": 0.1872238088080565, + "flos": 25200166233600.0, + "grad_norm": 2.2963519187591355, + "language_loss": 0.83063322, + "learning_rate": 3.746248920938024e-06, + "loss": 0.85706389, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26635742, + "step": 3114, + "time_per_iteration": 2.8817389011383057 + }, + { + "auxiliary_loss_clip": 0.01591971, + "auxiliary_loss_mlp": 0.01054448, + "balance_loss_clip": 1.36721897, + "balance_loss_mlp": 1.02776921, + "epoch": 0.1872839320607245, + "flos": 24145036058880.0, + "grad_norm": 2.1944214557027775, + "language_loss": 0.5835095, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60997367, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2668457, + "step": 3115, + "time_per_iteration": 2.9038774967193604 + }, + { + "auxiliary_loss_clip": 0.01598096, + "auxiliary_loss_mlp": 0.01050945, + "balance_loss_clip": 1.37683654, + "balance_loss_mlp": 1.02523112, + "epoch": 0.18734405531339246, + "flos": 21182415321600.0, + "grad_norm": 1.590893337288736, + "language_loss": 0.72453189, + "learning_rate": 3.745869065428261e-06, + "loss": 0.75102222, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25732422, + "step": 3116, + "time_per_iteration": 2.837353229522705 + }, + { + "auxiliary_loss_clip": 0.01571462, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.35603535, + "balance_loss_mlp": 1.01668644, + "epoch": 0.18740417856606043, + "flos": 17246431042560.0, + "grad_norm": 3.7191052120659966, + "language_loss": 0.80037874, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.82649457, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.23449707, + "step": 3117, + "time_per_iteration": 2.9050958156585693 + }, + { + "auxiliary_loss_clip": 0.01569233, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.35454333, + "balance_loss_mlp": 1.01805079, + "epoch": 0.1874643018187284, + "flos": 32569807311360.0, + "grad_norm": 1.547163060619897, + "language_loss": 0.84943068, + "learning_rate": 3.745488945104381e-06, + "loss": 0.8755604, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25708008, + "step": 3118, + "time_per_iteration": 2.978087902069092 + }, + { + "auxiliary_loss_clip": 0.01590921, + "auxiliary_loss_mlp": 0.01049839, + "balance_loss_clip": 1.36653662, + "balance_loss_mlp": 1.02572274, + "epoch": 0.18752442507139636, + "flos": 23268532930560.0, + "grad_norm": 2.840494128987577, + "language_loss": 0.77771026, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.80411792, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.24133301, + "step": 3119, + "time_per_iteration": 4.25692081451416 + }, + { + "auxiliary_loss_clip": 0.01581536, + "auxiliary_loss_mlp": 0.01046367, + "balance_loss_clip": 1.36002707, + "balance_loss_mlp": 1.02160728, + "epoch": 0.18758454832406432, + "flos": 21770581466880.0, + "grad_norm": 1.7316178218629545, + "language_loss": 0.830284, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.85656303, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.24780273, + "step": 3120, + "time_per_iteration": 2.9192965030670166 + }, + { + "auxiliary_loss_clip": 0.01580911, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.36139274, + "balance_loss_mlp": 1.01882529, + "epoch": 0.1876446715767323, + "flos": 29582229427200.0, + "grad_norm": 1.8069756972226652, + "language_loss": 0.85813129, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.88437104, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.24243164, + "step": 3121, + "time_per_iteration": 2.9062294960021973 + }, + { + "auxiliary_loss_clip": 0.0159161, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.37297118, + "balance_loss_mlp": 1.01574278, + "epoch": 0.18770479482940028, + "flos": 30353139895680.0, + "grad_norm": 1.6043620037311233, + "language_loss": 0.71786362, + "learning_rate": 3.744727910244937e-06, + "loss": 0.74417853, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24169922, + "step": 3122, + "time_per_iteration": 2.929694652557373 + }, + { + "auxiliary_loss_clip": 0.01588379, + "auxiliary_loss_mlp": 0.01047755, + "balance_loss_clip": 1.36917233, + "balance_loss_mlp": 1.02278054, + "epoch": 0.18776491808206824, + "flos": 14473522327680.0, + "grad_norm": 3.3403560020744787, + "language_loss": 0.72174466, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.74810606, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.25, + "step": 3123, + "time_per_iteration": 2.8495266437530518 + }, + { + "auxiliary_loss_clip": 0.01576781, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.3606987, + "balance_loss_mlp": 1.01804841, + "epoch": 0.1878250413347362, + "flos": 24509212606080.0, + "grad_norm": 1.8164713711841194, + "language_loss": 0.75616777, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.78235763, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.24145508, + "step": 3124, + "time_per_iteration": 2.869356155395508 + }, + { + "auxiliary_loss_clip": 0.01584355, + "auxiliary_loss_mlp": 0.01048333, + "balance_loss_clip": 1.36234462, + "balance_loss_mlp": 1.02159429, + "epoch": 0.18788516458740417, + "flos": 39800845008000.0, + "grad_norm": 2.015170282472192, + "language_loss": 0.81808943, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.84441632, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26745605, + "step": 3125, + "time_per_iteration": 4.401974678039551 + }, + { + "auxiliary_loss_clip": 0.01359135, + "auxiliary_loss_mlp": 0.01033646, + "balance_loss_clip": 1.22696829, + "balance_loss_mlp": 1.01380944, + "epoch": 0.18794528784007214, + "flos": 64728102735360.0, + "grad_norm": 0.9735989894225896, + "language_loss": 0.63697881, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.66090661, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.19824219, + "step": 3126, + "time_per_iteration": 3.4352986812591553 + }, + { + "auxiliary_loss_clip": 0.01581972, + "auxiliary_loss_mlp": 0.01048275, + "balance_loss_clip": 1.36365223, + "balance_loss_mlp": 1.02289557, + "epoch": 0.1880054110927401, + "flos": 28633958766720.0, + "grad_norm": 1.651650625777953, + "language_loss": 0.82102275, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.84732521, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25390625, + "step": 3127, + "time_per_iteration": 4.402543783187866 + }, + { + "auxiliary_loss_clip": 0.01350581, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.217453, + "balance_loss_mlp": 1.00549686, + "epoch": 0.1880655343454081, + "flos": 64519994062080.0, + "grad_norm": 0.7685551272780244, + "language_loss": 0.6197114, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64355254, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.28125, + "step": 3128, + "time_per_iteration": 4.892225503921509 + }, + { + "auxiliary_loss_clip": 0.01589486, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_clip": 1.36585546, + "balance_loss_mlp": 1.02097726, + "epoch": 0.18812565759807606, + "flos": 32137211347200.0, + "grad_norm": 2.1490249533292336, + "language_loss": 0.72626084, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.75262618, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26098633, + "step": 3129, + "time_per_iteration": 3.0445449352264404 + }, + { + "auxiliary_loss_clip": 0.01585369, + "auxiliary_loss_mlp": 0.01046369, + "balance_loss_clip": 1.36416221, + "balance_loss_mlp": 1.02117991, + "epoch": 0.18818578085074403, + "flos": 20632508294400.0, + "grad_norm": 1.9181718491171293, + "language_loss": 0.85652661, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.88284397, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.25195312, + "step": 3130, + "time_per_iteration": 2.950244903564453 + }, + { + "auxiliary_loss_clip": 0.01570039, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.3491683, + "balance_loss_mlp": 1.01974893, + "epoch": 0.188245904103412, + "flos": 28852338009600.0, + "grad_norm": 1.853391730323631, + "language_loss": 0.77780187, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.80394685, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24731445, + "step": 3131, + "time_per_iteration": 2.9085958003997803 + }, + { + "auxiliary_loss_clip": 0.01564238, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_clip": 1.34777474, + "balance_loss_mlp": 1.02071357, + "epoch": 0.18830602735607996, + "flos": 29431274215680.0, + "grad_norm": 1.7790181437957948, + "language_loss": 0.82812393, + "learning_rate": 3.74282069289017e-06, + "loss": 0.85424167, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.26794434, + "step": 3132, + "time_per_iteration": 2.923907995223999 + }, + { + "auxiliary_loss_clip": 0.01602914, + "auxiliary_loss_mlp": 0.01046883, + "balance_loss_clip": 1.376122, + "balance_loss_mlp": 1.02125299, + "epoch": 0.18836615060874792, + "flos": 28883719918080.0, + "grad_norm": 2.0463172809136254, + "language_loss": 0.8052392, + "learning_rate": 3.742629607551614e-06, + "loss": 0.83173716, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.25646973, + "step": 3133, + "time_per_iteration": 2.923058032989502 + }, + { + "auxiliary_loss_clip": 0.01568736, + "auxiliary_loss_mlp": 0.01050731, + "balance_loss_clip": 1.34821379, + "balance_loss_mlp": 1.02437353, + "epoch": 0.18842627386141592, + "flos": 22611947368320.0, + "grad_norm": 1.9597009988494538, + "language_loss": 0.83933455, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.86552918, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26318359, + "step": 3134, + "time_per_iteration": 2.870147466659546 + }, + { + "auxiliary_loss_clip": 0.01575129, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_clip": 1.35624349, + "balance_loss_mlp": 1.02464628, + "epoch": 0.18848639711408388, + "flos": 24584825946240.0, + "grad_norm": 1.4995475234490754, + "language_loss": 0.83689392, + "learning_rate": 3.742247238639684e-06, + "loss": 0.86314881, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25744629, + "step": 3135, + "time_per_iteration": 3.0020697116851807 + }, + { + "auxiliary_loss_clip": 0.01572072, + "auxiliary_loss_mlp": 0.01046891, + "balance_loss_clip": 1.35194159, + "balance_loss_mlp": 1.02126086, + "epoch": 0.18854652036675185, + "flos": 34180002420480.0, + "grad_norm": 1.8433972534520813, + "language_loss": 0.79523027, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.82141984, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.25598145, + "step": 3136, + "time_per_iteration": 2.9804697036743164 + }, + { + "auxiliary_loss_clip": 0.01583548, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_clip": 1.36327267, + "balance_loss_mlp": 1.02534521, + "epoch": 0.1886066436194198, + "flos": 24209157219840.0, + "grad_norm": 1.9329436979961732, + "language_loss": 0.82266217, + "learning_rate": 3.741864605462996e-06, + "loss": 0.84901112, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.26000977, + "step": 3137, + "time_per_iteration": 2.935488224029541 + }, + { + "auxiliary_loss_clip": 0.01586322, + "auxiliary_loss_mlp": 0.01046991, + "balance_loss_clip": 1.36569595, + "balance_loss_mlp": 1.02052689, + "epoch": 0.18866676687208778, + "flos": 21260879084160.0, + "grad_norm": 1.6190350428345992, + "language_loss": 0.81924915, + "learning_rate": 3.741673189793504e-06, + "loss": 0.84558231, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26501465, + "step": 3138, + "time_per_iteration": 2.8867318630218506 + }, + { + "auxiliary_loss_clip": 0.0159655, + "auxiliary_loss_mlp": 0.01058622, + "balance_loss_clip": 1.37180257, + "balance_loss_mlp": 1.03146625, + "epoch": 0.18872689012475574, + "flos": 37323602933760.0, + "grad_norm": 1.6560756618210744, + "language_loss": 0.64399552, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.67054725, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2713623, + "step": 3139, + "time_per_iteration": 2.998769760131836 + }, + { + "auxiliary_loss_clip": 0.01572381, + "auxiliary_loss_mlp": 0.01047252, + "balance_loss_clip": 1.35099769, + "balance_loss_mlp": 1.02051353, + "epoch": 0.1887870133774237, + "flos": 21662138384640.0, + "grad_norm": 3.024390013021514, + "language_loss": 0.72151548, + "learning_rate": 3.741290160328514e-06, + "loss": 0.74771184, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 2.21191406, + "router_z_loss_mlp": 0.26745605, + "step": 3140, + "time_per_iteration": 2.889688491821289 + }, + { + "auxiliary_loss_clip": 0.01572464, + "auxiliary_loss_mlp": 0.01051052, + "balance_loss_clip": 1.34947443, + "balance_loss_mlp": 1.02344322, + "epoch": 0.1888471366300917, + "flos": 15933169428480.0, + "grad_norm": 3.4574888421783543, + "language_loss": 0.88838363, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.91461879, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27612305, + "step": 3141, + "time_per_iteration": 2.806288719177246 + }, + { + "auxiliary_loss_clip": 0.01590465, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.3641001, + "balance_loss_mlp": 1.02395034, + "epoch": 0.18890725988275966, + "flos": 18561230979840.0, + "grad_norm": 1.9605831349115537, + "language_loss": 0.78162777, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.80803394, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26208496, + "step": 3142, + "time_per_iteration": 2.8683576583862305 + }, + { + "auxiliary_loss_clip": 0.01573383, + "auxiliary_loss_mlp": 0.01045915, + "balance_loss_clip": 1.35593534, + "balance_loss_mlp": 1.01989186, + "epoch": 0.18896738313542763, + "flos": 28852790457600.0, + "grad_norm": 1.625834258340113, + "language_loss": 0.79664505, + "learning_rate": 3.740715120924971e-06, + "loss": 0.82283807, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.26025391, + "step": 3143, + "time_per_iteration": 2.8949053287506104 + }, + { + "auxiliary_loss_clip": 0.01574516, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_clip": 1.3529563, + "balance_loss_mlp": 1.02239108, + "epoch": 0.1890275063880956, + "flos": 22421285205120.0, + "grad_norm": 2.28951866884211, + "language_loss": 0.72816223, + "learning_rate": 3.740523309097912e-06, + "loss": 0.75440443, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.27331543, + "step": 3144, + "time_per_iteration": 2.8312060832977295 + }, + { + "auxiliary_loss_clip": 0.01579794, + "auxiliary_loss_mlp": 0.01054268, + "balance_loss_clip": 1.35553062, + "balance_loss_mlp": 1.02830446, + "epoch": 0.18908762964076356, + "flos": 24254700750720.0, + "grad_norm": 2.2119200425956254, + "language_loss": 0.7517854, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.77812606, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.25976562, + "step": 3145, + "time_per_iteration": 2.903623342514038 + }, + { + "auxiliary_loss_clip": 0.01560943, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.34371567, + "balance_loss_mlp": 1.02084708, + "epoch": 0.18914775289343153, + "flos": 16991738208000.0, + "grad_norm": 2.7309401302308483, + "language_loss": 0.77655154, + "learning_rate": 3.740139487448616e-06, + "loss": 0.80261731, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24804688, + "step": 3146, + "time_per_iteration": 2.843816041946411 + }, + { + "auxiliary_loss_clip": 0.0158161, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.35920358, + "balance_loss_mlp": 1.01435471, + "epoch": 0.1892078761460995, + "flos": 21553831036800.0, + "grad_norm": 1.7162968386169448, + "language_loss": 0.79957664, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.82579428, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.25793457, + "step": 3147, + "time_per_iteration": 2.886833667755127 + }, + { + "auxiliary_loss_clip": 0.01571583, + "auxiliary_loss_mlp": 0.01046008, + "balance_loss_clip": 1.35192704, + "balance_loss_mlp": 1.02018762, + "epoch": 0.18926799939876748, + "flos": 23011532611200.0, + "grad_norm": 2.199921705597021, + "language_loss": 0.6775803, + "learning_rate": 3.739755401854267e-06, + "loss": 0.70375621, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25830078, + "step": 3148, + "time_per_iteration": 2.8633103370666504 + }, + { + "auxiliary_loss_clip": 0.01575107, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.35085726, + "balance_loss_mlp": 1.0196718, + "epoch": 0.18932812265143545, + "flos": 22283134272000.0, + "grad_norm": 2.900369567083367, + "language_loss": 0.77392524, + "learning_rate": 3.739563260095902e-06, + "loss": 0.80013967, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.26696777, + "step": 3149, + "time_per_iteration": 2.8908467292785645 + }, + { + "auxiliary_loss_clip": 0.0155615, + "auxiliary_loss_mlp": 0.01052308, + "balance_loss_clip": 1.34022152, + "balance_loss_mlp": 1.02607048, + "epoch": 0.1893882459041034, + "flos": 18633586694400.0, + "grad_norm": 2.048909697756744, + "language_loss": 0.82001507, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.84609967, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26245117, + "step": 3150, + "time_per_iteration": 2.8559305667877197 + }, + { + "auxiliary_loss_clip": 0.01585718, + "auxiliary_loss_mlp": 0.01050019, + "balance_loss_clip": 1.36180663, + "balance_loss_mlp": 1.02283883, + "epoch": 0.18944836915677138, + "flos": 22903180018560.0, + "grad_norm": 2.198466313895822, + "language_loss": 0.86300385, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.88936126, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.27148438, + "step": 3151, + "time_per_iteration": 2.900867462158203 + }, + { + "auxiliary_loss_clip": 0.01567729, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.34872818, + "balance_loss_mlp": 1.02293992, + "epoch": 0.18950849240943934, + "flos": 26807646654720.0, + "grad_norm": 1.657777748678977, + "language_loss": 0.75840318, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.78458261, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27270508, + "step": 3152, + "time_per_iteration": 2.9809913635253906 + }, + { + "auxiliary_loss_clip": 0.01572855, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.35191989, + "balance_loss_mlp": 1.02996349, + "epoch": 0.1895686156621073, + "flos": 24981606011520.0, + "grad_norm": 1.8178600894816515, + "language_loss": 0.76181173, + "learning_rate": 3.738794033491209e-06, + "loss": 0.78811038, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27050781, + "step": 3153, + "time_per_iteration": 2.898897886276245 + }, + { + "auxiliary_loss_clip": 0.01575594, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_clip": 1.35347247, + "balance_loss_mlp": 1.02941132, + "epoch": 0.1896287389147753, + "flos": 21954547399680.0, + "grad_norm": 3.4844041477462473, + "language_loss": 0.80967152, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.83597612, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.25463867, + "step": 3154, + "time_per_iteration": 4.2570414543151855 + }, + { + "auxiliary_loss_clip": 0.0159768, + "auxiliary_loss_mlp": 0.01053287, + "balance_loss_clip": 1.36994326, + "balance_loss_mlp": 1.02614307, + "epoch": 0.18968886216744327, + "flos": 18186150435840.0, + "grad_norm": 2.2987345238783825, + "language_loss": 0.73777759, + "learning_rate": 3.738409024548223e-06, + "loss": 0.76428723, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 2.27734375, + "router_z_loss_mlp": 0.2713623, + "step": 3155, + "time_per_iteration": 2.8232548236846924 + }, + { + "auxiliary_loss_clip": 0.01562849, + "auxiliary_loss_mlp": 0.01053507, + "balance_loss_clip": 1.34331608, + "balance_loss_mlp": 1.02717364, + "epoch": 0.18974898542011123, + "flos": 20422182625920.0, + "grad_norm": 1.8368039108003942, + "language_loss": 0.74944377, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.77560735, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26342773, + "step": 3156, + "time_per_iteration": 2.885035753250122 + }, + { + "auxiliary_loss_clip": 0.01586772, + "auxiliary_loss_mlp": 0.01050511, + "balance_loss_clip": 1.36095393, + "balance_loss_mlp": 1.02477384, + "epoch": 0.1898091086727792, + "flos": 23994850008960.0, + "grad_norm": 2.046962983755719, + "language_loss": 0.69333756, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.71971041, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.25756836, + "step": 3157, + "time_per_iteration": 2.909311056137085 + }, + { + "auxiliary_loss_clip": 0.01583055, + "auxiliary_loss_mlp": 0.01047504, + "balance_loss_clip": 1.36011314, + "balance_loss_mlp": 1.02121854, + "epoch": 0.18986923192544716, + "flos": 27648741087360.0, + "grad_norm": 1.6996184345048786, + "language_loss": 0.81410092, + "learning_rate": 3.737831016747176e-06, + "loss": 0.84040648, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26306152, + "step": 3158, + "time_per_iteration": 2.900313377380371 + }, + { + "auxiliary_loss_clip": 0.01609863, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.37944078, + "balance_loss_mlp": 1.02221954, + "epoch": 0.18992935517811513, + "flos": 25495199447040.0, + "grad_norm": 1.6809999893446657, + "language_loss": 0.72908485, + "learning_rate": 3.737638215672964e-06, + "loss": 0.75568849, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.28271484, + "step": 3159, + "time_per_iteration": 2.8512120246887207 + }, + { + "auxiliary_loss_clip": 0.01584003, + "auxiliary_loss_mlp": 0.01047339, + "balance_loss_clip": 1.36071897, + "balance_loss_mlp": 1.02058816, + "epoch": 0.1899894784307831, + "flos": 17429763548160.0, + "grad_norm": 2.1022971506225487, + "language_loss": 0.85579586, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.88210928, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26757812, + "step": 3160, + "time_per_iteration": 4.165325403213501 + }, + { + "auxiliary_loss_clip": 0.01571868, + "auxiliary_loss_mlp": 0.01051803, + "balance_loss_clip": 1.35514867, + "balance_loss_mlp": 1.02582741, + "epoch": 0.19004960168345109, + "flos": 27504165392640.0, + "grad_norm": 1.6844320090357818, + "language_loss": 0.74285394, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.76909065, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.2598877, + "step": 3161, + "time_per_iteration": 2.8833272457122803 + }, + { + "auxiliary_loss_clip": 0.01575602, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.35613811, + "balance_loss_mlp": 1.0168494, + "epoch": 0.19010972493611905, + "flos": 38668925128320.0, + "grad_norm": 2.071936983361805, + "language_loss": 0.81926751, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.84544796, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.25598145, + "step": 3162, + "time_per_iteration": 3.012822151184082 + }, + { + "auxiliary_loss_clip": 0.01585004, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.36145067, + "balance_loss_mlp": 1.01525879, + "epoch": 0.19016984818878702, + "flos": 19254084888960.0, + "grad_norm": 3.3685948786342044, + "language_loss": 0.76738483, + "learning_rate": 3.73686635253511e-06, + "loss": 0.79366142, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27416992, + "step": 3163, + "time_per_iteration": 5.8205296993255615 + }, + { + "auxiliary_loss_clip": 0.01586474, + "auxiliary_loss_mlp": 0.01051437, + "balance_loss_clip": 1.36879122, + "balance_loss_mlp": 1.02481723, + "epoch": 0.19022997144145498, + "flos": 37610944531200.0, + "grad_norm": 1.7011990888707766, + "language_loss": 0.75612962, + "learning_rate": 3.736673222076982e-06, + "loss": 0.78250873, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26611328, + "step": 3164, + "time_per_iteration": 3.0000579357147217 + }, + { + "auxiliary_loss_clip": 0.01594567, + "auxiliary_loss_mlp": 0.0104712, + "balance_loss_clip": 1.37244916, + "balance_loss_mlp": 1.01960695, + "epoch": 0.19029009469412295, + "flos": 61551021070080.0, + "grad_norm": 1.4890671270348457, + "language_loss": 0.67479306, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.7012099, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 2.22363281, + "router_z_loss_mlp": 0.27514648, + "step": 3165, + "time_per_iteration": 3.2436959743499756 + }, + { + "auxiliary_loss_clip": 0.01587906, + "auxiliary_loss_mlp": 0.01046314, + "balance_loss_clip": 1.36904025, + "balance_loss_mlp": 1.01727414, + "epoch": 0.1903502179467909, + "flos": 13962960293760.0, + "grad_norm": 2.08591175565006, + "language_loss": 0.75765777, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.78399992, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.29040527, + "step": 3166, + "time_per_iteration": 2.8089373111724854 + }, + { + "auxiliary_loss_clip": 0.01358795, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.22600949, + "balance_loss_mlp": 1.00079811, + "epoch": 0.1904103411994589, + "flos": 66931983855360.0, + "grad_norm": 0.8028460833196479, + "language_loss": 0.505198, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52907622, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.28320312, + "step": 3167, + "time_per_iteration": 3.372403621673584 + }, + { + "auxiliary_loss_clip": 0.0156611, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.35044646, + "balance_loss_mlp": 1.02459502, + "epoch": 0.19047046445212687, + "flos": 21918912480000.0, + "grad_norm": 1.8639202814268288, + "language_loss": 0.75131738, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.77748811, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26391602, + "step": 3168, + "time_per_iteration": 2.822801113128662 + }, + { + "auxiliary_loss_clip": 0.01358899, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.22566354, + "balance_loss_mlp": 1.00235391, + "epoch": 0.19053058770479483, + "flos": 59280593552640.0, + "grad_norm": 0.8927012824871449, + "language_loss": 0.60137618, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6252557, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.26757812, + "step": 3169, + "time_per_iteration": 3.2251923084259033 + }, + { + "auxiliary_loss_clip": 0.01602767, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.37611556, + "balance_loss_mlp": 1.02037573, + "epoch": 0.1905907109574628, + "flos": 23961839287680.0, + "grad_norm": 1.672524400730476, + "language_loss": 0.79639339, + "learning_rate": 3.735513056633436e-06, + "loss": 0.82289183, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.26721191, + "step": 3170, + "time_per_iteration": 2.870544195175171 + }, + { + "auxiliary_loss_clip": 0.01582535, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.36352396, + "balance_loss_mlp": 1.01812673, + "epoch": 0.19065083421013077, + "flos": 20821722624000.0, + "grad_norm": 1.762526474556542, + "language_loss": 0.79549479, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.82177192, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27038574, + "step": 3171, + "time_per_iteration": 2.871091365814209 + }, + { + "auxiliary_loss_clip": 0.01613734, + "auxiliary_loss_mlp": 0.01045354, + "balance_loss_clip": 1.38215482, + "balance_loss_mlp": 1.01849663, + "epoch": 0.19071095746279873, + "flos": 31297429013760.0, + "grad_norm": 2.1491517315433035, + "language_loss": 0.80146438, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.82805526, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.26879883, + "step": 3172, + "time_per_iteration": 3.0304644107818604 + }, + { + "auxiliary_loss_clip": 0.01583974, + "auxiliary_loss_mlp": 0.01050931, + "balance_loss_clip": 1.36377609, + "balance_loss_mlp": 1.02518141, + "epoch": 0.1907710807154667, + "flos": 14364310083840.0, + "grad_norm": 1.9593912954730246, + "language_loss": 0.8177948, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.84414387, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.25756836, + "step": 3173, + "time_per_iteration": 2.82743501663208 + }, + { + "auxiliary_loss_clip": 0.01598949, + "auxiliary_loss_mlp": 0.01048347, + "balance_loss_clip": 1.37270784, + "balance_loss_mlp": 1.02264535, + "epoch": 0.1908312039681347, + "flos": 26918442466560.0, + "grad_norm": 5.630213003996178, + "language_loss": 0.79590869, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.82238162, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.25708008, + "step": 3174, + "time_per_iteration": 2.914959192276001 + }, + { + "auxiliary_loss_clip": 0.01605777, + "auxiliary_loss_mlp": 0.01048846, + "balance_loss_clip": 1.38010073, + "balance_loss_mlp": 1.02207136, + "epoch": 0.19089132722080265, + "flos": 14501013183360.0, + "grad_norm": 2.189019942797107, + "language_loss": 0.82303667, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.84958285, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.26782227, + "step": 3175, + "time_per_iteration": 2.836819887161255 + }, + { + "auxiliary_loss_clip": 0.01607444, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.38012362, + "balance_loss_mlp": 1.02863431, + "epoch": 0.19095145047347062, + "flos": 13960471829760.0, + "grad_norm": 2.0221176055758874, + "language_loss": 0.87212706, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.89875364, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.26611328, + "step": 3176, + "time_per_iteration": 2.8335506916046143 + }, + { + "auxiliary_loss_clip": 0.0161666, + "auxiliary_loss_mlp": 0.01045417, + "balance_loss_clip": 1.38599944, + "balance_loss_mlp": 1.01696157, + "epoch": 0.19101157372613858, + "flos": 25312862327040.0, + "grad_norm": 1.9335281878751003, + "language_loss": 0.83261561, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.85923642, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 2.3046875, + "router_z_loss_mlp": 0.28491211, + "step": 3177, + "time_per_iteration": 2.8637807369232178 + }, + { + "auxiliary_loss_clip": 0.01594735, + "auxiliary_loss_mlp": 0.01042105, + "balance_loss_clip": 1.3718338, + "balance_loss_mlp": 1.01593864, + "epoch": 0.19107169697880655, + "flos": 20567889440640.0, + "grad_norm": 2.0585533787105343, + "language_loss": 0.767537, + "learning_rate": 3.73396248424356e-06, + "loss": 0.79390544, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26196289, + "step": 3178, + "time_per_iteration": 2.8514297008514404 + }, + { + "auxiliary_loss_clip": 0.01604265, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.37830174, + "balance_loss_mlp": 1.0154295, + "epoch": 0.19113182023147451, + "flos": 22173198111360.0, + "grad_norm": 1.7623080237531925, + "language_loss": 0.82116759, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.84762776, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.26379395, + "step": 3179, + "time_per_iteration": 2.9932940006256104 + }, + { + "auxiliary_loss_clip": 0.01604155, + "auxiliary_loss_mlp": 0.01044849, + "balance_loss_clip": 1.37981856, + "balance_loss_mlp": 1.01846862, + "epoch": 0.19119194348414248, + "flos": 18589355262720.0, + "grad_norm": 2.473546059181213, + "language_loss": 0.80593711, + "learning_rate": 3.733574183478691e-06, + "loss": 0.83242714, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.26379395, + "step": 3180, + "time_per_iteration": 2.796189069747925 + }, + { + "auxiliary_loss_clip": 0.01593418, + "auxiliary_loss_mlp": 0.01047769, + "balance_loss_clip": 1.37156522, + "balance_loss_mlp": 1.02149546, + "epoch": 0.19125206673681047, + "flos": 19036112849280.0, + "grad_norm": 2.0358117694384505, + "language_loss": 0.80586076, + "learning_rate": 3.733379934486615e-06, + "loss": 0.83227265, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26269531, + "step": 3181, + "time_per_iteration": 2.809083938598633 + }, + { + "auxiliary_loss_clip": 0.01600307, + "auxiliary_loss_mlp": 0.01048199, + "balance_loss_clip": 1.37592936, + "balance_loss_mlp": 1.0215317, + "epoch": 0.19131218998947844, + "flos": 21700352257920.0, + "grad_norm": 2.2364548236052313, + "language_loss": 0.74678075, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.77326578, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26696777, + "step": 3182, + "time_per_iteration": 2.817657709121704 + }, + { + "auxiliary_loss_clip": 0.01590158, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.36903358, + "balance_loss_mlp": 1.01670861, + "epoch": 0.1913723132421464, + "flos": 18451747267200.0, + "grad_norm": 1.846974170174953, + "language_loss": 0.66717911, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.69350958, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26196289, + "step": 3183, + "time_per_iteration": 2.8341546058654785 + }, + { + "auxiliary_loss_clip": 0.01612259, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.38235044, + "balance_loss_mlp": 1.01837778, + "epoch": 0.19143243649481437, + "flos": 27170963550720.0, + "grad_norm": 1.4884668406026045, + "language_loss": 0.73681599, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7633872, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.26513672, + "step": 3184, + "time_per_iteration": 2.8783516883850098 + }, + { + "auxiliary_loss_clip": 0.01603129, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.37638557, + "balance_loss_mlp": 1.02008915, + "epoch": 0.19149255974748233, + "flos": 21727164441600.0, + "grad_norm": 1.772221773319966, + "language_loss": 0.89248055, + "learning_rate": 3.732602281292598e-06, + "loss": 0.91901195, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.29882812, + "step": 3185, + "time_per_iteration": 2.7932181358337402 + }, + { + "auxiliary_loss_clip": 0.01585889, + "auxiliary_loss_mlp": 0.01040624, + "balance_loss_clip": 1.36216378, + "balance_loss_mlp": 1.01405263, + "epoch": 0.1915526830001503, + "flos": 22972821045120.0, + "grad_norm": 1.8431155412777869, + "language_loss": 0.73857898, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.76484406, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26574707, + "step": 3186, + "time_per_iteration": 2.9249303340911865 + }, + { + "auxiliary_loss_clip": 0.01608186, + "auxiliary_loss_mlp": 0.01049689, + "balance_loss_clip": 1.3814677, + "balance_loss_mlp": 1.02224731, + "epoch": 0.1916128062528183, + "flos": 26151920743680.0, + "grad_norm": 1.8240701997055138, + "language_loss": 0.84880418, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.8753829, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.2746582, + "step": 3187, + "time_per_iteration": 2.8734729290008545 + }, + { + "auxiliary_loss_clip": 0.01351451, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_clip": 1.2171154, + "balance_loss_mlp": 1.0181855, + "epoch": 0.19167292950548626, + "flos": 54953303829120.0, + "grad_norm": 0.8802834394269181, + "language_loss": 0.55876303, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58270729, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.24707031, + "step": 3188, + "time_per_iteration": 4.797913312911987 + }, + { + "auxiliary_loss_clip": 0.01596319, + "auxiliary_loss_mlp": 0.01051803, + "balance_loss_clip": 1.37255955, + "balance_loss_mlp": 1.02521968, + "epoch": 0.19173305275815422, + "flos": 29947853808000.0, + "grad_norm": 1.5767004351929843, + "language_loss": 0.7047919, + "learning_rate": 3.731823576891397e-06, + "loss": 0.73127306, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26574707, + "step": 3189, + "time_per_iteration": 2.9395627975463867 + }, + { + "auxiliary_loss_clip": 0.01582261, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.36426187, + "balance_loss_mlp": 1.01753068, + "epoch": 0.1917931760108222, + "flos": 24762819565440.0, + "grad_norm": 1.8111492738173383, + "language_loss": 0.74900985, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.7752651, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25756836, + "step": 3190, + "time_per_iteration": 2.8578155040740967 + }, + { + "auxiliary_loss_clip": 0.01586477, + "auxiliary_loss_mlp": 0.01049947, + "balance_loss_clip": 1.36669457, + "balance_loss_mlp": 1.02397144, + "epoch": 0.19185329926349015, + "flos": 18852825588480.0, + "grad_norm": 2.287782808313809, + "language_loss": 0.85813051, + "learning_rate": 3.73143383063572e-06, + "loss": 0.88449478, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.25976562, + "step": 3191, + "time_per_iteration": 2.8653664588928223 + }, + { + "auxiliary_loss_clip": 0.01587691, + "auxiliary_loss_mlp": 0.01044797, + "balance_loss_clip": 1.36862862, + "balance_loss_mlp": 1.01860666, + "epoch": 0.19191342251615812, + "flos": 22095955958400.0, + "grad_norm": 2.0842952667217585, + "language_loss": 0.90604949, + "learning_rate": 3.73123885901997e-06, + "loss": 0.93237436, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26184082, + "step": 3192, + "time_per_iteration": 2.844270706176758 + }, + { + "auxiliary_loss_clip": 0.01607469, + "auxiliary_loss_mlp": 0.01052666, + "balance_loss_clip": 1.37988842, + "balance_loss_mlp": 1.02453244, + "epoch": 0.19197354576882608, + "flos": 22208968765440.0, + "grad_norm": 1.9865084382874445, + "language_loss": 0.75272024, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77932155, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.28173828, + "step": 3193, + "time_per_iteration": 2.8871216773986816 + }, + { + "auxiliary_loss_clip": 0.01599558, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.37238002, + "balance_loss_mlp": 1.02190816, + "epoch": 0.19203366902149407, + "flos": 24905992671360.0, + "grad_norm": 1.7028475204125064, + "language_loss": 0.75825262, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7847417, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.27441406, + "step": 3194, + "time_per_iteration": 2.9212520122528076 + }, + { + "auxiliary_loss_clip": 0.01333945, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.19878316, + "balance_loss_mlp": 1.00963616, + "epoch": 0.19209379227416204, + "flos": 68445481547520.0, + "grad_norm": 0.7951739055743999, + "language_loss": 0.68538529, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70906907, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.24804688, + "step": 3195, + "time_per_iteration": 4.729078769683838 + }, + { + "auxiliary_loss_clip": 0.01597646, + "auxiliary_loss_mlp": 0.01054015, + "balance_loss_clip": 1.374331, + "balance_loss_mlp": 1.027336, + "epoch": 0.19215391552683, + "flos": 22065524190720.0, + "grad_norm": 2.09716449351901, + "language_loss": 0.7462787, + "learning_rate": 3.730458316143429e-06, + "loss": 0.77279532, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.2668457, + "step": 3196, + "time_per_iteration": 2.8374783992767334 + }, + { + "auxiliary_loss_clip": 0.01601126, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.37838554, + "balance_loss_mlp": 1.02263069, + "epoch": 0.19221403877949797, + "flos": 20312608423680.0, + "grad_norm": 1.7391405017110613, + "language_loss": 0.84154677, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.86805272, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26831055, + "step": 3197, + "time_per_iteration": 2.8432765007019043 + }, + { + "auxiliary_loss_clip": 0.01593632, + "auxiliary_loss_mlp": 0.01049425, + "balance_loss_clip": 1.36735654, + "balance_loss_mlp": 1.02193511, + "epoch": 0.19227416203216594, + "flos": 23195724768000.0, + "grad_norm": 2.488101655320581, + "language_loss": 0.81374812, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.84017873, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.27490234, + "step": 3198, + "time_per_iteration": 4.30849552154541 + }, + { + "auxiliary_loss_clip": 0.01602188, + "auxiliary_loss_mlp": 0.0104692, + "balance_loss_clip": 1.37634027, + "balance_loss_mlp": 1.01913261, + "epoch": 0.1923342852848339, + "flos": 25787879930880.0, + "grad_norm": 3.119331425499335, + "language_loss": 0.79682016, + "learning_rate": 3.729872219959029e-06, + "loss": 0.82331121, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.27770996, + "step": 3199, + "time_per_iteration": 4.211105585098267 + }, + { + "auxiliary_loss_clip": 0.01593932, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.370785, + "balance_loss_mlp": 1.01815772, + "epoch": 0.19239440853750187, + "flos": 17137083064320.0, + "grad_norm": 2.1531842717003657, + "language_loss": 0.85675061, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.88313019, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25878906, + "step": 3200, + "time_per_iteration": 2.8035526275634766 + }, + { + "auxiliary_loss_clip": 0.01594119, + "auxiliary_loss_mlp": 0.01055649, + "balance_loss_clip": 1.37127519, + "balance_loss_mlp": 1.02897024, + "epoch": 0.19245453179016986, + "flos": 16443640972800.0, + "grad_norm": 2.040986317692293, + "language_loss": 0.80025339, + "learning_rate": 3.729481161172443e-06, + "loss": 0.82675099, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26696777, + "step": 3201, + "time_per_iteration": 2.8297629356384277 + }, + { + "auxiliary_loss_clip": 0.01591571, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_clip": 1.36688066, + "balance_loss_mlp": 1.01865339, + "epoch": 0.19251465504283782, + "flos": 20239981240320.0, + "grad_norm": 2.21565989526193, + "language_loss": 0.7046659, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.73104495, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27734375, + "step": 3202, + "time_per_iteration": 2.8338985443115234 + }, + { + "auxiliary_loss_clip": 0.01576039, + "auxiliary_loss_mlp": 0.0104723, + "balance_loss_clip": 1.35748434, + "balance_loss_mlp": 1.01944244, + "epoch": 0.1925747782955058, + "flos": 19473957210240.0, + "grad_norm": 1.7830657217416914, + "language_loss": 0.91912627, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.94535899, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.27783203, + "step": 3203, + "time_per_iteration": 2.8469111919403076 + }, + { + "auxiliary_loss_clip": 0.01599124, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.37198436, + "balance_loss_mlp": 1.02838349, + "epoch": 0.19263490154817375, + "flos": 17794618767360.0, + "grad_norm": 2.349201103598365, + "language_loss": 0.83755696, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.86409789, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 2.27148438, + "router_z_loss_mlp": 0.26635742, + "step": 3204, + "time_per_iteration": 2.830904483795166 + }, + { + "auxiliary_loss_clip": 0.01600775, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.3787117, + "balance_loss_mlp": 1.01926982, + "epoch": 0.19269502480084172, + "flos": 17465986650240.0, + "grad_norm": 1.8727635968072005, + "language_loss": 0.76045895, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78692389, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.26464844, + "step": 3205, + "time_per_iteration": 2.8485400676727295 + }, + { + "auxiliary_loss_clip": 0.01615377, + "auxiliary_loss_mlp": 0.01051843, + "balance_loss_clip": 1.38800359, + "balance_loss_mlp": 1.02496076, + "epoch": 0.19275514805350968, + "flos": 21516838773120.0, + "grad_norm": 2.7750057895904674, + "language_loss": 0.8462128, + "learning_rate": 3.728502366649107e-06, + "loss": 0.87288499, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 2.27539062, + "router_z_loss_mlp": 0.2689209, + "step": 3206, + "time_per_iteration": 2.8545782566070557 + }, + { + "auxiliary_loss_clip": 0.01320873, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.18672788, + "balance_loss_mlp": 1.0119921, + "epoch": 0.19281527130617768, + "flos": 47720184871680.0, + "grad_norm": 0.8620259903875853, + "language_loss": 0.6063863, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62992096, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.20605469, + "step": 3207, + "time_per_iteration": 3.2039854526519775 + }, + { + "auxiliary_loss_clip": 0.01596403, + "auxiliary_loss_mlp": 0.01060979, + "balance_loss_clip": 1.36947465, + "balance_loss_mlp": 1.03458595, + "epoch": 0.19287539455884564, + "flos": 11808559002240.0, + "grad_norm": 2.8521803347963894, + "language_loss": 0.76094502, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.78751886, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.26367188, + "step": 3208, + "time_per_iteration": 2.8869776725769043 + }, + { + "auxiliary_loss_clip": 0.01619295, + "auxiliary_loss_mlp": 0.01068733, + "balance_loss_clip": 1.39156532, + "balance_loss_mlp": 1.04157662, + "epoch": 0.1929355178115136, + "flos": 20641195296000.0, + "grad_norm": 1.9126386356728184, + "language_loss": 0.61829174, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.645172, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 2.27929688, + "router_z_loss_mlp": 0.27197266, + "step": 3209, + "time_per_iteration": 2.841313362121582 + }, + { + "auxiliary_loss_clip": 0.01602133, + "auxiliary_loss_mlp": 0.01066796, + "balance_loss_clip": 1.375139, + "balance_loss_mlp": 1.04103506, + "epoch": 0.19299564106418157, + "flos": 40822873971840.0, + "grad_norm": 1.8264743848593412, + "language_loss": 0.80984139, + "learning_rate": 3.727718151176243e-06, + "loss": 0.83653069, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.2578125, + "step": 3210, + "time_per_iteration": 3.0539228916168213 + }, + { + "auxiliary_loss_clip": 0.01584488, + "auxiliary_loss_mlp": 0.0106464, + "balance_loss_clip": 1.3660655, + "balance_loss_mlp": 1.03872442, + "epoch": 0.19305576431684954, + "flos": 11368090442880.0, + "grad_norm": 1.9813870026955729, + "language_loss": 0.84017658, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.86666787, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2590332, + "step": 3211, + "time_per_iteration": 2.8057193756103516 + }, + { + "auxiliary_loss_clip": 0.01319548, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.18636191, + "balance_loss_mlp": 1.0087055, + "epoch": 0.1931158875695175, + "flos": 54536073114240.0, + "grad_norm": 0.9769680721396009, + "language_loss": 0.63659501, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.66016364, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.28515625, + "step": 3212, + "time_per_iteration": 3.239260196685791 + }, + { + "auxiliary_loss_clip": 0.0159117, + "auxiliary_loss_mlp": 0.01063672, + "balance_loss_clip": 1.37036574, + "balance_loss_mlp": 1.03859043, + "epoch": 0.19317601082218547, + "flos": 19837907533440.0, + "grad_norm": 1.5492710082922068, + "language_loss": 0.77521461, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.801763, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25109863, + "step": 3213, + "time_per_iteration": 2.8692479133605957 + }, + { + "auxiliary_loss_clip": 0.01612785, + "auxiliary_loss_mlp": 0.01067722, + "balance_loss_clip": 1.38442326, + "balance_loss_mlp": 1.04146004, + "epoch": 0.19323613407485346, + "flos": 13159944000000.0, + "grad_norm": 4.360238204682326, + "language_loss": 0.72277731, + "learning_rate": 3.726932887459503e-06, + "loss": 0.74958235, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.26281738, + "step": 3214, + "time_per_iteration": 2.8058691024780273 + }, + { + "auxiliary_loss_clip": 0.01596984, + "auxiliary_loss_mlp": 0.01078921, + "balance_loss_clip": 1.37240791, + "balance_loss_mlp": 1.05035853, + "epoch": 0.19329625732752143, + "flos": 14035225518720.0, + "grad_norm": 2.4609375825808555, + "language_loss": 0.76370353, + "learning_rate": 3.72673640779803e-06, + "loss": 0.79046255, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.2857666, + "step": 3215, + "time_per_iteration": 2.935652732849121 + }, + { + "auxiliary_loss_clip": 0.01591032, + "auxiliary_loss_mlp": 0.01073795, + "balance_loss_clip": 1.37166977, + "balance_loss_mlp": 1.04678226, + "epoch": 0.1933563805801894, + "flos": 23452453618560.0, + "grad_norm": 1.687377028570692, + "language_loss": 0.88767099, + "learning_rate": 3.72653986265854e-06, + "loss": 0.91431922, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27001953, + "step": 3216, + "time_per_iteration": 2.852292776107788 + }, + { + "auxiliary_loss_clip": 0.01593834, + "auxiliary_loss_mlp": 0.0107024, + "balance_loss_clip": 1.37295151, + "balance_loss_mlp": 1.04552794, + "epoch": 0.19341650383285736, + "flos": 20494990788480.0, + "grad_norm": 1.8937768693185784, + "language_loss": 0.80887246, + "learning_rate": 3.726343252048485e-06, + "loss": 0.83551323, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.24719238, + "step": 3217, + "time_per_iteration": 2.855722665786743 + }, + { + "auxiliary_loss_clip": 0.01627515, + "auxiliary_loss_mlp": 0.01068219, + "balance_loss_clip": 1.39379144, + "balance_loss_mlp": 1.04010987, + "epoch": 0.19347662708552532, + "flos": 17867517419520.0, + "grad_norm": 2.8794496926170363, + "language_loss": 0.64319146, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.67014873, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.28125, + "step": 3218, + "time_per_iteration": 2.773016929626465 + }, + { + "auxiliary_loss_clip": 0.01610255, + "auxiliary_loss_mlp": 0.01069544, + "balance_loss_clip": 1.38459754, + "balance_loss_mlp": 1.04298401, + "epoch": 0.1935367503381933, + "flos": 18196466250240.0, + "grad_norm": 1.6043076997911772, + "language_loss": 0.80759645, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.83439445, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 2.2578125, + "router_z_loss_mlp": 0.26611328, + "step": 3219, + "time_per_iteration": 2.803114414215088 + }, + { + "auxiliary_loss_clip": 0.01588709, + "auxiliary_loss_mlp": 0.01058174, + "balance_loss_clip": 1.3689568, + "balance_loss_mlp": 1.03180504, + "epoch": 0.19359687359086128, + "flos": 15964596581760.0, + "grad_norm": 2.425071376728323, + "language_loss": 0.86856198, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.8950308, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26367188, + "step": 3220, + "time_per_iteration": 2.8387198448181152 + }, + { + "auxiliary_loss_clip": 0.01568554, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_clip": 1.35427785, + "balance_loss_mlp": 1.0254643, + "epoch": 0.19365699684352924, + "flos": 21225108430080.0, + "grad_norm": 2.284279702136911, + "language_loss": 0.84565628, + "learning_rate": 3.725556155051766e-06, + "loss": 0.87184894, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25244141, + "step": 3221, + "time_per_iteration": 2.7967641353607178 + }, + { + "auxiliary_loss_clip": 0.01587566, + "auxiliary_loss_mlp": 0.0106314, + "balance_loss_clip": 1.3699975, + "balance_loss_mlp": 1.03747439, + "epoch": 0.1937171200961972, + "flos": 17319963121920.0, + "grad_norm": 2.011883242117417, + "language_loss": 0.87132215, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.89782917, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.2565918, + "step": 3222, + "time_per_iteration": 2.91170597076416 + }, + { + "auxiliary_loss_clip": 0.01591441, + "auxiliary_loss_mlp": 0.01051263, + "balance_loss_clip": 1.36776328, + "balance_loss_mlp": 1.02514386, + "epoch": 0.19377724334886517, + "flos": 22645682006400.0, + "grad_norm": 1.6817588034285855, + "language_loss": 0.78897297, + "learning_rate": 3.72516221392398e-06, + "loss": 0.81540006, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26135254, + "step": 3223, + "time_per_iteration": 2.830341339111328 + }, + { + "auxiliary_loss_clip": 0.01589622, + "auxiliary_loss_mlp": 0.0105731, + "balance_loss_clip": 1.37039995, + "balance_loss_mlp": 1.03164411, + "epoch": 0.19383736660153314, + "flos": 15084111911040.0, + "grad_norm": 1.8272496385227819, + "language_loss": 0.76433623, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.79080552, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25683594, + "step": 3224, + "time_per_iteration": 4.192519187927246 + }, + { + "auxiliary_loss_clip": 0.01589658, + "auxiliary_loss_mlp": 0.01049856, + "balance_loss_clip": 1.36779058, + "balance_loss_mlp": 1.02306986, + "epoch": 0.1938974898542011, + "flos": 47136525223680.0, + "grad_norm": 2.3436830097037857, + "language_loss": 0.72352314, + "learning_rate": 3.7247680111229e-06, + "loss": 0.74991834, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.26794434, + "step": 3225, + "time_per_iteration": 3.0543878078460693 + }, + { + "auxiliary_loss_clip": 0.01582704, + "auxiliary_loss_mlp": 0.01043167, + "balance_loss_clip": 1.36156082, + "balance_loss_mlp": 1.01797855, + "epoch": 0.19395761310686907, + "flos": 25823695829760.0, + "grad_norm": 2.1239632362484535, + "language_loss": 0.70306295, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.72932172, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.25195312, + "step": 3226, + "time_per_iteration": 2.8952291011810303 + }, + { + "auxiliary_loss_clip": 0.01576765, + "auxiliary_loss_mlp": 0.01045953, + "balance_loss_clip": 1.36182499, + "balance_loss_mlp": 1.0184629, + "epoch": 0.19401773635953706, + "flos": 23050017953280.0, + "grad_norm": 1.773626252894541, + "language_loss": 0.77144039, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.7976675, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27453613, + "step": 3227, + "time_per_iteration": 2.8513972759246826 + }, + { + "auxiliary_loss_clip": 0.01581917, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.36153793, + "balance_loss_mlp": 1.01813149, + "epoch": 0.19407785961220503, + "flos": 15928282990080.0, + "grad_norm": 2.0070380154588463, + "language_loss": 0.70349103, + "learning_rate": 3.724176216414662e-06, + "loss": 0.72976053, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2689209, + "step": 3228, + "time_per_iteration": 2.7930748462677 + }, + { + "auxiliary_loss_clip": 0.01572314, + "auxiliary_loss_mlp": 0.01043931, + "balance_loss_clip": 1.35513091, + "balance_loss_mlp": 1.01770544, + "epoch": 0.194137982864873, + "flos": 25932817584000.0, + "grad_norm": 1.8911257198333322, + "language_loss": 0.74871588, + "learning_rate": 3.72397882074007e-06, + "loss": 0.77487838, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26220703, + "step": 3229, + "time_per_iteration": 2.905484914779663 + }, + { + "auxiliary_loss_clip": 0.01580427, + "auxiliary_loss_mlp": 0.01050576, + "balance_loss_clip": 1.36251915, + "balance_loss_mlp": 1.02398014, + "epoch": 0.19419810611754096, + "flos": 13269201488640.0, + "grad_norm": 1.802127469998507, + "language_loss": 0.66891289, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.69522291, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26574707, + "step": 3230, + "time_per_iteration": 2.8357434272766113 + }, + { + "auxiliary_loss_clip": 0.01567465, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_clip": 1.35409427, + "balance_loss_mlp": 1.01609755, + "epoch": 0.19425822937020892, + "flos": 15713975779200.0, + "grad_norm": 3.8864357563038427, + "language_loss": 0.82241488, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84852892, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.27856445, + "step": 3231, + "time_per_iteration": 4.33530592918396 + }, + { + "auxiliary_loss_clip": 0.01582729, + "auxiliary_loss_mlp": 0.01043303, + "balance_loss_clip": 1.36430192, + "balance_loss_mlp": 1.01598072, + "epoch": 0.1943183526228769, + "flos": 23113958135040.0, + "grad_norm": 2.0536648829359048, + "language_loss": 0.87550187, + "learning_rate": 3.72338624150555e-06, + "loss": 0.90176225, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2734375, + "step": 3232, + "time_per_iteration": 2.9359803199768066 + }, + { + "auxiliary_loss_clip": 0.01572505, + "auxiliary_loss_mlp": 0.01046496, + "balance_loss_clip": 1.35708928, + "balance_loss_mlp": 1.0189352, + "epoch": 0.19437847587554485, + "flos": 24722071983360.0, + "grad_norm": 1.6786380220459685, + "language_loss": 0.85871375, + "learning_rate": 3.723188584382096e-06, + "loss": 0.88490373, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27587891, + "step": 3233, + "time_per_iteration": 4.296800851821899 + }, + { + "auxiliary_loss_clip": 0.0158688, + "auxiliary_loss_mlp": 0.01049253, + "balance_loss_clip": 1.36515856, + "balance_loss_mlp": 1.02225184, + "epoch": 0.19443859912821285, + "flos": 23127486330240.0, + "grad_norm": 1.6079227148715274, + "language_loss": 0.89754075, + "learning_rate": 3.722990861915158e-06, + "loss": 0.92390203, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2701416, + "step": 3234, + "time_per_iteration": 2.8633687496185303 + }, + { + "auxiliary_loss_clip": 0.01574593, + "auxiliary_loss_mlp": 0.01044705, + "balance_loss_clip": 1.35291719, + "balance_loss_mlp": 1.01762033, + "epoch": 0.1944987223808808, + "flos": 15092391709440.0, + "grad_norm": 2.2392622915992964, + "language_loss": 0.7960093, + "learning_rate": 3.722793074112234e-06, + "loss": 0.82220232, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27099609, + "step": 3235, + "time_per_iteration": 2.793816089630127 + }, + { + "auxiliary_loss_clip": 0.01575111, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.35941243, + "balance_loss_mlp": 1.01546383, + "epoch": 0.19455884563354878, + "flos": 17135228027520.0, + "grad_norm": 1.8324927368184571, + "language_loss": 0.80006051, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.82622802, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26171875, + "step": 3236, + "time_per_iteration": 2.837857246398926 + }, + { + "auxiliary_loss_clip": 0.01567549, + "auxiliary_loss_mlp": 0.01045271, + "balance_loss_clip": 1.35373664, + "balance_loss_mlp": 1.01832926, + "epoch": 0.19461896888621674, + "flos": 20202988976640.0, + "grad_norm": 1.9527497248702526, + "language_loss": 0.77429587, + "learning_rate": 3.72239730252843e-06, + "loss": 0.80042404, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.26977539, + "step": 3237, + "time_per_iteration": 2.8135523796081543 + }, + { + "auxiliary_loss_clip": 0.01581638, + "auxiliary_loss_mlp": 0.01044406, + "balance_loss_clip": 1.36006284, + "balance_loss_mlp": 1.01767969, + "epoch": 0.1946790921388847, + "flos": 25312274144640.0, + "grad_norm": 1.923058663366835, + "language_loss": 0.76085436, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.78711486, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26745605, + "step": 3238, + "time_per_iteration": 2.8834903240203857 + }, + { + "auxiliary_loss_clip": 0.01556076, + "auxiliary_loss_mlp": 0.01042301, + "balance_loss_clip": 1.34154749, + "balance_loss_mlp": 1.01540732, + "epoch": 0.19473921539155267, + "flos": 20202988976640.0, + "grad_norm": 1.802520751952537, + "language_loss": 0.74851322, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.77449703, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2689209, + "step": 3239, + "time_per_iteration": 2.794116735458374 + }, + { + "auxiliary_loss_clip": 0.01571223, + "auxiliary_loss_mlp": 0.01044349, + "balance_loss_clip": 1.35656118, + "balance_loss_mlp": 1.01582253, + "epoch": 0.19479933864422067, + "flos": 20897562188160.0, + "grad_norm": 1.6813017588338317, + "language_loss": 0.74472404, + "learning_rate": 3.721803155320412e-06, + "loss": 0.77087975, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.28491211, + "step": 3240, + "time_per_iteration": 2.854987859725952 + }, + { + "auxiliary_loss_clip": 0.01573688, + "auxiliary_loss_mlp": 0.01043678, + "balance_loss_clip": 1.3580128, + "balance_loss_mlp": 1.01628375, + "epoch": 0.19485946189688863, + "flos": 23305570439040.0, + "grad_norm": 1.9677566399792343, + "language_loss": 0.67736065, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.70353425, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27416992, + "step": 3241, + "time_per_iteration": 2.8206679821014404 + }, + { + "auxiliary_loss_clip": 0.01580424, + "auxiliary_loss_mlp": 0.01046258, + "balance_loss_clip": 1.36328077, + "balance_loss_mlp": 1.01864934, + "epoch": 0.1949195851495566, + "flos": 23305525194240.0, + "grad_norm": 1.4361777861871758, + "language_loss": 0.83968264, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.86594945, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.27587891, + "step": 3242, + "time_per_iteration": 2.869105100631714 + }, + { + "auxiliary_loss_clip": 0.01327075, + "auxiliary_loss_mlp": 0.01066762, + "balance_loss_clip": 1.19077694, + "balance_loss_mlp": 1.04177594, + "epoch": 0.19497970840222456, + "flos": 64993925790720.0, + "grad_norm": 0.8324449489093395, + "language_loss": 0.57583505, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59977341, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.25, + "step": 3243, + "time_per_iteration": 3.376127004623413 + }, + { + "auxiliary_loss_clip": 0.01579206, + "auxiliary_loss_mlp": 0.01050857, + "balance_loss_clip": 1.36200571, + "balance_loss_mlp": 1.02267623, + "epoch": 0.19503983165489253, + "flos": 19653805866240.0, + "grad_norm": 1.7547136988143541, + "language_loss": 0.84563404, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.87193465, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.28149414, + "step": 3244, + "time_per_iteration": 2.869676351547241 + }, + { + "auxiliary_loss_clip": 0.0158971, + "auxiliary_loss_mlp": 0.01059038, + "balance_loss_clip": 1.37021792, + "balance_loss_mlp": 1.03003407, + "epoch": 0.1950999549075605, + "flos": 21151802574720.0, + "grad_norm": 2.0642628983726325, + "language_loss": 0.77685481, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.80334228, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.28967285, + "step": 3245, + "time_per_iteration": 2.9199533462524414 + }, + { + "auxiliary_loss_clip": 0.01567959, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.34968722, + "balance_loss_mlp": 1.01775146, + "epoch": 0.19516007816022846, + "flos": 20894123583360.0, + "grad_norm": 2.0565330467835152, + "language_loss": 0.85669935, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.88283968, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2833252, + "step": 3246, + "time_per_iteration": 2.9189541339874268 + }, + { + "auxiliary_loss_clip": 0.015785, + "auxiliary_loss_mlp": 0.01053579, + "balance_loss_clip": 1.35883546, + "balance_loss_mlp": 1.02721024, + "epoch": 0.19522020141289645, + "flos": 16919472983040.0, + "grad_norm": 2.5872328865770227, + "language_loss": 0.77856827, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.80488908, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26342773, + "step": 3247, + "time_per_iteration": 2.8074357509613037 + }, + { + "auxiliary_loss_clip": 0.01564071, + "auxiliary_loss_mlp": 0.01047897, + "balance_loss_clip": 1.34780395, + "balance_loss_mlp": 1.02122951, + "epoch": 0.19528032466556441, + "flos": 26736241080960.0, + "grad_norm": 1.595723199207394, + "language_loss": 0.76442903, + "learning_rate": 3.720215890515421e-06, + "loss": 0.79054868, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 2.1640625, + "router_z_loss_mlp": 0.26696777, + "step": 3248, + "time_per_iteration": 2.8972721099853516 + }, + { + "auxiliary_loss_clip": 0.0157143, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.35244513, + "balance_loss_mlp": 1.02148438, + "epoch": 0.19534044791823238, + "flos": 21042680820480.0, + "grad_norm": 1.7930897151430056, + "language_loss": 0.7994222, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.82563007, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27856445, + "step": 3249, + "time_per_iteration": 2.823413610458374 + }, + { + "auxiliary_loss_clip": 0.01564447, + "auxiliary_loss_mlp": 0.01048886, + "balance_loss_clip": 1.34753895, + "balance_loss_mlp": 1.02058578, + "epoch": 0.19540057117090034, + "flos": 22353680194560.0, + "grad_norm": 1.4867276700605674, + "language_loss": 0.73765576, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.76378906, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.28283691, + "step": 3250, + "time_per_iteration": 2.8628592491149902 + }, + { + "auxiliary_loss_clip": 0.01560029, + "auxiliary_loss_mlp": 0.01044637, + "balance_loss_clip": 1.34588027, + "balance_loss_mlp": 1.01677799, + "epoch": 0.1954606944235683, + "flos": 20310934366080.0, + "grad_norm": 1.8631908430314594, + "language_loss": 0.80786932, + "learning_rate": 3.719619589699017e-06, + "loss": 0.83391601, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.27868652, + "step": 3251, + "time_per_iteration": 2.808612823486328 + }, + { + "auxiliary_loss_clip": 0.01570814, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_clip": 1.35213494, + "balance_loss_mlp": 1.01608562, + "epoch": 0.19552081767623627, + "flos": 17355552796800.0, + "grad_norm": 2.1548846916450066, + "language_loss": 0.84364057, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86979014, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.28039551, + "step": 3252, + "time_per_iteration": 2.8758645057678223 + }, + { + "auxiliary_loss_clip": 0.01589967, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_clip": 1.36461067, + "balance_loss_mlp": 1.02660477, + "epoch": 0.19558094092890424, + "flos": 31990101943680.0, + "grad_norm": 1.7733686687297636, + "language_loss": 0.73826563, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76472658, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.29528809, + "step": 3253, + "time_per_iteration": 2.9374611377716064 + }, + { + "auxiliary_loss_clip": 0.01586863, + "auxiliary_loss_mlp": 0.01051653, + "balance_loss_clip": 1.36211753, + "balance_loss_mlp": 1.0239253, + "epoch": 0.19564106418157223, + "flos": 22277931120000.0, + "grad_norm": 1.711652845036668, + "language_loss": 0.77712566, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.80351084, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.27709961, + "step": 3254, + "time_per_iteration": 2.841430902481079 + }, + { + "auxiliary_loss_clip": 0.01324157, + "auxiliary_loss_mlp": 0.01055939, + "balance_loss_clip": 1.19419456, + "balance_loss_mlp": 1.02942657, + "epoch": 0.1957011874342402, + "flos": 54388330283520.0, + "grad_norm": 0.7666990244990247, + "language_loss": 0.55396461, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57776558, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.265625, + "step": 3255, + "time_per_iteration": 3.313453197479248 + }, + { + "auxiliary_loss_clip": 0.01582981, + "auxiliary_loss_mlp": 0.01048807, + "balance_loss_clip": 1.36220741, + "balance_loss_mlp": 1.02118611, + "epoch": 0.19576131068690816, + "flos": 16514910812160.0, + "grad_norm": 2.3783829668126395, + "language_loss": 0.72555542, + "learning_rate": 3.718624450942688e-06, + "loss": 0.75187337, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27624512, + "step": 3256, + "time_per_iteration": 2.8090317249298096 + }, + { + "auxiliary_loss_clip": 0.01569907, + "auxiliary_loss_mlp": 0.01049409, + "balance_loss_clip": 1.35294616, + "balance_loss_mlp": 1.02326643, + "epoch": 0.19582143393957613, + "flos": 14727626979840.0, + "grad_norm": 2.4476956177306817, + "language_loss": 0.81569719, + "learning_rate": 3.718425227649987e-06, + "loss": 0.84189039, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.26123047, + "step": 3257, + "time_per_iteration": 2.859912872314453 + }, + { + "auxiliary_loss_clip": 0.01587281, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_clip": 1.36866498, + "balance_loss_mlp": 1.02142203, + "epoch": 0.1958815571922441, + "flos": 24436132974720.0, + "grad_norm": 2.1871293228393953, + "language_loss": 0.76089621, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.78726602, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.2824707, + "step": 3258, + "time_per_iteration": 4.422747850418091 + }, + { + "auxiliary_loss_clip": 0.01589469, + "auxiliary_loss_mlp": 0.0104634, + "balance_loss_clip": 1.36576915, + "balance_loss_mlp": 1.01839721, + "epoch": 0.19594168044491206, + "flos": 24911150578560.0, + "grad_norm": 1.6158583556339396, + "language_loss": 0.75236058, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.77871871, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27978516, + "step": 3259, + "time_per_iteration": 2.86773681640625 + }, + { + "auxiliary_loss_clip": 0.01591987, + "auxiliary_loss_mlp": 0.01051956, + "balance_loss_clip": 1.36578357, + "balance_loss_mlp": 1.02322626, + "epoch": 0.19600180369758005, + "flos": 12064473446400.0, + "grad_norm": 2.355805514697153, + "language_loss": 0.79099214, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.81743151, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28735352, + "step": 3260, + "time_per_iteration": 2.8117856979370117 + }, + { + "auxiliary_loss_clip": 0.01591244, + "auxiliary_loss_mlp": 0.01049733, + "balance_loss_clip": 1.3690486, + "balance_loss_mlp": 1.0219574, + "epoch": 0.19606192695024802, + "flos": 20859981742080.0, + "grad_norm": 1.8854894405731202, + "language_loss": 0.83283174, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.85924155, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27783203, + "step": 3261, + "time_per_iteration": 2.8003501892089844 + }, + { + "auxiliary_loss_clip": 0.01589941, + "auxiliary_loss_mlp": 0.01051255, + "balance_loss_clip": 1.37095082, + "balance_loss_mlp": 1.02183378, + "epoch": 0.19612205020291598, + "flos": 28487211321600.0, + "grad_norm": 1.6839341576947946, + "language_loss": 0.77180433, + "learning_rate": 3.717428133894807e-06, + "loss": 0.79821628, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.29394531, + "step": 3262, + "time_per_iteration": 2.8650102615356445 + }, + { + "auxiliary_loss_clip": 0.01575305, + "auxiliary_loss_mlp": 0.01051269, + "balance_loss_clip": 1.35811329, + "balance_loss_mlp": 1.023839, + "epoch": 0.19618217345558395, + "flos": 25567555161600.0, + "grad_norm": 1.9578037795642376, + "language_loss": 0.87068266, + "learning_rate": 3.71722851973837e-06, + "loss": 0.8969484, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.27453613, + "step": 3263, + "time_per_iteration": 2.905756950378418 + }, + { + "auxiliary_loss_clip": 0.0158448, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.36397469, + "balance_loss_mlp": 1.01961923, + "epoch": 0.1962422967082519, + "flos": 25275191391360.0, + "grad_norm": 2.022196669350236, + "language_loss": 0.75336623, + "learning_rate": 3.717028840464455e-06, + "loss": 0.77966851, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26135254, + "step": 3264, + "time_per_iteration": 2.844958543777466 + }, + { + "auxiliary_loss_clip": 0.01575382, + "auxiliary_loss_mlp": 0.01051938, + "balance_loss_clip": 1.35946536, + "balance_loss_mlp": 1.02401876, + "epoch": 0.19630241996091988, + "flos": 18816692976000.0, + "grad_norm": 2.5296978800359, + "language_loss": 0.8015877, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.82786095, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.27929688, + "step": 3265, + "time_per_iteration": 2.938898801803589 + }, + { + "auxiliary_loss_clip": 0.01316181, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.18841136, + "balance_loss_mlp": 1.00674081, + "epoch": 0.19636254321358784, + "flos": 62347675829760.0, + "grad_norm": 0.8272363273666344, + "language_loss": 0.53571332, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55918288, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.24023438, + "step": 3266, + "time_per_iteration": 4.7649595737457275 + }, + { + "auxiliary_loss_clip": 0.01593469, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.36798525, + "balance_loss_mlp": 1.0335815, + "epoch": 0.19642266646625584, + "flos": 21079220636160.0, + "grad_norm": 1.8769965550384446, + "language_loss": 0.80970526, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.83625472, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.27880859, + "step": 3267, + "time_per_iteration": 2.8376312255859375 + }, + { + "auxiliary_loss_clip": 0.01570926, + "auxiliary_loss_mlp": 0.01049045, + "balance_loss_clip": 1.35274935, + "balance_loss_mlp": 1.02205658, + "epoch": 0.1964827897189238, + "flos": 14546918672640.0, + "grad_norm": 2.1806490223785886, + "language_loss": 0.8751868, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.9013865, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.26953125, + "step": 3268, + "time_per_iteration": 5.63650107383728 + }, + { + "auxiliary_loss_clip": 0.01575541, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.36006308, + "balance_loss_mlp": 1.02163184, + "epoch": 0.19654291297159177, + "flos": 19253949154560.0, + "grad_norm": 2.3611376335432577, + "language_loss": 0.70229471, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.72853029, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26379395, + "step": 3269, + "time_per_iteration": 2.8611230850219727 + }, + { + "auxiliary_loss_clip": 0.01593569, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_clip": 1.37081456, + "balance_loss_mlp": 1.02429438, + "epoch": 0.19660303622425973, + "flos": 25786748810880.0, + "grad_norm": 1.7618721242590047, + "language_loss": 0.81536609, + "learning_rate": 3.715829397778135e-06, + "loss": 0.84183514, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.29052734, + "step": 3270, + "time_per_iteration": 2.880014181137085 + }, + { + "auxiliary_loss_clip": 0.01575099, + "auxiliary_loss_mlp": 0.01050469, + "balance_loss_clip": 1.35694432, + "balance_loss_mlp": 1.02353978, + "epoch": 0.1966631594769277, + "flos": 20604881704320.0, + "grad_norm": 1.9367749141975046, + "language_loss": 0.85599506, + "learning_rate": 3.715629262894028e-06, + "loss": 0.88225067, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26940918, + "step": 3271, + "time_per_iteration": 2.8191463947296143 + }, + { + "auxiliary_loss_clip": 0.01578483, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.3633523, + "balance_loss_mlp": 1.01991296, + "epoch": 0.19672328272959566, + "flos": 23633704863360.0, + "grad_norm": 1.7973831789291228, + "language_loss": 0.80932856, + "learning_rate": 3.715429062953087e-06, + "loss": 0.83558118, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26843262, + "step": 3272, + "time_per_iteration": 2.870612859725952 + }, + { + "auxiliary_loss_clip": 0.01593214, + "auxiliary_loss_mlp": 0.01049045, + "balance_loss_clip": 1.37231028, + "balance_loss_mlp": 1.02098322, + "epoch": 0.19678340598226365, + "flos": 23120925834240.0, + "grad_norm": 1.7185000993549815, + "language_loss": 0.81942904, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.84585154, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.28039551, + "step": 3273, + "time_per_iteration": 2.965625762939453 + }, + { + "auxiliary_loss_clip": 0.0158643, + "auxiliary_loss_mlp": 0.01046986, + "balance_loss_clip": 1.36639011, + "balance_loss_mlp": 1.02099788, + "epoch": 0.19684352923493162, + "flos": 24545480952960.0, + "grad_norm": 1.7118373648642495, + "language_loss": 0.79090285, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.81723702, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26025391, + "step": 3274, + "time_per_iteration": 2.835944890975952 + }, + { + "auxiliary_loss_clip": 0.01598243, + "auxiliary_loss_mlp": 0.01048098, + "balance_loss_clip": 1.37688828, + "balance_loss_mlp": 1.02068019, + "epoch": 0.19690365248759958, + "flos": 21805854428160.0, + "grad_norm": 3.299325342709566, + "language_loss": 0.82618499, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.85264844, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.27416992, + "step": 3275, + "time_per_iteration": 2.74751615524292 + }, + { + "auxiliary_loss_clip": 0.01584286, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.3653636, + "balance_loss_mlp": 1.02134037, + "epoch": 0.19696377574026755, + "flos": 19064463356160.0, + "grad_norm": 1.8626873822115226, + "language_loss": 0.81949317, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.84582984, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.28076172, + "step": 3276, + "time_per_iteration": 2.842057466506958 + }, + { + "auxiliary_loss_clip": 0.01595761, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_clip": 1.37639832, + "balance_loss_mlp": 1.01797724, + "epoch": 0.19702389899293551, + "flos": 22831005283200.0, + "grad_norm": 1.8039233631257374, + "language_loss": 0.90372175, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.9301275, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26843262, + "step": 3277, + "time_per_iteration": 2.803459882736206 + }, + { + "auxiliary_loss_clip": 0.01595908, + "auxiliary_loss_mlp": 0.0105551, + "balance_loss_clip": 1.36929929, + "balance_loss_mlp": 1.02751923, + "epoch": 0.19708402224560348, + "flos": 22904763586560.0, + "grad_norm": 3.299260808621769, + "language_loss": 0.64157486, + "learning_rate": 3.714226497539239e-06, + "loss": 0.66808897, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.27966309, + "step": 3278, + "time_per_iteration": 2.829946994781494 + }, + { + "auxiliary_loss_clip": 0.01588959, + "auxiliary_loss_mlp": 0.01055691, + "balance_loss_clip": 1.36745572, + "balance_loss_mlp": 1.02876127, + "epoch": 0.19714414549827144, + "flos": 25672559639040.0, + "grad_norm": 1.83437975091346, + "language_loss": 0.74868053, + "learning_rate": 3.714025842413166e-06, + "loss": 0.77512705, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.26928711, + "step": 3279, + "time_per_iteration": 2.8498685359954834 + }, + { + "auxiliary_loss_clip": 0.01574778, + "auxiliary_loss_mlp": 0.01045624, + "balance_loss_clip": 1.35542917, + "balance_loss_mlp": 1.01863539, + "epoch": 0.19720426875093944, + "flos": 23926611571200.0, + "grad_norm": 1.559209505308059, + "language_loss": 0.83174062, + "learning_rate": 3.713825122291061e-06, + "loss": 0.85794461, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.2701416, + "step": 3280, + "time_per_iteration": 2.814842939376831 + }, + { + "auxiliary_loss_clip": 0.01586388, + "auxiliary_loss_mlp": 0.01046851, + "balance_loss_clip": 1.36675549, + "balance_loss_mlp": 1.01998186, + "epoch": 0.1972643920036074, + "flos": 13890061641600.0, + "grad_norm": 1.8944552797378194, + "language_loss": 0.78216481, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80849719, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.26867676, + "step": 3281, + "time_per_iteration": 2.8537449836730957 + }, + { + "auxiliary_loss_clip": 0.0157253, + "auxiliary_loss_mlp": 0.01043535, + "balance_loss_clip": 1.35994291, + "balance_loss_mlp": 1.01842988, + "epoch": 0.19732451525627537, + "flos": 19872728046720.0, + "grad_norm": 1.66284930184393, + "language_loss": 0.8089689, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.83512956, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25085449, + "step": 3282, + "time_per_iteration": 2.8340342044830322 + }, + { + "auxiliary_loss_clip": 0.01586392, + "auxiliary_loss_mlp": 0.01049159, + "balance_loss_clip": 1.36637807, + "balance_loss_mlp": 1.02239609, + "epoch": 0.19738463850894333, + "flos": 24984365944320.0, + "grad_norm": 1.9907781379727785, + "language_loss": 0.72700632, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.75336182, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26806641, + "step": 3283, + "time_per_iteration": 2.86289644241333 + }, + { + "auxiliary_loss_clip": 0.01585751, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.36573052, + "balance_loss_mlp": 1.02341986, + "epoch": 0.1974447617616113, + "flos": 18377943719040.0, + "grad_norm": 1.6910251332203223, + "language_loss": 0.79795992, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.82431155, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26013184, + "step": 3284, + "time_per_iteration": 2.8177490234375 + }, + { + "auxiliary_loss_clip": 0.01597004, + "auxiliary_loss_mlp": 0.01050511, + "balance_loss_clip": 1.37327623, + "balance_loss_mlp": 1.02414227, + "epoch": 0.19750488501427926, + "flos": 22903134773760.0, + "grad_norm": 1.8992745504546273, + "language_loss": 0.86891937, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.89539456, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.26379395, + "step": 3285, + "time_per_iteration": 2.860649347305298 + }, + { + "auxiliary_loss_clip": 0.01570705, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_clip": 1.35573959, + "balance_loss_mlp": 1.02404141, + "epoch": 0.19756500826694723, + "flos": 21881467768320.0, + "grad_norm": 1.895361939845148, + "language_loss": 0.89118326, + "learning_rate": 3.712619437068174e-06, + "loss": 0.91738677, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25610352, + "step": 3286, + "time_per_iteration": 2.8846237659454346 + }, + { + "auxiliary_loss_clip": 0.01605014, + "auxiliary_loss_mlp": 0.01049584, + "balance_loss_clip": 1.38119519, + "balance_loss_mlp": 1.02137959, + "epoch": 0.19762513151961522, + "flos": 15167416867200.0, + "grad_norm": 2.414848410490199, + "language_loss": 0.7892282, + "learning_rate": 3.712418262187102e-06, + "loss": 0.8157742, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.28222656, + "step": 3287, + "time_per_iteration": 2.866270065307617 + }, + { + "auxiliary_loss_clip": 0.016037, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.38001096, + "balance_loss_mlp": 1.02132785, + "epoch": 0.1976852547722832, + "flos": 16987032748800.0, + "grad_norm": 1.8352889834187307, + "language_loss": 0.82701433, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.8535369, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 2.23632812, + "router_z_loss_mlp": 0.27258301, + "step": 3288, + "time_per_iteration": 2.877490997314453 + }, + { + "auxiliary_loss_clip": 0.01570923, + "auxiliary_loss_mlp": 0.01051048, + "balance_loss_clip": 1.35718226, + "balance_loss_mlp": 1.02378476, + "epoch": 0.19774537802495115, + "flos": 20312563178880.0, + "grad_norm": 1.6694984093507008, + "language_loss": 0.73968446, + "learning_rate": 3.712015717627374e-06, + "loss": 0.76590419, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 2.13769531, + "router_z_loss_mlp": 0.27258301, + "step": 3289, + "time_per_iteration": 2.854494094848633 + }, + { + "auxiliary_loss_clip": 0.01581319, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.36480486, + "balance_loss_mlp": 1.02475357, + "epoch": 0.19780550127761912, + "flos": 27246984094080.0, + "grad_norm": 1.6480766040507855, + "language_loss": 0.80345517, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.82978225, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.26647949, + "step": 3290, + "time_per_iteration": 2.9184963703155518 + }, + { + "auxiliary_loss_clip": 0.01357308, + "auxiliary_loss_mlp": 0.01031009, + "balance_loss_clip": 1.22887802, + "balance_loss_mlp": 1.00564098, + "epoch": 0.19786562453028708, + "flos": 63584527680000.0, + "grad_norm": 0.8989204677889219, + "language_loss": 0.6041075, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62799072, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.25390625, + "step": 3291, + "time_per_iteration": 3.396043300628662 + }, + { + "auxiliary_loss_clip": 0.01597661, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.37280846, + "balance_loss_mlp": 1.02405596, + "epoch": 0.19792574778295505, + "flos": 26297853782400.0, + "grad_norm": 1.7428547501556686, + "language_loss": 0.82499373, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.85148609, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.27539062, + "step": 3292, + "time_per_iteration": 2.8627769947052 + }, + { + "auxiliary_loss_clip": 0.01573328, + "auxiliary_loss_mlp": 0.0104617, + "balance_loss_clip": 1.3595103, + "balance_loss_mlp": 1.02057528, + "epoch": 0.19798587103562304, + "flos": 19947662714880.0, + "grad_norm": 2.796477354993951, + "language_loss": 0.82360756, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84980249, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25610352, + "step": 3293, + "time_per_iteration": 4.233789920806885 + }, + { + "auxiliary_loss_clip": 0.01619639, + "auxiliary_loss_mlp": 0.01056208, + "balance_loss_clip": 1.38883793, + "balance_loss_mlp": 1.02912331, + "epoch": 0.198045994288291, + "flos": 20129818855680.0, + "grad_norm": 1.7274948888138428, + "language_loss": 0.62711805, + "learning_rate": 3.711008220265093e-06, + "loss": 0.65387648, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 2.31054688, + "router_z_loss_mlp": 0.27099609, + "step": 3294, + "time_per_iteration": 2.975546360015869 + }, + { + "auxiliary_loss_clip": 0.01583735, + "auxiliary_loss_mlp": 0.01056071, + "balance_loss_clip": 1.36488271, + "balance_loss_mlp": 1.03127575, + "epoch": 0.19810611754095897, + "flos": 17976639173760.0, + "grad_norm": 1.7586315843173386, + "language_loss": 0.88180828, + "learning_rate": 3.710806526117251e-06, + "loss": 0.90820634, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24804688, + "step": 3295, + "time_per_iteration": 2.785015344619751 + }, + { + "auxiliary_loss_clip": 0.01576409, + "auxiliary_loss_mlp": 0.01056151, + "balance_loss_clip": 1.36092722, + "balance_loss_mlp": 1.03110468, + "epoch": 0.19816624079362694, + "flos": 15093160871040.0, + "grad_norm": 2.141233259771441, + "language_loss": 0.82214653, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.84847206, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.25012207, + "step": 3296, + "time_per_iteration": 2.8380656242370605 + }, + { + "auxiliary_loss_clip": 0.01600071, + "auxiliary_loss_mlp": 0.01058223, + "balance_loss_clip": 1.3772676, + "balance_loss_mlp": 1.03068531, + "epoch": 0.1982263640462949, + "flos": 24911195823360.0, + "grad_norm": 1.69564701735555, + "language_loss": 0.68706948, + "learning_rate": 3.710402943207354e-06, + "loss": 0.71365243, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.27563477, + "step": 3297, + "time_per_iteration": 2.8697750568389893 + }, + { + "auxiliary_loss_clip": 0.01580398, + "auxiliary_loss_mlp": 0.01049208, + "balance_loss_clip": 1.3647027, + "balance_loss_mlp": 1.0252347, + "epoch": 0.19828648729896287, + "flos": 20385914279040.0, + "grad_norm": 2.062370779515307, + "language_loss": 0.82210505, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.84840113, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23986816, + "step": 3298, + "time_per_iteration": 2.858400583267212 + }, + { + "auxiliary_loss_clip": 0.01609899, + "auxiliary_loss_mlp": 0.01059131, + "balance_loss_clip": 1.38383722, + "balance_loss_mlp": 1.0323329, + "epoch": 0.19834661055163083, + "flos": 18889320159360.0, + "grad_norm": 1.9131701166903587, + "language_loss": 0.86167145, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.88836181, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.26794434, + "step": 3299, + "time_per_iteration": 2.8041765689849854 + }, + { + "auxiliary_loss_clip": 0.01342641, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.21271873, + "balance_loss_mlp": 1.00937176, + "epoch": 0.19840673380429882, + "flos": 60289790976000.0, + "grad_norm": 0.7618795897688294, + "language_loss": 0.53281176, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55655408, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.22265625, + "step": 3300, + "time_per_iteration": 3.3363473415374756 + }, + { + "auxiliary_loss_clip": 0.01601072, + "auxiliary_loss_mlp": 0.01067574, + "balance_loss_clip": 1.38106489, + "balance_loss_mlp": 1.04000044, + "epoch": 0.1984668570569668, + "flos": 19911665836800.0, + "grad_norm": 1.764988500880052, + "language_loss": 0.74175435, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.76844084, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.27563477, + "step": 3301, + "time_per_iteration": 4.391502857208252 + }, + { + "auxiliary_loss_clip": 0.01600292, + "auxiliary_loss_mlp": 0.01056735, + "balance_loss_clip": 1.37992895, + "balance_loss_mlp": 1.03219008, + "epoch": 0.19852698030963475, + "flos": 15638679152640.0, + "grad_norm": 3.00413769759054, + "language_loss": 0.90051693, + "learning_rate": 3.709392851040235e-06, + "loss": 0.92708719, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.2454834, + "step": 3302, + "time_per_iteration": 4.369137763977051 + }, + { + "auxiliary_loss_clip": 0.01602047, + "auxiliary_loss_mlp": 0.01062443, + "balance_loss_clip": 1.38206446, + "balance_loss_mlp": 1.03780222, + "epoch": 0.19858710356230272, + "flos": 43158571752960.0, + "grad_norm": 3.0994433223930735, + "language_loss": 0.74771023, + "learning_rate": 3.709190638115111e-06, + "loss": 0.77435517, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24633789, + "step": 3303, + "time_per_iteration": 4.514683723449707 + }, + { + "auxiliary_loss_clip": 0.01610283, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.38752294, + "balance_loss_mlp": 1.03919792, + "epoch": 0.19864722681497068, + "flos": 35155356733440.0, + "grad_norm": 1.8128670174043895, + "language_loss": 0.76257706, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.78934133, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.26928711, + "step": 3304, + "time_per_iteration": 2.94378399848938 + }, + { + "auxiliary_loss_clip": 0.01594818, + "auxiliary_loss_mlp": 0.01059994, + "balance_loss_clip": 1.37661719, + "balance_loss_mlp": 1.03579473, + "epoch": 0.19870735006763865, + "flos": 19435336133760.0, + "grad_norm": 1.8040168864481487, + "language_loss": 0.87150466, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.89805281, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2421875, + "step": 3305, + "time_per_iteration": 2.8326048851013184 + }, + { + "auxiliary_loss_clip": 0.01606519, + "auxiliary_loss_mlp": 0.01059601, + "balance_loss_clip": 1.38179207, + "balance_loss_mlp": 1.03501987, + "epoch": 0.19876747332030664, + "flos": 23557141382400.0, + "grad_norm": 1.601841815749736, + "language_loss": 0.69354033, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.72020149, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 2.24609375, + "router_z_loss_mlp": 0.24597168, + "step": 3306, + "time_per_iteration": 2.9686787128448486 + }, + { + "auxiliary_loss_clip": 0.0159914, + "auxiliary_loss_mlp": 0.01056396, + "balance_loss_clip": 1.37847805, + "balance_loss_mlp": 1.03223228, + "epoch": 0.1988275965729746, + "flos": 19839129143040.0, + "grad_norm": 1.6280183443263112, + "language_loss": 0.7763176, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.80287302, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.24169922, + "step": 3307, + "time_per_iteration": 3.003679037094116 + }, + { + "auxiliary_loss_clip": 0.01601747, + "auxiliary_loss_mlp": 0.01049487, + "balance_loss_clip": 1.3816005, + "balance_loss_mlp": 1.02548993, + "epoch": 0.19888771982564257, + "flos": 23524085416320.0, + "grad_norm": 1.7467661581858134, + "language_loss": 0.76948971, + "learning_rate": 3.708178601452737e-06, + "loss": 0.79600203, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.23986816, + "step": 3308, + "time_per_iteration": 2.930572509765625 + }, + { + "auxiliary_loss_clip": 0.01591086, + "auxiliary_loss_mlp": 0.01047848, + "balance_loss_clip": 1.37043917, + "balance_loss_mlp": 1.02141953, + "epoch": 0.19894784307831054, + "flos": 18159654965760.0, + "grad_norm": 1.781612265836147, + "language_loss": 0.77083975, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.79722911, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26452637, + "step": 3309, + "time_per_iteration": 2.790924072265625 + }, + { + "auxiliary_loss_clip": 0.01571177, + "auxiliary_loss_mlp": 0.01051761, + "balance_loss_clip": 1.35825253, + "balance_loss_mlp": 1.02594018, + "epoch": 0.1990079663309785, + "flos": 24285720700800.0, + "grad_norm": 1.7273304963489382, + "language_loss": 0.88641691, + "learning_rate": 3.707773333313917e-06, + "loss": 0.91264635, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25793457, + "step": 3310, + "time_per_iteration": 2.8617002964019775 + }, + { + "auxiliary_loss_clip": 0.01585834, + "auxiliary_loss_mlp": 0.01042228, + "balance_loss_clip": 1.36844134, + "balance_loss_mlp": 1.01674092, + "epoch": 0.19906808958364647, + "flos": 34911205937280.0, + "grad_norm": 2.271604908780493, + "language_loss": 0.65788293, + "learning_rate": 3.70757060210226e-06, + "loss": 0.68416357, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25512695, + "step": 3311, + "time_per_iteration": 2.946624279022217 + }, + { + "auxiliary_loss_clip": 0.01587747, + "auxiliary_loss_mlp": 0.01049179, + "balance_loss_clip": 1.36673045, + "balance_loss_mlp": 1.0222857, + "epoch": 0.19912821283631443, + "flos": 24035597591040.0, + "grad_norm": 2.287732287187971, + "language_loss": 0.75920779, + "learning_rate": 3.707367806139355e-06, + "loss": 0.78557706, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.26916504, + "step": 3312, + "time_per_iteration": 2.867140293121338 + }, + { + "auxiliary_loss_clip": 0.0159813, + "auxiliary_loss_mlp": 0.01047008, + "balance_loss_clip": 1.37907028, + "balance_loss_mlp": 1.02085376, + "epoch": 0.19918833608898243, + "flos": 19866981957120.0, + "grad_norm": 1.9764003994313841, + "language_loss": 0.84555376, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.87200516, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26135254, + "step": 3313, + "time_per_iteration": 2.8573191165924072 + }, + { + "auxiliary_loss_clip": 0.01585958, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.36775017, + "balance_loss_mlp": 1.01809025, + "epoch": 0.1992484593416504, + "flos": 29107664271360.0, + "grad_norm": 2.022588819567331, + "language_loss": 0.82256997, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.84885901, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.2487793, + "step": 3314, + "time_per_iteration": 2.892676830291748 + }, + { + "auxiliary_loss_clip": 0.01562121, + "auxiliary_loss_mlp": 0.01048627, + "balance_loss_clip": 1.3487289, + "balance_loss_mlp": 1.02244854, + "epoch": 0.19930858259431836, + "flos": 23305479949440.0, + "grad_norm": 1.6198228547806657, + "language_loss": 0.886774, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.91288149, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.26171875, + "step": 3315, + "time_per_iteration": 2.8754024505615234 + }, + { + "auxiliary_loss_clip": 0.01588824, + "auxiliary_loss_mlp": 0.0104716, + "balance_loss_clip": 1.36892164, + "balance_loss_mlp": 1.02220988, + "epoch": 0.19936870584698632, + "flos": 25390918886400.0, + "grad_norm": 1.5243603334168598, + "language_loss": 0.72640395, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.75276375, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24951172, + "step": 3316, + "time_per_iteration": 2.88999342918396 + }, + { + "auxiliary_loss_clip": 0.0133288, + "auxiliary_loss_mlp": 0.01047002, + "balance_loss_clip": 1.19430089, + "balance_loss_mlp": 1.02201545, + "epoch": 0.1994288290996543, + "flos": 62202167976960.0, + "grad_norm": 0.8398729977474604, + "language_loss": 0.66383195, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68763077, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.25, + "step": 3317, + "time_per_iteration": 3.430983304977417 + }, + { + "auxiliary_loss_clip": 0.0160353, + "auxiliary_loss_mlp": 0.01051441, + "balance_loss_clip": 1.38019681, + "balance_loss_mlp": 1.02533436, + "epoch": 0.19948895235232225, + "flos": 19035117463680.0, + "grad_norm": 1.929866835525955, + "language_loss": 0.75623906, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.78278875, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26098633, + "step": 3318, + "time_per_iteration": 2.819131374359131 + }, + { + "auxiliary_loss_clip": 0.01588628, + "auxiliary_loss_mlp": 0.01044774, + "balance_loss_clip": 1.37260079, + "balance_loss_mlp": 1.01933527, + "epoch": 0.19954907560499022, + "flos": 37829911956480.0, + "grad_norm": 1.8023782455180206, + "language_loss": 0.8049854, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.83131945, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25488281, + "step": 3319, + "time_per_iteration": 2.964843511581421 + }, + { + "auxiliary_loss_clip": 0.01600713, + "auxiliary_loss_mlp": 0.01049664, + "balance_loss_clip": 1.38024092, + "balance_loss_mlp": 1.02309251, + "epoch": 0.1996091988576582, + "flos": 49582611613440.0, + "grad_norm": 1.998896533093924, + "language_loss": 0.77414382, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.80064756, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.26574707, + "step": 3320, + "time_per_iteration": 3.0875680446624756 + }, + { + "auxiliary_loss_clip": 0.01599697, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_clip": 1.38045859, + "balance_loss_mlp": 1.02505779, + "epoch": 0.19966932211032618, + "flos": 22645455782400.0, + "grad_norm": 1.4908708700039546, + "language_loss": 0.80924153, + "learning_rate": 3.705539729936701e-06, + "loss": 0.83575046, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26135254, + "step": 3321, + "time_per_iteration": 2.8509926795959473 + }, + { + "auxiliary_loss_clip": 0.01333701, + "auxiliary_loss_mlp": 0.01027025, + "balance_loss_clip": 1.19938636, + "balance_loss_mlp": 1.00489962, + "epoch": 0.19972944536299414, + "flos": 54108273098880.0, + "grad_norm": 0.8735789687098219, + "language_loss": 0.65390891, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67751616, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.22167969, + "step": 3322, + "time_per_iteration": 3.1475648880004883 + }, + { + "auxiliary_loss_clip": 0.01334422, + "auxiliary_loss_mlp": 0.01022758, + "balance_loss_clip": 1.20032489, + "balance_loss_mlp": 1.00139618, + "epoch": 0.1997895686156621, + "flos": 69381355132800.0, + "grad_norm": 0.7856044536088392, + "language_loss": 0.57146972, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59504151, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.21386719, + "step": 3323, + "time_per_iteration": 3.4460341930389404 + }, + { + "auxiliary_loss_clip": 0.01588804, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.37481916, + "balance_loss_mlp": 1.03943205, + "epoch": 0.19984969186833007, + "flos": 18561366714240.0, + "grad_norm": 1.892816342301302, + "language_loss": 0.81916547, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.84569722, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24951172, + "step": 3324, + "time_per_iteration": 2.8029603958129883 + }, + { + "auxiliary_loss_clip": 0.01602037, + "auxiliary_loss_mlp": 0.01068454, + "balance_loss_clip": 1.38360667, + "balance_loss_mlp": 1.04241884, + "epoch": 0.19990981512099804, + "flos": 26440167237120.0, + "grad_norm": 1.4669472397419587, + "language_loss": 0.54415047, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.57085538, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.26049805, + "step": 3325, + "time_per_iteration": 2.9312896728515625 + }, + { + "auxiliary_loss_clip": 0.0160502, + "auxiliary_loss_mlp": 0.01074724, + "balance_loss_clip": 1.38558745, + "balance_loss_mlp": 1.0495584, + "epoch": 0.19996993837366603, + "flos": 16334609708160.0, + "grad_norm": 1.9647414134266359, + "language_loss": 0.86422229, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.8910197, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25158691, + "step": 3326, + "time_per_iteration": 2.879739999771118 + }, + { + "auxiliary_loss_clip": 0.01584698, + "auxiliary_loss_mlp": 0.01065265, + "balance_loss_clip": 1.37037134, + "balance_loss_mlp": 1.0396229, + "epoch": 0.200030061626334, + "flos": 20852380615680.0, + "grad_norm": 1.8704057608302813, + "language_loss": 0.72588658, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.75238621, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25598145, + "step": 3327, + "time_per_iteration": 2.973954439163208 + }, + { + "auxiliary_loss_clip": 0.01602212, + "auxiliary_loss_mlp": 0.01077083, + "balance_loss_clip": 1.38121891, + "balance_loss_mlp": 1.05033231, + "epoch": 0.20009018487900196, + "flos": 23771493838080.0, + "grad_norm": 1.8698855012982523, + "language_loss": 0.77425569, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.80104858, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26757812, + "step": 3328, + "time_per_iteration": 4.272123336791992 + }, + { + "auxiliary_loss_clip": 0.01567568, + "auxiliary_loss_mlp": 0.01067463, + "balance_loss_clip": 1.35751426, + "balance_loss_mlp": 1.04337108, + "epoch": 0.20015030813166992, + "flos": 28123396732800.0, + "grad_norm": 1.6536606501837847, + "language_loss": 0.69739771, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.72374797, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.2409668, + "step": 3329, + "time_per_iteration": 2.949007034301758 + }, + { + "auxiliary_loss_clip": 0.01593675, + "auxiliary_loss_mlp": 0.01068403, + "balance_loss_clip": 1.3756876, + "balance_loss_mlp": 1.03907752, + "epoch": 0.2002104313843379, + "flos": 26078162440320.0, + "grad_norm": 1.5945467520206502, + "language_loss": 0.82213843, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.84875911, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.29333496, + "step": 3330, + "time_per_iteration": 2.862922191619873 + }, + { + "auxiliary_loss_clip": 0.01588819, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_clip": 1.36899567, + "balance_loss_mlp": 1.03683913, + "epoch": 0.20027055463700585, + "flos": 22977028811520.0, + "grad_norm": 3.8702071395681266, + "language_loss": 0.76899815, + "learning_rate": 3.703502390349417e-06, + "loss": 0.79552037, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 2.1953125, + "router_z_loss_mlp": 0.26550293, + "step": 3331, + "time_per_iteration": 2.867485284805298 + }, + { + "auxiliary_loss_clip": 0.01608316, + "auxiliary_loss_mlp": 0.01079765, + "balance_loss_clip": 1.38770103, + "balance_loss_mlp": 1.05210829, + "epoch": 0.20033067788967382, + "flos": 17174618265600.0, + "grad_norm": 1.9192987625308315, + "language_loss": 0.80208671, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.82896751, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27661133, + "step": 3332, + "time_per_iteration": 2.809671640396118 + }, + { + "auxiliary_loss_clip": 0.01329291, + "auxiliary_loss_mlp": 0.01059762, + "balance_loss_clip": 1.19492495, + "balance_loss_mlp": 1.0351572, + "epoch": 0.2003908011423418, + "flos": 60851805626880.0, + "grad_norm": 0.9533613491468712, + "language_loss": 0.62021172, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64410228, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.24511719, + "step": 3333, + "time_per_iteration": 3.2643260955810547 + }, + { + "auxiliary_loss_clip": 0.01586205, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.36522865, + "balance_loss_mlp": 1.03001237, + "epoch": 0.20045092439500978, + "flos": 24216713101440.0, + "grad_norm": 1.7806025653426771, + "language_loss": 0.82160246, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.84802735, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26281738, + "step": 3334, + "time_per_iteration": 2.874650478363037 + }, + { + "auxiliary_loss_clip": 0.01588854, + "auxiliary_loss_mlp": 0.01059805, + "balance_loss_clip": 1.36777544, + "balance_loss_mlp": 1.03278041, + "epoch": 0.20051104764767774, + "flos": 29399530348800.0, + "grad_norm": 2.4537902393964823, + "language_loss": 0.75845802, + "learning_rate": 3.702685645366134e-06, + "loss": 0.78494465, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27062988, + "step": 3335, + "time_per_iteration": 4.2998998165130615 + }, + { + "auxiliary_loss_clip": 0.01586759, + "auxiliary_loss_mlp": 0.0105666, + "balance_loss_clip": 1.36848915, + "balance_loss_mlp": 1.0301832, + "epoch": 0.2005711709003457, + "flos": 23524311640320.0, + "grad_norm": 2.0922648465740235, + "language_loss": 0.81007087, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.83650506, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26525879, + "step": 3336, + "time_per_iteration": 2.887704849243164 + }, + { + "auxiliary_loss_clip": 0.0158974, + "auxiliary_loss_mlp": 0.01051289, + "balance_loss_clip": 1.36758733, + "balance_loss_mlp": 1.02351367, + "epoch": 0.20063129415301367, + "flos": 22532261996160.0, + "grad_norm": 3.5389855101489682, + "language_loss": 0.79080951, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.81721985, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 2.22070312, + "router_z_loss_mlp": 0.27746582, + "step": 3337, + "time_per_iteration": 4.26805567741394 + }, + { + "auxiliary_loss_clip": 0.01586676, + "auxiliary_loss_mlp": 0.01045316, + "balance_loss_clip": 1.36757278, + "balance_loss_mlp": 1.01870894, + "epoch": 0.20069141740568164, + "flos": 25969357399680.0, + "grad_norm": 1.9834392692861786, + "language_loss": 0.70029432, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.72661424, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26611328, + "step": 3338, + "time_per_iteration": 4.306506156921387 + }, + { + "auxiliary_loss_clip": 0.01582696, + "auxiliary_loss_mlp": 0.01048834, + "balance_loss_clip": 1.36461306, + "balance_loss_mlp": 1.02289438, + "epoch": 0.2007515406583496, + "flos": 24801395397120.0, + "grad_norm": 2.0088450045011372, + "language_loss": 0.70591664, + "learning_rate": 3.701867867326735e-06, + "loss": 0.73223197, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25927734, + "step": 3339, + "time_per_iteration": 2.9037976264953613 + }, + { + "auxiliary_loss_clip": 0.01593626, + "auxiliary_loss_mlp": 0.01042267, + "balance_loss_clip": 1.37043238, + "balance_loss_mlp": 1.01513481, + "epoch": 0.2008116639110176, + "flos": 37939802872320.0, + "grad_norm": 2.222300680736885, + "language_loss": 0.673666, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.7000249, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.27124023, + "step": 3340, + "time_per_iteration": 3.0024187564849854 + }, + { + "auxiliary_loss_clip": 0.01586252, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_clip": 1.36362672, + "balance_loss_mlp": 1.01806736, + "epoch": 0.20087178716368556, + "flos": 20750226560640.0, + "grad_norm": 2.1512200938767165, + "language_loss": 0.74818164, + "learning_rate": 3.701458591066019e-06, + "loss": 0.77449805, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 2.22460938, + "router_z_loss_mlp": 0.2734375, + "step": 3341, + "time_per_iteration": 2.9285974502563477 + }, + { + "auxiliary_loss_clip": 0.01564848, + "auxiliary_loss_mlp": 0.01047288, + "balance_loss_clip": 1.35291338, + "balance_loss_mlp": 1.02157474, + "epoch": 0.20093191041635353, + "flos": 23852898512640.0, + "grad_norm": 1.771185986963292, + "language_loss": 0.73245651, + "learning_rate": 3.70125385615256e-06, + "loss": 0.75857782, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.25744629, + "step": 3342, + "time_per_iteration": 2.849736452102661 + }, + { + "auxiliary_loss_clip": 0.01582418, + "auxiliary_loss_mlp": 0.01044066, + "balance_loss_clip": 1.36224377, + "balance_loss_mlp": 1.01788759, + "epoch": 0.2009920336690215, + "flos": 21800334562560.0, + "grad_norm": 1.7792059932016684, + "language_loss": 0.7330147, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75927961, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26220703, + "step": 3343, + "time_per_iteration": 2.8210582733154297 + }, + { + "auxiliary_loss_clip": 0.01574326, + "auxiliary_loss_mlp": 0.01053009, + "balance_loss_clip": 1.355986, + "balance_loss_mlp": 1.0248754, + "epoch": 0.20105215692168946, + "flos": 26370345231360.0, + "grad_norm": 2.0180269688608314, + "language_loss": 0.82461232, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.85088563, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.28173828, + "step": 3344, + "time_per_iteration": 2.861100673675537 + }, + { + "auxiliary_loss_clip": 0.01586797, + "auxiliary_loss_mlp": 0.01048644, + "balance_loss_clip": 1.36431682, + "balance_loss_mlp": 1.02103496, + "epoch": 0.20111228017435742, + "flos": 18816602486400.0, + "grad_norm": 14.210659737902155, + "language_loss": 0.84576428, + "learning_rate": 3.700639264372948e-06, + "loss": 0.87211871, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27587891, + "step": 3345, + "time_per_iteration": 2.9493563175201416 + }, + { + "auxiliary_loss_clip": 0.01552171, + "auxiliary_loss_mlp": 0.01046852, + "balance_loss_clip": 1.34472752, + "balance_loss_mlp": 1.02090049, + "epoch": 0.20117240342702541, + "flos": 19984745468160.0, + "grad_norm": 2.2528793433826606, + "language_loss": 0.68853819, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.71452844, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.2598877, + "step": 3346, + "time_per_iteration": 2.8190698623657227 + }, + { + "auxiliary_loss_clip": 0.01566376, + "auxiliary_loss_mlp": 0.01050645, + "balance_loss_clip": 1.35100985, + "balance_loss_mlp": 1.02322721, + "epoch": 0.20123252667969338, + "flos": 23151493336320.0, + "grad_norm": 2.0325933506550817, + "language_loss": 0.75358665, + "learning_rate": 3.70022921406487e-06, + "loss": 0.77975684, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27416992, + "step": 3347, + "time_per_iteration": 2.85384202003479 + }, + { + "auxiliary_loss_clip": 0.01575192, + "auxiliary_loss_mlp": 0.01046571, + "balance_loss_clip": 1.36037636, + "balance_loss_mlp": 1.02203774, + "epoch": 0.20129264993236134, + "flos": 23232038359680.0, + "grad_norm": 1.5476273844590394, + "language_loss": 0.87424833, + "learning_rate": 3.70002409219765e-06, + "loss": 0.90046597, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.2454834, + "step": 3348, + "time_per_iteration": 2.8500397205352783 + }, + { + "auxiliary_loss_clip": 0.01561283, + "auxiliary_loss_mlp": 0.01049103, + "balance_loss_clip": 1.35008824, + "balance_loss_mlp": 1.02117264, + "epoch": 0.2013527731850293, + "flos": 21881422523520.0, + "grad_norm": 1.6204012455760757, + "language_loss": 0.72134966, + "learning_rate": 3.699818905865346e-06, + "loss": 0.74745357, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.27905273, + "step": 3349, + "time_per_iteration": 2.827141046524048 + }, + { + "auxiliary_loss_clip": 0.01560071, + "auxiliary_loss_mlp": 0.01047, + "balance_loss_clip": 1.34725988, + "balance_loss_mlp": 1.01906896, + "epoch": 0.20141289643769728, + "flos": 18049854539520.0, + "grad_norm": 1.6261137327040167, + "language_loss": 0.72691935, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.75299007, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.27929688, + "step": 3350, + "time_per_iteration": 2.8394765853881836 + }, + { + "auxiliary_loss_clip": 0.01562639, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_clip": 1.3454603, + "balance_loss_mlp": 1.01871431, + "epoch": 0.20147301969036524, + "flos": 23961975022080.0, + "grad_norm": 2.7163386080977507, + "language_loss": 0.77451253, + "learning_rate": 3.69940833983661e-06, + "loss": 0.80059659, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.27062988, + "step": 3351, + "time_per_iteration": 2.8969576358795166 + }, + { + "auxiliary_loss_clip": 0.01585412, + "auxiliary_loss_mlp": 0.01046356, + "balance_loss_clip": 1.36264205, + "balance_loss_mlp": 1.01891398, + "epoch": 0.2015331429430332, + "flos": 25598213153280.0, + "grad_norm": 1.468398291704413, + "language_loss": 0.81827581, + "learning_rate": 3.699202960155748e-06, + "loss": 0.84459341, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.27416992, + "step": 3352, + "time_per_iteration": 2.9369630813598633 + }, + { + "auxiliary_loss_clip": 0.01568057, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_clip": 1.35160506, + "balance_loss_mlp": 1.01830745, + "epoch": 0.2015932661957012, + "flos": 26736422060160.0, + "grad_norm": 3.909242091666632, + "language_loss": 0.81689525, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.84302485, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26599121, + "step": 3353, + "time_per_iteration": 2.8768374919891357 + }, + { + "auxiliary_loss_clip": 0.01568384, + "auxiliary_loss_mlp": 0.01042793, + "balance_loss_clip": 1.3553865, + "balance_loss_mlp": 1.01772332, + "epoch": 0.20165338944836916, + "flos": 15641665309440.0, + "grad_norm": 1.6819665675367104, + "language_loss": 0.91336012, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.9394719, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25073242, + "step": 3354, + "time_per_iteration": 2.879809617996216 + }, + { + "auxiliary_loss_clip": 0.01334869, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.20712066, + "balance_loss_mlp": 1.00572836, + "epoch": 0.20171351270103713, + "flos": 57939524369280.0, + "grad_norm": 0.8542660202662613, + "language_loss": 0.55932653, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.58296621, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.23339844, + "step": 3355, + "time_per_iteration": 3.3341612815856934 + }, + { + "auxiliary_loss_clip": 0.01555245, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.3441633, + "balance_loss_mlp": 1.03142083, + "epoch": 0.2017736359537051, + "flos": 20824301577600.0, + "grad_norm": 1.5353229483779283, + "language_loss": 0.84946835, + "learning_rate": 3.698380797170751e-06, + "loss": 0.87558424, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.24902344, + "step": 3356, + "time_per_iteration": 2.9603211879730225 + }, + { + "auxiliary_loss_clip": 0.01596698, + "auxiliary_loss_mlp": 0.01051274, + "balance_loss_clip": 1.36980689, + "balance_loss_mlp": 1.02364111, + "epoch": 0.20183375920637306, + "flos": 17100814717440.0, + "grad_norm": 5.12005868688605, + "language_loss": 0.70976049, + "learning_rate": 3.698175095398085e-06, + "loss": 0.73624015, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 2.26757812, + "router_z_loss_mlp": 0.27612305, + "step": 3357, + "time_per_iteration": 2.899057149887085 + }, + { + "auxiliary_loss_clip": 0.01574057, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.35492134, + "balance_loss_mlp": 1.0194521, + "epoch": 0.20189388245904102, + "flos": 18670624202880.0, + "grad_norm": 1.82743947712233, + "language_loss": 0.73174816, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.75794333, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.2598877, + "step": 3358, + "time_per_iteration": 2.7870848178863525 + }, + { + "auxiliary_loss_clip": 0.01541686, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.3343184, + "balance_loss_mlp": 1.02879024, + "epoch": 0.20195400571170902, + "flos": 16805555280000.0, + "grad_norm": 1.6989074160663957, + "language_loss": 0.84124184, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.86719596, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24926758, + "step": 3359, + "time_per_iteration": 2.8331081867218018 + }, + { + "auxiliary_loss_clip": 0.01326239, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.19755876, + "balance_loss_mlp": 1.03120935, + "epoch": 0.20201412896437698, + "flos": 67202765856000.0, + "grad_norm": 0.7756399794665564, + "language_loss": 0.59048927, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61426306, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.19921875, + "step": 3360, + "time_per_iteration": 3.3646411895751953 + }, + { + "auxiliary_loss_clip": 0.01575638, + "auxiliary_loss_mlp": 0.01049979, + "balance_loss_clip": 1.35691047, + "balance_loss_mlp": 1.02301383, + "epoch": 0.20207425221704495, + "flos": 21335044590720.0, + "grad_norm": 2.1923864211094988, + "language_loss": 0.64366353, + "learning_rate": 3.697351644435763e-06, + "loss": 0.66991973, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26953125, + "step": 3361, + "time_per_iteration": 2.8059589862823486 + }, + { + "auxiliary_loss_clip": 0.01563754, + "auxiliary_loss_mlp": 0.01050612, + "balance_loss_clip": 1.35095561, + "balance_loss_mlp": 1.02488685, + "epoch": 0.2021343754697129, + "flos": 22537012700160.0, + "grad_norm": 2.1454466725536627, + "language_loss": 0.7700336, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.79617727, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25732422, + "step": 3362, + "time_per_iteration": 2.8746564388275146 + }, + { + "auxiliary_loss_clip": 0.01563777, + "auxiliary_loss_mlp": 0.01048195, + "balance_loss_clip": 1.3508513, + "balance_loss_mlp": 1.02297008, + "epoch": 0.20219449872238088, + "flos": 19072200216960.0, + "grad_norm": 1.6759195322879317, + "language_loss": 0.77539647, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.80151612, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25231934, + "step": 3363, + "time_per_iteration": 4.254496812820435 + }, + { + "auxiliary_loss_clip": 0.01563041, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.34997082, + "balance_loss_mlp": 1.02181375, + "epoch": 0.20225462197504884, + "flos": 24727637093760.0, + "grad_norm": 1.4112282268537386, + "language_loss": 0.76006937, + "learning_rate": 3.696733380367391e-06, + "loss": 0.7861684, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25048828, + "step": 3364, + "time_per_iteration": 2.881741523742676 + }, + { + "auxiliary_loss_clip": 0.01581925, + "auxiliary_loss_mlp": 0.01052476, + "balance_loss_clip": 1.36285067, + "balance_loss_mlp": 1.02559435, + "epoch": 0.2023147452277168, + "flos": 22028531927040.0, + "grad_norm": 1.8897377600214003, + "language_loss": 0.72265816, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.74900222, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26879883, + "step": 3365, + "time_per_iteration": 2.8943593502044678 + }, + { + "auxiliary_loss_clip": 0.01569021, + "auxiliary_loss_mlp": 0.01050553, + "balance_loss_clip": 1.35467625, + "balance_loss_mlp": 1.02433896, + "epoch": 0.2023748684803848, + "flos": 17753418737280.0, + "grad_norm": 1.806542196539308, + "language_loss": 0.86498231, + "learning_rate": 3.696320882607286e-06, + "loss": 0.89117801, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26257324, + "step": 3366, + "time_per_iteration": 2.8551669120788574 + }, + { + "auxiliary_loss_clip": 0.0156845, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.35550869, + "balance_loss_mlp": 1.01659298, + "epoch": 0.20243499173305277, + "flos": 31151314995840.0, + "grad_norm": 1.5525971625876667, + "language_loss": 0.70110261, + "learning_rate": 3.696114537236335e-06, + "loss": 0.72721326, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.26013184, + "step": 3367, + "time_per_iteration": 2.920126438140869 + }, + { + "auxiliary_loss_clip": 0.01581005, + "auxiliary_loss_mlp": 0.01044249, + "balance_loss_clip": 1.36041212, + "balance_loss_mlp": 1.01616359, + "epoch": 0.20249511498572073, + "flos": 33852410933760.0, + "grad_norm": 1.8250431160483767, + "language_loss": 0.69113064, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.71738315, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.28063965, + "step": 3368, + "time_per_iteration": 2.926661491394043 + }, + { + "auxiliary_loss_clip": 0.01561295, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.34946942, + "balance_loss_mlp": 1.02196252, + "epoch": 0.2025552382383887, + "flos": 21225651367680.0, + "grad_norm": 1.5039366155660086, + "language_loss": 0.78484261, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.81093991, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.26464844, + "step": 3369, + "time_per_iteration": 2.895616292953491 + }, + { + "auxiliary_loss_clip": 0.01570482, + "auxiliary_loss_mlp": 0.01057808, + "balance_loss_clip": 1.35147834, + "balance_loss_mlp": 1.03131962, + "epoch": 0.20261536149105666, + "flos": 14655497489280.0, + "grad_norm": 3.791362517507997, + "language_loss": 0.67005777, + "learning_rate": 3.695495115253795e-06, + "loss": 0.69634068, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26513672, + "step": 3370, + "time_per_iteration": 2.795042037963867 + }, + { + "auxiliary_loss_clip": 0.01313703, + "auxiliary_loss_mlp": 0.01058501, + "balance_loss_clip": 1.19372869, + "balance_loss_mlp": 1.03694785, + "epoch": 0.20267548474372463, + "flos": 66814265589120.0, + "grad_norm": 0.6895483383539973, + "language_loss": 0.5820756, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60579765, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.21582031, + "step": 3371, + "time_per_iteration": 4.844992399215698 + }, + { + "auxiliary_loss_clip": 0.01557455, + "auxiliary_loss_mlp": 0.01050549, + "balance_loss_clip": 1.34516788, + "balance_loss_mlp": 1.02361929, + "epoch": 0.2027356079963926, + "flos": 24691459236480.0, + "grad_norm": 1.524627567641125, + "language_loss": 0.92225122, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94833124, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.26940918, + "step": 3372, + "time_per_iteration": 4.343122243881226 + }, + { + "auxiliary_loss_clip": 0.01573193, + "auxiliary_loss_mlp": 0.01053591, + "balance_loss_clip": 1.35607803, + "balance_loss_mlp": 1.0251472, + "epoch": 0.20279573124906058, + "flos": 26403129728640.0, + "grad_norm": 1.695656185109376, + "language_loss": 0.79044497, + "learning_rate": 3.694875114631167e-06, + "loss": 0.81671274, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.28442383, + "step": 3373, + "time_per_iteration": 4.210289001464844 + }, + { + "auxiliary_loss_clip": 0.01551678, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.34427392, + "balance_loss_mlp": 1.02112961, + "epoch": 0.20285585450172855, + "flos": 33812432513280.0, + "grad_norm": 1.7280921838951797, + "language_loss": 0.72238815, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.74838948, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.27294922, + "step": 3374, + "time_per_iteration": 2.946377992630005 + }, + { + "auxiliary_loss_clip": 0.01313018, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.19479227, + "balance_loss_mlp": 1.00247908, + "epoch": 0.20291597775439651, + "flos": 71197324168320.0, + "grad_norm": 0.9735946020547792, + "language_loss": 0.62473059, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64814878, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.26367188, + "step": 3375, + "time_per_iteration": 3.3196218013763428 + }, + { + "auxiliary_loss_clip": 0.01553456, + "auxiliary_loss_mlp": 0.01048339, + "balance_loss_clip": 1.34194064, + "balance_loss_mlp": 1.02199411, + "epoch": 0.20297610100706448, + "flos": 19502171982720.0, + "grad_norm": 1.577872935996985, + "language_loss": 0.83048964, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.85650754, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.26367188, + "step": 3376, + "time_per_iteration": 2.822425127029419 + }, + { + "auxiliary_loss_clip": 0.01573252, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.35553503, + "balance_loss_mlp": 1.02236223, + "epoch": 0.20303622425973245, + "flos": 25054640398080.0, + "grad_norm": 1.919334084219983, + "language_loss": 0.82510549, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.85134226, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.28051758, + "step": 3377, + "time_per_iteration": 2.8600258827209473 + }, + { + "auxiliary_loss_clip": 0.01568752, + "auxiliary_loss_mlp": 0.01056629, + "balance_loss_clip": 1.35478139, + "balance_loss_mlp": 1.03036773, + "epoch": 0.2030963475124004, + "flos": 21989729871360.0, + "grad_norm": 1.8155610280813599, + "language_loss": 0.78310513, + "learning_rate": 3.69384049496805e-06, + "loss": 0.80935895, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26281738, + "step": 3378, + "time_per_iteration": 2.8825161457061768 + }, + { + "auxiliary_loss_clip": 0.01559555, + "auxiliary_loss_mlp": 0.01049681, + "balance_loss_clip": 1.34288204, + "balance_loss_mlp": 1.02190542, + "epoch": 0.2031564707650684, + "flos": 19509863598720.0, + "grad_norm": 1.8209140951875677, + "language_loss": 0.81249887, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.83859122, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.27770996, + "step": 3379, + "time_per_iteration": 2.8480472564697266 + }, + { + "auxiliary_loss_clip": 0.01546424, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_clip": 1.33819032, + "balance_loss_mlp": 1.028512, + "epoch": 0.20321659401773637, + "flos": 22757246979840.0, + "grad_norm": 1.6918580032740358, + "language_loss": 0.87753922, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.90354538, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.25708008, + "step": 3380, + "time_per_iteration": 2.8253355026245117 + }, + { + "auxiliary_loss_clip": 0.01566852, + "auxiliary_loss_mlp": 0.01050224, + "balance_loss_clip": 1.35207546, + "balance_loss_mlp": 1.02402163, + "epoch": 0.20327671727040433, + "flos": 22466104819200.0, + "grad_norm": 1.708777914703145, + "language_loss": 0.76351237, + "learning_rate": 3.693218952340186e-06, + "loss": 0.78968316, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 2.14550781, + "router_z_loss_mlp": 0.26208496, + "step": 3381, + "time_per_iteration": 2.870901107788086 + }, + { + "auxiliary_loss_clip": 0.01576797, + "auxiliary_loss_mlp": 0.01053446, + "balance_loss_clip": 1.35819292, + "balance_loss_mlp": 1.026564, + "epoch": 0.2033368405230723, + "flos": 19544005440000.0, + "grad_norm": 1.8255344953339325, + "language_loss": 0.80834186, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.83464426, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26855469, + "step": 3382, + "time_per_iteration": 2.830556631088257 + }, + { + "auxiliary_loss_clip": 0.01579837, + "auxiliary_loss_mlp": 0.01049087, + "balance_loss_clip": 1.36190856, + "balance_loss_mlp": 1.02097702, + "epoch": 0.20339696377574026, + "flos": 13817389213440.0, + "grad_norm": 1.7896673548708524, + "language_loss": 0.81402779, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.84031701, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.28100586, + "step": 3383, + "time_per_iteration": 2.8066368103027344 + }, + { + "auxiliary_loss_clip": 0.01568814, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.35407245, + "balance_loss_mlp": 1.01970446, + "epoch": 0.20345708702840823, + "flos": 20349057749760.0, + "grad_norm": 1.8452084591068358, + "language_loss": 0.75707567, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.78322762, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.26672363, + "step": 3384, + "time_per_iteration": 2.803147792816162 + }, + { + "auxiliary_loss_clip": 0.01592791, + "auxiliary_loss_mlp": 0.0105075, + "balance_loss_clip": 1.36652792, + "balance_loss_mlp": 1.022259, + "epoch": 0.2035172102810762, + "flos": 20342180540160.0, + "grad_norm": 2.720252133128083, + "language_loss": 0.78308415, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.80951959, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 2.25976562, + "router_z_loss_mlp": 0.28479004, + "step": 3385, + "time_per_iteration": 2.77998423576355 + }, + { + "auxiliary_loss_clip": 0.0157207, + "auxiliary_loss_mlp": 0.01060598, + "balance_loss_clip": 1.35660267, + "balance_loss_mlp": 1.03284597, + "epoch": 0.2035773335337442, + "flos": 23341341093120.0, + "grad_norm": 1.8022732265513657, + "language_loss": 0.70314646, + "learning_rate": 3.692181763924639e-06, + "loss": 0.72947311, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.27770996, + "step": 3386, + "time_per_iteration": 2.8637287616729736 + }, + { + "auxiliary_loss_clip": 0.01579987, + "auxiliary_loss_mlp": 0.01056308, + "balance_loss_clip": 1.36350214, + "balance_loss_mlp": 1.02920032, + "epoch": 0.20363745678641215, + "flos": 28342137934080.0, + "grad_norm": 1.3274381715266834, + "language_loss": 0.81804127, + "learning_rate": 3.691974133706947e-06, + "loss": 0.84440422, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.27148438, + "step": 3387, + "time_per_iteration": 2.885659694671631 + }, + { + "auxiliary_loss_clip": 0.01555458, + "auxiliary_loss_mlp": 0.01049114, + "balance_loss_clip": 1.34636378, + "balance_loss_mlp": 1.02216101, + "epoch": 0.20369758003908012, + "flos": 18924728855040.0, + "grad_norm": 2.395837630485103, + "language_loss": 0.81010854, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.83615428, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.26965332, + "step": 3388, + "time_per_iteration": 2.8331298828125 + }, + { + "auxiliary_loss_clip": 0.01581407, + "auxiliary_loss_mlp": 0.01050164, + "balance_loss_clip": 1.36288714, + "balance_loss_mlp": 1.02331781, + "epoch": 0.20375770329174808, + "flos": 19215373322880.0, + "grad_norm": 1.627191866807711, + "language_loss": 0.72680414, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.75311983, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.26818848, + "step": 3389, + "time_per_iteration": 2.800248622894287 + }, + { + "auxiliary_loss_clip": 0.01570444, + "auxiliary_loss_mlp": 0.01048659, + "balance_loss_clip": 1.35702658, + "balance_loss_mlp": 1.02119267, + "epoch": 0.20381782654441605, + "flos": 19400470375680.0, + "grad_norm": 1.713563831960336, + "language_loss": 0.88053715, + "learning_rate": 3.691350858126404e-06, + "loss": 0.90672815, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.27453613, + "step": 3390, + "time_per_iteration": 2.814483642578125 + }, + { + "auxiliary_loss_clip": 0.01562017, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.35012078, + "balance_loss_mlp": 1.01794863, + "epoch": 0.203877949797084, + "flos": 24838659129600.0, + "grad_norm": 1.9458072995582858, + "language_loss": 0.73194063, + "learning_rate": 3.691142971316662e-06, + "loss": 0.75801253, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.27246094, + "step": 3391, + "time_per_iteration": 2.8968234062194824 + }, + { + "auxiliary_loss_clip": 0.01564363, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.35157716, + "balance_loss_mlp": 1.02329135, + "epoch": 0.20393807304975198, + "flos": 18012590807040.0, + "grad_norm": 2.2514636244515254, + "language_loss": 0.88136178, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.90750277, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.26452637, + "step": 3392, + "time_per_iteration": 2.8103151321411133 + }, + { + "auxiliary_loss_clip": 0.01575677, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.35782075, + "balance_loss_mlp": 1.02252245, + "epoch": 0.20399819630241997, + "flos": 24217663242240.0, + "grad_norm": 1.430969129202825, + "language_loss": 0.81655252, + "learning_rate": 3.69072700532013e-06, + "loss": 0.84281582, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.28149414, + "step": 3393, + "time_per_iteration": 2.9099128246307373 + }, + { + "auxiliary_loss_clip": 0.01563193, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.34995019, + "balance_loss_mlp": 1.01629186, + "epoch": 0.20405831955508794, + "flos": 20786856865920.0, + "grad_norm": 1.6902799845061478, + "language_loss": 0.86766255, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.89372271, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26501465, + "step": 3394, + "time_per_iteration": 2.833338975906372 + }, + { + "auxiliary_loss_clip": 0.01562853, + "auxiliary_loss_mlp": 0.01047036, + "balance_loss_clip": 1.34879136, + "balance_loss_mlp": 1.02104819, + "epoch": 0.2041184428077559, + "flos": 15495325067520.0, + "grad_norm": 2.173526952963263, + "language_loss": 0.85175169, + "learning_rate": 3.69031078287345e-06, + "loss": 0.87785053, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.26025391, + "step": 3395, + "time_per_iteration": 2.8285036087036133 + }, + { + "auxiliary_loss_clip": 0.01576541, + "auxiliary_loss_mlp": 0.01045667, + "balance_loss_clip": 1.35749245, + "balance_loss_mlp": 1.0185231, + "epoch": 0.20417856606042387, + "flos": 15594854924160.0, + "grad_norm": 1.8811318074710468, + "language_loss": 0.85199505, + "learning_rate": 3.690102575501033e-06, + "loss": 0.8782171, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.2713623, + "step": 3396, + "time_per_iteration": 2.806762933731079 + }, + { + "auxiliary_loss_clip": 0.01568357, + "auxiliary_loss_mlp": 0.01047175, + "balance_loss_clip": 1.3544575, + "balance_loss_mlp": 1.01997185, + "epoch": 0.20423868931309183, + "flos": 24289883222400.0, + "grad_norm": 1.6999120911642334, + "language_loss": 0.77907741, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.80523276, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.27209473, + "step": 3397, + "time_per_iteration": 2.865786075592041 + }, + { + "auxiliary_loss_clip": 0.0156347, + "auxiliary_loss_mlp": 0.01045203, + "balance_loss_clip": 1.34764361, + "balance_loss_mlp": 1.0178566, + "epoch": 0.2042988125657598, + "flos": 18621551577600.0, + "grad_norm": 2.3538477056925227, + "language_loss": 0.88582194, + "learning_rate": 3.689685968497518e-06, + "loss": 0.91190863, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.2734375, + "step": 3398, + "time_per_iteration": 4.220952272415161 + }, + { + "auxiliary_loss_clip": 0.01580934, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_clip": 1.36369407, + "balance_loss_mlp": 1.01838589, + "epoch": 0.2043589358184278, + "flos": 17858377969920.0, + "grad_norm": 1.7882190994765854, + "language_loss": 0.78763294, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.81389809, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.27185059, + "step": 3399, + "time_per_iteration": 2.88181734085083 + }, + { + "auxiliary_loss_clip": 0.01573484, + "auxiliary_loss_mlp": 0.01048196, + "balance_loss_clip": 1.35576153, + "balance_loss_mlp": 1.02090871, + "epoch": 0.20441905907109575, + "flos": 21445478444160.0, + "grad_norm": 1.870615310201243, + "language_loss": 0.77180648, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.79802328, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.27282715, + "step": 3400, + "time_per_iteration": 2.8886966705322266 + }, + { + "auxiliary_loss_clip": 0.01554146, + "auxiliary_loss_mlp": 0.01046168, + "balance_loss_clip": 1.34287536, + "balance_loss_mlp": 1.0205853, + "epoch": 0.20447918232376372, + "flos": 27718789317120.0, + "grad_norm": 1.6491158864075368, + "language_loss": 0.806054, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.83205712, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25610352, + "step": 3401, + "time_per_iteration": 2.8712358474731445 + }, + { + "auxiliary_loss_clip": 0.01568978, + "auxiliary_loss_mlp": 0.010455, + "balance_loss_clip": 1.35155797, + "balance_loss_mlp": 1.01866579, + "epoch": 0.20453930557643168, + "flos": 30537874990080.0, + "grad_norm": 1.8592813987203245, + "language_loss": 0.70438313, + "learning_rate": 3.688851985676991e-06, + "loss": 0.73052794, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26818848, + "step": 3402, + "time_per_iteration": 2.9641377925872803 + }, + { + "auxiliary_loss_clip": 0.0157107, + "auxiliary_loss_mlp": 0.01047197, + "balance_loss_clip": 1.35342193, + "balance_loss_mlp": 1.01899242, + "epoch": 0.20459942882909965, + "flos": 18996632121600.0, + "grad_norm": 2.2130724212321238, + "language_loss": 0.82180291, + "learning_rate": 3.688643329848496e-06, + "loss": 0.84798557, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.28222656, + "step": 3403, + "time_per_iteration": 2.833972930908203 + }, + { + "auxiliary_loss_clip": 0.01571017, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.35335696, + "balance_loss_mlp": 1.02057588, + "epoch": 0.20465955208176762, + "flos": 20348605301760.0, + "grad_norm": 4.097731364701076, + "language_loss": 0.85115016, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.87733513, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.26916504, + "step": 3404, + "time_per_iteration": 2.8034298419952393 + }, + { + "auxiliary_loss_clip": 0.01568347, + "auxiliary_loss_mlp": 0.01055846, + "balance_loss_clip": 1.34937835, + "balance_loss_mlp": 1.02958453, + "epoch": 0.20471967533443558, + "flos": 21261105308160.0, + "grad_norm": 1.8670021321033277, + "language_loss": 0.8656106, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.8918525, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26269531, + "step": 3405, + "time_per_iteration": 2.8429055213928223 + }, + { + "auxiliary_loss_clip": 0.01564625, + "auxiliary_loss_mlp": 0.01051034, + "balance_loss_clip": 1.34766698, + "balance_loss_mlp": 1.02498651, + "epoch": 0.20477979858710357, + "flos": 14509654940160.0, + "grad_norm": 2.23537153822877, + "language_loss": 0.85337508, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.87953162, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.26074219, + "step": 3406, + "time_per_iteration": 4.2840211391448975 + }, + { + "auxiliary_loss_clip": 0.01565572, + "auxiliary_loss_mlp": 0.01053449, + "balance_loss_clip": 1.35008192, + "balance_loss_mlp": 1.0268774, + "epoch": 0.20483992183977154, + "flos": 11407616415360.0, + "grad_norm": 1.9386880301435354, + "language_loss": 0.6910395, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.71722972, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.265625, + "step": 3407, + "time_per_iteration": 5.625287294387817 + }, + { + "auxiliary_loss_clip": 0.01560028, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.3459909, + "balance_loss_mlp": 1.0229249, + "epoch": 0.2049000450924395, + "flos": 19069033080960.0, + "grad_norm": 1.911111672445809, + "language_loss": 0.85104221, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.87713063, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.2590332, + "step": 3408, + "time_per_iteration": 2.885241746902466 + }, + { + "auxiliary_loss_clip": 0.01580582, + "auxiliary_loss_mlp": 0.01063523, + "balance_loss_clip": 1.35981679, + "balance_loss_mlp": 1.03555703, + "epoch": 0.20496016834510747, + "flos": 14582689326720.0, + "grad_norm": 2.494459554719174, + "language_loss": 0.65775156, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.6841926, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 2.20996094, + "router_z_loss_mlp": 0.27990723, + "step": 3409, + "time_per_iteration": 2.82252836227417 + }, + { + "auxiliary_loss_clip": 0.0158383, + "auxiliary_loss_mlp": 0.01066613, + "balance_loss_clip": 1.36650383, + "balance_loss_mlp": 1.04020858, + "epoch": 0.20502029159777543, + "flos": 22136613050880.0, + "grad_norm": 1.309368188669326, + "language_loss": 0.81072164, + "learning_rate": 3.687180946553745e-06, + "loss": 0.83722609, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26416016, + "step": 3410, + "time_per_iteration": 2.962275981903076 + }, + { + "auxiliary_loss_clip": 0.0157824, + "auxiliary_loss_mlp": 0.01071021, + "balance_loss_clip": 1.36184382, + "balance_loss_mlp": 1.04531956, + "epoch": 0.2050804148504434, + "flos": 25377164467200.0, + "grad_norm": 1.9124316837304327, + "language_loss": 0.76939702, + "learning_rate": 3.686971778678803e-06, + "loss": 0.79588962, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25695801, + "step": 3411, + "time_per_iteration": 2.903496026992798 + }, + { + "auxiliary_loss_clip": 0.01580842, + "auxiliary_loss_mlp": 0.01075601, + "balance_loss_clip": 1.36636209, + "balance_loss_mlp": 1.04893434, + "epoch": 0.2051405381031114, + "flos": 23629904300160.0, + "grad_norm": 1.8779360452222869, + "language_loss": 0.7509191, + "learning_rate": 3.686762546833722e-06, + "loss": 0.77748346, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26696777, + "step": 3412, + "time_per_iteration": 2.900313377380371 + }, + { + "auxiliary_loss_clip": 0.01584663, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_clip": 1.36348057, + "balance_loss_mlp": 1.05055451, + "epoch": 0.20520066135577936, + "flos": 19572989374080.0, + "grad_norm": 1.9818385946795365, + "language_loss": 0.78916478, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.81578082, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.26416016, + "step": 3413, + "time_per_iteration": 2.8083720207214355 + }, + { + "auxiliary_loss_clip": 0.01578856, + "auxiliary_loss_mlp": 0.01077696, + "balance_loss_clip": 1.36824775, + "balance_loss_mlp": 1.05132675, + "epoch": 0.20526078460844732, + "flos": 17685406523520.0, + "grad_norm": 1.829279381207386, + "language_loss": 0.85364246, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.88020802, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.26367188, + "step": 3414, + "time_per_iteration": 2.8397960662841797 + }, + { + "auxiliary_loss_clip": 0.01581591, + "auxiliary_loss_mlp": 0.01064901, + "balance_loss_clip": 1.36653423, + "balance_loss_mlp": 1.04014087, + "epoch": 0.2053209078611153, + "flos": 21508558974720.0, + "grad_norm": 1.8043040873657037, + "language_loss": 0.81929553, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.84576046, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24755859, + "step": 3415, + "time_per_iteration": 2.8536548614501953 + }, + { + "auxiliary_loss_clip": 0.01590431, + "auxiliary_loss_mlp": 0.01067713, + "balance_loss_clip": 1.3775692, + "balance_loss_mlp": 1.04314435, + "epoch": 0.20538103111378325, + "flos": 25673826493440.0, + "grad_norm": 2.508462868875211, + "language_loss": 0.74312854, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.76970994, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.24584961, + "step": 3416, + "time_per_iteration": 2.909085988998413 + }, + { + "auxiliary_loss_clip": 0.01587066, + "auxiliary_loss_mlp": 0.01065772, + "balance_loss_clip": 1.36920357, + "balance_loss_mlp": 1.04066622, + "epoch": 0.20544115436645122, + "flos": 23159365931520.0, + "grad_norm": 2.074122135644117, + "language_loss": 0.79561067, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.82213902, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.25109863, + "step": 3417, + "time_per_iteration": 2.881777763366699 + }, + { + "auxiliary_loss_clip": 0.015977, + "auxiliary_loss_mlp": 0.01066917, + "balance_loss_clip": 1.37863374, + "balance_loss_mlp": 1.0402503, + "epoch": 0.20550127761911918, + "flos": 19399746458880.0, + "grad_norm": 2.0769245581817093, + "language_loss": 0.88567913, + "learning_rate": 3.685505812834798e-06, + "loss": 0.91232526, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26647949, + "step": 3418, + "time_per_iteration": 2.7952401638031006 + }, + { + "auxiliary_loss_clip": 0.0159254, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_clip": 1.37403691, + "balance_loss_mlp": 1.03175867, + "epoch": 0.20556140087178718, + "flos": 22903360997760.0, + "grad_norm": 2.072394255276017, + "language_loss": 0.63064229, + "learning_rate": 3.685296133421035e-06, + "loss": 0.65714896, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26391602, + "step": 3419, + "time_per_iteration": 2.841963291168213 + }, + { + "auxiliary_loss_clip": 0.01621516, + "auxiliary_loss_mlp": 0.01065689, + "balance_loss_clip": 1.39964283, + "balance_loss_mlp": 1.0388906, + "epoch": 0.20562152412445514, + "flos": 19798517295360.0, + "grad_norm": 1.6759174243060417, + "language_loss": 0.86828744, + "learning_rate": 3.685086390100674e-06, + "loss": 0.8951596, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.26818848, + "step": 3420, + "time_per_iteration": 2.834890127182007 + }, + { + "auxiliary_loss_clip": 0.01586799, + "auxiliary_loss_mlp": 0.01058655, + "balance_loss_clip": 1.3695966, + "balance_loss_mlp": 1.03284669, + "epoch": 0.2056816473771231, + "flos": 31513998464640.0, + "grad_norm": 2.1598218163010587, + "language_loss": 0.72670245, + "learning_rate": 3.684876582881668e-06, + "loss": 0.75315696, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25817871, + "step": 3421, + "time_per_iteration": 3.0526740550994873 + }, + { + "auxiliary_loss_clip": 0.01588247, + "auxiliary_loss_mlp": 0.01048246, + "balance_loss_clip": 1.3733784, + "balance_loss_mlp": 1.02134025, + "epoch": 0.20574177062979107, + "flos": 23268804399360.0, + "grad_norm": 1.97278585463943, + "language_loss": 0.72680414, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.75316912, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 2.14746094, + "router_z_loss_mlp": 0.26916504, + "step": 3422, + "time_per_iteration": 2.877342939376831 + }, + { + "auxiliary_loss_clip": 0.0133716, + "auxiliary_loss_mlp": 0.01046688, + "balance_loss_clip": 1.21383905, + "balance_loss_mlp": 1.01884103, + "epoch": 0.20580189388245904, + "flos": 70341389441280.0, + "grad_norm": 0.7601778547504336, + "language_loss": 0.55680513, + "learning_rate": 3.684456776779548e-06, + "loss": 0.58064365, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.27929688, + "step": 3423, + "time_per_iteration": 3.4115304946899414 + }, + { + "auxiliary_loss_clip": 0.01599721, + "auxiliary_loss_mlp": 0.01047493, + "balance_loss_clip": 1.37859416, + "balance_loss_mlp": 1.02086139, + "epoch": 0.205862017135127, + "flos": 30750191429760.0, + "grad_norm": 2.094599541194899, + "language_loss": 0.72404921, + "learning_rate": 3.684246777912353e-06, + "loss": 0.75052136, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.26660156, + "step": 3424, + "time_per_iteration": 2.9402096271514893 + }, + { + "auxiliary_loss_clip": 0.01586837, + "auxiliary_loss_mlp": 0.01047708, + "balance_loss_clip": 1.37233555, + "balance_loss_mlp": 1.020087, + "epoch": 0.20592214038779497, + "flos": 21334456408320.0, + "grad_norm": 1.4247170936094284, + "language_loss": 0.7572099, + "learning_rate": 3.684036715178351e-06, + "loss": 0.78355527, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.27624512, + "step": 3425, + "time_per_iteration": 2.832782030105591 + }, + { + "auxiliary_loss_clip": 0.01568186, + "auxiliary_loss_mlp": 0.01049307, + "balance_loss_clip": 1.35779631, + "balance_loss_mlp": 1.02252054, + "epoch": 0.20598226364046296, + "flos": 22901505960960.0, + "grad_norm": 1.70594571267528, + "language_loss": 0.89215666, + "learning_rate": 3.683826588585508e-06, + "loss": 0.91833162, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.26818848, + "step": 3426, + "time_per_iteration": 2.865995168685913 + }, + { + "auxiliary_loss_clip": 0.01568928, + "auxiliary_loss_mlp": 0.01045632, + "balance_loss_clip": 1.35639298, + "balance_loss_mlp": 1.0184164, + "epoch": 0.20604238689313092, + "flos": 23889121614720.0, + "grad_norm": 1.4613035675479986, + "language_loss": 0.78380322, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.8099488, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.27185059, + "step": 3427, + "time_per_iteration": 2.8604531288146973 + }, + { + "auxiliary_loss_clip": 0.01587445, + "auxiliary_loss_mlp": 0.01043859, + "balance_loss_clip": 1.36824512, + "balance_loss_mlp": 1.01675141, + "epoch": 0.2061025101457989, + "flos": 22501604004480.0, + "grad_norm": 1.4534748991415538, + "language_loss": 0.74764431, + "learning_rate": 3.683406143855174e-06, + "loss": 0.77395743, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27087402, + "step": 3428, + "time_per_iteration": 2.8366570472717285 + }, + { + "auxiliary_loss_clip": 0.0159098, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.37088776, + "balance_loss_mlp": 1.0191741, + "epoch": 0.20616263339846685, + "flos": 22787904971520.0, + "grad_norm": 1.705467179096323, + "language_loss": 0.74469471, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.77107674, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.28015137, + "step": 3429, + "time_per_iteration": 2.875523090362549 + }, + { + "auxiliary_loss_clip": 0.01596922, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_clip": 1.37611783, + "balance_loss_mlp": 1.0215584, + "epoch": 0.20622275665113482, + "flos": 20890820712960.0, + "grad_norm": 1.670757551701038, + "language_loss": 0.85881221, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.88527572, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.27880859, + "step": 3430, + "time_per_iteration": 2.8344533443450928 + }, + { + "auxiliary_loss_clip": 0.01576353, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_clip": 1.35923457, + "balance_loss_mlp": 1.0204041, + "epoch": 0.20628287990380278, + "flos": 19363840070400.0, + "grad_norm": 1.6153644499145472, + "language_loss": 0.70356554, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.72980845, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.27563477, + "step": 3431, + "time_per_iteration": 2.83349347114563 + }, + { + "auxiliary_loss_clip": 0.01329041, + "auxiliary_loss_mlp": 0.01079151, + "balance_loss_clip": 1.19957983, + "balance_loss_mlp": 1.04691648, + "epoch": 0.20634300315647078, + "flos": 71547474827520.0, + "grad_norm": 0.8294178462842249, + "language_loss": 0.6026237, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62670565, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.32226562, + "step": 3432, + "time_per_iteration": 3.505629301071167 + }, + { + "auxiliary_loss_clip": 0.01582415, + "auxiliary_loss_mlp": 0.01047693, + "balance_loss_clip": 1.36637437, + "balance_loss_mlp": 1.02001262, + "epoch": 0.20640312640913874, + "flos": 21733453468800.0, + "grad_norm": 1.5153538888255014, + "language_loss": 0.73550439, + "learning_rate": 3.682353915057679e-06, + "loss": 0.76180547, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.27685547, + "step": 3433, + "time_per_iteration": 4.274772882461548 + }, + { + "auxiliary_loss_clip": 0.01588292, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.36679935, + "balance_loss_mlp": 1.01769543, + "epoch": 0.2064632496618067, + "flos": 20563364960640.0, + "grad_norm": 2.122072724626134, + "language_loss": 0.87259561, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.89893824, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.28295898, + "step": 3434, + "time_per_iteration": 2.8769097328186035 + }, + { + "auxiliary_loss_clip": 0.01581808, + "auxiliary_loss_mlp": 0.01051806, + "balance_loss_clip": 1.36162806, + "balance_loss_mlp": 1.0241735, + "epoch": 0.20652337291447467, + "flos": 29834750511360.0, + "grad_norm": 1.626771597166887, + "language_loss": 0.70639431, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.73273045, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27661133, + "step": 3435, + "time_per_iteration": 2.933743953704834 + }, + { + "auxiliary_loss_clip": 0.01570664, + "auxiliary_loss_mlp": 0.0105372, + "balance_loss_clip": 1.35717177, + "balance_loss_mlp": 1.02346456, + "epoch": 0.20658349616714264, + "flos": 26224819395840.0, + "grad_norm": 1.656085285368096, + "language_loss": 0.90389282, + "learning_rate": 3.681721812174988e-06, + "loss": 0.93013668, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.30249023, + "step": 3436, + "time_per_iteration": 2.8787035942077637 + }, + { + "auxiliary_loss_clip": 0.01575209, + "auxiliary_loss_mlp": 0.01049943, + "balance_loss_clip": 1.35999656, + "balance_loss_mlp": 1.02097559, + "epoch": 0.2066436194198106, + "flos": 26005399522560.0, + "grad_norm": 1.8394290682720231, + "language_loss": 0.77946204, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.80571353, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.28955078, + "step": 3437, + "time_per_iteration": 2.900306224822998 + }, + { + "auxiliary_loss_clip": 0.01578671, + "auxiliary_loss_mlp": 0.01049961, + "balance_loss_clip": 1.36131191, + "balance_loss_mlp": 1.0226624, + "epoch": 0.20670374267247857, + "flos": 21370996224000.0, + "grad_norm": 2.1893917164752366, + "language_loss": 0.78904295, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.81532925, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.27294922, + "step": 3438, + "time_per_iteration": 2.8084795475006104 + }, + { + "auxiliary_loss_clip": 0.01325891, + "auxiliary_loss_mlp": 0.01052426, + "balance_loss_clip": 1.19715881, + "balance_loss_mlp": 1.02267098, + "epoch": 0.20676386592514656, + "flos": 66414046919040.0, + "grad_norm": 0.8534668033017735, + "language_loss": 0.67208529, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69586849, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.296875, + "step": 3439, + "time_per_iteration": 3.3585543632507324 + }, + { + "auxiliary_loss_clip": 0.01581094, + "auxiliary_loss_mlp": 0.01049264, + "balance_loss_clip": 1.36050439, + "balance_loss_mlp": 1.02198887, + "epoch": 0.20682398917781453, + "flos": 17283423306240.0, + "grad_norm": 1.937195905097359, + "language_loss": 0.8508023, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.87710583, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.27319336, + "step": 3440, + "time_per_iteration": 2.8208167552948 + }, + { + "auxiliary_loss_clip": 0.01592484, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.37389922, + "balance_loss_mlp": 1.02135491, + "epoch": 0.2068841124304825, + "flos": 18086349110400.0, + "grad_norm": 2.482453324957035, + "language_loss": 0.85666394, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.88308102, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.27880859, + "step": 3441, + "time_per_iteration": 4.186328887939453 + }, + { + "auxiliary_loss_clip": 0.0157685, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_clip": 1.36308777, + "balance_loss_mlp": 1.01931798, + "epoch": 0.20694423568315046, + "flos": 27359092005120.0, + "grad_norm": 1.6412561347328074, + "language_loss": 0.86323321, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88947612, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.28161621, + "step": 3442, + "time_per_iteration": 4.448321580886841 + }, + { + "auxiliary_loss_clip": 0.01584089, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.36305642, + "balance_loss_mlp": 1.01939511, + "epoch": 0.20700435893581842, + "flos": 20239438302720.0, + "grad_norm": 1.9501647463495295, + "language_loss": 0.74013257, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.76644742, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.28015137, + "step": 3443, + "time_per_iteration": 2.82975697517395 + }, + { + "auxiliary_loss_clip": 0.01563841, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.35025549, + "balance_loss_mlp": 1.02116942, + "epoch": 0.2070644821884864, + "flos": 20640788092800.0, + "grad_norm": 1.7762047370447032, + "language_loss": 0.86638212, + "learning_rate": 3.680033399147797e-06, + "loss": 0.89249718, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.26501465, + "step": 3444, + "time_per_iteration": 2.8425731658935547 + }, + { + "auxiliary_loss_clip": 0.01315696, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.19226766, + "balance_loss_mlp": 1.00882173, + "epoch": 0.20712460544115438, + "flos": 65970592202880.0, + "grad_norm": 0.7043319373230057, + "language_loss": 0.57186842, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59540731, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.29296875, + "step": 3445, + "time_per_iteration": 3.3234875202178955 + }, + { + "auxiliary_loss_clip": 0.01566762, + "auxiliary_loss_mlp": 0.01050476, + "balance_loss_clip": 1.3539362, + "balance_loss_mlp": 1.02334428, + "epoch": 0.20718472869382235, + "flos": 19434747951360.0, + "grad_norm": 1.6958014434502875, + "language_loss": 0.78874898, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.81492138, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.27172852, + "step": 3446, + "time_per_iteration": 2.9626452922821045 + }, + { + "auxiliary_loss_clip": 0.01602171, + "auxiliary_loss_mlp": 0.0105754, + "balance_loss_clip": 1.37538815, + "balance_loss_mlp": 1.02921557, + "epoch": 0.2072448519464903, + "flos": 24509846033280.0, + "grad_norm": 2.4479927321522728, + "language_loss": 0.63686377, + "learning_rate": 3.679399192876334e-06, + "loss": 0.66346085, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.28283691, + "step": 3447, + "time_per_iteration": 2.8674073219299316 + }, + { + "auxiliary_loss_clip": 0.0156301, + "auxiliary_loss_mlp": 0.01047311, + "balance_loss_clip": 1.34550107, + "balance_loss_mlp": 1.01990485, + "epoch": 0.20730497519915828, + "flos": 23086014831360.0, + "grad_norm": 1.6221661041949098, + "language_loss": 0.87681437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.90291756, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.27404785, + "step": 3448, + "time_per_iteration": 2.855940818786621 + }, + { + "auxiliary_loss_clip": 0.01566543, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_clip": 1.34997296, + "balance_loss_mlp": 1.01995826, + "epoch": 0.20736509845182624, + "flos": 21078858677760.0, + "grad_norm": 2.6909934959531654, + "language_loss": 0.76208544, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.78822494, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27478027, + "step": 3449, + "time_per_iteration": 2.8234074115753174 + }, + { + "auxiliary_loss_clip": 0.01582788, + "auxiliary_loss_mlp": 0.01051529, + "balance_loss_clip": 1.36156487, + "balance_loss_mlp": 1.02363348, + "epoch": 0.2074252217044942, + "flos": 17640632154240.0, + "grad_norm": 1.756368339629202, + "language_loss": 0.77057946, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.79692256, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.27880859, + "step": 3450, + "time_per_iteration": 2.84580659866333 + }, + { + "auxiliary_loss_clip": 0.0157278, + "auxiliary_loss_mlp": 0.01045852, + "balance_loss_clip": 1.35498309, + "balance_loss_mlp": 1.01903009, + "epoch": 0.20748534495716217, + "flos": 23556643689600.0, + "grad_norm": 1.6468851500814907, + "language_loss": 0.83355117, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.85973746, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.26831055, + "step": 3451, + "time_per_iteration": 2.8549022674560547 + }, + { + "auxiliary_loss_clip": 0.01311624, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.18950224, + "balance_loss_mlp": 0.99329203, + "epoch": 0.20754546820983016, + "flos": 52277346017280.0, + "grad_norm": 0.8066930990930274, + "language_loss": 0.56667078, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.59006512, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.34570312, + "step": 3452, + "time_per_iteration": 3.279888868331909 + }, + { + "auxiliary_loss_clip": 0.01570087, + "auxiliary_loss_mlp": 0.01054446, + "balance_loss_clip": 1.35325408, + "balance_loss_mlp": 1.02707577, + "epoch": 0.20760559146249813, + "flos": 20422046891520.0, + "grad_norm": 2.691345957880228, + "language_loss": 0.89589322, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.92213857, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.27416992, + "step": 3453, + "time_per_iteration": 2.8707380294799805 + }, + { + "auxiliary_loss_clip": 0.01581392, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_clip": 1.36344481, + "balance_loss_mlp": 1.02133489, + "epoch": 0.2076657147151661, + "flos": 23196584419200.0, + "grad_norm": 1.5410929440294723, + "language_loss": 0.81685019, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.84315991, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 2.18066406, + "router_z_loss_mlp": 0.28271484, + "step": 3454, + "time_per_iteration": 2.872332811355591 + }, + { + "auxiliary_loss_clip": 0.01570322, + "auxiliary_loss_mlp": 0.01051253, + "balance_loss_clip": 1.35353553, + "balance_loss_mlp": 1.02483582, + "epoch": 0.20772583796783406, + "flos": 18301561217280.0, + "grad_norm": 2.7930499586468382, + "language_loss": 0.78942257, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.8156383, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.26416016, + "step": 3455, + "time_per_iteration": 2.9198086261749268 + }, + { + "auxiliary_loss_clip": 0.01562658, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_clip": 1.34996319, + "balance_loss_mlp": 1.02287877, + "epoch": 0.20778596122050202, + "flos": 17611693464960.0, + "grad_norm": 1.6471282649710617, + "language_loss": 0.81703985, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.84315097, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25610352, + "step": 3456, + "time_per_iteration": 2.846808671951294 + }, + { + "auxiliary_loss_clip": 0.01582619, + "auxiliary_loss_mlp": 0.01048933, + "balance_loss_clip": 1.36470819, + "balance_loss_mlp": 1.02239668, + "epoch": 0.20784608447317, + "flos": 23816132472960.0, + "grad_norm": 1.5406226566986498, + "language_loss": 0.79434925, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.82066482, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26538086, + "step": 3457, + "time_per_iteration": 2.9080049991607666 + }, + { + "auxiliary_loss_clip": 0.01578715, + "auxiliary_loss_mlp": 0.01058672, + "balance_loss_clip": 1.3605057, + "balance_loss_mlp": 1.03123021, + "epoch": 0.20790620772583795, + "flos": 17648188035840.0, + "grad_norm": 1.7295085509806147, + "language_loss": 0.84657764, + "learning_rate": 3.677068867939333e-06, + "loss": 0.87295163, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.27441406, + "step": 3458, + "time_per_iteration": 2.8262007236480713 + }, + { + "auxiliary_loss_clip": 0.01576111, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.36307859, + "balance_loss_mlp": 1.01808119, + "epoch": 0.20796633097850595, + "flos": 27685145168640.0, + "grad_norm": 1.849088510950195, + "language_loss": 0.76521945, + "learning_rate": 3.676856638489272e-06, + "loss": 0.79141802, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25695801, + "step": 3459, + "time_per_iteration": 2.8809902667999268 + }, + { + "auxiliary_loss_clip": 0.01571354, + "auxiliary_loss_mlp": 0.01046575, + "balance_loss_clip": 1.35959804, + "balance_loss_mlp": 1.01991987, + "epoch": 0.2080264542311739, + "flos": 19255170764160.0, + "grad_norm": 1.7809797590188388, + "language_loss": 0.77902198, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.80520129, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.26660156, + "step": 3460, + "time_per_iteration": 2.873784303665161 + }, + { + "auxiliary_loss_clip": 0.01566899, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.3518126, + "balance_loss_mlp": 1.01524377, + "epoch": 0.20808657748384188, + "flos": 27537130869120.0, + "grad_norm": 1.9552043096463112, + "language_loss": 0.76221961, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.78829849, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.25744629, + "step": 3461, + "time_per_iteration": 2.917376756668091 + }, + { + "auxiliary_loss_clip": 0.01580365, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.35902429, + "balance_loss_mlp": 1.01729083, + "epoch": 0.20814670073650984, + "flos": 26918306732160.0, + "grad_norm": 1.8356328498650532, + "language_loss": 0.89310598, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.91934019, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.25769043, + "step": 3462, + "time_per_iteration": 2.887219190597534 + }, + { + "auxiliary_loss_clip": 0.01304387, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.18221819, + "balance_loss_mlp": 1.00941837, + "epoch": 0.2082068239891778, + "flos": 70206948581760.0, + "grad_norm": 0.7629100914793379, + "language_loss": 0.59075403, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.61418962, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.296875, + "step": 3463, + "time_per_iteration": 3.4774398803710938 + }, + { + "auxiliary_loss_clip": 0.0157642, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.3556124, + "balance_loss_mlp": 1.02329624, + "epoch": 0.20826694724184577, + "flos": 24618786808320.0, + "grad_norm": 2.4439582278070344, + "language_loss": 0.68087924, + "learning_rate": 3.675794537601429e-06, + "loss": 0.70715117, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.27490234, + "step": 3464, + "time_per_iteration": 2.8441696166992188 + }, + { + "auxiliary_loss_clip": 0.01573408, + "auxiliary_loss_mlp": 0.0105048, + "balance_loss_clip": 1.35481811, + "balance_loss_mlp": 1.02408743, + "epoch": 0.20832707049451377, + "flos": 12899640810240.0, + "grad_norm": 1.956848906998702, + "language_loss": 0.85159594, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.8778348, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.26403809, + "step": 3465, + "time_per_iteration": 2.8777337074279785 + }, + { + "auxiliary_loss_clip": 0.01583619, + "auxiliary_loss_mlp": 0.01048306, + "balance_loss_clip": 1.36592889, + "balance_loss_mlp": 1.02142465, + "epoch": 0.20838719374718173, + "flos": 22208606807040.0, + "grad_norm": 2.34575015798552, + "language_loss": 0.8317886, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.85810781, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.2689209, + "step": 3466, + "time_per_iteration": 2.8336503505706787 + }, + { + "auxiliary_loss_clip": 0.01562946, + "auxiliary_loss_mlp": 0.01043095, + "balance_loss_clip": 1.35179138, + "balance_loss_mlp": 1.01823974, + "epoch": 0.2084473169998497, + "flos": 15167009664000.0, + "grad_norm": 1.8280312526848845, + "language_loss": 0.83002418, + "learning_rate": 3.675156514448716e-06, + "loss": 0.85608464, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24853516, + "step": 3467, + "time_per_iteration": 2.8921115398406982 + }, + { + "auxiliary_loss_clip": 0.01546349, + "auxiliary_loss_mlp": 0.01046472, + "balance_loss_clip": 1.34109533, + "balance_loss_mlp": 1.02165294, + "epoch": 0.20850744025251766, + "flos": 17465624691840.0, + "grad_norm": 1.7567668956238731, + "language_loss": 0.82468915, + "learning_rate": 3.674943713009518e-06, + "loss": 0.85061741, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.24816895, + "step": 3468, + "time_per_iteration": 4.346414089202881 + }, + { + "auxiliary_loss_clip": 0.01591818, + "auxiliary_loss_mlp": 0.01053599, + "balance_loss_clip": 1.370435, + "balance_loss_mlp": 1.02607393, + "epoch": 0.20856756350518563, + "flos": 25709008965120.0, + "grad_norm": 1.9841593939856008, + "language_loss": 0.90477729, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.9312315, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.2755127, + "step": 3469, + "time_per_iteration": 2.8687009811401367 + }, + { + "auxiliary_loss_clip": 0.01587589, + "auxiliary_loss_mlp": 0.01051727, + "balance_loss_clip": 1.37218702, + "balance_loss_mlp": 1.02677608, + "epoch": 0.2086276867578536, + "flos": 37903263056640.0, + "grad_norm": 2.192599147486033, + "language_loss": 0.77558178, + "learning_rate": 3.674517919597092e-06, + "loss": 0.80197495, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24951172, + "step": 3470, + "time_per_iteration": 3.006735324859619 + }, + { + "auxiliary_loss_clip": 0.01568449, + "auxiliary_loss_mlp": 0.01049197, + "balance_loss_clip": 1.35650826, + "balance_loss_mlp": 1.02180254, + "epoch": 0.20868781001052156, + "flos": 25568098099200.0, + "grad_norm": 1.7531097033802499, + "language_loss": 0.76274133, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78891778, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.27416992, + "step": 3471, + "time_per_iteration": 2.876418352127075 + }, + { + "auxiliary_loss_clip": 0.0160164, + "auxiliary_loss_mlp": 0.01057172, + "balance_loss_clip": 1.37876439, + "balance_loss_mlp": 1.02970624, + "epoch": 0.20874793326318955, + "flos": 27541248145920.0, + "grad_norm": 2.0526749794347046, + "language_loss": 0.76658249, + "learning_rate": 3.67409187219312e-06, + "loss": 0.79317063, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.27490234, + "step": 3472, + "time_per_iteration": 2.9014899730682373 + }, + { + "auxiliary_loss_clip": 0.01574532, + "auxiliary_loss_mlp": 0.01044703, + "balance_loss_clip": 1.36149693, + "balance_loss_mlp": 1.02019358, + "epoch": 0.20880805651585752, + "flos": 18557249437440.0, + "grad_norm": 2.1459270630503733, + "language_loss": 0.85590672, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.88209903, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.24523926, + "step": 3473, + "time_per_iteration": 2.845736265182495 + }, + { + "auxiliary_loss_clip": 0.01301817, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.17915881, + "balance_loss_mlp": 1.00618637, + "epoch": 0.20886817976852548, + "flos": 65977605146880.0, + "grad_norm": 0.8837869229572406, + "language_loss": 0.63720304, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.66053295, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.25, + "step": 3474, + "time_per_iteration": 3.3063149452209473 + }, + { + "auxiliary_loss_clip": 0.01594132, + "auxiliary_loss_mlp": 0.01047849, + "balance_loss_clip": 1.37707818, + "balance_loss_mlp": 1.021945, + "epoch": 0.20892830302119345, + "flos": 36553054423680.0, + "grad_norm": 2.204921392941378, + "language_loss": 0.71433204, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.74075186, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2590332, + "step": 3475, + "time_per_iteration": 4.432063579559326 + }, + { + "auxiliary_loss_clip": 0.01597499, + "auxiliary_loss_mlp": 0.01050006, + "balance_loss_clip": 1.38082993, + "balance_loss_mlp": 1.02563941, + "epoch": 0.2089884262738614, + "flos": 20965800625920.0, + "grad_norm": 1.4856491517583763, + "language_loss": 0.71122444, + "learning_rate": 3.673239015669065e-06, + "loss": 0.73769945, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24353027, + "step": 3476, + "time_per_iteration": 2.982536792755127 + }, + { + "auxiliary_loss_clip": 0.01582436, + "auxiliary_loss_mlp": 0.01055747, + "balance_loss_clip": 1.36985803, + "balance_loss_mlp": 1.02961612, + "epoch": 0.20904854952652938, + "flos": 22794329733120.0, + "grad_norm": 1.7864167679467173, + "language_loss": 0.90042526, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.92680717, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.26135254, + "step": 3477, + "time_per_iteration": 5.613992929458618 + }, + { + "auxiliary_loss_clip": 0.01579775, + "auxiliary_loss_mlp": 0.01055212, + "balance_loss_clip": 1.36698556, + "balance_loss_mlp": 1.03245544, + "epoch": 0.20910867277919734, + "flos": 27313457984640.0, + "grad_norm": 2.40986115101093, + "language_loss": 0.68918264, + "learning_rate": 3.672812206678344e-06, + "loss": 0.71553254, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.22753906, + "step": 3478, + "time_per_iteration": 2.8909878730773926 + }, + { + "auxiliary_loss_clip": 0.01601871, + "auxiliary_loss_mlp": 0.01058144, + "balance_loss_clip": 1.38493633, + "balance_loss_mlp": 1.03291917, + "epoch": 0.20916879603186533, + "flos": 14327317820160.0, + "grad_norm": 2.0379919708209533, + "language_loss": 0.85422349, + "learning_rate": 3.672598707029127e-06, + "loss": 0.88082361, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.25231934, + "step": 3479, + "time_per_iteration": 2.8117213249206543 + }, + { + "auxiliary_loss_clip": 0.01622109, + "auxiliary_loss_mlp": 0.01064644, + "balance_loss_clip": 1.40176225, + "balance_loss_mlp": 1.03883505, + "epoch": 0.2092289192845333, + "flos": 22283134272000.0, + "grad_norm": 2.229480018969664, + "language_loss": 0.75258768, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.77945518, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.25817871, + "step": 3480, + "time_per_iteration": 2.8442554473876953 + }, + { + "auxiliary_loss_clip": 0.01595128, + "auxiliary_loss_mlp": 0.0106301, + "balance_loss_clip": 1.37992084, + "balance_loss_mlp": 1.03796363, + "epoch": 0.20928904253720126, + "flos": 14839101463680.0, + "grad_norm": 2.326773163435039, + "language_loss": 0.77308244, + "learning_rate": 3.67217151746346e-06, + "loss": 0.79966378, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.25073242, + "step": 3481, + "time_per_iteration": 2.9033396244049072 + }, + { + "auxiliary_loss_clip": 0.01600601, + "auxiliary_loss_mlp": 0.01067667, + "balance_loss_clip": 1.38445234, + "balance_loss_mlp": 1.04357517, + "epoch": 0.20934916578986923, + "flos": 23269799784960.0, + "grad_norm": 1.696646483573241, + "language_loss": 0.85875076, + "learning_rate": 3.671957827563209e-06, + "loss": 0.88543344, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.2409668, + "step": 3482, + "time_per_iteration": 2.9559326171875 + }, + { + "auxiliary_loss_clip": 0.01614426, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.39730644, + "balance_loss_mlp": 1.03408766, + "epoch": 0.2094092890425372, + "flos": 32026958472960.0, + "grad_norm": 2.5832225196608833, + "language_loss": 0.73108906, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.7578181, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24401855, + "step": 3483, + "time_per_iteration": 2.978679656982422 + }, + { + "auxiliary_loss_clip": 0.01632857, + "auxiliary_loss_mlp": 0.01071987, + "balance_loss_clip": 1.41006529, + "balance_loss_mlp": 1.04694116, + "epoch": 0.20946941229520516, + "flos": 20020108919040.0, + "grad_norm": 1.9102037208420837, + "language_loss": 0.75814462, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.78519309, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.25085449, + "step": 3484, + "time_per_iteration": 2.9299166202545166 + }, + { + "auxiliary_loss_clip": 0.01622443, + "auxiliary_loss_mlp": 0.01059284, + "balance_loss_clip": 1.40380394, + "balance_loss_mlp": 1.03469157, + "epoch": 0.20952953554787315, + "flos": 30752996607360.0, + "grad_norm": 1.5675497283726032, + "language_loss": 0.71051073, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.73732793, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24609375, + "step": 3485, + "time_per_iteration": 3.113206148147583 + }, + { + "auxiliary_loss_clip": 0.01633383, + "auxiliary_loss_mlp": 0.01074511, + "balance_loss_clip": 1.40995705, + "balance_loss_mlp": 1.04799855, + "epoch": 0.20958965880054112, + "flos": 27059670046080.0, + "grad_norm": 1.8766670558984113, + "language_loss": 0.83890104, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.86598003, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.26525879, + "step": 3486, + "time_per_iteration": 2.933392286300659 + }, + { + "auxiliary_loss_clip": 0.01610553, + "auxiliary_loss_mlp": 0.01065502, + "balance_loss_clip": 1.3928839, + "balance_loss_mlp": 1.04189825, + "epoch": 0.20964978205320908, + "flos": 34217401887360.0, + "grad_norm": 1.7770383573052646, + "language_loss": 0.88052475, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.90728533, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 2.17578125, + "router_z_loss_mlp": 0.23596191, + "step": 3487, + "time_per_iteration": 2.9854443073272705 + }, + { + "auxiliary_loss_clip": 0.01608429, + "auxiliary_loss_mlp": 0.01063088, + "balance_loss_clip": 1.39152384, + "balance_loss_mlp": 1.03925824, + "epoch": 0.20970990530587705, + "flos": 23487907559040.0, + "grad_norm": 2.815392010081029, + "language_loss": 0.73144519, + "learning_rate": 3.670674357028504e-06, + "loss": 0.75816035, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.23828125, + "step": 3488, + "time_per_iteration": 2.88979172706604 + }, + { + "auxiliary_loss_clip": 0.01610514, + "auxiliary_loss_mlp": 0.01066098, + "balance_loss_clip": 1.39491677, + "balance_loss_mlp": 1.04220796, + "epoch": 0.209770028558545, + "flos": 18560597552640.0, + "grad_norm": 8.835210202889051, + "language_loss": 0.81599128, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.84275734, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.23901367, + "step": 3489, + "time_per_iteration": 2.8530938625335693 + }, + { + "auxiliary_loss_clip": 0.01622772, + "auxiliary_loss_mlp": 0.01059693, + "balance_loss_clip": 1.40302742, + "balance_loss_mlp": 1.03513539, + "epoch": 0.20983015181121298, + "flos": 21627136892160.0, + "grad_norm": 1.9792191495799596, + "language_loss": 0.73740691, + "learning_rate": 3.670246026613266e-06, + "loss": 0.76423156, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.24536133, + "step": 3490, + "time_per_iteration": 2.8296351432800293 + }, + { + "auxiliary_loss_clip": 0.01612006, + "auxiliary_loss_mlp": 0.01056729, + "balance_loss_clip": 1.40089011, + "balance_loss_mlp": 1.03332853, + "epoch": 0.20989027506388094, + "flos": 16622584732800.0, + "grad_norm": 2.567326456785195, + "language_loss": 0.71212959, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.73881692, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.23376465, + "step": 3491, + "time_per_iteration": 2.8325035572052 + }, + { + "auxiliary_loss_clip": 0.01621829, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_clip": 1.40133989, + "balance_loss_mlp": 1.02786076, + "epoch": 0.20995039831654894, + "flos": 23225930311680.0, + "grad_norm": 3.621408117576884, + "language_loss": 0.80587894, + "learning_rate": 3.669817442854444e-06, + "loss": 0.83262461, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 0.24902344, + "step": 3492, + "time_per_iteration": 2.8666093349456787 + }, + { + "auxiliary_loss_clip": 0.01606245, + "auxiliary_loss_mlp": 0.01047689, + "balance_loss_clip": 1.39156723, + "balance_loss_mlp": 1.02370358, + "epoch": 0.2100105215692169, + "flos": 18155854402560.0, + "grad_norm": 1.8439052769180586, + "language_loss": 0.87604666, + "learning_rate": 3.669603055991502e-06, + "loss": 0.90258598, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2401123, + "step": 3493, + "time_per_iteration": 2.825314998626709 + }, + { + "auxiliary_loss_clip": 0.01588111, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.3790226, + "balance_loss_mlp": 1.02106023, + "epoch": 0.21007064482188487, + "flos": 15970252181760.0, + "grad_norm": 1.6635366680287995, + "language_loss": 0.7081722, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.73449856, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.23461914, + "step": 3494, + "time_per_iteration": 2.8576161861419678 + }, + { + "auxiliary_loss_clip": 0.01638385, + "auxiliary_loss_mlp": 0.01046959, + "balance_loss_clip": 1.41564059, + "balance_loss_mlp": 1.02281868, + "epoch": 0.21013076807455283, + "flos": 32247192752640.0, + "grad_norm": 1.6782030846400946, + "language_loss": 0.79556483, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.82241827, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 2.23046875, + "router_z_loss_mlp": 0.24145508, + "step": 3495, + "time_per_iteration": 2.9335973262786865 + }, + { + "auxiliary_loss_clip": 0.01619292, + "auxiliary_loss_mlp": 0.0105216, + "balance_loss_clip": 1.40227365, + "balance_loss_mlp": 1.02739978, + "epoch": 0.2101908913272208, + "flos": 23707282187520.0, + "grad_norm": 1.5612574830074857, + "language_loss": 0.77909982, + "learning_rate": 3.668959515566116e-06, + "loss": 0.80581439, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.24743652, + "step": 3496, + "time_per_iteration": 2.9463846683502197 + }, + { + "auxiliary_loss_clip": 0.01624157, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.40591598, + "balance_loss_mlp": 1.02474523, + "epoch": 0.21025101457988876, + "flos": 20385416586240.0, + "grad_norm": 5.154272264854059, + "language_loss": 0.82637465, + "learning_rate": 3.668744875505915e-06, + "loss": 0.85311264, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24914551, + "step": 3497, + "time_per_iteration": 2.8832790851593018 + }, + { + "auxiliary_loss_clip": 0.01631579, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_clip": 1.40826488, + "balance_loss_mlp": 1.02305448, + "epoch": 0.21031113783255675, + "flos": 25786658321280.0, + "grad_norm": 3.9267061128959067, + "language_loss": 0.68484813, + "learning_rate": 3.668530172166741e-06, + "loss": 0.71165615, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 2.23242188, + "router_z_loss_mlp": 0.26184082, + "step": 3498, + "time_per_iteration": 2.9090375900268555 + }, + { + "auxiliary_loss_clip": 0.0163645, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_clip": 1.41190791, + "balance_loss_mlp": 1.02144802, + "epoch": 0.21037126108522472, + "flos": 22028396192640.0, + "grad_norm": 1.9131055775479922, + "language_loss": 0.82330751, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.85014945, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 2.24804688, + "router_z_loss_mlp": 0.26318359, + "step": 3499, + "time_per_iteration": 2.8462469577789307 + }, + { + "auxiliary_loss_clip": 0.01594628, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.38113487, + "balance_loss_mlp": 1.0221045, + "epoch": 0.21043138433789269, + "flos": 25344425214720.0, + "grad_norm": 1.5147462120788404, + "language_loss": 0.791022, + "learning_rate": 3.668100575684043e-06, + "loss": 0.81742799, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.23864746, + "step": 3500, + "time_per_iteration": 2.860731363296509 + }, + { + "auxiliary_loss_clip": 0.01608012, + "auxiliary_loss_mlp": 0.01042763, + "balance_loss_clip": 1.39095616, + "balance_loss_mlp": 1.01824152, + "epoch": 0.21049150759056065, + "flos": 25567600406400.0, + "grad_norm": 1.490225138089557, + "language_loss": 0.75115263, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.77766037, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.2454834, + "step": 3501, + "time_per_iteration": 2.8910651206970215 + }, + { + "auxiliary_loss_clip": 0.01597083, + "auxiliary_loss_mlp": 0.01044155, + "balance_loss_clip": 1.38256061, + "balance_loss_mlp": 1.01805973, + "epoch": 0.21055163084322862, + "flos": 24505502532480.0, + "grad_norm": 1.8392569566153247, + "language_loss": 0.76333463, + "learning_rate": 3.667670726183183e-06, + "loss": 0.789747, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.26098633, + "step": 3502, + "time_per_iteration": 2.8671019077301025 + }, + { + "auxiliary_loss_clip": 0.01595, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.38082588, + "balance_loss_mlp": 1.01441216, + "epoch": 0.21061175409589658, + "flos": 25750028016000.0, + "grad_norm": 1.9442301028509381, + "language_loss": 0.77892268, + "learning_rate": 3.667455706571316e-06, + "loss": 0.80526006, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24328613, + "step": 3503, + "time_per_iteration": 4.338297367095947 + }, + { + "auxiliary_loss_clip": 0.01626891, + "auxiliary_loss_mlp": 0.01052381, + "balance_loss_clip": 1.40275407, + "balance_loss_mlp": 1.0259409, + "epoch": 0.21067187734856455, + "flos": 18998396668800.0, + "grad_norm": 2.137116509951048, + "language_loss": 0.80015254, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.8269453, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 2.24414062, + "router_z_loss_mlp": 0.26477051, + "step": 3504, + "time_per_iteration": 2.8929390907287598 + }, + { + "auxiliary_loss_clip": 0.0162298, + "auxiliary_loss_mlp": 0.0104858, + "balance_loss_clip": 1.39946198, + "balance_loss_mlp": 1.02252054, + "epoch": 0.21073200060123254, + "flos": 24692183153280.0, + "grad_norm": 1.4355710442292404, + "language_loss": 0.77748561, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.80420119, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.26074219, + "step": 3505, + "time_per_iteration": 2.8696703910827637 + }, + { + "auxiliary_loss_clip": 0.01588001, + "auxiliary_loss_mlp": 0.01047957, + "balance_loss_clip": 1.37700319, + "balance_loss_mlp": 1.02318537, + "epoch": 0.2107921238539005, + "flos": 28561286338560.0, + "grad_norm": 1.6056810631882448, + "language_loss": 0.63848633, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66484594, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24780273, + "step": 3506, + "time_per_iteration": 2.9022254943847656 + }, + { + "auxiliary_loss_clip": 0.0160299, + "auxiliary_loss_mlp": 0.01045227, + "balance_loss_clip": 1.38715816, + "balance_loss_mlp": 1.02067029, + "epoch": 0.21085224710656847, + "flos": 25897092174720.0, + "grad_norm": 1.6276633356893873, + "language_loss": 0.82981437, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.85629654, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.24584961, + "step": 3507, + "time_per_iteration": 2.932844877243042 + }, + { + "auxiliary_loss_clip": 0.01592165, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_clip": 1.37726188, + "balance_loss_mlp": 1.02062917, + "epoch": 0.21091237035923643, + "flos": 14984627299200.0, + "grad_norm": 2.3467543470847296, + "language_loss": 0.76895618, + "learning_rate": 3.666379660223824e-06, + "loss": 0.79533207, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.2479248, + "step": 3508, + "time_per_iteration": 2.837642192840576 + }, + { + "auxiliary_loss_clip": 0.01604906, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.38636589, + "balance_loss_mlp": 1.01686049, + "epoch": 0.2109724936119044, + "flos": 16371149523840.0, + "grad_norm": 3.2327358738596232, + "language_loss": 0.86295873, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.88942564, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.24914551, + "step": 3509, + "time_per_iteration": 2.8366899490356445 + }, + { + "auxiliary_loss_clip": 0.01607103, + "auxiliary_loss_mlp": 0.01044944, + "balance_loss_clip": 1.38645077, + "balance_loss_mlp": 1.01882565, + "epoch": 0.21103261686457236, + "flos": 31514631891840.0, + "grad_norm": 1.7805922693776797, + "language_loss": 0.68937206, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.71589255, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26123047, + "step": 3510, + "time_per_iteration": 4.338866233825684 + }, + { + "auxiliary_loss_clip": 0.01613645, + "auxiliary_loss_mlp": 0.01040088, + "balance_loss_clip": 1.39160967, + "balance_loss_mlp": 1.01603127, + "epoch": 0.21109274011724033, + "flos": 27355381931520.0, + "grad_norm": 1.7580826776592287, + "language_loss": 0.73476881, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.76130617, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.24072266, + "step": 3511, + "time_per_iteration": 2.8831405639648438 + }, + { + "auxiliary_loss_clip": 0.01625148, + "auxiliary_loss_mlp": 0.01049113, + "balance_loss_clip": 1.40165734, + "balance_loss_mlp": 1.02258945, + "epoch": 0.21115286336990832, + "flos": 17828986832640.0, + "grad_norm": 2.9789023990048475, + "language_loss": 0.71046019, + "learning_rate": 3.665517685689794e-06, + "loss": 0.73720276, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.26525879, + "step": 3512, + "time_per_iteration": 4.243883371353149 + }, + { + "auxiliary_loss_clip": 0.01609222, + "auxiliary_loss_mlp": 0.01048406, + "balance_loss_clip": 1.39012742, + "balance_loss_mlp": 1.02355075, + "epoch": 0.2112129866225763, + "flos": 27209222668800.0, + "grad_norm": 2.4406001463792824, + "language_loss": 0.74434853, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.77092481, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.24865723, + "step": 3513, + "time_per_iteration": 2.941117286682129 + }, + { + "auxiliary_loss_clip": 0.01585409, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.37246287, + "balance_loss_mlp": 1.01802838, + "epoch": 0.21127310987524425, + "flos": 23741469273600.0, + "grad_norm": 1.710699049417334, + "language_loss": 0.74827337, + "learning_rate": 3.665086319450502e-06, + "loss": 0.77455992, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25219727, + "step": 3514, + "time_per_iteration": 2.8781352043151855 + }, + { + "auxiliary_loss_clip": 0.01612381, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.39207721, + "balance_loss_mlp": 1.0184691, + "epoch": 0.21133323312791222, + "flos": 18341584882560.0, + "grad_norm": 1.7471150855919921, + "language_loss": 0.77663159, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.80318594, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.24584961, + "step": 3515, + "time_per_iteration": 2.8819997310638428 + }, + { + "auxiliary_loss_clip": 0.01613535, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.39507246, + "balance_loss_mlp": 1.02051663, + "epoch": 0.21139335638058018, + "flos": 17940189847680.0, + "grad_norm": 1.7204826934822057, + "language_loss": 0.69107807, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.71767271, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.25439453, + "step": 3516, + "time_per_iteration": 2.824127674102783 + }, + { + "auxiliary_loss_clip": 0.01596081, + "auxiliary_loss_mlp": 0.01052561, + "balance_loss_clip": 1.37701166, + "balance_loss_mlp": 1.02561975, + "epoch": 0.21145347963324815, + "flos": 24582925664640.0, + "grad_norm": 1.6369528189905638, + "language_loss": 0.85576469, + "learning_rate": 3.664438796560225e-06, + "loss": 0.88225114, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.26965332, + "step": 3517, + "time_per_iteration": 2.863694667816162 + }, + { + "auxiliary_loss_clip": 0.01592952, + "auxiliary_loss_mlp": 0.0104105, + "balance_loss_clip": 1.37450719, + "balance_loss_mlp": 1.01637423, + "epoch": 0.21151360288591614, + "flos": 35859657576960.0, + "grad_norm": 1.7023532740344416, + "language_loss": 0.64183241, + "learning_rate": 3.664222829354512e-06, + "loss": 0.66817242, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.24707031, + "step": 3518, + "time_per_iteration": 2.924001932144165 + }, + { + "auxiliary_loss_clip": 0.01596346, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.38104558, + "balance_loss_mlp": 1.02408171, + "epoch": 0.2115737261385841, + "flos": 24651526060800.0, + "grad_norm": 1.7994553508380786, + "language_loss": 0.90247154, + "learning_rate": 3.664006799041303e-06, + "loss": 0.92892456, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.24902344, + "step": 3519, + "time_per_iteration": 2.8809826374053955 + }, + { + "auxiliary_loss_clip": 0.01603211, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.3835243, + "balance_loss_mlp": 1.02477658, + "epoch": 0.21163384939125207, + "flos": 25237384721280.0, + "grad_norm": 1.7498774942476862, + "language_loss": 0.82515794, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.85170686, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26904297, + "step": 3520, + "time_per_iteration": 2.8607378005981445 + }, + { + "auxiliary_loss_clip": 0.01574699, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.3629936, + "balance_loss_mlp": 1.01882255, + "epoch": 0.21169397264392004, + "flos": 26078614888320.0, + "grad_norm": 1.6677683836674955, + "language_loss": 0.76997483, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.79616594, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25610352, + "step": 3521, + "time_per_iteration": 2.859482526779175 + }, + { + "auxiliary_loss_clip": 0.01584429, + "auxiliary_loss_mlp": 0.01049864, + "balance_loss_clip": 1.3694905, + "balance_loss_mlp": 1.0253787, + "epoch": 0.211754095896588, + "flos": 23117577719040.0, + "grad_norm": 4.116591803440022, + "language_loss": 0.767694, + "learning_rate": 3.663358329538626e-06, + "loss": 0.79403692, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.24499512, + "step": 3522, + "time_per_iteration": 2.9286365509033203 + }, + { + "auxiliary_loss_clip": 0.01592193, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.37383842, + "balance_loss_mlp": 1.02671206, + "epoch": 0.21181421914925597, + "flos": 27932372611200.0, + "grad_norm": 2.3299385700694266, + "language_loss": 0.7130096, + "learning_rate": 3.663142046877374e-06, + "loss": 0.73947585, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 2.18359375, + "router_z_loss_mlp": 0.27734375, + "step": 3523, + "time_per_iteration": 2.867607593536377 + }, + { + "auxiliary_loss_clip": 0.0160601, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_clip": 1.38733816, + "balance_loss_mlp": 1.02070022, + "epoch": 0.21187434240192393, + "flos": 17137445022720.0, + "grad_norm": 2.1416928345858004, + "language_loss": 0.78649628, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.81302285, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.25952148, + "step": 3524, + "time_per_iteration": 2.828070878982544 + }, + { + "auxiliary_loss_clip": 0.01609367, + "auxiliary_loss_mlp": 0.01043666, + "balance_loss_clip": 1.3862679, + "balance_loss_mlp": 1.01763105, + "epoch": 0.21193446565459192, + "flos": 22357797471360.0, + "grad_norm": 1.8285900966656392, + "language_loss": 0.82587856, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.85240889, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.26037598, + "step": 3525, + "time_per_iteration": 2.864640712738037 + }, + { + "auxiliary_loss_clip": 0.01595902, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.38002491, + "balance_loss_mlp": 1.01767492, + "epoch": 0.2119945889072599, + "flos": 27210896726400.0, + "grad_norm": 1.7822670062951536, + "language_loss": 0.75673151, + "learning_rate": 3.662492820527356e-06, + "loss": 0.78312081, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.25341797, + "step": 3526, + "time_per_iteration": 2.8851542472839355 + }, + { + "auxiliary_loss_clip": 0.0160026, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.38109815, + "balance_loss_mlp": 1.01473475, + "epoch": 0.21205471215992786, + "flos": 21000575894400.0, + "grad_norm": 1.74345902127393, + "language_loss": 0.78210163, + "learning_rate": 3.662276285649284e-06, + "loss": 0.80851257, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26123047, + "step": 3527, + "time_per_iteration": 2.9474925994873047 + }, + { + "auxiliary_loss_clip": 0.01601548, + "auxiliary_loss_mlp": 0.01047434, + "balance_loss_clip": 1.38534713, + "balance_loss_mlp": 1.02114844, + "epoch": 0.21211483541259582, + "flos": 20787535537920.0, + "grad_norm": 1.5656628358249758, + "language_loss": 0.79128903, + "learning_rate": 3.662059687737528e-06, + "loss": 0.81777883, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26306152, + "step": 3528, + "time_per_iteration": 3.0328245162963867 + }, + { + "auxiliary_loss_clip": 0.01586257, + "auxiliary_loss_mlp": 0.01050214, + "balance_loss_clip": 1.37186694, + "balance_loss_mlp": 1.02514458, + "epoch": 0.21217495866526379, + "flos": 18999437299200.0, + "grad_norm": 1.6608645527152934, + "language_loss": 0.8202008, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.84656549, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25085449, + "step": 3529, + "time_per_iteration": 2.961333751678467 + }, + { + "auxiliary_loss_clip": 0.01600479, + "auxiliary_loss_mlp": 0.01054678, + "balance_loss_clip": 1.37992382, + "balance_loss_mlp": 1.0279876, + "epoch": 0.21223508191793175, + "flos": 20676920705280.0, + "grad_norm": 2.3878362373181052, + "language_loss": 0.78481841, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.81137002, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26721191, + "step": 3530, + "time_per_iteration": 3.006397247314453 + }, + { + "auxiliary_loss_clip": 0.01594714, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.37973571, + "balance_loss_mlp": 1.02150011, + "epoch": 0.21229520517059972, + "flos": 21626458220160.0, + "grad_norm": 4.772064312878051, + "language_loss": 0.84751761, + "learning_rate": 3.661409515882308e-06, + "loss": 0.87393665, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25671387, + "step": 3531, + "time_per_iteration": 2.8433279991149902 + }, + { + "auxiliary_loss_clip": 0.01605111, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_clip": 1.3848902, + "balance_loss_mlp": 1.01986182, + "epoch": 0.2123553284232677, + "flos": 13999092906240.0, + "grad_norm": 2.363788024184872, + "language_loss": 0.74933153, + "learning_rate": 3.661192665917977e-06, + "loss": 0.77585495, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.27392578, + "step": 3532, + "time_per_iteration": 2.843433380126953 + }, + { + "auxiliary_loss_clip": 0.01592506, + "auxiliary_loss_mlp": 0.01044337, + "balance_loss_clip": 1.37631714, + "balance_loss_mlp": 1.01908875, + "epoch": 0.21241545167593567, + "flos": 18306040452480.0, + "grad_norm": 1.7630882461947641, + "language_loss": 0.74979436, + "learning_rate": 3.660975752961054e-06, + "loss": 0.7761628, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25256348, + "step": 3533, + "time_per_iteration": 2.822777032852173 + }, + { + "auxiliary_loss_clip": 0.01610223, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.38916183, + "balance_loss_mlp": 1.02020168, + "epoch": 0.21247557492860364, + "flos": 34726063639680.0, + "grad_norm": 2.24192967123436, + "language_loss": 0.72468746, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.75124407, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.25219727, + "step": 3534, + "time_per_iteration": 3.0029408931732178 + }, + { + "auxiliary_loss_clip": 0.0160068, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.38212395, + "balance_loss_mlp": 1.02266049, + "epoch": 0.2125356981812716, + "flos": 22063804888320.0, + "grad_norm": 1.8614097300370627, + "language_loss": 0.7315675, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.75804824, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.24731445, + "step": 3535, + "time_per_iteration": 2.874300479888916 + }, + { + "auxiliary_loss_clip": 0.01598754, + "auxiliary_loss_mlp": 0.01052757, + "balance_loss_clip": 1.3824966, + "balance_loss_mlp": 1.02684128, + "epoch": 0.21259582143393957, + "flos": 28560019484160.0, + "grad_norm": 2.0171326098589946, + "language_loss": 0.71203214, + "learning_rate": 3.660324636216996e-06, + "loss": 0.73854727, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25915527, + "step": 3536, + "time_per_iteration": 2.9106850624084473 + }, + { + "auxiliary_loss_clip": 0.01599832, + "auxiliary_loss_mlp": 0.01048906, + "balance_loss_clip": 1.38078785, + "balance_loss_mlp": 1.02378881, + "epoch": 0.21265594468660753, + "flos": 20130588017280.0, + "grad_norm": 1.7489473813769747, + "language_loss": 0.88243389, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90892136, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.25109863, + "step": 3537, + "time_per_iteration": 2.888338088989258 + }, + { + "auxiliary_loss_clip": 0.01588907, + "auxiliary_loss_mlp": 0.01048494, + "balance_loss_clip": 1.37448454, + "balance_loss_mlp": 1.02347195, + "epoch": 0.21271606793927553, + "flos": 23086603013760.0, + "grad_norm": 1.6665571130565497, + "language_loss": 0.81713498, + "learning_rate": 3.659890243575524e-06, + "loss": 0.84350896, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25024414, + "step": 3538, + "time_per_iteration": 4.356805086135864 + }, + { + "auxiliary_loss_clip": 0.01583812, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.36951447, + "balance_loss_mlp": 1.02901459, + "epoch": 0.2127761911919435, + "flos": 26397383639040.0, + "grad_norm": 1.6562930289070181, + "language_loss": 0.88398004, + "learning_rate": 3.659672952835863e-06, + "loss": 0.91035998, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25195312, + "step": 3539, + "time_per_iteration": 2.89227557182312 + }, + { + "auxiliary_loss_clip": 0.01593713, + "auxiliary_loss_mlp": 0.01057363, + "balance_loss_clip": 1.37602496, + "balance_loss_mlp": 1.03110075, + "epoch": 0.21283631444461146, + "flos": 20237447531520.0, + "grad_norm": 2.033168966038709, + "language_loss": 0.59025502, + "learning_rate": 3.659455599161237e-06, + "loss": 0.61676574, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.26281738, + "step": 3540, + "time_per_iteration": 2.8385040760040283 + }, + { + "auxiliary_loss_clip": 0.01592498, + "auxiliary_loss_mlp": 0.01049495, + "balance_loss_clip": 1.37539446, + "balance_loss_mlp": 1.02462816, + "epoch": 0.21289643769727942, + "flos": 13524980198400.0, + "grad_norm": 2.314725713029893, + "language_loss": 0.77262658, + "learning_rate": 3.659238182559888e-06, + "loss": 0.79904652, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.24841309, + "step": 3541, + "time_per_iteration": 2.9411773681640625 + }, + { + "auxiliary_loss_clip": 0.01587378, + "auxiliary_loss_mlp": 0.01050375, + "balance_loss_clip": 1.37440419, + "balance_loss_mlp": 1.02535343, + "epoch": 0.2129565609499474, + "flos": 24837799478400.0, + "grad_norm": 2.3855973139699027, + "language_loss": 0.7042101, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.73058766, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25036621, + "step": 3542, + "time_per_iteration": 2.939868688583374 + }, + { + "auxiliary_loss_clip": 0.0159153, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.37846398, + "balance_loss_mlp": 1.02480125, + "epoch": 0.21301668420261535, + "flos": 23669656496640.0, + "grad_norm": 1.7017816288046186, + "language_loss": 0.76849282, + "learning_rate": 3.658803160610004e-06, + "loss": 0.79490185, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24572754, + "step": 3543, + "time_per_iteration": 2.861057996749878 + }, + { + "auxiliary_loss_clip": 0.01589556, + "auxiliary_loss_mlp": 0.01048497, + "balance_loss_clip": 1.37518907, + "balance_loss_mlp": 1.02264011, + "epoch": 0.21307680745528332, + "flos": 16371511482240.0, + "grad_norm": 1.8250785780890129, + "language_loss": 0.67554438, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.70192486, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25891113, + "step": 3544, + "time_per_iteration": 2.8570363521575928 + }, + { + "auxiliary_loss_clip": 0.01577763, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_clip": 1.364959, + "balance_loss_mlp": 1.02726972, + "epoch": 0.2131369307079513, + "flos": 19108423319040.0, + "grad_norm": 1.5934270387805176, + "language_loss": 0.71300721, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73931015, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25280762, + "step": 3545, + "time_per_iteration": 2.8272645473480225 + }, + { + "auxiliary_loss_clip": 0.01603803, + "auxiliary_loss_mlp": 0.01051534, + "balance_loss_clip": 1.38572693, + "balance_loss_mlp": 1.02722692, + "epoch": 0.21319705396061928, + "flos": 30384340824960.0, + "grad_norm": 1.749022083145176, + "language_loss": 0.73032147, + "learning_rate": 3.658150155940946e-06, + "loss": 0.75687486, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.24304199, + "step": 3546, + "time_per_iteration": 4.282550811767578 + }, + { + "auxiliary_loss_clip": 0.01609174, + "auxiliary_loss_mlp": 0.01054754, + "balance_loss_clip": 1.39090848, + "balance_loss_mlp": 1.0291841, + "epoch": 0.21325717721328724, + "flos": 21764609153280.0, + "grad_norm": 1.8629614973496167, + "language_loss": 0.80669463, + "learning_rate": 3.657932361952479e-06, + "loss": 0.83333397, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.25585938, + "step": 3547, + "time_per_iteration": 4.317594528198242 + }, + { + "auxiliary_loss_clip": 0.01582162, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.36335969, + "balance_loss_mlp": 1.02033627, + "epoch": 0.2133173004659552, + "flos": 28742628072960.0, + "grad_norm": 3.0369105610627534, + "language_loss": 0.76639795, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.79267395, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 2.1875, + "router_z_loss_mlp": 0.25109863, + "step": 3548, + "time_per_iteration": 4.370636701583862 + }, + { + "auxiliary_loss_clip": 0.01600879, + "auxiliary_loss_mlp": 0.01056087, + "balance_loss_clip": 1.38006163, + "balance_loss_mlp": 1.02959895, + "epoch": 0.21337742371862317, + "flos": 16845895658880.0, + "grad_norm": 1.9232872134587669, + "language_loss": 0.74692184, + "learning_rate": 3.657496585376922e-06, + "loss": 0.7734915, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.26513672, + "step": 3549, + "time_per_iteration": 2.852412700653076 + }, + { + "auxiliary_loss_clip": 0.01598084, + "auxiliary_loss_mlp": 0.01049061, + "balance_loss_clip": 1.38273752, + "balance_loss_mlp": 1.0244925, + "epoch": 0.21343754697129114, + "flos": 24435409057920.0, + "grad_norm": 1.6190791493447203, + "language_loss": 0.8183859, + "learning_rate": 3.657278602806357e-06, + "loss": 0.84485739, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.24560547, + "step": 3550, + "time_per_iteration": 2.844397783279419 + }, + { + "auxiliary_loss_clip": 0.01560779, + "auxiliary_loss_mlp": 0.0104666, + "balance_loss_clip": 1.35312152, + "balance_loss_mlp": 1.02341402, + "epoch": 0.21349767022395913, + "flos": 19286281203840.0, + "grad_norm": 1.581824519582713, + "language_loss": 0.88346559, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90954, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.23242188, + "step": 3551, + "time_per_iteration": 2.8811280727386475 + }, + { + "auxiliary_loss_clip": 0.01576837, + "auxiliary_loss_mlp": 0.01046372, + "balance_loss_clip": 1.36579359, + "balance_loss_mlp": 1.02129006, + "epoch": 0.2135577934766271, + "flos": 17356412448000.0, + "grad_norm": 1.782918639996052, + "language_loss": 0.84069085, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86692297, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25109863, + "step": 3552, + "time_per_iteration": 2.860135316848755 + }, + { + "auxiliary_loss_clip": 0.01574883, + "auxiliary_loss_mlp": 0.01053348, + "balance_loss_clip": 1.36339283, + "balance_loss_mlp": 1.02887464, + "epoch": 0.21361791672929506, + "flos": 24066843765120.0, + "grad_norm": 1.7336998963771433, + "language_loss": 0.77323949, + "learning_rate": 3.656624278062713e-06, + "loss": 0.7995218, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24475098, + "step": 3553, + "time_per_iteration": 2.8982932567596436 + }, + { + "auxiliary_loss_clip": 0.01555221, + "auxiliary_loss_mlp": 0.01051444, + "balance_loss_clip": 1.3449403, + "balance_loss_mlp": 1.0281744, + "epoch": 0.21367803998196302, + "flos": 22172066991360.0, + "grad_norm": 2.2388048075564924, + "language_loss": 0.73559529, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.76166189, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23278809, + "step": 3554, + "time_per_iteration": 2.911975860595703 + }, + { + "auxiliary_loss_clip": 0.01589831, + "auxiliary_loss_mlp": 0.0104841, + "balance_loss_clip": 1.37454462, + "balance_loss_mlp": 1.02289927, + "epoch": 0.213738163234631, + "flos": 20896702536960.0, + "grad_norm": 1.8149443956057874, + "language_loss": 0.68795967, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.714342, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25524902, + "step": 3555, + "time_per_iteration": 2.8772294521331787 + }, + { + "auxiliary_loss_clip": 0.01574632, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.36138713, + "balance_loss_mlp": 1.021523, + "epoch": 0.21379828648729896, + "flos": 28414946096640.0, + "grad_norm": 1.7479400012686521, + "language_loss": 0.6647988, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.69100738, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.24694824, + "step": 3556, + "time_per_iteration": 2.9523372650146484 + }, + { + "auxiliary_loss_clip": 0.01561764, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.35057902, + "balance_loss_mlp": 1.0235461, + "epoch": 0.21385840973996692, + "flos": 25489543847040.0, + "grad_norm": 1.9068913549539808, + "language_loss": 0.73414379, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.76025498, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.25805664, + "step": 3557, + "time_per_iteration": 2.905759811401367 + }, + { + "auxiliary_loss_clip": 0.01578956, + "auxiliary_loss_mlp": 0.01046947, + "balance_loss_clip": 1.36361456, + "balance_loss_mlp": 1.01857519, + "epoch": 0.2139185329926349, + "flos": 28085906776320.0, + "grad_norm": 1.7161115953673018, + "language_loss": 0.68552649, + "learning_rate": 3.655532480546528e-06, + "loss": 0.7117855, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.28369141, + "step": 3558, + "time_per_iteration": 2.9231388568878174 + }, + { + "auxiliary_loss_clip": 0.01577167, + "auxiliary_loss_mlp": 0.01046343, + "balance_loss_clip": 1.35813653, + "balance_loss_mlp": 1.01981878, + "epoch": 0.21397865624530288, + "flos": 19617899477760.0, + "grad_norm": 1.7470466078190239, + "language_loss": 0.81468153, + "learning_rate": 3.655313932676286e-06, + "loss": 0.84091657, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26550293, + "step": 3559, + "time_per_iteration": 2.830726146697998 + }, + { + "auxiliary_loss_clip": 0.01556485, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.34616137, + "balance_loss_mlp": 1.0195483, + "epoch": 0.21403877949797084, + "flos": 24692318887680.0, + "grad_norm": 1.600755865953125, + "language_loss": 0.68583149, + "learning_rate": 3.655095322036373e-06, + "loss": 0.71183097, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23925781, + "step": 3560, + "time_per_iteration": 2.883713722229004 + }, + { + "auxiliary_loss_clip": 0.01570302, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.35480165, + "balance_loss_mlp": 1.01993132, + "epoch": 0.2140989027506388, + "flos": 19869651400320.0, + "grad_norm": 2.473314289049543, + "language_loss": 0.73953652, + "learning_rate": 3.65487664863508e-06, + "loss": 0.76569432, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.2557373, + "step": 3561, + "time_per_iteration": 2.8458545207977295 + }, + { + "auxiliary_loss_clip": 0.01574025, + "auxiliary_loss_mlp": 0.01050214, + "balance_loss_clip": 1.36071134, + "balance_loss_mlp": 1.02391624, + "epoch": 0.21415902600330677, + "flos": 19144646421120.0, + "grad_norm": 2.154303731695337, + "language_loss": 0.79401881, + "learning_rate": 3.654657912480698e-06, + "loss": 0.82026118, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.26306152, + "step": 3562, + "time_per_iteration": 2.82829213142395 + }, + { + "auxiliary_loss_clip": 0.01560826, + "auxiliary_loss_mlp": 0.01049684, + "balance_loss_clip": 1.35008192, + "balance_loss_mlp": 1.02317238, + "epoch": 0.21421914925597474, + "flos": 22282636579200.0, + "grad_norm": 1.6952815700046682, + "language_loss": 0.85330123, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.87940633, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.26513672, + "step": 3563, + "time_per_iteration": 2.8837172985076904 + }, + { + "auxiliary_loss_clip": 0.01570874, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.36050844, + "balance_loss_mlp": 1.02046537, + "epoch": 0.2142792725086427, + "flos": 33888136343040.0, + "grad_norm": 1.3707779395693789, + "language_loss": 0.77472192, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.80088699, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.25158691, + "step": 3564, + "time_per_iteration": 2.9383113384246826 + }, + { + "auxiliary_loss_clip": 0.01564279, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.35457253, + "balance_loss_mlp": 1.01822305, + "epoch": 0.2143393957613107, + "flos": 19868475035520.0, + "grad_norm": 1.6287056030957199, + "language_loss": 0.89388496, + "learning_rate": 3.654001327581981e-06, + "loss": 0.91996491, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.25500488, + "step": 3565, + "time_per_iteration": 2.8876523971557617 + }, + { + "auxiliary_loss_clip": 0.01368994, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.247293, + "balance_loss_mlp": 1.00660086, + "epoch": 0.21439951901397866, + "flos": 68559896943360.0, + "grad_norm": 0.8671009271569773, + "language_loss": 0.52261698, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54662091, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.24707031, + "step": 3566, + "time_per_iteration": 3.303924798965454 + }, + { + "auxiliary_loss_clip": 0.01537463, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.33453131, + "balance_loss_mlp": 1.01447773, + "epoch": 0.21445964226664663, + "flos": 19692562677120.0, + "grad_norm": 1.9501930004331203, + "language_loss": 0.67926174, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.70503616, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.25500488, + "step": 3567, + "time_per_iteration": 2.8587489128112793 + }, + { + "auxiliary_loss_clip": 0.01553285, + "auxiliary_loss_mlp": 0.01042577, + "balance_loss_clip": 1.34777725, + "balance_loss_mlp": 1.01784134, + "epoch": 0.2145197655193146, + "flos": 31120883228160.0, + "grad_norm": 1.4523267130627369, + "language_loss": 0.75046784, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.77642649, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.24743652, + "step": 3568, + "time_per_iteration": 2.989978313446045 + }, + { + "auxiliary_loss_clip": 0.01556425, + "auxiliary_loss_mlp": 0.01048688, + "balance_loss_clip": 1.34894931, + "balance_loss_mlp": 1.02359462, + "epoch": 0.21457988877198256, + "flos": 20130452282880.0, + "grad_norm": 1.6910397335315217, + "language_loss": 0.79047072, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.81652182, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.25073242, + "step": 3569, + "time_per_iteration": 2.8444364070892334 + }, + { + "auxiliary_loss_clip": 0.01584858, + "auxiliary_loss_mlp": 0.01043062, + "balance_loss_clip": 1.36536837, + "balance_loss_mlp": 1.01634753, + "epoch": 0.21464001202465052, + "flos": 18597227857920.0, + "grad_norm": 2.263610051622323, + "language_loss": 0.71189207, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.73817122, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 2.19726562, + "router_z_loss_mlp": 0.26745605, + "step": 3570, + "time_per_iteration": 2.8805501461029053 + }, + { + "auxiliary_loss_clip": 0.01570285, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.35344148, + "balance_loss_mlp": 1.02192903, + "epoch": 0.21470013527731852, + "flos": 21845154176640.0, + "grad_norm": 2.398553477349351, + "language_loss": 0.80674201, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.83291638, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.25219727, + "step": 3571, + "time_per_iteration": 2.9900119304656982 + }, + { + "auxiliary_loss_clip": 0.01579013, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_clip": 1.36509371, + "balance_loss_mlp": 1.02140498, + "epoch": 0.21476025852998648, + "flos": 17612191157760.0, + "grad_norm": 1.9521905860463338, + "language_loss": 0.84714627, + "learning_rate": 3.652467101342991e-06, + "loss": 0.87340295, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25244141, + "step": 3572, + "time_per_iteration": 2.864039659500122 + }, + { + "auxiliary_loss_clip": 0.01580178, + "auxiliary_loss_mlp": 0.01045716, + "balance_loss_clip": 1.36275697, + "balance_loss_mlp": 1.02117074, + "epoch": 0.21482038178265445, + "flos": 24839202067200.0, + "grad_norm": 2.378261763657597, + "language_loss": 0.6655215, + "learning_rate": 3.652247675452598e-06, + "loss": 0.69178045, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.24560547, + "step": 3573, + "time_per_iteration": 4.325469970703125 + }, + { + "auxiliary_loss_clip": 0.01554092, + "auxiliary_loss_mlp": 0.0104586, + "balance_loss_clip": 1.34754837, + "balance_loss_mlp": 1.02074218, + "epoch": 0.2148805050353224, + "flos": 23268623420160.0, + "grad_norm": 2.643786979905976, + "language_loss": 0.76062584, + "learning_rate": 3.652028186908807e-06, + "loss": 0.78662527, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.25109863, + "step": 3574, + "time_per_iteration": 2.901113748550415 + }, + { + "auxiliary_loss_clip": 0.01571102, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.35715091, + "balance_loss_mlp": 1.02355266, + "epoch": 0.21494062828799038, + "flos": 21330112907520.0, + "grad_norm": 2.0122282900967563, + "language_loss": 0.73819768, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.76440662, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.26208496, + "step": 3575, + "time_per_iteration": 2.828514575958252 + }, + { + "auxiliary_loss_clip": 0.01560336, + "auxiliary_loss_mlp": 0.01051157, + "balance_loss_clip": 1.35096502, + "balance_loss_mlp": 1.02624261, + "epoch": 0.21500075154065834, + "flos": 18852554119680.0, + "grad_norm": 1.5782074065758862, + "language_loss": 0.68976164, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.71587658, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.24926758, + "step": 3576, + "time_per_iteration": 2.867933750152588 + }, + { + "auxiliary_loss_clip": 0.01578948, + "auxiliary_loss_mlp": 0.01051997, + "balance_loss_clip": 1.36154783, + "balance_loss_mlp": 1.02637887, + "epoch": 0.2150608747933263, + "flos": 18451113840000.0, + "grad_norm": 1.927781035100966, + "language_loss": 0.90569955, + "learning_rate": 3.651369345440292e-06, + "loss": 0.93200898, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.25610352, + "step": 3577, + "time_per_iteration": 2.813812017440796 + }, + { + "auxiliary_loss_clip": 0.01356938, + "auxiliary_loss_mlp": 0.01072759, + "balance_loss_clip": 1.23683333, + "balance_loss_mlp": 1.04777241, + "epoch": 0.2151209980459943, + "flos": 66628037416320.0, + "grad_norm": 0.8051943400803414, + "language_loss": 0.56147867, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58577561, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.24902344, + "step": 3578, + "time_per_iteration": 3.331730604171753 + }, + { + "auxiliary_loss_clip": 0.01577319, + "auxiliary_loss_mlp": 0.01049485, + "balance_loss_clip": 1.36441755, + "balance_loss_mlp": 1.02607203, + "epoch": 0.21518112129866226, + "flos": 21584805742080.0, + "grad_norm": 1.61923645566343, + "language_loss": 0.89354634, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.91981447, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.23425293, + "step": 3579, + "time_per_iteration": 2.8296000957489014 + }, + { + "auxiliary_loss_clip": 0.01574991, + "auxiliary_loss_mlp": 0.0104757, + "balance_loss_clip": 1.35909092, + "balance_loss_mlp": 1.02106953, + "epoch": 0.21524124455133023, + "flos": 20057236917120.0, + "grad_norm": 1.5972009738400594, + "language_loss": 0.79194653, + "learning_rate": 3.650709940390972e-06, + "loss": 0.8181721, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26538086, + "step": 3580, + "time_per_iteration": 2.8478307723999023 + }, + { + "auxiliary_loss_clip": 0.01558759, + "auxiliary_loss_mlp": 0.01050606, + "balance_loss_clip": 1.34753847, + "balance_loss_mlp": 1.02579808, + "epoch": 0.2153013678039982, + "flos": 23962472714880.0, + "grad_norm": 1.6999756918307452, + "language_loss": 0.74477464, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.77086824, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24780273, + "step": 3581, + "time_per_iteration": 4.280422925949097 + }, + { + "auxiliary_loss_clip": 0.01562229, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.35068679, + "balance_loss_mlp": 1.02005708, + "epoch": 0.21536149105666616, + "flos": 20604564990720.0, + "grad_norm": 2.5211134782884175, + "language_loss": 0.73372829, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.75981718, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.26623535, + "step": 3582, + "time_per_iteration": 5.679165601730347 + }, + { + "auxiliary_loss_clip": 0.0156574, + "auxiliary_loss_mlp": 0.01047945, + "balance_loss_clip": 1.35311937, + "balance_loss_mlp": 1.02130198, + "epoch": 0.21542161430933413, + "flos": 12867806453760.0, + "grad_norm": 2.918641567440283, + "language_loss": 0.85631639, + "learning_rate": 3.650049971985889e-06, + "loss": 0.8824532, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.26696777, + "step": 3583, + "time_per_iteration": 2.934835195541382 + }, + { + "auxiliary_loss_clip": 0.01571211, + "auxiliary_loss_mlp": 0.01049923, + "balance_loss_clip": 1.35461533, + "balance_loss_mlp": 1.024984, + "epoch": 0.21548173756200212, + "flos": 26115018969600.0, + "grad_norm": 2.6796054145273347, + "language_loss": 0.84107769, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.86728907, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24963379, + "step": 3584, + "time_per_iteration": 2.876629114151001 + }, + { + "auxiliary_loss_clip": 0.01567144, + "auxiliary_loss_mlp": 0.01050702, + "balance_loss_clip": 1.35678363, + "balance_loss_mlp": 1.02521467, + "epoch": 0.21554186081467008, + "flos": 22173922028160.0, + "grad_norm": 1.7771126255102319, + "language_loss": 0.91799068, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.94416916, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25524902, + "step": 3585, + "time_per_iteration": 2.942531108856201 + }, + { + "auxiliary_loss_clip": 0.01556402, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.34506893, + "balance_loss_mlp": 1.02048957, + "epoch": 0.21560198406733805, + "flos": 22977164545920.0, + "grad_norm": 1.7406462751215042, + "language_loss": 0.75575858, + "learning_rate": 3.649389440450277e-06, + "loss": 0.78177404, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.24658203, + "step": 3586, + "time_per_iteration": 2.912104606628418 + }, + { + "auxiliary_loss_clip": 0.01576726, + "auxiliary_loss_mlp": 0.01053799, + "balance_loss_clip": 1.36188221, + "balance_loss_mlp": 1.02681041, + "epoch": 0.215662107320006, + "flos": 22794374977920.0, + "grad_norm": 1.6189818566684133, + "language_loss": 0.83512032, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.86142558, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26977539, + "step": 3587, + "time_per_iteration": 2.9209725856781006 + }, + { + "auxiliary_loss_clip": 0.01562479, + "auxiliary_loss_mlp": 0.01047027, + "balance_loss_clip": 1.34976196, + "balance_loss_mlp": 1.02149212, + "epoch": 0.21572223057267398, + "flos": 30896260202880.0, + "grad_norm": 2.9519668658711704, + "language_loss": 0.77104509, + "learning_rate": 3.648948773354224e-06, + "loss": 0.79714024, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25524902, + "step": 3588, + "time_per_iteration": 2.916823387145996 + }, + { + "auxiliary_loss_clip": 0.01570061, + "auxiliary_loss_mlp": 0.01045348, + "balance_loss_clip": 1.35588777, + "balance_loss_mlp": 1.01962328, + "epoch": 0.21578235382534194, + "flos": 26922785967360.0, + "grad_norm": 1.7227345585824707, + "language_loss": 0.81842542, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.84457946, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 2.13964844, + "router_z_loss_mlp": 0.25732422, + "step": 3589, + "time_per_iteration": 2.910130023956299 + }, + { + "auxiliary_loss_clip": 0.01568073, + "auxiliary_loss_mlp": 0.01042674, + "balance_loss_clip": 1.3561877, + "balance_loss_mlp": 1.0180577, + "epoch": 0.2158424770780099, + "flos": 24436404443520.0, + "grad_norm": 2.296552286858966, + "language_loss": 0.73864567, + "learning_rate": 3.648507856144961e-06, + "loss": 0.76475316, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.24633789, + "step": 3590, + "time_per_iteration": 2.9115357398986816 + }, + { + "auxiliary_loss_clip": 0.01564487, + "auxiliary_loss_mlp": 0.01048482, + "balance_loss_clip": 1.3495059, + "balance_loss_mlp": 1.02181458, + "epoch": 0.2159026003306779, + "flos": 23960165230080.0, + "grad_norm": 1.7384292984470875, + "language_loss": 0.85507435, + "learning_rate": 3.648287303768775e-06, + "loss": 0.88120401, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26660156, + "step": 3591, + "time_per_iteration": 2.8779680728912354 + }, + { + "auxiliary_loss_clip": 0.01572831, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_clip": 1.35691798, + "balance_loss_mlp": 1.02477455, + "epoch": 0.21596272358334587, + "flos": 30052179613440.0, + "grad_norm": 2.294141451608414, + "language_loss": 0.69446993, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.72073972, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.29394531, + "step": 3592, + "time_per_iteration": 3.005282402038574 + }, + { + "auxiliary_loss_clip": 0.0157025, + "auxiliary_loss_mlp": 0.01051779, + "balance_loss_clip": 1.35537124, + "balance_loss_mlp": 1.0246824, + "epoch": 0.21602284683601383, + "flos": 20385959523840.0, + "grad_norm": 2.4961006470982134, + "language_loss": 0.85901427, + "learning_rate": 3.647846011515108e-06, + "loss": 0.88523459, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.27124023, + "step": 3593, + "time_per_iteration": 2.812030553817749 + }, + { + "auxiliary_loss_clip": 0.01564725, + "auxiliary_loss_mlp": 0.01045844, + "balance_loss_clip": 1.35045171, + "balance_loss_mlp": 1.01850939, + "epoch": 0.2160829700886818, + "flos": 20787128334720.0, + "grad_norm": 2.3192501397241148, + "language_loss": 0.7676338, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.7937395, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 2.14355469, + "router_z_loss_mlp": 0.27319336, + "step": 3594, + "time_per_iteration": 2.858562469482422 + }, + { + "auxiliary_loss_clip": 0.01541905, + "auxiliary_loss_mlp": 0.01052174, + "balance_loss_clip": 1.33315253, + "balance_loss_mlp": 1.02381468, + "epoch": 0.21614309334134976, + "flos": 22319945556480.0, + "grad_norm": 1.4681669366861925, + "language_loss": 0.80517483, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.8311156, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.28356934, + "step": 3595, + "time_per_iteration": 2.876054286956787 + }, + { + "auxiliary_loss_clip": 0.01560411, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.34365726, + "balance_loss_mlp": 1.01611984, + "epoch": 0.21620321659401773, + "flos": 19618940108160.0, + "grad_norm": 1.7686373818494592, + "language_loss": 0.79415727, + "learning_rate": 3.647183604506897e-06, + "loss": 0.82020372, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.28137207, + "step": 3596, + "time_per_iteration": 2.8432259559631348 + }, + { + "auxiliary_loss_clip": 0.01542106, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.33244753, + "balance_loss_mlp": 1.02040279, + "epoch": 0.2162633398466857, + "flos": 18854092442880.0, + "grad_norm": 1.7124125220120825, + "language_loss": 0.84406114, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.86996424, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.27783203, + "step": 3597, + "time_per_iteration": 2.949237108230591 + }, + { + "auxiliary_loss_clip": 0.01551093, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_clip": 1.33647919, + "balance_loss_mlp": 1.01842523, + "epoch": 0.21632346309935369, + "flos": 18777619451520.0, + "grad_norm": 1.9652168595141264, + "language_loss": 0.8142364, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.84020472, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.27307129, + "step": 3598, + "time_per_iteration": 2.8100430965423584 + }, + { + "auxiliary_loss_clip": 0.01555826, + "auxiliary_loss_mlp": 0.01055847, + "balance_loss_clip": 1.33930945, + "balance_loss_mlp": 1.02586579, + "epoch": 0.21638358635202165, + "flos": 26335660452480.0, + "grad_norm": 1.850066971157753, + "language_loss": 0.82858479, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.85470152, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.29968262, + "step": 3599, + "time_per_iteration": 2.920530319213867 + }, + { + "auxiliary_loss_clip": 0.01554111, + "auxiliary_loss_mlp": 0.01052029, + "balance_loss_clip": 1.34192872, + "balance_loss_mlp": 1.0248853, + "epoch": 0.21644370960468962, + "flos": 20750633763840.0, + "grad_norm": 2.1092629461700745, + "language_loss": 0.77308524, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.79914665, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.27172852, + "step": 3600, + "time_per_iteration": 2.810134172439575 + }, + { + "auxiliary_loss_clip": 0.01542805, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.3324182, + "balance_loss_mlp": 1.01886368, + "epoch": 0.21650383285735758, + "flos": 23963015652480.0, + "grad_norm": 3.5468157250702594, + "language_loss": 0.81333619, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.83922029, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.26745605, + "step": 3601, + "time_per_iteration": 2.883336305618286 + }, + { + "auxiliary_loss_clip": 0.01549092, + "auxiliary_loss_mlp": 0.01053414, + "balance_loss_clip": 1.33372653, + "balance_loss_mlp": 1.0253756, + "epoch": 0.21656395611002555, + "flos": 23706603515520.0, + "grad_norm": 2.490618195569459, + "language_loss": 0.84998697, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.87601203, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.28076172, + "step": 3602, + "time_per_iteration": 2.842400074005127 + }, + { + "auxiliary_loss_clip": 0.015648, + "auxiliary_loss_mlp": 0.01051603, + "balance_loss_clip": 1.34993076, + "balance_loss_mlp": 1.02103758, + "epoch": 0.2166240793626935, + "flos": 20675337137280.0, + "grad_norm": 1.8413025666392315, + "language_loss": 0.76029509, + "learning_rate": 3.645635802397693e-06, + "loss": 0.78645909, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 2.14941406, + "router_z_loss_mlp": 0.30566406, + "step": 3603, + "time_per_iteration": 2.8345208168029785 + }, + { + "auxiliary_loss_clip": 0.01544753, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.33798039, + "balance_loss_mlp": 1.02490258, + "epoch": 0.2166842026153615, + "flos": 21590506586880.0, + "grad_norm": 1.918004274854854, + "language_loss": 0.75438344, + "learning_rate": 3.645414438132855e-06, + "loss": 0.7803697, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.29003906, + "step": 3604, + "time_per_iteration": 2.881065845489502 + }, + { + "auxiliary_loss_clip": 0.01538934, + "auxiliary_loss_mlp": 0.01049836, + "balance_loss_clip": 1.33085799, + "balance_loss_mlp": 1.02153563, + "epoch": 0.21674432586802947, + "flos": 25640996751360.0, + "grad_norm": 1.6446844282423814, + "language_loss": 0.80805838, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.83394611, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.28320312, + "step": 3605, + "time_per_iteration": 2.868868112564087 + }, + { + "auxiliary_loss_clip": 0.01347993, + "auxiliary_loss_mlp": 0.01241698, + "balance_loss_clip": 1.22505736, + "balance_loss_mlp": 1.20431352, + "epoch": 0.21680444912069743, + "flos": 56444694796800.0, + "grad_norm": 0.7745646414400693, + "language_loss": 0.58451337, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.61041027, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.37304688, + "step": 3606, + "time_per_iteration": 3.4698266983032227 + }, + { + "auxiliary_loss_clip": 0.01560145, + "auxiliary_loss_mlp": 0.01055158, + "balance_loss_clip": 1.34326971, + "balance_loss_mlp": 1.02597594, + "epoch": 0.2168645723733654, + "flos": 23889619307520.0, + "grad_norm": 1.9694989565320489, + "language_loss": 0.742378, + "learning_rate": 3.644749971006248e-06, + "loss": 0.76853102, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.29199219, + "step": 3607, + "time_per_iteration": 4.312813758850098 + }, + { + "auxiliary_loss_clip": 0.01557829, + "auxiliary_loss_mlp": 0.01062482, + "balance_loss_clip": 1.34220386, + "balance_loss_mlp": 1.0330615, + "epoch": 0.21692469562603336, + "flos": 16954926923520.0, + "grad_norm": 1.9709374938909772, + "language_loss": 0.78174067, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.80794382, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.29394531, + "step": 3608, + "time_per_iteration": 2.8579771518707275 + }, + { + "auxiliary_loss_clip": 0.0156179, + "auxiliary_loss_mlp": 0.01064434, + "balance_loss_clip": 1.34636974, + "balance_loss_mlp": 1.03680146, + "epoch": 0.21698481887870133, + "flos": 25129937024640.0, + "grad_norm": 2.0227403527813856, + "language_loss": 0.75246203, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.77872425, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27685547, + "step": 3609, + "time_per_iteration": 2.8933165073394775 + }, + { + "auxiliary_loss_clip": 0.01553657, + "auxiliary_loss_mlp": 0.01079384, + "balance_loss_clip": 1.34022963, + "balance_loss_mlp": 1.04886651, + "epoch": 0.2170449421313693, + "flos": 17903650032000.0, + "grad_norm": 2.0838356384336363, + "language_loss": 0.89748889, + "learning_rate": 3.6440849425579e-06, + "loss": 0.9238193, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.30517578, + "step": 3610, + "time_per_iteration": 2.891892910003662 + }, + { + "auxiliary_loss_clip": 0.01548629, + "auxiliary_loss_mlp": 0.0108281, + "balance_loss_clip": 1.33856845, + "balance_loss_mlp": 1.05379438, + "epoch": 0.2171050653840373, + "flos": 22648803897600.0, + "grad_norm": 1.6808863240937366, + "language_loss": 0.7883575, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.81467187, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2902832, + "step": 3611, + "time_per_iteration": 2.909013271331787 + }, + { + "auxiliary_loss_clip": 0.01546404, + "auxiliary_loss_mlp": 0.01098888, + "balance_loss_clip": 1.3364383, + "balance_loss_mlp": 1.07139826, + "epoch": 0.21716518863670525, + "flos": 19509365905920.0, + "grad_norm": 3.0529351993419005, + "language_loss": 0.64275777, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.66921067, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.27478027, + "step": 3612, + "time_per_iteration": 2.8417248725891113 + }, + { + "auxiliary_loss_clip": 0.01554507, + "auxiliary_loss_mlp": 0.01094298, + "balance_loss_clip": 1.34069204, + "balance_loss_mlp": 1.06674862, + "epoch": 0.21722531188937322, + "flos": 19801503452160.0, + "grad_norm": 1.7351041988385476, + "language_loss": 0.7726289, + "learning_rate": 3.643419353014776e-06, + "loss": 0.79911703, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.27514648, + "step": 3613, + "time_per_iteration": 2.8021726608276367 + }, + { + "auxiliary_loss_clip": 0.01551739, + "auxiliary_loss_mlp": 0.01101072, + "balance_loss_clip": 1.34172082, + "balance_loss_mlp": 1.07374895, + "epoch": 0.21728543514204118, + "flos": 13342281120000.0, + "grad_norm": 1.9598516872721383, + "language_loss": 0.72838777, + "learning_rate": 3.643197365185261e-06, + "loss": 0.75491589, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.27331543, + "step": 3614, + "time_per_iteration": 2.9022440910339355 + }, + { + "auxiliary_loss_clip": 0.01549668, + "auxiliary_loss_mlp": 0.01105539, + "balance_loss_clip": 1.33733869, + "balance_loss_mlp": 1.07773948, + "epoch": 0.21734555839470915, + "flos": 15240315519360.0, + "grad_norm": 1.5754842326616403, + "language_loss": 0.73954558, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.76609766, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27807617, + "step": 3615, + "time_per_iteration": 2.807662010192871 + }, + { + "auxiliary_loss_clip": 0.01581337, + "auxiliary_loss_mlp": 0.01089343, + "balance_loss_clip": 1.36232305, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2174056816473771, + "flos": 19983388124160.0, + "grad_norm": 2.251969703188635, + "language_loss": 0.9106493, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.93735611, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27087402, + "step": 3616, + "time_per_iteration": 4.206519365310669 + }, + { + "auxiliary_loss_clip": 0.01576884, + "auxiliary_loss_mlp": 0.01085695, + "balance_loss_clip": 1.36186445, + "balance_loss_mlp": 1.05849111, + "epoch": 0.21746580490004508, + "flos": 16695076181760.0, + "grad_norm": 1.9325679661898285, + "language_loss": 0.82048261, + "learning_rate": 3.642531027869148e-06, + "loss": 0.84710848, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.27209473, + "step": 3617, + "time_per_iteration": 4.319268703460693 + }, + { + "auxiliary_loss_clip": 0.01571902, + "auxiliary_loss_mlp": 0.01086563, + "balance_loss_clip": 1.35742414, + "balance_loss_mlp": 1.06011057, + "epoch": 0.21752592815271307, + "flos": 25782450554880.0, + "grad_norm": 1.7090524231480697, + "language_loss": 0.76580828, + "learning_rate": 3.642308790849329e-06, + "loss": 0.79239297, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.26489258, + "step": 3618, + "time_per_iteration": 4.363492727279663 + }, + { + "auxiliary_loss_clip": 0.01568508, + "auxiliary_loss_mlp": 0.01082781, + "balance_loss_clip": 1.35338068, + "balance_loss_mlp": 1.05581605, + "epoch": 0.21758605140538104, + "flos": 11262045335040.0, + "grad_norm": 2.032160888929606, + "language_loss": 0.69966739, + "learning_rate": 3.642086491552996e-06, + "loss": 0.72618032, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26977539, + "step": 3619, + "time_per_iteration": 2.7954177856445312 + }, + { + "auxiliary_loss_clip": 0.01570055, + "auxiliary_loss_mlp": 0.01078063, + "balance_loss_clip": 1.35500097, + "balance_loss_mlp": 1.05139542, + "epoch": 0.217646174658049, + "flos": 19251415445760.0, + "grad_norm": 1.5822913790636493, + "language_loss": 0.78680336, + "learning_rate": 3.641864129988579e-06, + "loss": 0.81328452, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2668457, + "step": 3620, + "time_per_iteration": 2.852590560913086 + }, + { + "auxiliary_loss_clip": 0.01554469, + "auxiliary_loss_mlp": 0.01061546, + "balance_loss_clip": 1.34600282, + "balance_loss_mlp": 1.03515327, + "epoch": 0.21770629791071697, + "flos": 21955226071680.0, + "grad_norm": 1.4695835619853503, + "language_loss": 0.80876571, + "learning_rate": 3.641641706164509e-06, + "loss": 0.83492589, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.26391602, + "step": 3621, + "time_per_iteration": 2.9717319011688232 + }, + { + "auxiliary_loss_clip": 0.01560484, + "auxiliary_loss_mlp": 0.01054599, + "balance_loss_clip": 1.34750366, + "balance_loss_mlp": 1.02687073, + "epoch": 0.21776642116338493, + "flos": 24947645149440.0, + "grad_norm": 1.654074014635563, + "language_loss": 0.88439906, + "learning_rate": 3.641419220089221e-06, + "loss": 0.91054988, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 2.12988281, + "router_z_loss_mlp": 0.27746582, + "step": 3622, + "time_per_iteration": 2.8861684799194336 + }, + { + "auxiliary_loss_clip": 0.01586252, + "auxiliary_loss_mlp": 0.01054647, + "balance_loss_clip": 1.36733961, + "balance_loss_mlp": 1.02753866, + "epoch": 0.2178265444160529, + "flos": 17830163197440.0, + "grad_norm": 1.8246283818750202, + "language_loss": 0.78141761, + "learning_rate": 3.641196671771152e-06, + "loss": 0.80782658, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27075195, + "step": 3623, + "time_per_iteration": 2.8534955978393555 + }, + { + "auxiliary_loss_clip": 0.01573518, + "auxiliary_loss_mlp": 0.01057151, + "balance_loss_clip": 1.35740018, + "balance_loss_mlp": 1.02871919, + "epoch": 0.2178866676687209, + "flos": 17721991584000.0, + "grad_norm": 1.8972716539453516, + "language_loss": 0.85494578, + "learning_rate": 3.640974061218741e-06, + "loss": 0.88125247, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.28405762, + "step": 3624, + "time_per_iteration": 2.8529136180877686 + }, + { + "auxiliary_loss_clip": 0.01567822, + "auxiliary_loss_mlp": 0.01057711, + "balance_loss_clip": 1.35203028, + "balance_loss_mlp": 1.02893424, + "epoch": 0.21794679092138886, + "flos": 16954519720320.0, + "grad_norm": 2.2119978052108022, + "language_loss": 0.79330331, + "learning_rate": 3.640751388440429e-06, + "loss": 0.81955862, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.2878418, + "step": 3625, + "time_per_iteration": 2.8427653312683105 + }, + { + "auxiliary_loss_clip": 0.01344556, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.21580672, + "balance_loss_mlp": 0.99956173, + "epoch": 0.21800691417405682, + "flos": 63748314432000.0, + "grad_norm": 0.8196046102317003, + "language_loss": 0.6073935, + "learning_rate": 3.64052865344466e-06, + "loss": 0.63113415, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 1.2890625, + "router_z_loss_mlp": 0.29882812, + "step": 3626, + "time_per_iteration": 3.525327682495117 + }, + { + "auxiliary_loss_clip": 0.01561687, + "auxiliary_loss_mlp": 0.01058128, + "balance_loss_clip": 1.34565854, + "balance_loss_mlp": 1.02775371, + "epoch": 0.21806703742672479, + "flos": 21626367730560.0, + "grad_norm": 2.439441746122754, + "language_loss": 0.91610467, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.94230282, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.30371094, + "step": 3627, + "time_per_iteration": 2.869502067565918 + }, + { + "auxiliary_loss_clip": 0.01560913, + "auxiliary_loss_mlp": 0.01046255, + "balance_loss_clip": 1.3495152, + "balance_loss_mlp": 1.01733494, + "epoch": 0.21812716067939275, + "flos": 19364428252800.0, + "grad_norm": 1.567586291762664, + "language_loss": 0.74360406, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.76967579, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.28930664, + "step": 3628, + "time_per_iteration": 2.878415584564209 + }, + { + "auxiliary_loss_clip": 0.01543001, + "auxiliary_loss_mlp": 0.01048683, + "balance_loss_clip": 1.33244967, + "balance_loss_mlp": 1.01890445, + "epoch": 0.21818728393206072, + "flos": 23558001033600.0, + "grad_norm": 1.9045103106186956, + "language_loss": 0.78829753, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.81421435, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.29785156, + "step": 3629, + "time_per_iteration": 2.8743019104003906 + }, + { + "auxiliary_loss_clip": 0.01550203, + "auxiliary_loss_mlp": 0.01053615, + "balance_loss_clip": 1.34075618, + "balance_loss_mlp": 1.0257082, + "epoch": 0.21824740718472868, + "flos": 30237502890240.0, + "grad_norm": 1.5831673489698208, + "language_loss": 0.72456503, + "learning_rate": 3.63963709145597e-06, + "loss": 0.7506032, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.27893066, + "step": 3630, + "time_per_iteration": 2.971954822540283 + }, + { + "auxiliary_loss_clip": 0.01534599, + "auxiliary_loss_mlp": 0.01048106, + "balance_loss_clip": 1.33023024, + "balance_loss_mlp": 1.02110541, + "epoch": 0.21830753043739667, + "flos": 26144274372480.0, + "grad_norm": 1.6850470883380413, + "language_loss": 0.77442551, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.8002525, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.27001953, + "step": 3631, + "time_per_iteration": 2.911485195159912 + }, + { + "auxiliary_loss_clip": 0.01567206, + "auxiliary_loss_mlp": 0.01046896, + "balance_loss_clip": 1.35222077, + "balance_loss_mlp": 1.01933479, + "epoch": 0.21836765369006464, + "flos": 21729381436800.0, + "grad_norm": 2.875348684553046, + "language_loss": 0.76117909, + "learning_rate": 3.639190937376594e-06, + "loss": 0.78732014, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.27587891, + "step": 3632, + "time_per_iteration": 2.832697629928589 + }, + { + "auxiliary_loss_clip": 0.01535879, + "auxiliary_loss_mlp": 0.01048474, + "balance_loss_clip": 1.32951164, + "balance_loss_mlp": 1.02198565, + "epoch": 0.2184277769427326, + "flos": 19947029287680.0, + "grad_norm": 8.445167202456247, + "language_loss": 0.85058415, + "learning_rate": 3.638967767095249e-06, + "loss": 0.87642765, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.26513672, + "step": 3633, + "time_per_iteration": 2.858062982559204 + }, + { + "auxiliary_loss_clip": 0.01553551, + "auxiliary_loss_mlp": 0.01053779, + "balance_loss_clip": 1.34263575, + "balance_loss_mlp": 1.02547848, + "epoch": 0.21848790019540057, + "flos": 20350098380160.0, + "grad_norm": 2.0142021819604117, + "language_loss": 0.82031989, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.84639323, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.28344727, + "step": 3634, + "time_per_iteration": 2.857800006866455 + }, + { + "auxiliary_loss_clip": 0.01557966, + "auxiliary_loss_mlp": 0.01047886, + "balance_loss_clip": 1.34459472, + "balance_loss_mlp": 1.0199194, + "epoch": 0.21854802344806853, + "flos": 15459328189440.0, + "grad_norm": 1.7712082112106888, + "language_loss": 0.76205093, + "learning_rate": 3.638521240091558e-06, + "loss": 0.78810942, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.27941895, + "step": 3635, + "time_per_iteration": 2.8408913612365723 + }, + { + "auxiliary_loss_clip": 0.01534712, + "auxiliary_loss_mlp": 0.01055022, + "balance_loss_clip": 1.32823944, + "balance_loss_mlp": 1.02715123, + "epoch": 0.2186081467007365, + "flos": 16327958722560.0, + "grad_norm": 3.144301965428661, + "language_loss": 0.89419532, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.92009264, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.27905273, + "step": 3636, + "time_per_iteration": 2.794177770614624 + }, + { + "auxiliary_loss_clip": 0.01554667, + "auxiliary_loss_mlp": 0.01051753, + "balance_loss_clip": 1.34286618, + "balance_loss_mlp": 1.02354836, + "epoch": 0.2186682699534045, + "flos": 21699221137920.0, + "grad_norm": 2.2392941826777686, + "language_loss": 0.76917398, + "learning_rate": 3.638074464556311e-06, + "loss": 0.7952382, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.28198242, + "step": 3637, + "time_per_iteration": 2.8681886196136475 + }, + { + "auxiliary_loss_clip": 0.01574191, + "auxiliary_loss_mlp": 0.01047752, + "balance_loss_clip": 1.35809314, + "balance_loss_mlp": 1.01933229, + "epoch": 0.21872839320607246, + "flos": 17745727121280.0, + "grad_norm": 2.56757592520503, + "language_loss": 0.91048813, + "learning_rate": 3.63785098361053e-06, + "loss": 0.9367075, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.28442383, + "step": 3638, + "time_per_iteration": 2.7905232906341553 + }, + { + "auxiliary_loss_clip": 0.01558089, + "auxiliary_loss_mlp": 0.01054073, + "balance_loss_clip": 1.34548569, + "balance_loss_mlp": 1.0253675, + "epoch": 0.21878851645874042, + "flos": 18659222513280.0, + "grad_norm": 4.039486675225097, + "language_loss": 0.90264744, + "learning_rate": 3.637627440557275e-06, + "loss": 0.92876899, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.28723145, + "step": 3639, + "time_per_iteration": 2.7648720741271973 + }, + { + "auxiliary_loss_clip": 0.01555462, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.3452878, + "balance_loss_mlp": 1.01486623, + "epoch": 0.2188486397114084, + "flos": 25568414812800.0, + "grad_norm": 1.6636724489399652, + "language_loss": 0.80248511, + "learning_rate": 3.637403835405024e-06, + "loss": 0.82845807, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.26977539, + "step": 3640, + "time_per_iteration": 2.8911850452423096 + }, + { + "auxiliary_loss_clip": 0.01566267, + "auxiliary_loss_mlp": 0.01054612, + "balance_loss_clip": 1.35282707, + "balance_loss_mlp": 1.0251193, + "epoch": 0.21890876296407635, + "flos": 17900347161600.0, + "grad_norm": 1.9284827349117362, + "language_loss": 0.73597336, + "learning_rate": 3.637180168162255e-06, + "loss": 0.76218218, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.29516602, + "step": 3641, + "time_per_iteration": 2.816746950149536 + }, + { + "auxiliary_loss_clip": 0.01569276, + "auxiliary_loss_mlp": 0.01052224, + "balance_loss_clip": 1.35795927, + "balance_loss_mlp": 1.02425742, + "epoch": 0.21896888621674432, + "flos": 17758124196480.0, + "grad_norm": 2.166661236949752, + "language_loss": 0.82277179, + "learning_rate": 3.63695643883745e-06, + "loss": 0.84898674, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.27978516, + "step": 3642, + "time_per_iteration": 4.281655550003052 + }, + { + "auxiliary_loss_clip": 0.01572777, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_clip": 1.35734725, + "balance_loss_mlp": 1.01978195, + "epoch": 0.21902900946941228, + "flos": 23086603013760.0, + "grad_norm": 1.8649919917963906, + "language_loss": 0.72853112, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.75474942, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.29284668, + "step": 3643, + "time_per_iteration": 2.873709201812744 + }, + { + "auxiliary_loss_clip": 0.0156781, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.35548282, + "balance_loss_mlp": 1.02135158, + "epoch": 0.21908913272208028, + "flos": 48195546451200.0, + "grad_norm": 1.8979379737637807, + "language_loss": 0.68783379, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.71399772, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27233887, + "step": 3644, + "time_per_iteration": 3.078646659851074 + }, + { + "auxiliary_loss_clip": 0.01590829, + "auxiliary_loss_mlp": 0.01051936, + "balance_loss_clip": 1.37001014, + "balance_loss_mlp": 1.02363539, + "epoch": 0.21914925597474824, + "flos": 22246911169920.0, + "grad_norm": 2.6831741740501145, + "language_loss": 0.79479009, + "learning_rate": 3.636284878455669e-06, + "loss": 0.82121772, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.28295898, + "step": 3645, + "time_per_iteration": 2.8720359802246094 + }, + { + "auxiliary_loss_clip": 0.01557569, + "auxiliary_loss_mlp": 0.01052666, + "balance_loss_clip": 1.34948862, + "balance_loss_mlp": 1.02481925, + "epoch": 0.2192093792274162, + "flos": 22135300951680.0, + "grad_norm": 1.6963448253196738, + "language_loss": 0.83317339, + "learning_rate": 3.636060900887582e-06, + "loss": 0.8592757, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.27880859, + "step": 3646, + "time_per_iteration": 2.8609414100646973 + }, + { + "auxiliary_loss_clip": 0.01561116, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.35315347, + "balance_loss_mlp": 1.02008533, + "epoch": 0.21926950248008417, + "flos": 15677933656320.0, + "grad_norm": 1.8496548428486472, + "language_loss": 0.84131402, + "learning_rate": 3.635836861279901e-06, + "loss": 0.86740673, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.28112793, + "step": 3647, + "time_per_iteration": 2.8129632472991943 + }, + { + "auxiliary_loss_clip": 0.01557128, + "auxiliary_loss_mlp": 0.01054944, + "balance_loss_clip": 1.34731174, + "balance_loss_mlp": 1.02704859, + "epoch": 0.21932962573275214, + "flos": 30274449909120.0, + "grad_norm": 1.728672460955334, + "language_loss": 0.73199528, + "learning_rate": 3.635612759641123e-06, + "loss": 0.75811601, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.27929688, + "step": 3648, + "time_per_iteration": 2.9695637226104736 + }, + { + "auxiliary_loss_clip": 0.0157683, + "auxiliary_loss_mlp": 0.01054595, + "balance_loss_clip": 1.3591783, + "balance_loss_mlp": 1.0272963, + "epoch": 0.2193897489854201, + "flos": 10787299200000.0, + "grad_norm": 2.279313432773206, + "language_loss": 0.74590254, + "learning_rate": 3.635388595979745e-06, + "loss": 0.7722168, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27319336, + "step": 3649, + "time_per_iteration": 2.8115382194519043 + }, + { + "auxiliary_loss_clip": 0.0155767, + "auxiliary_loss_mlp": 0.010464, + "balance_loss_clip": 1.34889829, + "balance_loss_mlp": 1.01960182, + "epoch": 0.21944987223808807, + "flos": 19142112712320.0, + "grad_norm": 1.8622976473207737, + "language_loss": 0.87135887, + "learning_rate": 3.635164370304267e-06, + "loss": 0.8973996, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26806641, + "step": 3650, + "time_per_iteration": 2.8774044513702393 + }, + { + "auxiliary_loss_clip": 0.01569695, + "auxiliary_loss_mlp": 0.01047039, + "balance_loss_clip": 1.35647726, + "balance_loss_mlp": 1.01957369, + "epoch": 0.21950999549075606, + "flos": 22721385836160.0, + "grad_norm": 2.09268474080192, + "language_loss": 0.84587395, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.87204129, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.27478027, + "step": 3651, + "time_per_iteration": 4.382604122161865 + }, + { + "auxiliary_loss_clip": 0.01575232, + "auxiliary_loss_mlp": 0.0104918, + "balance_loss_clip": 1.3614924, + "balance_loss_mlp": 1.02177382, + "epoch": 0.21957011874342403, + "flos": 10568467509120.0, + "grad_norm": 2.158004919971209, + "language_loss": 0.75310826, + "learning_rate": 3.634715732945027e-06, + "loss": 0.77935237, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.27416992, + "step": 3652, + "time_per_iteration": 4.16953706741333 + }, + { + "auxiliary_loss_clip": 0.01363819, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.22806644, + "balance_loss_mlp": 1.00178778, + "epoch": 0.219630241996092, + "flos": 65774591153280.0, + "grad_norm": 0.7441403570779467, + "language_loss": 0.51660192, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.54060513, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.34765625, + "step": 3653, + "time_per_iteration": 4.77081561088562 + }, + { + "auxiliary_loss_clip": 0.01599535, + "auxiliary_loss_mlp": 0.01049809, + "balance_loss_clip": 1.38431334, + "balance_loss_mlp": 1.02268863, + "epoch": 0.21969036524875996, + "flos": 23707146453120.0, + "grad_norm": 2.1890048283841548, + "language_loss": 0.76188982, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.78838325, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.27124023, + "step": 3654, + "time_per_iteration": 2.857222557067871 + }, + { + "auxiliary_loss_clip": 0.01592434, + "auxiliary_loss_mlp": 0.01054607, + "balance_loss_clip": 1.37447405, + "balance_loss_mlp": 1.02646172, + "epoch": 0.21975048850142792, + "flos": 19649009917440.0, + "grad_norm": 2.0203538279675284, + "language_loss": 0.73486674, + "learning_rate": 3.634042312013064e-06, + "loss": 0.76133716, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.28137207, + "step": 3655, + "time_per_iteration": 2.878793954849243 + }, + { + "auxiliary_loss_clip": 0.01561552, + "auxiliary_loss_mlp": 0.01046115, + "balance_loss_clip": 1.35129702, + "balance_loss_mlp": 1.01937628, + "epoch": 0.21981061175409589, + "flos": 22456920124800.0, + "grad_norm": 1.6227779920421104, + "language_loss": 0.8116287, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83770537, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.26757812, + "step": 3656, + "time_per_iteration": 3.042649745941162 + }, + { + "auxiliary_loss_clip": 0.01584677, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.37143278, + "balance_loss_mlp": 1.02128053, + "epoch": 0.21987073500676388, + "flos": 18160333637760.0, + "grad_norm": 1.9518946221750473, + "language_loss": 0.86256379, + "learning_rate": 3.63359305489566e-06, + "loss": 0.88888896, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26574707, + "step": 3657, + "time_per_iteration": 2.9165942668914795 + }, + { + "auxiliary_loss_clip": 0.01579089, + "auxiliary_loss_mlp": 0.01047062, + "balance_loss_clip": 1.3630259, + "balance_loss_mlp": 1.02028775, + "epoch": 0.21993085825943184, + "flos": 25636246047360.0, + "grad_norm": 1.5225178479224017, + "language_loss": 0.81132483, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.83758634, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26745605, + "step": 3658, + "time_per_iteration": 2.8570139408111572 + }, + { + "auxiliary_loss_clip": 0.01338338, + "auxiliary_loss_mlp": 0.0104454, + "balance_loss_clip": 1.20915735, + "balance_loss_mlp": 1.01268721, + "epoch": 0.2199909815120998, + "flos": 70959236175360.0, + "grad_norm": 0.7863018379968482, + "language_loss": 0.58287382, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60670257, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.31835938, + "step": 3659, + "time_per_iteration": 3.439894676208496 + }, + { + "auxiliary_loss_clip": 0.01564439, + "auxiliary_loss_mlp": 0.01049627, + "balance_loss_clip": 1.35254359, + "balance_loss_mlp": 1.02305555, + "epoch": 0.22005110476476777, + "flos": 21553514323200.0, + "grad_norm": 2.3620239272260974, + "language_loss": 0.75152677, + "learning_rate": 3.632918704645772e-06, + "loss": 0.7776674, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.26574707, + "step": 3660, + "time_per_iteration": 2.8405041694641113 + }, + { + "auxiliary_loss_clip": 0.01574071, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.35928833, + "balance_loss_mlp": 1.01769257, + "epoch": 0.22011122801743574, + "flos": 22064664539520.0, + "grad_norm": 1.4876682536621197, + "language_loss": 0.81491292, + "learning_rate": 3.632693797376893e-06, + "loss": 0.84109944, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.2689209, + "step": 3661, + "time_per_iteration": 2.8389694690704346 + }, + { + "auxiliary_loss_clip": 0.01554399, + "auxiliary_loss_mlp": 0.01044477, + "balance_loss_clip": 1.34467947, + "balance_loss_mlp": 1.01856089, + "epoch": 0.2201713512701037, + "flos": 26698796369280.0, + "grad_norm": 1.6593231991851383, + "language_loss": 0.74362397, + "learning_rate": 3.632468828196102e-06, + "loss": 0.76961267, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2590332, + "step": 3662, + "time_per_iteration": 2.941701889038086 + }, + { + "auxiliary_loss_clip": 0.01540209, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.33290553, + "balance_loss_mlp": 1.02494359, + "epoch": 0.22023147452277167, + "flos": 22171931256960.0, + "grad_norm": 1.5128658993929442, + "language_loss": 0.79960454, + "learning_rate": 3.632243797111929e-06, + "loss": 0.82551098, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.25512695, + "step": 3663, + "time_per_iteration": 2.846776247024536 + }, + { + "auxiliary_loss_clip": 0.01584774, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.36585748, + "balance_loss_mlp": 1.02891183, + "epoch": 0.22029159777543966, + "flos": 22532126261760.0, + "grad_norm": 1.6002012756667423, + "language_loss": 0.81034684, + "learning_rate": 3.632018704132908e-06, + "loss": 0.83675867, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.27514648, + "step": 3664, + "time_per_iteration": 2.879239797592163 + }, + { + "auxiliary_loss_clip": 0.01575674, + "auxiliary_loss_mlp": 0.01048834, + "balance_loss_clip": 1.3564049, + "balance_loss_mlp": 1.02179718, + "epoch": 0.22035172102810763, + "flos": 13050279308160.0, + "grad_norm": 2.654225571970054, + "language_loss": 0.7869646, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.81320971, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.27050781, + "step": 3665, + "time_per_iteration": 2.8054111003875732 + }, + { + "auxiliary_loss_clip": 0.0155607, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.34342301, + "balance_loss_mlp": 1.02398992, + "epoch": 0.2204118442807756, + "flos": 12173142752640.0, + "grad_norm": 2.3829151101585824, + "language_loss": 0.98858523, + "learning_rate": 3.631568332524466e-06, + "loss": 1.0146414, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.2557373, + "step": 3666, + "time_per_iteration": 2.7684144973754883 + }, + { + "auxiliary_loss_clip": 0.01544255, + "auxiliary_loss_mlp": 0.01053444, + "balance_loss_clip": 1.33312631, + "balance_loss_mlp": 1.02540648, + "epoch": 0.22047196753344356, + "flos": 40122056977920.0, + "grad_norm": 1.6755184617301044, + "language_loss": 0.81650007, + "learning_rate": 3.631343053912122e-06, + "loss": 0.84247708, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.28039551, + "step": 3667, + "time_per_iteration": 3.074862241744995 + }, + { + "auxiliary_loss_clip": 0.0157822, + "auxiliary_loss_mlp": 0.0105323, + "balance_loss_clip": 1.35922194, + "balance_loss_mlp": 1.02551436, + "epoch": 0.22053209078611152, + "flos": 20710067160960.0, + "grad_norm": 1.62350957704438, + "language_loss": 0.78158367, + "learning_rate": 3.631117713439087e-06, + "loss": 0.80789816, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.27722168, + "step": 3668, + "time_per_iteration": 2.8252410888671875 + }, + { + "auxiliary_loss_clip": 0.01538559, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.32895494, + "balance_loss_mlp": 1.02033305, + "epoch": 0.2205922140387795, + "flos": 24726732197760.0, + "grad_norm": 1.4755785125719523, + "language_loss": 0.72163588, + "learning_rate": 3.630892311113904e-06, + "loss": 0.74749386, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.26916504, + "step": 3669, + "time_per_iteration": 2.996981143951416 + }, + { + "auxiliary_loss_clip": 0.01544059, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.33235455, + "balance_loss_mlp": 1.01936007, + "epoch": 0.22065233729144745, + "flos": 23487726579840.0, + "grad_norm": 1.7339900196806786, + "language_loss": 0.86347997, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.88937891, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26477051, + "step": 3670, + "time_per_iteration": 2.840299367904663 + }, + { + "auxiliary_loss_clip": 0.01561917, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.34628165, + "balance_loss_mlp": 1.0278182, + "epoch": 0.22071246054411545, + "flos": 35239068892800.0, + "grad_norm": 1.7642298718453084, + "language_loss": 0.77511561, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.80127764, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26489258, + "step": 3671, + "time_per_iteration": 2.9431350231170654 + }, + { + "auxiliary_loss_clip": 0.01548795, + "auxiliary_loss_mlp": 0.01047736, + "balance_loss_clip": 1.33756471, + "balance_loss_mlp": 1.02166545, + "epoch": 0.2207725837967834, + "flos": 18159383496960.0, + "grad_norm": 1.9276752036350355, + "language_loss": 0.81306082, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83902615, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26098633, + "step": 3672, + "time_per_iteration": 2.77610445022583 + }, + { + "auxiliary_loss_clip": 0.01549402, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.33903682, + "balance_loss_mlp": 1.02502525, + "epoch": 0.22083270704945138, + "flos": 20487887354880.0, + "grad_norm": 1.8814614766587932, + "language_loss": 0.74478024, + "learning_rate": 3.629990083462682e-06, + "loss": 0.7707727, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24768066, + "step": 3673, + "time_per_iteration": 2.8167409896850586 + }, + { + "auxiliary_loss_clip": 0.01545796, + "auxiliary_loss_mlp": 0.01045756, + "balance_loss_clip": 1.33597517, + "balance_loss_mlp": 1.0188024, + "epoch": 0.22089283030211934, + "flos": 34137671270400.0, + "grad_norm": 2.3003851577677925, + "language_loss": 0.77533615, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.80125165, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.26916504, + "step": 3674, + "time_per_iteration": 2.9564268589019775 + }, + { + "auxiliary_loss_clip": 0.01550146, + "auxiliary_loss_mlp": 0.01053364, + "balance_loss_clip": 1.33844984, + "balance_loss_mlp": 1.02612519, + "epoch": 0.2209529535547873, + "flos": 18086258620800.0, + "grad_norm": 1.8842058019719292, + "language_loss": 0.76367682, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.78971195, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.27233887, + "step": 3675, + "time_per_iteration": 2.8900671005249023 + }, + { + "auxiliary_loss_clip": 0.01553198, + "auxiliary_loss_mlp": 0.01053109, + "balance_loss_clip": 1.34019494, + "balance_loss_mlp": 1.02656066, + "epoch": 0.22101307680745527, + "flos": 27246803114880.0, + "grad_norm": 1.7013513683409005, + "language_loss": 0.81008852, + "learning_rate": 3.629312763695772e-06, + "loss": 0.8361516, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.26550293, + "step": 3676, + "time_per_iteration": 2.9181697368621826 + }, + { + "auxiliary_loss_clip": 0.01547327, + "auxiliary_loss_mlp": 0.01047256, + "balance_loss_clip": 1.33278227, + "balance_loss_mlp": 1.02136374, + "epoch": 0.22107320006012326, + "flos": 16551495872640.0, + "grad_norm": 2.0143616168436695, + "language_loss": 0.77178288, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.79772872, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2590332, + "step": 3677, + "time_per_iteration": 4.333882093429565 + }, + { + "auxiliary_loss_clip": 0.0154144, + "auxiliary_loss_mlp": 0.01046929, + "balance_loss_clip": 1.3289547, + "balance_loss_mlp": 1.02090549, + "epoch": 0.22113332331279123, + "flos": 22064619294720.0, + "grad_norm": 2.069746614824054, + "language_loss": 0.84606987, + "learning_rate": 3.628860908251712e-06, + "loss": 0.87195355, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.26049805, + "step": 3678, + "time_per_iteration": 2.8162953853607178 + }, + { + "auxiliary_loss_clip": 0.01550182, + "auxiliary_loss_mlp": 0.01052458, + "balance_loss_clip": 1.34009469, + "balance_loss_mlp": 1.02700698, + "epoch": 0.2211934465654592, + "flos": 26623499742720.0, + "grad_norm": 1.7727009447070683, + "language_loss": 0.90151638, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.92754275, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2545166, + "step": 3679, + "time_per_iteration": 2.8922386169433594 + }, + { + "auxiliary_loss_clip": 0.01592638, + "auxiliary_loss_mlp": 0.0105132, + "balance_loss_clip": 1.37444603, + "balance_loss_mlp": 1.02470016, + "epoch": 0.22125356981812716, + "flos": 16368434835840.0, + "grad_norm": 2.066163582602331, + "language_loss": 0.87748849, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.90392816, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.26623535, + "step": 3680, + "time_per_iteration": 2.822935104370117 + }, + { + "auxiliary_loss_clip": 0.01540964, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_clip": 1.33385611, + "balance_loss_mlp": 1.01829159, + "epoch": 0.22131369307079513, + "flos": 21660645306240.0, + "grad_norm": 1.8352116791114714, + "language_loss": 0.81804973, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84389764, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.25537109, + "step": 3681, + "time_per_iteration": 2.8633689880371094 + }, + { + "auxiliary_loss_clip": 0.01524854, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_clip": 1.321908, + "balance_loss_mlp": 1.02166021, + "epoch": 0.2213738163234631, + "flos": 19618940108160.0, + "grad_norm": 2.638627396821017, + "language_loss": 0.81246483, + "learning_rate": 3.62795645623335e-06, + "loss": 0.83818614, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.25646973, + "step": 3682, + "time_per_iteration": 2.8272817134857178 + }, + { + "auxiliary_loss_clip": 0.01548082, + "auxiliary_loss_mlp": 0.01049493, + "balance_loss_clip": 1.33675599, + "balance_loss_mlp": 1.02300453, + "epoch": 0.22143393957613106, + "flos": 23633976332160.0, + "grad_norm": 1.5819038677764574, + "language_loss": 0.78053868, + "learning_rate": 3.627730188876638e-06, + "loss": 0.80651438, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.26513672, + "step": 3683, + "time_per_iteration": 2.8330986499786377 + }, + { + "auxiliary_loss_clip": 0.01552038, + "auxiliary_loss_mlp": 0.01052489, + "balance_loss_clip": 1.33509672, + "balance_loss_mlp": 1.02542806, + "epoch": 0.22149406282879905, + "flos": 26189139231360.0, + "grad_norm": 2.3624098391980493, + "language_loss": 0.73754454, + "learning_rate": 3.627503859796234e-06, + "loss": 0.76358986, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.27075195, + "step": 3684, + "time_per_iteration": 2.8971121311187744 + }, + { + "auxiliary_loss_clip": 0.01533122, + "auxiliary_loss_mlp": 0.01054824, + "balance_loss_clip": 1.32338214, + "balance_loss_mlp": 1.02735806, + "epoch": 0.221554186081467, + "flos": 14547054407040.0, + "grad_norm": 2.4743531564303898, + "language_loss": 0.81101257, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.83689207, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.2746582, + "step": 3685, + "time_per_iteration": 2.81502103805542 + }, + { + "auxiliary_loss_clip": 0.01518332, + "auxiliary_loss_mlp": 0.01061509, + "balance_loss_clip": 1.31386423, + "balance_loss_mlp": 1.03423405, + "epoch": 0.22161430933413498, + "flos": 22248811451520.0, + "grad_norm": 1.4117014025950925, + "language_loss": 0.87684786, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.9026463, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.27319336, + "step": 3686, + "time_per_iteration": 4.306074380874634 + }, + { + "auxiliary_loss_clip": 0.01525819, + "auxiliary_loss_mlp": 0.01070648, + "balance_loss_clip": 1.31715465, + "balance_loss_mlp": 1.04084587, + "epoch": 0.22167443258680294, + "flos": 23486821683840.0, + "grad_norm": 1.827666874056116, + "language_loss": 0.79183149, + "learning_rate": 3.626824502298707e-06, + "loss": 0.81779623, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.29785156, + "step": 3687, + "time_per_iteration": 4.319127559661865 + }, + { + "auxiliary_loss_clip": 0.015522, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_clip": 1.33489823, + "balance_loss_mlp": 1.03523254, + "epoch": 0.2217345558394709, + "flos": 23231359687680.0, + "grad_norm": 2.341458328658131, + "language_loss": 0.86218399, + "learning_rate": 3.626597926409383e-06, + "loss": 0.88835514, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.29663086, + "step": 3688, + "time_per_iteration": 4.248947620391846 + }, + { + "auxiliary_loss_clip": 0.01542868, + "auxiliary_loss_mlp": 0.01056624, + "balance_loss_clip": 1.32886028, + "balance_loss_mlp": 1.028157, + "epoch": 0.22179467909213887, + "flos": 20020651856640.0, + "grad_norm": 1.7384778086244979, + "language_loss": 0.82426679, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.85026169, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.28466797, + "step": 3689, + "time_per_iteration": 2.8132243156433105 + }, + { + "auxiliary_loss_clip": 0.01522264, + "auxiliary_loss_mlp": 0.01074015, + "balance_loss_clip": 1.3145597, + "balance_loss_mlp": 1.04499984, + "epoch": 0.22185480234480687, + "flos": 19692607921920.0, + "grad_norm": 1.6242320643922146, + "language_loss": 0.71671546, + "learning_rate": 3.626144589597061e-06, + "loss": 0.74267828, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.2902832, + "step": 3690, + "time_per_iteration": 2.8555452823638916 + }, + { + "auxiliary_loss_clip": 0.01532952, + "auxiliary_loss_mlp": 0.01062989, + "balance_loss_clip": 1.3188808, + "balance_loss_mlp": 1.0328052, + "epoch": 0.22191492559747483, + "flos": 21991403928960.0, + "grad_norm": 1.6744829651121327, + "language_loss": 0.73484838, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.76080775, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.30175781, + "step": 3691, + "time_per_iteration": 2.8474085330963135 + }, + { + "auxiliary_loss_clip": 0.01517481, + "auxiliary_loss_mlp": 0.01068514, + "balance_loss_clip": 1.30975842, + "balance_loss_mlp": 1.03594601, + "epoch": 0.2219750488501428, + "flos": 23232536052480.0, + "grad_norm": 1.9043043595744231, + "language_loss": 0.72802353, + "learning_rate": 3.625691006130477e-06, + "loss": 0.75388348, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.32568359, + "step": 3692, + "time_per_iteration": 2.8302063941955566 + }, + { + "auxiliary_loss_clip": 0.01531918, + "auxiliary_loss_mlp": 0.01060433, + "balance_loss_clip": 1.31971872, + "balance_loss_mlp": 1.03034472, + "epoch": 0.22203517210281076, + "flos": 22463390131200.0, + "grad_norm": 2.07943778477618, + "language_loss": 0.87677133, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.90269482, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.30102539, + "step": 3693, + "time_per_iteration": 2.8835949897766113 + }, + { + "auxiliary_loss_clip": 0.01508168, + "auxiliary_loss_mlp": 0.01055709, + "balance_loss_clip": 1.30488992, + "balance_loss_mlp": 1.02645516, + "epoch": 0.22209529535547873, + "flos": 17572981898880.0, + "grad_norm": 2.265172959033575, + "language_loss": 0.8597343, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.88537312, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.29223633, + "step": 3694, + "time_per_iteration": 2.807481288909912 + }, + { + "auxiliary_loss_clip": 0.01538653, + "auxiliary_loss_mlp": 0.01049744, + "balance_loss_clip": 1.32070518, + "balance_loss_mlp": 1.02025175, + "epoch": 0.2221554186081467, + "flos": 21478579655040.0, + "grad_norm": 2.1405591901886263, + "language_loss": 0.70808375, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.73396766, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 2.18164062, + "router_z_loss_mlp": 0.29443359, + "step": 3695, + "time_per_iteration": 2.8378751277923584 + }, + { + "auxiliary_loss_clip": 0.01504991, + "auxiliary_loss_mlp": 0.01052052, + "balance_loss_clip": 1.30282021, + "balance_loss_mlp": 1.0232029, + "epoch": 0.22221554186081466, + "flos": 27685099923840.0, + "grad_norm": 1.4591319382742496, + "language_loss": 0.72429812, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.74986857, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.28857422, + "step": 3696, + "time_per_iteration": 2.880035400390625 + }, + { + "auxiliary_loss_clip": 0.01521868, + "auxiliary_loss_mlp": 0.01051763, + "balance_loss_clip": 1.31282651, + "balance_loss_mlp": 1.02060127, + "epoch": 0.22227566511348265, + "flos": 25970217050880.0, + "grad_norm": 1.6236684126671401, + "language_loss": 0.88405144, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90978765, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.31152344, + "step": 3697, + "time_per_iteration": 2.925234079360962 + }, + { + "auxiliary_loss_clip": 0.01512653, + "auxiliary_loss_mlp": 0.01047719, + "balance_loss_clip": 1.31021452, + "balance_loss_mlp": 1.01962113, + "epoch": 0.22233578836615062, + "flos": 39218515441920.0, + "grad_norm": 1.625401334914006, + "language_loss": 0.66932386, + "learning_rate": 3.624328776493346e-06, + "loss": 0.69492757, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.28112793, + "step": 3698, + "time_per_iteration": 3.032222032546997 + }, + { + "auxiliary_loss_clip": 0.01531077, + "auxiliary_loss_mlp": 0.01052922, + "balance_loss_clip": 1.32219326, + "balance_loss_mlp": 1.02526593, + "epoch": 0.22239591161881858, + "flos": 36297592427520.0, + "grad_norm": 1.745801909224781, + "language_loss": 0.8371138, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.86295378, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.27685547, + "step": 3699, + "time_per_iteration": 3.0708842277526855 + }, + { + "auxiliary_loss_clip": 0.01518083, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.3117578, + "balance_loss_mlp": 1.0244267, + "epoch": 0.22245603487148655, + "flos": 19729192982400.0, + "grad_norm": 1.4674400596607815, + "language_loss": 0.8061806, + "learning_rate": 3.62387420709809e-06, + "loss": 0.83188581, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2800293, + "step": 3700, + "time_per_iteration": 2.8814496994018555 + }, + { + "auxiliary_loss_clip": 0.01524298, + "auxiliary_loss_mlp": 0.01050346, + "balance_loss_clip": 1.31281376, + "balance_loss_mlp": 1.02230823, + "epoch": 0.2225161581241545, + "flos": 46296742890240.0, + "grad_norm": 1.8739201316578045, + "language_loss": 0.73064053, + "learning_rate": 3.623646830029943e-06, + "loss": 0.756387, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.28051758, + "step": 3701, + "time_per_iteration": 3.0360448360443115 + }, + { + "auxiliary_loss_clip": 0.01533225, + "auxiliary_loss_mlp": 0.01056941, + "balance_loss_clip": 1.32457376, + "balance_loss_mlp": 1.02780616, + "epoch": 0.22257628137682248, + "flos": 23706739249920.0, + "grad_norm": 1.7040491240413036, + "language_loss": 0.81475317, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.84065485, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.29125977, + "step": 3702, + "time_per_iteration": 2.8876004219055176 + }, + { + "auxiliary_loss_clip": 0.01504455, + "auxiliary_loss_mlp": 0.01061735, + "balance_loss_clip": 1.30291271, + "balance_loss_mlp": 1.03279126, + "epoch": 0.22263640462949044, + "flos": 19363297132800.0, + "grad_norm": 1.9879889272703812, + "language_loss": 0.79133493, + "learning_rate": 3.623191891195723e-06, + "loss": 0.81699681, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.28881836, + "step": 3703, + "time_per_iteration": 2.903937816619873 + }, + { + "auxiliary_loss_clip": 0.01534618, + "auxiliary_loss_mlp": 0.01066673, + "balance_loss_clip": 1.32198787, + "balance_loss_mlp": 1.03675115, + "epoch": 0.22269652788215843, + "flos": 20785499521920.0, + "grad_norm": 2.027828373196379, + "language_loss": 0.76006395, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.78607684, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.29907227, + "step": 3704, + "time_per_iteration": 2.8594677448272705 + }, + { + "auxiliary_loss_clip": 0.01509844, + "auxiliary_loss_mlp": 0.01073549, + "balance_loss_clip": 1.30759084, + "balance_loss_mlp": 1.04626155, + "epoch": 0.2227566511348264, + "flos": 47975583640320.0, + "grad_norm": 1.7312814155415173, + "language_loss": 0.65309811, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.67893195, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.27331543, + "step": 3705, + "time_per_iteration": 3.0836493968963623 + }, + { + "auxiliary_loss_clip": 0.01327022, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_clip": 1.20234537, + "balance_loss_mlp": 1.04598927, + "epoch": 0.22281677438749437, + "flos": 66244903297920.0, + "grad_norm": 1.4423330771618499, + "language_loss": 0.6528728, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67694807, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.34570312, + "step": 3706, + "time_per_iteration": 3.2226431369781494 + }, + { + "auxiliary_loss_clip": 0.01529484, + "auxiliary_loss_mlp": 0.01068963, + "balance_loss_clip": 1.31984937, + "balance_loss_mlp": 1.04134202, + "epoch": 0.22287689764016233, + "flos": 21881603502720.0, + "grad_norm": 2.049113596674448, + "language_loss": 0.81535149, + "learning_rate": 3.622281274977141e-06, + "loss": 0.84133589, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.27648926, + "step": 3707, + "time_per_iteration": 2.856508493423462 + }, + { + "auxiliary_loss_clip": 0.01515275, + "auxiliary_loss_mlp": 0.01070343, + "balance_loss_clip": 1.31057656, + "balance_loss_mlp": 1.04116035, + "epoch": 0.2229370208928303, + "flos": 27683878314240.0, + "grad_norm": 2.0399772428031526, + "language_loss": 0.79370618, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.81956238, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.29199219, + "step": 3708, + "time_per_iteration": 2.914273738861084 + }, + { + "auxiliary_loss_clip": 0.01538146, + "auxiliary_loss_mlp": 0.01069448, + "balance_loss_clip": 1.32525301, + "balance_loss_mlp": 1.0413624, + "epoch": 0.22299714414549826, + "flos": 30167002212480.0, + "grad_norm": 1.8900729925800335, + "language_loss": 0.81362772, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.83970368, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.28063965, + "step": 3709, + "time_per_iteration": 2.8960230350494385 + }, + { + "auxiliary_loss_clip": 0.01539169, + "auxiliary_loss_mlp": 0.01067588, + "balance_loss_clip": 1.32792854, + "balance_loss_mlp": 1.03916883, + "epoch": 0.22305726739816625, + "flos": 23152036273920.0, + "grad_norm": 1.7383643031380074, + "language_loss": 0.69278044, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71884799, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.28393555, + "step": 3710, + "time_per_iteration": 2.89029598236084 + }, + { + "auxiliary_loss_clip": 0.01529957, + "auxiliary_loss_mlp": 0.01062262, + "balance_loss_clip": 1.31907535, + "balance_loss_mlp": 1.03349674, + "epoch": 0.22311739065083422, + "flos": 19181457705600.0, + "grad_norm": 1.983189018332301, + "language_loss": 0.91576064, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.94168282, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.28747559, + "step": 3711, + "time_per_iteration": 2.7950806617736816 + }, + { + "auxiliary_loss_clip": 0.01533214, + "auxiliary_loss_mlp": 0.01063422, + "balance_loss_clip": 1.32236636, + "balance_loss_mlp": 1.03501415, + "epoch": 0.22317751390350218, + "flos": 13623514669440.0, + "grad_norm": 2.7561736582618495, + "language_loss": 0.90810961, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.93407601, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.28430176, + "step": 3712, + "time_per_iteration": 4.2227325439453125 + }, + { + "auxiliary_loss_clip": 0.01515393, + "auxiliary_loss_mlp": 0.01061958, + "balance_loss_clip": 1.3127768, + "balance_loss_mlp": 1.0322032, + "epoch": 0.22323763715617015, + "flos": 11034617132160.0, + "grad_norm": 3.117126213710573, + "language_loss": 0.76435262, + "learning_rate": 3.620913505310117e-06, + "loss": 0.79012609, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.29724121, + "step": 3713, + "time_per_iteration": 2.7940170764923096 + }, + { + "auxiliary_loss_clip": 0.01528658, + "auxiliary_loss_mlp": 0.0105681, + "balance_loss_clip": 1.32027483, + "balance_loss_mlp": 1.02696049, + "epoch": 0.22329776040883811, + "flos": 41365677565440.0, + "grad_norm": 2.011605690957797, + "language_loss": 0.63568819, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.66154277, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.29833984, + "step": 3714, + "time_per_iteration": 3.0150184631347656 + }, + { + "auxiliary_loss_clip": 0.01530868, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.3231349, + "balance_loss_mlp": 1.01910758, + "epoch": 0.22335788366150608, + "flos": 25130479962240.0, + "grad_norm": 4.880643617434365, + "language_loss": 0.80048847, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.82626408, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.27587891, + "step": 3715, + "time_per_iteration": 2.8704912662506104 + }, + { + "auxiliary_loss_clip": 0.01526097, + "auxiliary_loss_mlp": 0.01051503, + "balance_loss_clip": 1.31759357, + "balance_loss_mlp": 1.02320325, + "epoch": 0.22341800691417404, + "flos": 16992326390400.0, + "grad_norm": 1.7505197049793304, + "language_loss": 0.78563833, + "learning_rate": 3.620228790579645e-06, + "loss": 0.81141424, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.28271484, + "step": 3716, + "time_per_iteration": 2.8237810134887695 + }, + { + "auxiliary_loss_clip": 0.01539449, + "auxiliary_loss_mlp": 0.01052589, + "balance_loss_clip": 1.32963288, + "balance_loss_mlp": 1.02354908, + "epoch": 0.22347813016684204, + "flos": 14144754476160.0, + "grad_norm": 2.04961603196267, + "language_loss": 0.80359203, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.82951236, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.29077148, + "step": 3717, + "time_per_iteration": 2.8803696632385254 + }, + { + "auxiliary_loss_clip": 0.01540288, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.32624543, + "balance_loss_mlp": 1.02226031, + "epoch": 0.22353825341951, + "flos": 23592731057280.0, + "grad_norm": 1.8158578987619551, + "language_loss": 0.6810385, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70694768, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.28369141, + "step": 3718, + "time_per_iteration": 2.984872341156006 + }, + { + "auxiliary_loss_clip": 0.01532644, + "auxiliary_loss_mlp": 0.01051017, + "balance_loss_clip": 1.3212806, + "balance_loss_mlp": 1.02138114, + "epoch": 0.22359837667217797, + "flos": 29835926876160.0, + "grad_norm": 1.4559992634706882, + "language_loss": 0.8173362, + "learning_rate": 3.619543522896045e-06, + "loss": 0.84317279, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.29638672, + "step": 3719, + "time_per_iteration": 2.9412968158721924 + }, + { + "auxiliary_loss_clip": 0.01551766, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.33589101, + "balance_loss_mlp": 1.01907194, + "epoch": 0.22365849992484593, + "flos": 17612055423360.0, + "grad_norm": 2.232010442755566, + "language_loss": 0.87658888, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.90258896, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.29162598, + "step": 3720, + "time_per_iteration": 4.215553522109985 + }, + { + "auxiliary_loss_clip": 0.01532494, + "auxiliary_loss_mlp": 0.01045035, + "balance_loss_clip": 1.32721972, + "balance_loss_mlp": 1.01535177, + "epoch": 0.2237186231775139, + "flos": 22721204856960.0, + "grad_norm": 2.1636039356843293, + "language_loss": 0.75079429, + "learning_rate": 3.619086370692945e-06, + "loss": 0.7765696, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.296875, + "step": 3721, + "time_per_iteration": 2.8520843982696533 + }, + { + "auxiliary_loss_clip": 0.0155088, + "auxiliary_loss_mlp": 0.01054266, + "balance_loss_clip": 1.33496654, + "balance_loss_mlp": 1.02541721, + "epoch": 0.22377874643018186, + "flos": 13379590097280.0, + "grad_norm": 11.694820842464114, + "language_loss": 0.7990551, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.82510662, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.28881836, + "step": 3722, + "time_per_iteration": 4.309507369995117 + }, + { + "auxiliary_loss_clip": 0.01538341, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.32884693, + "balance_loss_mlp": 1.02120066, + "epoch": 0.22383886968284986, + "flos": 17904011990400.0, + "grad_norm": 2.126990282034098, + "language_loss": 0.83411515, + "learning_rate": 3.618628972906178e-06, + "loss": 0.85998666, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.27600098, + "step": 3723, + "time_per_iteration": 4.243016719818115 + }, + { + "auxiliary_loss_clip": 0.01547693, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.33448362, + "balance_loss_mlp": 1.02045083, + "epoch": 0.22389899293551782, + "flos": 23889845531520.0, + "grad_norm": 1.7704934336216331, + "language_loss": 0.8582375, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.88420612, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.28723145, + "step": 3724, + "time_per_iteration": 2.870314598083496 + }, + { + "auxiliary_loss_clip": 0.01537887, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.32806158, + "balance_loss_mlp": 1.02195072, + "epoch": 0.2239591161881858, + "flos": 27284066847360.0, + "grad_norm": 2.0011244910583916, + "language_loss": 0.80182076, + "learning_rate": 3.618171329605121e-06, + "loss": 0.8276993, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.28051758, + "step": 3725, + "time_per_iteration": 2.8906025886535645 + }, + { + "auxiliary_loss_clip": 0.01535217, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.3256222, + "balance_loss_mlp": 1.0187875, + "epoch": 0.22401923944085375, + "flos": 22247046904320.0, + "grad_norm": 2.549225671169216, + "language_loss": 0.77984941, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.80567551, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.28613281, + "step": 3726, + "time_per_iteration": 2.824796676635742 + }, + { + "auxiliary_loss_clip": 0.01569909, + "auxiliary_loss_mlp": 0.0105241, + "balance_loss_clip": 1.34903967, + "balance_loss_mlp": 1.02239263, + "epoch": 0.22407936269352172, + "flos": 12060265680000.0, + "grad_norm": 4.2315692056733285, + "language_loss": 0.74257636, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.76879954, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.30029297, + "step": 3727, + "time_per_iteration": 2.855062246322632 + }, + { + "auxiliary_loss_clip": 0.01550058, + "auxiliary_loss_mlp": 0.01050436, + "balance_loss_clip": 1.33492005, + "balance_loss_mlp": 1.02039528, + "epoch": 0.22413948594618968, + "flos": 19362754195200.0, + "grad_norm": 2.076897817432705, + "language_loss": 0.87962246, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.90562737, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.30029297, + "step": 3728, + "time_per_iteration": 2.8009896278381348 + }, + { + "auxiliary_loss_clip": 0.01544985, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.33253193, + "balance_loss_mlp": 1.02024961, + "epoch": 0.22419960919885765, + "flos": 24180535244160.0, + "grad_norm": 3.12615415539506, + "language_loss": 0.8177073, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.84366357, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 2.12402344, + "router_z_loss_mlp": 0.30371094, + "step": 3729, + "time_per_iteration": 2.8567426204681396 + }, + { + "auxiliary_loss_clip": 0.0153295, + "auxiliary_loss_mlp": 0.01055105, + "balance_loss_clip": 1.32444918, + "balance_loss_mlp": 1.02718639, + "epoch": 0.22425973245152564, + "flos": 27389885731200.0, + "grad_norm": 1.6469958283355879, + "language_loss": 0.87690419, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.90278471, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.27905273, + "step": 3730, + "time_per_iteration": 2.886948347091675 + }, + { + "auxiliary_loss_clip": 0.01540269, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.33250141, + "balance_loss_mlp": 1.01888108, + "epoch": 0.2243198557041936, + "flos": 13743630910080.0, + "grad_norm": 1.8795527029717858, + "language_loss": 0.7484799, + "learning_rate": 3.616796927310559e-06, + "loss": 0.77435869, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.28710938, + "step": 3731, + "time_per_iteration": 2.8478658199310303 + }, + { + "auxiliary_loss_clip": 0.01564747, + "auxiliary_loss_mlp": 0.01046847, + "balance_loss_clip": 1.35088468, + "balance_loss_mlp": 1.01941705, + "epoch": 0.22437997895686157, + "flos": 19538576064000.0, + "grad_norm": 2.8598084193124893, + "language_loss": 0.76624972, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.79236567, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.27416992, + "step": 3732, + "time_per_iteration": 2.8200795650482178 + }, + { + "auxiliary_loss_clip": 0.01542485, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.33263206, + "balance_loss_mlp": 1.02506781, + "epoch": 0.22444010220952954, + "flos": 23706784494720.0, + "grad_norm": 1.6276970271148343, + "language_loss": 0.89356577, + "learning_rate": 3.616338302646873e-06, + "loss": 0.91952759, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.28662109, + "step": 3733, + "time_per_iteration": 2.858466625213623 + }, + { + "auxiliary_loss_clip": 0.01544236, + "auxiliary_loss_mlp": 0.01052947, + "balance_loss_clip": 1.33369946, + "balance_loss_mlp": 1.02207148, + "epoch": 0.2245002254621975, + "flos": 22393160922240.0, + "grad_norm": 1.5147335682843577, + "language_loss": 0.85221756, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.87818933, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.30883789, + "step": 3734, + "time_per_iteration": 2.8464205265045166 + }, + { + "auxiliary_loss_clip": 0.015403, + "auxiliary_loss_mlp": 0.01059528, + "balance_loss_clip": 1.33037925, + "balance_loss_mlp": 1.0311327, + "epoch": 0.22456034871486547, + "flos": 26953398714240.0, + "grad_norm": 1.922770622506954, + "language_loss": 0.77510822, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.80110651, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.28393555, + "step": 3735, + "time_per_iteration": 2.876671552658081 + }, + { + "auxiliary_loss_clip": 0.01531448, + "auxiliary_loss_mlp": 0.01050451, + "balance_loss_clip": 1.32588935, + "balance_loss_mlp": 1.02112556, + "epoch": 0.22462047196753343, + "flos": 28994651464320.0, + "grad_norm": 1.6350834856147414, + "language_loss": 0.85378397, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.87960291, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.29284668, + "step": 3736, + "time_per_iteration": 2.954570770263672 + }, + { + "auxiliary_loss_clip": 0.01556441, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.34338951, + "balance_loss_mlp": 1.01709652, + "epoch": 0.22468059522020142, + "flos": 20020968570240.0, + "grad_norm": 1.669360671691999, + "language_loss": 0.87051022, + "learning_rate": 3.615420317888586e-06, + "loss": 0.89653432, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.28881836, + "step": 3737, + "time_per_iteration": 2.85334849357605 + }, + { + "auxiliary_loss_clip": 0.01552174, + "auxiliary_loss_mlp": 0.01057983, + "balance_loss_clip": 1.33818412, + "balance_loss_mlp": 1.0264163, + "epoch": 0.2247407184728694, + "flos": 29326043514240.0, + "grad_norm": 2.4424319613994316, + "language_loss": 0.79944551, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.8255471, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.31567383, + "step": 3738, + "time_per_iteration": 2.961815118789673 + }, + { + "auxiliary_loss_clip": 0.01545414, + "auxiliary_loss_mlp": 0.01051719, + "balance_loss_clip": 1.33292365, + "balance_loss_mlp": 1.02363324, + "epoch": 0.22480084172553735, + "flos": 22320714718080.0, + "grad_norm": 1.6782565651325472, + "language_loss": 0.77466273, + "learning_rate": 3.614960957933224e-06, + "loss": 0.80063403, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.28076172, + "step": 3739, + "time_per_iteration": 2.836958885192871 + }, + { + "auxiliary_loss_clip": 0.01534717, + "auxiliary_loss_mlp": 0.01052016, + "balance_loss_clip": 1.32427025, + "balance_loss_mlp": 1.01985335, + "epoch": 0.22486096497820532, + "flos": 25601742247680.0, + "grad_norm": 4.571255832643259, + "language_loss": 0.75497323, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.78084058, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.3215332, + "step": 3740, + "time_per_iteration": 2.8794212341308594 + }, + { + "auxiliary_loss_clip": 0.01526577, + "auxiliary_loss_mlp": 0.01047601, + "balance_loss_clip": 1.31819129, + "balance_loss_mlp": 1.01880002, + "epoch": 0.22492108823087328, + "flos": 17648866707840.0, + "grad_norm": 1.887467425843772, + "language_loss": 0.77047449, + "learning_rate": 3.614501353019939e-06, + "loss": 0.79621625, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.28796387, + "step": 3741, + "time_per_iteration": 2.8430638313293457 + }, + { + "auxiliary_loss_clip": 0.0153776, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.33054364, + "balance_loss_mlp": 1.01881313, + "epoch": 0.22498121148354125, + "flos": 16043331813120.0, + "grad_norm": 1.715347611607743, + "language_loss": 0.88517159, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.91102153, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.28381348, + "step": 3742, + "time_per_iteration": 2.911713123321533 + }, + { + "auxiliary_loss_clip": 0.01521661, + "auxiliary_loss_mlp": 0.01053344, + "balance_loss_clip": 1.31659913, + "balance_loss_mlp": 1.02294612, + "epoch": 0.22504133473620924, + "flos": 24034195002240.0, + "grad_norm": 2.4258360730147785, + "language_loss": 0.8243317, + "learning_rate": 3.614041503218444e-06, + "loss": 0.85008168, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.30419922, + "step": 3743, + "time_per_iteration": 3.0027434825897217 + }, + { + "auxiliary_loss_clip": 0.01534911, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.3240521, + "balance_loss_mlp": 1.01736689, + "epoch": 0.2251014579888772, + "flos": 16772725537920.0, + "grad_norm": 2.069787498989708, + "language_loss": 0.64260972, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66841996, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.28735352, + "step": 3744, + "time_per_iteration": 2.8116295337677 + }, + { + "auxiliary_loss_clip": 0.01521203, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.31002629, + "balance_loss_mlp": 1.0218153, + "epoch": 0.22516158124154517, + "flos": 13999047661440.0, + "grad_norm": 3.0721771725421743, + "language_loss": 0.76982129, + "learning_rate": 3.613581408598489e-06, + "loss": 0.79553616, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.28466797, + "step": 3745, + "time_per_iteration": 2.823042392730713 + }, + { + "auxiliary_loss_clip": 0.01525826, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_clip": 1.31792748, + "balance_loss_mlp": 1.02054048, + "epoch": 0.22522170449421314, + "flos": 14397547029120.0, + "grad_norm": 1.8942995207489424, + "language_loss": 0.81737006, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.84311658, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.28295898, + "step": 3746, + "time_per_iteration": 4.264824628829956 + }, + { + "auxiliary_loss_clip": 0.01529026, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.31770778, + "balance_loss_mlp": 1.02620721, + "epoch": 0.2252818277468811, + "flos": 23816087228160.0, + "grad_norm": 2.2278963911297613, + "language_loss": 0.87581521, + "learning_rate": 3.613121069229862e-06, + "loss": 0.90165555, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.28820801, + "step": 3747, + "time_per_iteration": 2.819066286087036 + }, + { + "auxiliary_loss_clip": 0.01532456, + "auxiliary_loss_mlp": 0.01047654, + "balance_loss_clip": 1.32214904, + "balance_loss_mlp": 1.01928186, + "epoch": 0.22534195099954907, + "flos": 24728903948160.0, + "grad_norm": 1.8023029524812735, + "language_loss": 0.77460176, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.80040288, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.28417969, + "step": 3748, + "time_per_iteration": 2.922307252883911 + }, + { + "auxiliary_loss_clip": 0.01526722, + "auxiliary_loss_mlp": 0.01050875, + "balance_loss_clip": 1.31868196, + "balance_loss_mlp": 1.02147841, + "epoch": 0.22540207425221703, + "flos": 21042092638080.0, + "grad_norm": 1.9816409818737228, + "language_loss": 0.80710018, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.83287609, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.29418945, + "step": 3749, + "time_per_iteration": 2.8502230644226074 + }, + { + "auxiliary_loss_clip": 0.01512205, + "auxiliary_loss_mlp": 0.01048275, + "balance_loss_clip": 1.30581737, + "balance_loss_mlp": 1.01959372, + "epoch": 0.22546219750488503, + "flos": 19399655969280.0, + "grad_norm": 1.525089940026836, + "language_loss": 0.80334485, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82894957, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.28686523, + "step": 3750, + "time_per_iteration": 2.8381307125091553 + }, + { + "auxiliary_loss_clip": 0.01539488, + "auxiliary_loss_mlp": 0.01055856, + "balance_loss_clip": 1.32651067, + "balance_loss_mlp": 1.02750778, + "epoch": 0.225522320757553, + "flos": 25203695328000.0, + "grad_norm": 5.416841267111513, + "language_loss": 0.83306801, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.85902143, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.28356934, + "step": 3751, + "time_per_iteration": 2.883805751800537 + }, + { + "auxiliary_loss_clip": 0.01537812, + "auxiliary_loss_mlp": 0.01054101, + "balance_loss_clip": 1.33054662, + "balance_loss_mlp": 1.02563381, + "epoch": 0.22558244401022096, + "flos": 17171405884800.0, + "grad_norm": 2.5850435224094546, + "language_loss": 0.84657884, + "learning_rate": 3.611969150491165e-06, + "loss": 0.87249798, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.28466797, + "step": 3752, + "time_per_iteration": 2.7997288703918457 + }, + { + "auxiliary_loss_clip": 0.01519272, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.3126328, + "balance_loss_mlp": 1.01533091, + "epoch": 0.22564256726288892, + "flos": 15239229644160.0, + "grad_norm": 1.7513019381399806, + "language_loss": 0.79261029, + "learning_rate": 3.611738583330375e-06, + "loss": 0.81821269, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.25634766, + "step": 3753, + "time_per_iteration": 2.832828998565674 + }, + { + "auxiliary_loss_clip": 0.0152849, + "auxiliary_loss_mlp": 0.01045703, + "balance_loss_clip": 1.32181883, + "balance_loss_mlp": 1.01767659, + "epoch": 0.2257026905155569, + "flos": 34581940392960.0, + "grad_norm": 1.7818187463908335, + "language_loss": 0.79554057, + "learning_rate": 3.611507955052295e-06, + "loss": 0.82128251, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.28039551, + "step": 3754, + "time_per_iteration": 2.9197747707366943 + }, + { + "auxiliary_loss_clip": 0.01521631, + "auxiliary_loss_mlp": 0.01048182, + "balance_loss_clip": 1.31670439, + "balance_loss_mlp": 1.01995289, + "epoch": 0.22576281376822485, + "flos": 19948069918080.0, + "grad_norm": 1.7790457126556039, + "language_loss": 0.7103247, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.73602283, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.28234863, + "step": 3755, + "time_per_iteration": 4.252835512161255 + }, + { + "auxiliary_loss_clip": 0.0153807, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_clip": 1.32546556, + "balance_loss_mlp": 1.02601171, + "epoch": 0.22582293702089282, + "flos": 24611140437120.0, + "grad_norm": 2.280353507914846, + "language_loss": 0.7817024, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.80761975, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.27685547, + "step": 3756, + "time_per_iteration": 2.9065020084381104 + }, + { + "auxiliary_loss_clip": 0.01549891, + "auxiliary_loss_mlp": 0.01043932, + "balance_loss_clip": 1.33700562, + "balance_loss_mlp": 1.01653814, + "epoch": 0.2258830602735608, + "flos": 23044769556480.0, + "grad_norm": 13.18143871256427, + "language_loss": 0.83606517, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.86200339, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.27404785, + "step": 3757, + "time_per_iteration": 4.252346515655518 + }, + { + "auxiliary_loss_clip": 0.01543328, + "auxiliary_loss_mlp": 0.01043795, + "balance_loss_clip": 1.33022404, + "balance_loss_mlp": 1.01597166, + "epoch": 0.22594318352622877, + "flos": 22167180552960.0, + "grad_norm": 2.3032334805131, + "language_loss": 0.73818803, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.76405931, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.27832031, + "step": 3758, + "time_per_iteration": 4.20307469367981 + }, + { + "auxiliary_loss_clip": 0.01532222, + "auxiliary_loss_mlp": 0.01048852, + "balance_loss_clip": 1.32114172, + "balance_loss_mlp": 1.02086139, + "epoch": 0.22600330677889674, + "flos": 20603886318720.0, + "grad_norm": 2.182321432903683, + "language_loss": 0.77273977, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79855049, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.27978516, + "step": 3759, + "time_per_iteration": 2.8294897079467773 + }, + { + "auxiliary_loss_clip": 0.01539222, + "auxiliary_loss_mlp": 0.01049388, + "balance_loss_clip": 1.32678676, + "balance_loss_mlp": 1.02207673, + "epoch": 0.2260634300315647, + "flos": 35672479263360.0, + "grad_norm": 1.711283843432931, + "language_loss": 0.78981626, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.81570238, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.2734375, + "step": 3760, + "time_per_iteration": 2.969528913497925 + }, + { + "auxiliary_loss_clip": 0.0129122, + "auxiliary_loss_mlp": 0.01043986, + "balance_loss_clip": 1.16764665, + "balance_loss_mlp": 1.01461303, + "epoch": 0.22612355328423267, + "flos": 72117226339200.0, + "grad_norm": 0.954599524700344, + "language_loss": 0.60130632, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62465835, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.29296875, + "step": 3761, + "time_per_iteration": 3.2857141494750977 + }, + { + "auxiliary_loss_clip": 0.01551985, + "auxiliary_loss_mlp": 0.01055522, + "balance_loss_clip": 1.33575988, + "balance_loss_mlp": 1.02957034, + "epoch": 0.22618367653690064, + "flos": 22794013019520.0, + "grad_norm": 2.9064264919473635, + "language_loss": 0.77778077, + "learning_rate": 3.609660729655211e-06, + "loss": 0.80385584, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.25927734, + "step": 3762, + "time_per_iteration": 2.872756004333496 + }, + { + "auxiliary_loss_clip": 0.0155121, + "auxiliary_loss_mlp": 0.01044578, + "balance_loss_clip": 1.33875978, + "balance_loss_mlp": 1.0177803, + "epoch": 0.22624379978956863, + "flos": 20457772300800.0, + "grad_norm": 3.42500249029195, + "language_loss": 0.79914749, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.82510543, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.26831055, + "step": 3763, + "time_per_iteration": 2.823897123336792 + }, + { + "auxiliary_loss_clip": 0.01553856, + "auxiliary_loss_mlp": 0.01054136, + "balance_loss_clip": 1.33790994, + "balance_loss_mlp": 1.02649117, + "epoch": 0.2263039230422366, + "flos": 17503431361920.0, + "grad_norm": 1.713214065018996, + "language_loss": 0.92705798, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.95313793, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27661133, + "step": 3764, + "time_per_iteration": 2.8070759773254395 + }, + { + "auxiliary_loss_clip": 0.01534094, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.32784247, + "balance_loss_mlp": 1.02797627, + "epoch": 0.22636404629490456, + "flos": 28341459262080.0, + "grad_norm": 1.6772547502973378, + "language_loss": 0.75946146, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.78534108, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.25878906, + "step": 3765, + "time_per_iteration": 2.8975167274475098 + }, + { + "auxiliary_loss_clip": 0.01540107, + "auxiliary_loss_mlp": 0.01055675, + "balance_loss_clip": 1.33289647, + "balance_loss_mlp": 1.02965176, + "epoch": 0.22642416954757252, + "flos": 17496916110720.0, + "grad_norm": 1.88371499543693, + "language_loss": 0.91106641, + "learning_rate": 3.608735651752494e-06, + "loss": 0.93702424, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.26013184, + "step": 3766, + "time_per_iteration": 2.7890732288360596 + }, + { + "auxiliary_loss_clip": 0.01533339, + "auxiliary_loss_mlp": 0.01054196, + "balance_loss_clip": 1.33021259, + "balance_loss_mlp": 1.02726626, + "epoch": 0.2264842928002405, + "flos": 24393982803840.0, + "grad_norm": 1.5401573321021198, + "language_loss": 0.7571919, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.78306723, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.26940918, + "step": 3767, + "time_per_iteration": 2.8692991733551025 + }, + { + "auxiliary_loss_clip": 0.01553821, + "auxiliary_loss_mlp": 0.01061337, + "balance_loss_clip": 1.34168208, + "balance_loss_mlp": 1.03459883, + "epoch": 0.22654441605290845, + "flos": 19840531731840.0, + "grad_norm": 1.4938241348761294, + "language_loss": 0.72595799, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.75210965, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.26733398, + "step": 3768, + "time_per_iteration": 2.849514961242676 + }, + { + "auxiliary_loss_clip": 0.01539211, + "auxiliary_loss_mlp": 0.01060941, + "balance_loss_clip": 1.33475351, + "balance_loss_mlp": 1.03492975, + "epoch": 0.22660453930557642, + "flos": 27466177743360.0, + "grad_norm": 1.7611297152614769, + "language_loss": 0.78781283, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.8138144, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26013184, + "step": 3769, + "time_per_iteration": 2.9468629360198975 + }, + { + "auxiliary_loss_clip": 0.01538642, + "auxiliary_loss_mlp": 0.01060281, + "balance_loss_clip": 1.32757461, + "balance_loss_mlp": 1.03390026, + "epoch": 0.2266646625582444, + "flos": 23998786306560.0, + "grad_norm": 2.6881760447460725, + "language_loss": 0.70133084, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.72732002, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.26391602, + "step": 3770, + "time_per_iteration": 2.8608558177948 + }, + { + "auxiliary_loss_clip": 0.01552444, + "auxiliary_loss_mlp": 0.01062649, + "balance_loss_clip": 1.34342861, + "balance_loss_mlp": 1.03808022, + "epoch": 0.22672478581091238, + "flos": 26038364999040.0, + "grad_norm": 1.6779546908087195, + "language_loss": 0.80978119, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.83593214, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24584961, + "step": 3771, + "time_per_iteration": 2.8651278018951416 + }, + { + "auxiliary_loss_clip": 0.01529995, + "auxiliary_loss_mlp": 0.01057275, + "balance_loss_clip": 1.32688236, + "balance_loss_mlp": 1.03189528, + "epoch": 0.22678490906358034, + "flos": 23852310330240.0, + "grad_norm": 1.4786098606571236, + "language_loss": 0.79192054, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81779325, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.25378418, + "step": 3772, + "time_per_iteration": 2.885185480117798 + }, + { + "auxiliary_loss_clip": 0.0129708, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.17372942, + "balance_loss_mlp": 1.01187575, + "epoch": 0.2268450323162483, + "flos": 65082343409280.0, + "grad_norm": 0.8540738676136503, + "language_loss": 0.54510063, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56841815, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.22753906, + "step": 3773, + "time_per_iteration": 3.4410598278045654 + }, + { + "auxiliary_loss_clip": 0.01530119, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_clip": 1.32599497, + "balance_loss_mlp": 1.02421844, + "epoch": 0.22690515556891627, + "flos": 22535655356160.0, + "grad_norm": 1.652865665104445, + "language_loss": 0.71300256, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.7387948, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.24902344, + "step": 3774, + "time_per_iteration": 2.8518316745758057 + }, + { + "auxiliary_loss_clip": 0.0153746, + "auxiliary_loss_mlp": 0.01049573, + "balance_loss_clip": 1.33238626, + "balance_loss_mlp": 1.02494383, + "epoch": 0.22696527882158424, + "flos": 18232598862720.0, + "grad_norm": 1.8973620743449378, + "language_loss": 0.74713433, + "learning_rate": 3.606650658627658e-06, + "loss": 0.77300471, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.24633789, + "step": 3775, + "time_per_iteration": 2.814811944961548 + }, + { + "auxiliary_loss_clip": 0.01515778, + "auxiliary_loss_mlp": 0.0104858, + "balance_loss_clip": 1.31195414, + "balance_loss_mlp": 1.02357018, + "epoch": 0.22702540207425223, + "flos": 17028051799680.0, + "grad_norm": 1.804581600827397, + "language_loss": 0.8418473, + "learning_rate": 3.606418687985928e-06, + "loss": 0.86749089, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.25012207, + "step": 3776, + "time_per_iteration": 2.8440778255462646 + }, + { + "auxiliary_loss_clip": 0.01535956, + "auxiliary_loss_mlp": 0.01049977, + "balance_loss_clip": 1.32706952, + "balance_loss_mlp": 1.02407324, + "epoch": 0.2270855253269202, + "flos": 21335949486720.0, + "grad_norm": 1.939496226483114, + "language_loss": 0.83524668, + "learning_rate": 3.606186656428641e-06, + "loss": 0.86110598, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2590332, + "step": 3777, + "time_per_iteration": 2.85398530960083 + }, + { + "auxiliary_loss_clip": 0.01541398, + "auxiliary_loss_mlp": 0.01047962, + "balance_loss_clip": 1.33302307, + "balance_loss_mlp": 1.02247524, + "epoch": 0.22714564857958816, + "flos": 23561032435200.0, + "grad_norm": 2.4028679252884175, + "language_loss": 0.74043608, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.76632971, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.25500488, + "step": 3778, + "time_per_iteration": 2.890629529953003 + }, + { + "auxiliary_loss_clip": 0.0153561, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.32413578, + "balance_loss_mlp": 1.01676619, + "epoch": 0.22720577183225613, + "flos": 25999743922560.0, + "grad_norm": 2.1086742078639182, + "language_loss": 0.65628982, + "learning_rate": 3.605722410602591e-06, + "loss": 0.68206376, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.25, + "step": 3779, + "time_per_iteration": 2.8753128051757812 + }, + { + "auxiliary_loss_clip": 0.01521602, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_clip": 1.3178333, + "balance_loss_mlp": 1.02120304, + "epoch": 0.2272658950849241, + "flos": 20823939619200.0, + "grad_norm": 1.8408528013001781, + "language_loss": 0.71956348, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.74524289, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.25170898, + "step": 3780, + "time_per_iteration": 2.8263442516326904 + }, + { + "auxiliary_loss_clip": 0.01526005, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.32105732, + "balance_loss_mlp": 1.01834726, + "epoch": 0.22732601833759206, + "flos": 23918422262400.0, + "grad_norm": 1.615438358259811, + "language_loss": 0.90308058, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.92879373, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26977539, + "step": 3781, + "time_per_iteration": 4.281549453735352 + }, + { + "auxiliary_loss_clip": 0.01523221, + "auxiliary_loss_mlp": 0.01044097, + "balance_loss_clip": 1.31571126, + "balance_loss_mlp": 1.0185982, + "epoch": 0.22738614159026002, + "flos": 15932852714880.0, + "grad_norm": 2.024344874085057, + "language_loss": 0.75779092, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.78346407, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.25524902, + "step": 3782, + "time_per_iteration": 2.845776319503784 + }, + { + "auxiliary_loss_clip": 0.01508731, + "auxiliary_loss_mlp": 0.01046256, + "balance_loss_clip": 1.30632675, + "balance_loss_mlp": 1.0225209, + "epoch": 0.22744626484292801, + "flos": 24215808205440.0, + "grad_norm": 3.1051367821642604, + "language_loss": 0.83631891, + "learning_rate": 3.604793188351095e-06, + "loss": 0.86186874, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.23730469, + "step": 3783, + "time_per_iteration": 2.897343397140503 + }, + { + "auxiliary_loss_clip": 0.01515332, + "auxiliary_loss_mlp": 0.01047023, + "balance_loss_clip": 1.30939543, + "balance_loss_mlp": 1.02172649, + "epoch": 0.22750638809559598, + "flos": 24802662251520.0, + "grad_norm": 1.6407136672388425, + "language_loss": 0.76511174, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.7907353, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.2532959, + "step": 3784, + "time_per_iteration": 2.879348039627075 + }, + { + "auxiliary_loss_clip": 0.01519427, + "auxiliary_loss_mlp": 0.01045311, + "balance_loss_clip": 1.31431389, + "balance_loss_mlp": 1.0195384, + "epoch": 0.22756651134826394, + "flos": 22246594456320.0, + "grad_norm": 1.54643732950845, + "language_loss": 0.71536744, + "learning_rate": 3.604328212066594e-06, + "loss": 0.74101484, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.25756836, + "step": 3785, + "time_per_iteration": 2.9326696395874023 + }, + { + "auxiliary_loss_clip": 0.01283158, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.16936898, + "balance_loss_mlp": 1.00980949, + "epoch": 0.2276266346009319, + "flos": 62739677928960.0, + "grad_norm": 0.8190006572904106, + "language_loss": 0.61987746, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.64301312, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.20605469, + "step": 3786, + "time_per_iteration": 3.3358380794525146 + }, + { + "auxiliary_loss_clip": 0.01527153, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.3194809, + "balance_loss_mlp": 1.01879096, + "epoch": 0.22768675785359987, + "flos": 18620375212800.0, + "grad_norm": 2.4930385852486303, + "language_loss": 0.88609755, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.91181397, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25744629, + "step": 3787, + "time_per_iteration": 2.819997787475586 + }, + { + "auxiliary_loss_clip": 0.01514061, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.31169367, + "balance_loss_mlp": 1.0261215, + "epoch": 0.22774688110626784, + "flos": 26881495447680.0, + "grad_norm": 1.258954494919232, + "language_loss": 0.7344901, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.7601248, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23278809, + "step": 3788, + "time_per_iteration": 2.9018099308013916 + }, + { + "auxiliary_loss_clip": 0.01517587, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.31217909, + "balance_loss_mlp": 1.01520729, + "epoch": 0.2278070043589358, + "flos": 15560396369280.0, + "grad_norm": 2.2962587955421405, + "language_loss": 0.69134581, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.71692216, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.24853516, + "step": 3789, + "time_per_iteration": 2.7884645462036133 + }, + { + "auxiliary_loss_clip": 0.01520704, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.31639504, + "balance_loss_mlp": 1.01914287, + "epoch": 0.2278671276116038, + "flos": 22426397867520.0, + "grad_norm": 1.9281422678271343, + "language_loss": 0.77082491, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.79647386, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.25036621, + "step": 3790, + "time_per_iteration": 4.3906683921813965 + }, + { + "auxiliary_loss_clip": 0.0150886, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.30742908, + "balance_loss_mlp": 1.02067637, + "epoch": 0.22792725086427176, + "flos": 20641104806400.0, + "grad_norm": 1.8882639228446665, + "language_loss": 0.91806614, + "learning_rate": 3.602931823424522e-06, + "loss": 0.94361162, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.25012207, + "step": 3791, + "time_per_iteration": 2.8177073001861572 + }, + { + "auxiliary_loss_clip": 0.01516961, + "auxiliary_loss_mlp": 0.01041766, + "balance_loss_clip": 1.30892777, + "balance_loss_mlp": 1.0182817, + "epoch": 0.22798737411693973, + "flos": 31440918833280.0, + "grad_norm": 2.338137298927301, + "language_loss": 0.83888352, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.86447084, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.23486328, + "step": 3792, + "time_per_iteration": 4.303308010101318 + }, + { + "auxiliary_loss_clip": 0.01299784, + "auxiliary_loss_mlp": 0.0105803, + "balance_loss_clip": 1.18557048, + "balance_loss_mlp": 1.03695345, + "epoch": 0.2280474973696077, + "flos": 52420564368000.0, + "grad_norm": 1.1502437050314154, + "language_loss": 0.65722501, + "learning_rate": 3.602465874182981e-06, + "loss": 0.68080318, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.2109375, + "step": 3793, + "time_per_iteration": 4.536312818527222 + }, + { + "auxiliary_loss_clip": 0.01554919, + "auxiliary_loss_mlp": 0.01048281, + "balance_loss_clip": 1.33965635, + "balance_loss_mlp": 1.02343822, + "epoch": 0.22810762062227566, + "flos": 26407292250240.0, + "grad_norm": 2.7371352098440367, + "language_loss": 0.78309703, + "learning_rate": 3.602232808409293e-06, + "loss": 0.809129, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 2.15527344, + "router_z_loss_mlp": 0.24853516, + "step": 3794, + "time_per_iteration": 2.9299237728118896 + }, + { + "auxiliary_loss_clip": 0.01529926, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.32226872, + "balance_loss_mlp": 1.01984262, + "epoch": 0.22816774387494362, + "flos": 25641403954560.0, + "grad_norm": 2.6955563051320888, + "language_loss": 0.81579369, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.84152907, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.23791504, + "step": 3795, + "time_per_iteration": 2.8729264736175537 + }, + { + "auxiliary_loss_clip": 0.01511653, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_clip": 1.30880344, + "balance_loss_mlp": 1.02720702, + "epoch": 0.22822786712761162, + "flos": 22460765932800.0, + "grad_norm": 1.8015410200842634, + "language_loss": 0.77992463, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.80556488, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.25170898, + "step": 3796, + "time_per_iteration": 2.9117472171783447 + }, + { + "auxiliary_loss_clip": 0.01518918, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.31307411, + "balance_loss_mlp": 1.01817763, + "epoch": 0.22828799038027958, + "flos": 12209954037120.0, + "grad_norm": 2.374112670172366, + "language_loss": 0.96527576, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.99088758, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.24084473, + "step": 3797, + "time_per_iteration": 2.956239938735962 + }, + { + "auxiliary_loss_clip": 0.01511443, + "auxiliary_loss_mlp": 0.01051763, + "balance_loss_clip": 1.30681276, + "balance_loss_mlp": 1.02715802, + "epoch": 0.22834811363294755, + "flos": 22094508124800.0, + "grad_norm": 1.5635256360694203, + "language_loss": 0.8264519, + "learning_rate": 3.601299937834666e-06, + "loss": 0.85208392, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.24597168, + "step": 3798, + "time_per_iteration": 2.912991762161255 + }, + { + "auxiliary_loss_clip": 0.01515723, + "auxiliary_loss_mlp": 0.01043174, + "balance_loss_clip": 1.30821931, + "balance_loss_mlp": 1.01783037, + "epoch": 0.2284082368856155, + "flos": 24870674465280.0, + "grad_norm": 2.555309979074055, + "language_loss": 0.79915035, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.82473934, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.25354004, + "step": 3799, + "time_per_iteration": 2.892906904220581 + }, + { + "auxiliary_loss_clip": 0.01511826, + "auxiliary_loss_mlp": 0.01050382, + "balance_loss_clip": 1.30658972, + "balance_loss_mlp": 1.02520478, + "epoch": 0.22846836013828348, + "flos": 23302810506240.0, + "grad_norm": 1.4736745245906775, + "language_loss": 0.75538671, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.78100872, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.25170898, + "step": 3800, + "time_per_iteration": 2.865077257156372 + }, + { + "auxiliary_loss_clip": 0.01506285, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.30257344, + "balance_loss_mlp": 1.02226424, + "epoch": 0.22852848339095144, + "flos": 27427420932480.0, + "grad_norm": 3.8275672034762995, + "language_loss": 0.64806861, + "learning_rate": 3.600599647297484e-06, + "loss": 0.67359817, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.24438477, + "step": 3801, + "time_per_iteration": 2.9472155570983887 + }, + { + "auxiliary_loss_clip": 0.01507876, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_clip": 1.30689156, + "balance_loss_mlp": 1.02093506, + "epoch": 0.2285886066436194, + "flos": 26331859889280.0, + "grad_norm": 1.5824162300255884, + "language_loss": 0.82478034, + "learning_rate": 3.60036609571682e-06, + "loss": 0.85030413, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23571777, + "step": 3802, + "time_per_iteration": 2.8514151573181152 + }, + { + "auxiliary_loss_clip": 0.01519531, + "auxiliary_loss_mlp": 0.01050161, + "balance_loss_clip": 1.31310093, + "balance_loss_mlp": 1.02579427, + "epoch": 0.2286487298962874, + "flos": 29728614913920.0, + "grad_norm": 1.6975647854672962, + "language_loss": 0.80000943, + "learning_rate": 3.600132483450114e-06, + "loss": 0.82570636, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.24365234, + "step": 3803, + "time_per_iteration": 2.894819498062134 + }, + { + "auxiliary_loss_clip": 0.01523605, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.31516576, + "balance_loss_mlp": 1.02104068, + "epoch": 0.22870885314895537, + "flos": 21296875962240.0, + "grad_norm": 1.623900769825681, + "language_loss": 0.86255479, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.88826549, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.26428223, + "step": 3804, + "time_per_iteration": 2.826904773712158 + }, + { + "auxiliary_loss_clip": 0.01520119, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_clip": 1.3101176, + "balance_loss_mlp": 1.01821721, + "epoch": 0.22876897640162333, + "flos": 14947680280320.0, + "grad_norm": 2.1463268173569667, + "language_loss": 0.77824879, + "learning_rate": 3.59966507689401e-06, + "loss": 0.80388165, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24963379, + "step": 3805, + "time_per_iteration": 2.832686424255371 + }, + { + "auxiliary_loss_clip": 0.01535721, + "auxiliary_loss_mlp": 0.01051242, + "balance_loss_clip": 1.32509029, + "balance_loss_mlp": 1.02555239, + "epoch": 0.2288290996542913, + "flos": 18123024660480.0, + "grad_norm": 2.8446736156739196, + "language_loss": 0.805453, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.83132261, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.25708008, + "step": 3806, + "time_per_iteration": 2.849250316619873 + }, + { + "auxiliary_loss_clip": 0.0152286, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.31543541, + "balance_loss_mlp": 1.02667928, + "epoch": 0.22888922290695926, + "flos": 39869173935360.0, + "grad_norm": 2.0760826761741153, + "language_loss": 0.7103883, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.73613811, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.2545166, + "step": 3807, + "time_per_iteration": 3.0215892791748047 + }, + { + "auxiliary_loss_clip": 0.01529003, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.31840932, + "balance_loss_mlp": 1.02702379, + "epoch": 0.22894934615962723, + "flos": 23414104010880.0, + "grad_norm": 2.438672231817149, + "language_loss": 0.66775322, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.6935733, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.2598877, + "step": 3808, + "time_per_iteration": 2.8938193321228027 + }, + { + "auxiliary_loss_clip": 0.0153659, + "auxiliary_loss_mlp": 0.01054024, + "balance_loss_clip": 1.32715023, + "balance_loss_mlp": 1.02870417, + "epoch": 0.22900946941229522, + "flos": 18852146916480.0, + "grad_norm": 1.7358063062615237, + "language_loss": 0.76403511, + "learning_rate": 3.598729535939222e-06, + "loss": 0.78994119, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.25341797, + "step": 3809, + "time_per_iteration": 2.8038406372070312 + }, + { + "auxiliary_loss_clip": 0.014995, + "auxiliary_loss_mlp": 0.01051466, + "balance_loss_clip": 1.29721606, + "balance_loss_mlp": 1.02633667, + "epoch": 0.22906959266496318, + "flos": 22939584099840.0, + "grad_norm": 1.710742637321474, + "language_loss": 0.82467496, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.85018462, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.25170898, + "step": 3810, + "time_per_iteration": 2.878061532974243 + }, + { + "auxiliary_loss_clip": 0.01515901, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.31073105, + "balance_loss_mlp": 1.01882839, + "epoch": 0.22912971591763115, + "flos": 19363975804800.0, + "grad_norm": 1.8663052651893375, + "language_loss": 0.79596245, + "learning_rate": 3.598261401682441e-06, + "loss": 0.82155538, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.24560547, + "step": 3811, + "time_per_iteration": 2.868255615234375 + }, + { + "auxiliary_loss_clip": 0.01508975, + "auxiliary_loss_mlp": 0.01044566, + "balance_loss_clip": 1.30397236, + "balance_loss_mlp": 1.01962721, + "epoch": 0.22918983917029911, + "flos": 19941916625280.0, + "grad_norm": 1.7012535544235612, + "language_loss": 0.83682203, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.86235744, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24926758, + "step": 3812, + "time_per_iteration": 2.8483266830444336 + }, + { + "auxiliary_loss_clip": 0.01535754, + "auxiliary_loss_mlp": 0.01050638, + "balance_loss_clip": 1.32394958, + "balance_loss_mlp": 1.0255568, + "epoch": 0.22924996242296708, + "flos": 16699600661760.0, + "grad_norm": 2.3808113378939275, + "language_loss": 0.85594612, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.88181007, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 2.12207031, + "router_z_loss_mlp": 0.25097656, + "step": 3813, + "time_per_iteration": 2.8966779708862305 + }, + { + "auxiliary_loss_clip": 0.01512091, + "auxiliary_loss_mlp": 0.01047555, + "balance_loss_clip": 1.30658317, + "balance_loss_mlp": 1.02247357, + "epoch": 0.22931008567563504, + "flos": 33049032681600.0, + "grad_norm": 1.7129411647859194, + "language_loss": 0.71444857, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.74004507, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.25061035, + "step": 3814, + "time_per_iteration": 2.961540937423706 + }, + { + "auxiliary_loss_clip": 0.01500098, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.29717398, + "balance_loss_mlp": 1.02113485, + "epoch": 0.229370208928303, + "flos": 23340979134720.0, + "grad_norm": 5.390795405642149, + "language_loss": 0.68171167, + "learning_rate": 3.597324405965139e-06, + "loss": 0.70717704, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.25317383, + "step": 3815, + "time_per_iteration": 2.9316020011901855 + }, + { + "auxiliary_loss_clip": 0.01515603, + "auxiliary_loss_mlp": 0.01053511, + "balance_loss_clip": 1.30957651, + "balance_loss_mlp": 1.02616417, + "epoch": 0.229430332180971, + "flos": 28627850718720.0, + "grad_norm": 1.7854511675695954, + "language_loss": 0.83857799, + "learning_rate": 3.597090005586848e-06, + "loss": 0.86426908, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.27368164, + "step": 3816, + "time_per_iteration": 4.373397588729858 + }, + { + "auxiliary_loss_clip": 0.01517791, + "auxiliary_loss_mlp": 0.01047058, + "balance_loss_clip": 1.31253672, + "balance_loss_mlp": 1.02018785, + "epoch": 0.22949045543363897, + "flos": 17247064469760.0, + "grad_norm": 2.429173079461321, + "language_loss": 0.89164776, + "learning_rate": 3.596855544646742e-06, + "loss": 0.91729623, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.26879883, + "step": 3817, + "time_per_iteration": 2.8765721321105957 + }, + { + "auxiliary_loss_clip": 0.01520426, + "auxiliary_loss_mlp": 0.01048622, + "balance_loss_clip": 1.31274569, + "balance_loss_mlp": 1.02190697, + "epoch": 0.22955057868630693, + "flos": 27500636298240.0, + "grad_norm": 1.7909923140204635, + "language_loss": 0.75933719, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.78502768, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.26708984, + "step": 3818, + "time_per_iteration": 2.987168788909912 + }, + { + "auxiliary_loss_clip": 0.01512737, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.30730176, + "balance_loss_mlp": 1.0201925, + "epoch": 0.2296107019389749, + "flos": 23486685949440.0, + "grad_norm": 2.483988088044078, + "language_loss": 0.75828874, + "learning_rate": 3.596386441116659e-06, + "loss": 0.78388965, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.27148438, + "step": 3819, + "time_per_iteration": 2.922292470932007 + }, + { + "auxiliary_loss_clip": 0.01526321, + "auxiliary_loss_mlp": 0.01048675, + "balance_loss_clip": 1.31853902, + "balance_loss_mlp": 1.0209825, + "epoch": 0.22967082519164286, + "flos": 31297655237760.0, + "grad_norm": 1.6556528879471908, + "language_loss": 0.81615829, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.84190828, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.27697754, + "step": 3820, + "time_per_iteration": 3.0351929664611816 + }, + { + "auxiliary_loss_clip": 0.01531671, + "auxiliary_loss_mlp": 0.01052838, + "balance_loss_clip": 1.31968915, + "balance_loss_mlp": 1.02363181, + "epoch": 0.22973094844431083, + "flos": 14649887134080.0, + "grad_norm": 1.932972429633902, + "language_loss": 0.70239294, + "learning_rate": 3.595917095446042e-06, + "loss": 0.72823805, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.29223633, + "step": 3821, + "time_per_iteration": 2.9400064945220947 + }, + { + "auxiliary_loss_clip": 0.01509055, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.30304527, + "balance_loss_mlp": 1.02013373, + "epoch": 0.2297910716969788, + "flos": 22835032070400.0, + "grad_norm": 1.5739326809067737, + "language_loss": 0.83673239, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.8623023, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.27807617, + "step": 3822, + "time_per_iteration": 2.9545109272003174 + }, + { + "auxiliary_loss_clip": 0.01523612, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.31916595, + "balance_loss_mlp": 1.01588404, + "epoch": 0.2298511949496468, + "flos": 23049294036480.0, + "grad_norm": 2.367886776627778, + "language_loss": 0.67082816, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.69649816, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.27539062, + "step": 3823, + "time_per_iteration": 2.942615032196045 + }, + { + "auxiliary_loss_clip": 0.0134908, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.22748172, + "balance_loss_mlp": 1.03331995, + "epoch": 0.22991131820231475, + "flos": 66920989368960.0, + "grad_norm": 0.7953184186567328, + "language_loss": 0.56801695, + "learning_rate": 3.595212623082357e-06, + "loss": 0.59213847, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.296875, + "step": 3824, + "time_per_iteration": 3.438488721847534 + }, + { + "auxiliary_loss_clip": 0.01508972, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.3066628, + "balance_loss_mlp": 1.02246809, + "epoch": 0.22997144145498272, + "flos": 17894284358400.0, + "grad_norm": 1.9630434992854373, + "language_loss": 0.73693883, + "learning_rate": 3.594977677968009e-06, + "loss": 0.76252645, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.27319336, + "step": 3825, + "time_per_iteration": 4.256927728652954 + }, + { + "auxiliary_loss_clip": 0.01526873, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.32095647, + "balance_loss_mlp": 1.02190459, + "epoch": 0.23003156470765068, + "flos": 24686482308480.0, + "grad_norm": 1.9875408202496487, + "language_loss": 0.89175379, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.91751611, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.27490234, + "step": 3826, + "time_per_iteration": 2.8776278495788574 + }, + { + "auxiliary_loss_clip": 0.01534695, + "auxiliary_loss_mlp": 0.01050533, + "balance_loss_clip": 1.32122338, + "balance_loss_mlp": 1.02274561, + "epoch": 0.23009168796031865, + "flos": 15822328371840.0, + "grad_norm": 5.547488682335743, + "language_loss": 0.8277427, + "learning_rate": 3.594507606303083e-06, + "loss": 0.85359496, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2779541, + "step": 3827, + "time_per_iteration": 4.230492830276489 + }, + { + "auxiliary_loss_clip": 0.01511908, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.30848145, + "balance_loss_mlp": 1.02096939, + "epoch": 0.2301518112129866, + "flos": 16220465781120.0, + "grad_norm": 2.0575405309738617, + "language_loss": 0.87915051, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.90475416, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.27539062, + "step": 3828, + "time_per_iteration": 4.276114225387573 + }, + { + "auxiliary_loss_clip": 0.0150788, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.3014195, + "balance_loss_mlp": 1.02106702, + "epoch": 0.2302119344656546, + "flos": 20605334152320.0, + "grad_norm": 2.047180537134642, + "language_loss": 0.72018778, + "learning_rate": 3.594037292782607e-06, + "loss": 0.74575847, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.28088379, + "step": 3829, + "time_per_iteration": 2.8294689655303955 + }, + { + "auxiliary_loss_clip": 0.01507556, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.30606651, + "balance_loss_mlp": 1.02102351, + "epoch": 0.23027205771832257, + "flos": 26808099102720.0, + "grad_norm": 1.7332053064842587, + "language_loss": 0.85712552, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.88267088, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25976562, + "step": 3830, + "time_per_iteration": 2.9193902015686035 + }, + { + "auxiliary_loss_clip": 0.01506904, + "auxiliary_loss_mlp": 0.01054061, + "balance_loss_clip": 1.30358028, + "balance_loss_mlp": 1.02647638, + "epoch": 0.23033218097099054, + "flos": 43889006108160.0, + "grad_norm": 1.6201235904048665, + "language_loss": 0.67711276, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.70272243, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.27587891, + "step": 3831, + "time_per_iteration": 3.027104139328003 + }, + { + "auxiliary_loss_clip": 0.01524282, + "auxiliary_loss_mlp": 0.01056283, + "balance_loss_clip": 1.31652498, + "balance_loss_mlp": 1.02891254, + "epoch": 0.2303923042236585, + "flos": 26078795867520.0, + "grad_norm": 2.1144973810420087, + "language_loss": 0.76967245, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.79547811, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.27331543, + "step": 3832, + "time_per_iteration": 2.9248671531677246 + }, + { + "auxiliary_loss_clip": 0.01522181, + "auxiliary_loss_mlp": 0.01049896, + "balance_loss_clip": 1.31523895, + "balance_loss_mlp": 1.02192974, + "epoch": 0.23045242747632647, + "flos": 18305542759680.0, + "grad_norm": 1.8935753615826372, + "language_loss": 0.8856473, + "learning_rate": 3.593095940460389e-06, + "loss": 0.91136813, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.27954102, + "step": 3833, + "time_per_iteration": 2.8466296195983887 + }, + { + "auxiliary_loss_clip": 0.01522867, + "auxiliary_loss_mlp": 0.01049713, + "balance_loss_clip": 1.31313682, + "balance_loss_mlp": 1.02248573, + "epoch": 0.23051255072899443, + "flos": 25531558283520.0, + "grad_norm": 1.5652774458094958, + "language_loss": 0.75684839, + "learning_rate": 3.592860451331624e-06, + "loss": 0.78257418, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2722168, + "step": 3834, + "time_per_iteration": 2.932950735092163 + }, + { + "auxiliary_loss_clip": 0.01503223, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.29984784, + "balance_loss_mlp": 1.03044438, + "epoch": 0.2305726739816624, + "flos": 21225108430080.0, + "grad_norm": 1.7411795032767157, + "language_loss": 0.87098718, + "learning_rate": 3.592624901801432e-06, + "loss": 0.89658916, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.26586914, + "step": 3835, + "time_per_iteration": 2.88079833984375 + }, + { + "auxiliary_loss_clip": 0.01528339, + "auxiliary_loss_mlp": 0.01051853, + "balance_loss_clip": 1.31744885, + "balance_loss_mlp": 1.02585387, + "epoch": 0.2306327972343304, + "flos": 23341522072320.0, + "grad_norm": 1.9472584044208716, + "language_loss": 0.83300984, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.8588118, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.26013184, + "step": 3836, + "time_per_iteration": 2.9056904315948486 + }, + { + "auxiliary_loss_clip": 0.01525412, + "auxiliary_loss_mlp": 0.01051228, + "balance_loss_clip": 1.319682, + "balance_loss_mlp": 1.02493072, + "epoch": 0.23069292048699835, + "flos": 20676196788480.0, + "grad_norm": 1.7099337264441898, + "language_loss": 0.80473101, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.83049744, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.26318359, + "step": 3837, + "time_per_iteration": 2.9525349140167236 + }, + { + "auxiliary_loss_clip": 0.01340646, + "auxiliary_loss_mlp": 0.01067775, + "balance_loss_clip": 1.21964765, + "balance_loss_mlp": 1.03782964, + "epoch": 0.23075304373966632, + "flos": 70482798000000.0, + "grad_norm": 0.9035344452207386, + "language_loss": 0.65402484, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67810905, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.29882812, + "step": 3838, + "time_per_iteration": 3.3570077419281006 + }, + { + "auxiliary_loss_clip": 0.01508644, + "auxiliary_loss_mlp": 0.01053544, + "balance_loss_clip": 1.30378556, + "balance_loss_mlp": 1.02808106, + "epoch": 0.23081316699233428, + "flos": 16626702009600.0, + "grad_norm": 2.1265525965954333, + "language_loss": 0.76633221, + "learning_rate": 3.591682099845058e-06, + "loss": 0.79195404, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.25500488, + "step": 3839, + "time_per_iteration": 2.8439126014709473 + }, + { + "auxiliary_loss_clip": 0.01542412, + "auxiliary_loss_mlp": 0.01055568, + "balance_loss_clip": 1.33032274, + "balance_loss_mlp": 1.03016436, + "epoch": 0.23087329024500225, + "flos": 13306012773120.0, + "grad_norm": 1.7578290286056804, + "language_loss": 0.70466173, + "learning_rate": 3.591446248441752e-06, + "loss": 0.7306416, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25402832, + "step": 3840, + "time_per_iteration": 3.1429245471954346 + }, + { + "auxiliary_loss_clip": 0.01525479, + "auxiliary_loss_mlp": 0.01054055, + "balance_loss_clip": 1.32076955, + "balance_loss_mlp": 1.02767372, + "epoch": 0.23093341349767021, + "flos": 17794799746560.0, + "grad_norm": 1.992562560846032, + "language_loss": 0.80357003, + "learning_rate": 3.591210336690645e-06, + "loss": 0.82936543, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.26379395, + "step": 3841, + "time_per_iteration": 2.8717758655548096 + }, + { + "auxiliary_loss_clip": 0.01518106, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_clip": 1.31247532, + "balance_loss_mlp": 1.02948904, + "epoch": 0.23099353675033818, + "flos": 23998695816960.0, + "grad_norm": 1.8843729536880263, + "language_loss": 0.8358109, + "learning_rate": 3.590974364600683e-06, + "loss": 0.86153537, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.24865723, + "step": 3842, + "time_per_iteration": 3.116657257080078 + }, + { + "auxiliary_loss_clip": 0.01511554, + "auxiliary_loss_mlp": 0.01053056, + "balance_loss_clip": 1.307024, + "balance_loss_mlp": 1.02655613, + "epoch": 0.23105366000300617, + "flos": 36009753137280.0, + "grad_norm": 1.473702479008749, + "language_loss": 0.6686241, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.69427025, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26513672, + "step": 3843, + "time_per_iteration": 3.087411642074585 + }, + { + "auxiliary_loss_clip": 0.01505543, + "auxiliary_loss_mlp": 0.01057126, + "balance_loss_clip": 1.30546463, + "balance_loss_mlp": 1.03017235, + "epoch": 0.23111378325567414, + "flos": 31256952900480.0, + "grad_norm": 1.7955261084913654, + "language_loss": 0.78057277, + "learning_rate": 3.590502239439987e-06, + "loss": 0.80619943, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.26928711, + "step": 3844, + "time_per_iteration": 2.9715194702148438 + }, + { + "auxiliary_loss_clip": 0.01520963, + "auxiliary_loss_mlp": 0.01063751, + "balance_loss_clip": 1.31561768, + "balance_loss_mlp": 1.03650022, + "epoch": 0.2311739065083421, + "flos": 19217545073280.0, + "grad_norm": 1.572769212965825, + "language_loss": 0.78657776, + "learning_rate": 3.590266086387156e-06, + "loss": 0.8124249, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.27270508, + "step": 3845, + "time_per_iteration": 2.8821213245391846 + }, + { + "auxiliary_loss_clip": 0.01490689, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.29299068, + "balance_loss_mlp": 1.02426314, + "epoch": 0.23123402976101007, + "flos": 23368650969600.0, + "grad_norm": 2.201840776289063, + "language_loss": 0.7737121, + "learning_rate": 3.590029873031276e-06, + "loss": 0.79911208, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.25061035, + "step": 3846, + "time_per_iteration": 2.803565502166748 + }, + { + "auxiliary_loss_clip": 0.0152686, + "auxiliary_loss_mlp": 0.0105939, + "balance_loss_clip": 1.32012355, + "balance_loss_mlp": 1.03409362, + "epoch": 0.23129415301367803, + "flos": 13743268951680.0, + "grad_norm": 1.8799741490335318, + "language_loss": 0.70656127, + "learning_rate": 3.589793599381304e-06, + "loss": 0.73242378, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.25305176, + "step": 3847, + "time_per_iteration": 2.795222759246826 + }, + { + "auxiliary_loss_clip": 0.01322603, + "auxiliary_loss_mlp": 0.01051413, + "balance_loss_clip": 1.20194364, + "balance_loss_mlp": 1.02862024, + "epoch": 0.231354276266346, + "flos": 69767882611200.0, + "grad_norm": 0.7821371794875414, + "language_loss": 0.61053944, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63427961, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.22753906, + "step": 3848, + "time_per_iteration": 3.2593278884887695 + }, + { + "auxiliary_loss_clip": 0.01512643, + "auxiliary_loss_mlp": 0.01055466, + "balance_loss_clip": 1.30643034, + "balance_loss_mlp": 1.0303123, + "epoch": 0.231414399519014, + "flos": 18843188446080.0, + "grad_norm": 2.0091925360634932, + "language_loss": 0.79245031, + "learning_rate": 3.589320871234923e-06, + "loss": 0.81813145, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.25146484, + "step": 3849, + "time_per_iteration": 2.8323168754577637 + }, + { + "auxiliary_loss_clip": 0.01528527, + "auxiliary_loss_mlp": 0.01050524, + "balance_loss_clip": 1.32004154, + "balance_loss_mlp": 1.02430964, + "epoch": 0.23147452277168196, + "flos": 36148627987200.0, + "grad_norm": 1.8826138668491732, + "language_loss": 0.72611177, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.75190228, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.26245117, + "step": 3850, + "time_per_iteration": 2.9565370082855225 + }, + { + "auxiliary_loss_clip": 0.01519966, + "auxiliary_loss_mlp": 0.01050478, + "balance_loss_clip": 1.31555581, + "balance_loss_mlp": 1.02496696, + "epoch": 0.23153464602434992, + "flos": 20822491785600.0, + "grad_norm": 2.0293828935096756, + "language_loss": 0.77586699, + "learning_rate": 3.588847902019718e-06, + "loss": 0.80157137, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.25512695, + "step": 3851, + "time_per_iteration": 4.262117624282837 + }, + { + "auxiliary_loss_clip": 0.01516017, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.31168652, + "balance_loss_mlp": 1.02157211, + "epoch": 0.2315947692770179, + "flos": 19949110548480.0, + "grad_norm": 1.4657336424047718, + "language_loss": 0.70489842, + "learning_rate": 3.588611327033723e-06, + "loss": 0.73053598, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.26184082, + "step": 3852, + "time_per_iteration": 2.9430229663848877 + }, + { + "auxiliary_loss_clip": 0.01528416, + "auxiliary_loss_mlp": 0.01042778, + "balance_loss_clip": 1.32009053, + "balance_loss_mlp": 1.01844776, + "epoch": 0.23165489252968585, + "flos": 12862150853760.0, + "grad_norm": 2.041903088619222, + "language_loss": 0.6842013, + "learning_rate": 3.588374691807428e-06, + "loss": 0.70991325, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.24353027, + "step": 3853, + "time_per_iteration": 2.8629841804504395 + }, + { + "auxiliary_loss_clip": 0.01540725, + "auxiliary_loss_mlp": 0.010431, + "balance_loss_clip": 1.33196604, + "balance_loss_mlp": 1.01767302, + "epoch": 0.23171501578235382, + "flos": 30640571982720.0, + "grad_norm": 3.388508573080948, + "language_loss": 0.81241018, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.83824843, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.25415039, + "step": 3854, + "time_per_iteration": 2.954526901245117 + }, + { + "auxiliary_loss_clip": 0.0154411, + "auxiliary_loss_mlp": 0.01050566, + "balance_loss_clip": 1.32860231, + "balance_loss_mlp": 1.02392244, + "epoch": 0.23177513903502178, + "flos": 23853531939840.0, + "grad_norm": 1.9414652408464406, + "language_loss": 0.66988856, + "learning_rate": 3.587901240669831e-06, + "loss": 0.69583535, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.26623535, + "step": 3855, + "time_per_iteration": 2.9186127185821533 + }, + { + "auxiliary_loss_clip": 0.01518805, + "auxiliary_loss_mlp": 0.01044502, + "balance_loss_clip": 1.31045258, + "balance_loss_mlp": 1.01808548, + "epoch": 0.23183526228768978, + "flos": 29582319916800.0, + "grad_norm": 1.8180448679859638, + "language_loss": 0.72504663, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.75067973, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.26416016, + "step": 3856, + "time_per_iteration": 3.006605386734009 + }, + { + "auxiliary_loss_clip": 0.01520522, + "auxiliary_loss_mlp": 0.01045066, + "balance_loss_clip": 1.31537807, + "balance_loss_mlp": 1.02040207, + "epoch": 0.23189538554035774, + "flos": 34471823253120.0, + "grad_norm": 1.533476206159904, + "language_loss": 0.77935898, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.80501485, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.2467041, + "step": 3857, + "time_per_iteration": 3.0112574100494385 + }, + { + "auxiliary_loss_clip": 0.0154004, + "auxiliary_loss_mlp": 0.01050712, + "balance_loss_clip": 1.3279686, + "balance_loss_mlp": 1.02219737, + "epoch": 0.2319555087930257, + "flos": 18012726541440.0, + "grad_norm": 2.3248480762763193, + "language_loss": 0.92222828, + "learning_rate": 3.587190612385584e-06, + "loss": 0.94813573, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.28491211, + "step": 3858, + "time_per_iteration": 2.9500911235809326 + }, + { + "auxiliary_loss_clip": 0.01514306, + "auxiliary_loss_mlp": 0.01041804, + "balance_loss_clip": 1.31179309, + "balance_loss_mlp": 1.01609063, + "epoch": 0.23201563204569367, + "flos": 23152941169920.0, + "grad_norm": 1.8404635649070624, + "language_loss": 0.77873868, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.80429971, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.25720215, + "step": 3859, + "time_per_iteration": 2.8936314582824707 + }, + { + "auxiliary_loss_clip": 0.01509283, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.30481267, + "balance_loss_mlp": 1.01987076, + "epoch": 0.23207575529836164, + "flos": 20677780356480.0, + "grad_norm": 1.9977278197932442, + "language_loss": 0.85560262, + "learning_rate": 3.58671655924898e-06, + "loss": 0.88115013, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25585938, + "step": 3860, + "time_per_iteration": 4.295123100280762 + }, + { + "auxiliary_loss_clip": 0.01520325, + "auxiliary_loss_mlp": 0.01039714, + "balance_loss_clip": 1.31433797, + "balance_loss_mlp": 1.01252258, + "epoch": 0.2321358785510296, + "flos": 16480904705280.0, + "grad_norm": 3.053601436795869, + "language_loss": 0.84179431, + "learning_rate": 3.586479442423508e-06, + "loss": 0.86739469, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.27172852, + "step": 3861, + "time_per_iteration": 2.883608102798462 + }, + { + "auxiliary_loss_clip": 0.01515988, + "auxiliary_loss_mlp": 0.0104514, + "balance_loss_clip": 1.31002855, + "balance_loss_mlp": 1.0193435, + "epoch": 0.2321960018036976, + "flos": 21626277240960.0, + "grad_norm": 1.506432260095734, + "language_loss": 0.86830103, + "learning_rate": 3.586242265438576e-06, + "loss": 0.89391226, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.25793457, + "step": 3862, + "time_per_iteration": 4.443707227706909 + }, + { + "auxiliary_loss_clip": 0.01505306, + "auxiliary_loss_mlp": 0.01047366, + "balance_loss_clip": 1.30348217, + "balance_loss_mlp": 1.02099681, + "epoch": 0.23225612505636556, + "flos": 22281188745600.0, + "grad_norm": 1.4662452042329233, + "language_loss": 0.75485778, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.78038448, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.26379395, + "step": 3863, + "time_per_iteration": 4.290844440460205 + }, + { + "auxiliary_loss_clip": 0.01505752, + "auxiliary_loss_mlp": 0.01046673, + "balance_loss_clip": 1.30383539, + "balance_loss_mlp": 1.02109075, + "epoch": 0.23231624830903352, + "flos": 17060248114560.0, + "grad_norm": 1.8252468242558253, + "language_loss": 0.75243795, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77796221, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.25598145, + "step": 3864, + "time_per_iteration": 2.8840951919555664 + }, + { + "auxiliary_loss_clip": 0.01507766, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_clip": 1.30466986, + "balance_loss_mlp": 1.01550508, + "epoch": 0.2323763715617015, + "flos": 34652938763520.0, + "grad_norm": 2.8617960873854584, + "language_loss": 0.71255785, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.73806071, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.27026367, + "step": 3865, + "time_per_iteration": 2.9787814617156982 + }, + { + "auxiliary_loss_clip": 0.01538096, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_clip": 1.32416153, + "balance_loss_mlp": 1.01613438, + "epoch": 0.23243649481436945, + "flos": 25561356624000.0, + "grad_norm": 1.7503839206954328, + "language_loss": 0.95924795, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.98505712, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.26721191, + "step": 3866, + "time_per_iteration": 2.8740692138671875 + }, + { + "auxiliary_loss_clip": 0.01520179, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.3147558, + "balance_loss_mlp": 1.01896095, + "epoch": 0.23249661806703742, + "flos": 20492683303680.0, + "grad_norm": 2.4076170663905625, + "language_loss": 0.74594247, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.77158678, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.25268555, + "step": 3867, + "time_per_iteration": 2.8725171089172363 + }, + { + "auxiliary_loss_clip": 0.01521784, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.31389415, + "balance_loss_mlp": 1.01815712, + "epoch": 0.23255674131970538, + "flos": 20386457216640.0, + "grad_norm": 2.068177945725956, + "language_loss": 0.83336735, + "learning_rate": 3.584817940684145e-06, + "loss": 0.859038, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.27148438, + "step": 3868, + "time_per_iteration": 2.846949338912964 + }, + { + "auxiliary_loss_clip": 0.01501407, + "auxiliary_loss_mlp": 0.01045827, + "balance_loss_clip": 1.29997933, + "balance_loss_mlp": 1.01917171, + "epoch": 0.23261686457237338, + "flos": 17064998818560.0, + "grad_norm": 1.668636103915524, + "language_loss": 0.74540198, + "learning_rate": 3.58458034283495e-06, + "loss": 0.77087432, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.2668457, + "step": 3869, + "time_per_iteration": 2.874662160873413 + }, + { + "auxiliary_loss_clip": 0.01508485, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.30452538, + "balance_loss_mlp": 1.02057028, + "epoch": 0.23267698782504134, + "flos": 29182056001920.0, + "grad_norm": 1.6687691059617609, + "language_loss": 0.80637872, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.8319276, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.25793457, + "step": 3870, + "time_per_iteration": 2.980065107345581 + }, + { + "auxiliary_loss_clip": 0.01530835, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.32002449, + "balance_loss_mlp": 1.02885449, + "epoch": 0.2327371110777093, + "flos": 21183501196800.0, + "grad_norm": 1.7768967071235278, + "language_loss": 0.72552371, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.75138283, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.26257324, + "step": 3871, + "time_per_iteration": 2.8243227005004883 + }, + { + "auxiliary_loss_clip": 0.01521553, + "auxiliary_loss_mlp": 0.01055138, + "balance_loss_clip": 1.31240928, + "balance_loss_mlp": 1.02836394, + "epoch": 0.23279723433037727, + "flos": 24873796356480.0, + "grad_norm": 2.409060755990464, + "language_loss": 0.70464933, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.73041618, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26806641, + "step": 3872, + "time_per_iteration": 2.886056900024414 + }, + { + "auxiliary_loss_clip": 0.01539028, + "auxiliary_loss_mlp": 0.01045185, + "balance_loss_clip": 1.32599556, + "balance_loss_mlp": 1.0181129, + "epoch": 0.23285735758304524, + "flos": 38814903411840.0, + "grad_norm": 1.4904964757839079, + "language_loss": 0.78840417, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.8142463, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.27087402, + "step": 3873, + "time_per_iteration": 3.005896806716919 + }, + { + "auxiliary_loss_clip": 0.01322267, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.20188689, + "balance_loss_mlp": 1.00606585, + "epoch": 0.2329174808357132, + "flos": 53972293916160.0, + "grad_norm": 0.8598714885059744, + "language_loss": 0.60612535, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6296891, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.28125, + "step": 3874, + "time_per_iteration": 3.312131881713867 + }, + { + "auxiliary_loss_clip": 0.01512598, + "auxiliary_loss_mlp": 0.01045006, + "balance_loss_clip": 1.30679631, + "balance_loss_mlp": 1.01800489, + "epoch": 0.23297760408838117, + "flos": 21226149060480.0, + "grad_norm": 2.4342135270417207, + "language_loss": 0.81927443, + "learning_rate": 3.583153494218927e-06, + "loss": 0.84485054, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.2701416, + "step": 3875, + "time_per_iteration": 2.8949317932128906 + }, + { + "auxiliary_loss_clip": 0.01510687, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_clip": 1.30493569, + "balance_loss_mlp": 1.01860344, + "epoch": 0.23303772734104916, + "flos": 28414810362240.0, + "grad_norm": 1.5287518477438544, + "language_loss": 0.61862868, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.64417017, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.24865723, + "step": 3876, + "time_per_iteration": 2.9404914379119873 + }, + { + "auxiliary_loss_clip": 0.01529996, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.3212018, + "balance_loss_mlp": 1.01814222, + "epoch": 0.23309785059371713, + "flos": 24324975204480.0, + "grad_norm": 1.5914533168067075, + "language_loss": 0.71841818, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.74416435, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.26477051, + "step": 3877, + "time_per_iteration": 2.925236701965332 + }, + { + "auxiliary_loss_clip": 0.01515952, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.30835915, + "balance_loss_mlp": 1.02092886, + "epoch": 0.2331579738463851, + "flos": 16000231501440.0, + "grad_norm": 2.4030564407106763, + "language_loss": 0.82014352, + "learning_rate": 3.582439259339073e-06, + "loss": 0.84579027, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.27807617, + "step": 3878, + "time_per_iteration": 2.9005205631256104 + }, + { + "auxiliary_loss_clip": 0.01530361, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_clip": 1.31811285, + "balance_loss_mlp": 1.02404833, + "epoch": 0.23321809709905306, + "flos": 36440086861440.0, + "grad_norm": 1.7109184154819705, + "language_loss": 0.75481313, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.78063798, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.28027344, + "step": 3879, + "time_per_iteration": 3.0409903526306152 + }, + { + "auxiliary_loss_clip": 0.01515412, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.30699086, + "balance_loss_mlp": 1.01697803, + "epoch": 0.23327822035172102, + "flos": 21334727877120.0, + "grad_norm": 2.2963304996056704, + "language_loss": 0.90426576, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.9298563, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.2668457, + "step": 3880, + "time_per_iteration": 2.864959239959717 + }, + { + "auxiliary_loss_clip": 0.01530436, + "auxiliary_loss_mlp": 0.01048886, + "balance_loss_clip": 1.31790876, + "balance_loss_mlp": 1.02343524, + "epoch": 0.233338343604389, + "flos": 19181231481600.0, + "grad_norm": 1.592694136361489, + "language_loss": 0.72640562, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.75219887, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25476074, + "step": 3881, + "time_per_iteration": 2.8695712089538574 + }, + { + "auxiliary_loss_clip": 0.01521284, + "auxiliary_loss_mlp": 0.01049481, + "balance_loss_clip": 1.31443143, + "balance_loss_mlp": 1.02449441, + "epoch": 0.23339846685705698, + "flos": 26920116524160.0, + "grad_norm": 2.527193248406329, + "language_loss": 0.69068158, + "learning_rate": 3.581486106120537e-06, + "loss": 0.71638918, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.24963379, + "step": 3882, + "time_per_iteration": 2.949002265930176 + }, + { + "auxiliary_loss_clip": 0.01538943, + "auxiliary_loss_mlp": 0.01052866, + "balance_loss_clip": 1.32790172, + "balance_loss_mlp": 1.0267477, + "epoch": 0.23345859010972494, + "flos": 32355907303680.0, + "grad_norm": 1.9182576662426278, + "language_loss": 0.77711183, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.80302989, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.26123047, + "step": 3883, + "time_per_iteration": 3.0050430297851562 + }, + { + "auxiliary_loss_clip": 0.01309034, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.19264674, + "balance_loss_mlp": 1.0038718, + "epoch": 0.2335187133623929, + "flos": 58517709396480.0, + "grad_norm": 0.7757320585735973, + "language_loss": 0.59073198, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61412811, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.26757812, + "step": 3884, + "time_per_iteration": 3.5351359844207764 + }, + { + "auxiliary_loss_clip": 0.0153637, + "auxiliary_loss_mlp": 0.010479, + "balance_loss_clip": 1.32326639, + "balance_loss_mlp": 1.02263927, + "epoch": 0.23357883661506088, + "flos": 24513963310080.0, + "grad_norm": 1.7390223326734005, + "language_loss": 0.81264067, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.83848333, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25280762, + "step": 3885, + "time_per_iteration": 2.894704818725586 + }, + { + "auxiliary_loss_clip": 0.0152452, + "auxiliary_loss_mlp": 0.01049873, + "balance_loss_clip": 1.31652927, + "balance_loss_mlp": 1.02469563, + "epoch": 0.23363895986772884, + "flos": 18956653701120.0, + "grad_norm": 2.379114922718705, + "language_loss": 0.88544381, + "learning_rate": 3.580531993380261e-06, + "loss": 0.91118765, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25195312, + "step": 3886, + "time_per_iteration": 2.8602209091186523 + }, + { + "auxiliary_loss_clip": 0.01540652, + "auxiliary_loss_mlp": 0.01051055, + "balance_loss_clip": 1.32988954, + "balance_loss_mlp": 1.02459025, + "epoch": 0.2336990831203968, + "flos": 31699502720640.0, + "grad_norm": 1.8140613898382878, + "language_loss": 0.74040115, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.7663182, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.26489258, + "step": 3887, + "time_per_iteration": 4.324591398239136 + }, + { + "auxiliary_loss_clip": 0.01537951, + "auxiliary_loss_mlp": 0.01044333, + "balance_loss_clip": 1.32627845, + "balance_loss_mlp": 1.01971626, + "epoch": 0.23375920637306477, + "flos": 27721368270720.0, + "grad_norm": 1.7353037166693437, + "language_loss": 0.85340917, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.87923193, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24633789, + "step": 3888, + "time_per_iteration": 2.945384979248047 + }, + { + "auxiliary_loss_clip": 0.01530975, + "auxiliary_loss_mlp": 0.010518, + "balance_loss_clip": 1.32305694, + "balance_loss_mlp": 1.02640879, + "epoch": 0.23381932962573276, + "flos": 17684501627520.0, + "grad_norm": 2.2237801328897477, + "language_loss": 0.8927865, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.91861415, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25390625, + "step": 3889, + "time_per_iteration": 2.8776543140411377 + }, + { + "auxiliary_loss_clip": 0.01525122, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.31505775, + "balance_loss_mlp": 1.02006698, + "epoch": 0.23387945287840073, + "flos": 14398587659520.0, + "grad_norm": 2.5782118886322762, + "language_loss": 0.78585315, + "learning_rate": 3.579576921697125e-06, + "loss": 0.8115443, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23901367, + "step": 3890, + "time_per_iteration": 2.887502670288086 + }, + { + "auxiliary_loss_clip": 0.01531216, + "auxiliary_loss_mlp": 0.01049046, + "balance_loss_clip": 1.32173514, + "balance_loss_mlp": 1.02299917, + "epoch": 0.2339395761310687, + "flos": 46114224791040.0, + "grad_norm": 1.673112527744224, + "language_loss": 0.74351966, + "learning_rate": 3.579338004009412e-06, + "loss": 0.76932234, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26025391, + "step": 3891, + "time_per_iteration": 3.1023240089416504 + }, + { + "auxiliary_loss_clip": 0.01522513, + "auxiliary_loss_mlp": 0.01046857, + "balance_loss_clip": 1.31765425, + "balance_loss_mlp": 1.02183449, + "epoch": 0.23399969938373666, + "flos": 22392075047040.0, + "grad_norm": 1.5751556357755865, + "language_loss": 0.83465695, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.86035061, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.25024414, + "step": 3892, + "time_per_iteration": 2.985633134841919 + }, + { + "auxiliary_loss_clip": 0.01544975, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.33311391, + "balance_loss_mlp": 1.01846159, + "epoch": 0.23405982263640462, + "flos": 43524377112960.0, + "grad_norm": 1.5379876767542195, + "language_loss": 0.65531898, + "learning_rate": 3.578859988977082e-06, + "loss": 0.68120426, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25097656, + "step": 3893, + "time_per_iteration": 3.0850818157196045 + }, + { + "auxiliary_loss_clip": 0.01529881, + "auxiliary_loss_mlp": 0.01045905, + "balance_loss_clip": 1.32521558, + "balance_loss_mlp": 1.01991749, + "epoch": 0.2341199458890726, + "flos": 22574819370240.0, + "grad_norm": 2.094694898739344, + "language_loss": 0.80124891, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.82700682, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.2598877, + "step": 3894, + "time_per_iteration": 2.950762987136841 + }, + { + "auxiliary_loss_clip": 0.01520726, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.31567121, + "balance_loss_mlp": 1.01592541, + "epoch": 0.23418006914174055, + "flos": 25646064168960.0, + "grad_norm": 1.9694474495115317, + "language_loss": 0.82453299, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.85014904, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24963379, + "step": 3895, + "time_per_iteration": 4.411352872848511 + }, + { + "auxiliary_loss_clip": 0.0152709, + "auxiliary_loss_mlp": 0.01039986, + "balance_loss_clip": 1.3207159, + "balance_loss_mlp": 1.01379621, + "epoch": 0.23424019239440855, + "flos": 13551973361280.0, + "grad_norm": 3.452224374386368, + "language_loss": 0.81507087, + "learning_rate": 3.578142517422292e-06, + "loss": 0.84074163, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.26208496, + "step": 3896, + "time_per_iteration": 4.28142786026001 + }, + { + "auxiliary_loss_clip": 0.01535632, + "auxiliary_loss_mlp": 0.01048984, + "balance_loss_clip": 1.32405353, + "balance_loss_mlp": 1.02305627, + "epoch": 0.2343003156470765, + "flos": 22429791227520.0, + "grad_norm": 2.1273465984258713, + "language_loss": 0.84026778, + "learning_rate": 3.577903240538623e-06, + "loss": 0.8661139, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.2590332, + "step": 3897, + "time_per_iteration": 2.8864715099334717 + }, + { + "auxiliary_loss_clip": 0.01546992, + "auxiliary_loss_mlp": 0.01047647, + "balance_loss_clip": 1.33330107, + "balance_loss_mlp": 1.0209564, + "epoch": 0.23436043889974448, + "flos": 14798308636800.0, + "grad_norm": 1.6713852780429568, + "language_loss": 0.79789764, + "learning_rate": 3.577663903820705e-06, + "loss": 0.82384402, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.2668457, + "step": 3898, + "time_per_iteration": 4.402770757675171 + }, + { + "auxiliary_loss_clip": 0.0151621, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.31384015, + "balance_loss_mlp": 1.02021849, + "epoch": 0.23442056215241244, + "flos": 22975852446720.0, + "grad_norm": 2.110013659373313, + "language_loss": 0.74920154, + "learning_rate": 3.577424507277614e-06, + "loss": 0.77481985, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.25402832, + "step": 3899, + "time_per_iteration": 3.0654308795928955 + }, + { + "auxiliary_loss_clip": 0.0153304, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.32414353, + "balance_loss_mlp": 1.01507282, + "epoch": 0.2344806854050804, + "flos": 23081218882560.0, + "grad_norm": 1.5543533618030059, + "language_loss": 0.7661137, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.79184574, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25109863, + "step": 3900, + "time_per_iteration": 2.9907684326171875 + }, + { + "auxiliary_loss_clip": 0.01517686, + "auxiliary_loss_mlp": 0.01045357, + "balance_loss_clip": 1.31121171, + "balance_loss_mlp": 1.01964366, + "epoch": 0.23454080865774837, + "flos": 16335967052160.0, + "grad_norm": 2.518979459363364, + "language_loss": 0.67658055, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.70221102, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.25720215, + "step": 3901, + "time_per_iteration": 2.8332109451293945 + }, + { + "auxiliary_loss_clip": 0.01313409, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.19439197, + "balance_loss_mlp": 1.02232969, + "epoch": 0.23460093191041637, + "flos": 67789800881280.0, + "grad_norm": 0.7693293793050607, + "language_loss": 0.58197355, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60551977, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.18847656, + "step": 3902, + "time_per_iteration": 3.3710453510284424 + }, + { + "auxiliary_loss_clip": 0.01521177, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.31593668, + "balance_loss_mlp": 1.0222342, + "epoch": 0.23466105516308433, + "flos": 20085542179200.0, + "grad_norm": 1.9462754592324514, + "language_loss": 0.81076485, + "learning_rate": 3.576466323035108e-06, + "loss": 0.83647537, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.27648926, + "step": 3903, + "time_per_iteration": 2.8782460689544678 + }, + { + "auxiliary_loss_clip": 0.01517306, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.31115782, + "balance_loss_mlp": 1.01976192, + "epoch": 0.2347211784157523, + "flos": 24546566828160.0, + "grad_norm": 2.0615651854189228, + "language_loss": 0.83592987, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.86155677, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.25622559, + "step": 3904, + "time_per_iteration": 2.8895552158355713 + }, + { + "auxiliary_loss_clip": 0.01521769, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_clip": 1.31571901, + "balance_loss_mlp": 1.02487326, + "epoch": 0.23478130166842026, + "flos": 23815453800960.0, + "grad_norm": 4.189836373182601, + "language_loss": 0.72241211, + "learning_rate": 3.57598687219895e-06, + "loss": 0.74813336, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.25512695, + "step": 3905, + "time_per_iteration": 2.8616042137145996 + }, + { + "auxiliary_loss_clip": 0.01512216, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_clip": 1.30953181, + "balance_loss_mlp": 1.01690221, + "epoch": 0.23484142492108823, + "flos": 24103247846400.0, + "grad_norm": 1.6131483515808744, + "language_loss": 0.7199837, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.74552476, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.24987793, + "step": 3906, + "time_per_iteration": 2.9244043827056885 + }, + { + "auxiliary_loss_clip": 0.01533829, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.32046556, + "balance_loss_mlp": 1.01710343, + "epoch": 0.2349015481737562, + "flos": 29107709516160.0, + "grad_norm": 2.349421809353929, + "language_loss": 0.7440778, + "learning_rate": 3.575507182316473e-06, + "loss": 0.76984501, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25817871, + "step": 3907, + "time_per_iteration": 2.9132325649261475 + }, + { + "auxiliary_loss_clip": 0.01523632, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.31576991, + "balance_loss_mlp": 1.02125812, + "epoch": 0.23496167142642416, + "flos": 18925543261440.0, + "grad_norm": 2.4983415284705215, + "language_loss": 0.73553085, + "learning_rate": 3.575267247755601e-06, + "loss": 0.7612443, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.26464844, + "step": 3908, + "time_per_iteration": 2.8181638717651367 + }, + { + "auxiliary_loss_clip": 0.0130743, + "auxiliary_loss_mlp": 0.01027295, + "balance_loss_clip": 1.18858635, + "balance_loss_mlp": 1.0073632, + "epoch": 0.23502179467909215, + "flos": 55894199587200.0, + "grad_norm": 1.0346300906367807, + "language_loss": 0.73429942, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75764668, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.19921875, + "step": 3909, + "time_per_iteration": 3.1355514526367188 + }, + { + "auxiliary_loss_clip": 0.01514674, + "auxiliary_loss_mlp": 0.01042105, + "balance_loss_clip": 1.30687785, + "balance_loss_mlp": 1.01655817, + "epoch": 0.23508191793176011, + "flos": 23411570302080.0, + "grad_norm": 1.5439285983534448, + "language_loss": 0.88671935, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.91228718, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.25585938, + "step": 3910, + "time_per_iteration": 2.904994487762451 + }, + { + "auxiliary_loss_clip": 0.01524678, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.31789446, + "balance_loss_mlp": 1.02149653, + "epoch": 0.23514204118442808, + "flos": 20057282161920.0, + "grad_norm": 1.827705736771759, + "language_loss": 0.77116811, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7968756, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24572754, + "step": 3911, + "time_per_iteration": 2.906074047088623 + }, + { + "auxiliary_loss_clip": 0.01502391, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.30240893, + "balance_loss_mlp": 1.02051914, + "epoch": 0.23520216443709605, + "flos": 21590732810880.0, + "grad_norm": 1.5507428808047905, + "language_loss": 0.82086313, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.84633315, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.24084473, + "step": 3912, + "time_per_iteration": 3.0106732845306396 + }, + { + "auxiliary_loss_clip": 0.01501162, + "auxiliary_loss_mlp": 0.01045044, + "balance_loss_clip": 1.30060363, + "balance_loss_mlp": 1.01897311, + "epoch": 0.235262287689764, + "flos": 23196312950400.0, + "grad_norm": 39.04365376997832, + "language_loss": 0.72738969, + "learning_rate": 3.574066679118909e-06, + "loss": 0.75285184, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.26062012, + "step": 3913, + "time_per_iteration": 2.892909526824951 + }, + { + "auxiliary_loss_clip": 0.01531442, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.32210803, + "balance_loss_mlp": 1.02221727, + "epoch": 0.23532241094243198, + "flos": 23195589033600.0, + "grad_norm": 2.645132650676352, + "language_loss": 0.76532364, + "learning_rate": 3.57382638628884e-06, + "loss": 0.79112321, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.26293945, + "step": 3914, + "time_per_iteration": 2.9661648273468018 + }, + { + "auxiliary_loss_clip": 0.01521835, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.31599402, + "balance_loss_mlp": 1.02050579, + "epoch": 0.23538253419509997, + "flos": 17028142289280.0, + "grad_norm": 3.6052963570970307, + "language_loss": 0.9037503, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9294306, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.25695801, + "step": 3915, + "time_per_iteration": 2.8724114894866943 + }, + { + "auxiliary_loss_clip": 0.01306756, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.18833947, + "balance_loss_mlp": 1.00872612, + "epoch": 0.23544265744776793, + "flos": 63474483047040.0, + "grad_norm": 0.8161060406184484, + "language_loss": 0.59429026, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61766535, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.22070312, + "step": 3916, + "time_per_iteration": 3.3638880252838135 + }, + { + "auxiliary_loss_clip": 0.0130461, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_clip": 1.18631697, + "balance_loss_mlp": 1.02707005, + "epoch": 0.2355027807004359, + "flos": 70549561342080.0, + "grad_norm": 0.8017934055820477, + "language_loss": 0.49514854, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51866657, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.20117188, + "step": 3917, + "time_per_iteration": 3.39274263381958 + }, + { + "auxiliary_loss_clip": 0.01533794, + "auxiliary_loss_mlp": 0.01053236, + "balance_loss_clip": 1.32387567, + "balance_loss_mlp": 1.02585363, + "epoch": 0.23556290395310386, + "flos": 21444302079360.0, + "grad_norm": 2.2310418226901993, + "language_loss": 0.7706787, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.79654908, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.27380371, + "step": 3918, + "time_per_iteration": 2.9718573093414307 + }, + { + "auxiliary_loss_clip": 0.01525352, + "auxiliary_loss_mlp": 0.01047833, + "balance_loss_clip": 1.31588542, + "balance_loss_mlp": 1.0208801, + "epoch": 0.23562302720577183, + "flos": 18195425619840.0, + "grad_norm": 1.7888915722780365, + "language_loss": 0.70243579, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.72816765, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.26940918, + "step": 3919, + "time_per_iteration": 2.9170117378234863 + }, + { + "auxiliary_loss_clip": 0.01505012, + "auxiliary_loss_mlp": 0.01048823, + "balance_loss_clip": 1.30373693, + "balance_loss_mlp": 1.02315772, + "epoch": 0.2356831504584398, + "flos": 33743832117120.0, + "grad_norm": 1.6909528770438067, + "language_loss": 0.70612615, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.73166454, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.25708008, + "step": 3920, + "time_per_iteration": 3.037109851837158 + }, + { + "auxiliary_loss_clip": 0.01524089, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.3197844, + "balance_loss_mlp": 1.02575004, + "epoch": 0.23574327371110776, + "flos": 24942622976640.0, + "grad_norm": 1.6166843749479267, + "language_loss": 0.77853066, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.80428571, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.25695801, + "step": 3921, + "time_per_iteration": 2.9396636486053467 + }, + { + "auxiliary_loss_clip": 0.01517882, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.31237209, + "balance_loss_mlp": 1.01762807, + "epoch": 0.23580339696377575, + "flos": 17831158583040.0, + "grad_norm": 2.1594104470764117, + "language_loss": 0.76845366, + "learning_rate": 3.571901895946612e-06, + "loss": 0.79407471, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.26599121, + "step": 3922, + "time_per_iteration": 4.266889333724976 + }, + { + "auxiliary_loss_clip": 0.01510646, + "auxiliary_loss_mlp": 0.01045753, + "balance_loss_clip": 1.30670297, + "balance_loss_mlp": 1.02161312, + "epoch": 0.23586352021644372, + "flos": 26297808537600.0, + "grad_norm": 1.9644969624818611, + "language_loss": 0.81235725, + "learning_rate": 3.571661066327956e-06, + "loss": 0.83792126, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.24157715, + "step": 3923, + "time_per_iteration": 2.944692611694336 + }, + { + "auxiliary_loss_clip": 0.01522672, + "auxiliary_loss_mlp": 0.01048711, + "balance_loss_clip": 1.31808555, + "balance_loss_mlp": 1.02203214, + "epoch": 0.23592364346911168, + "flos": 14254871616000.0, + "grad_norm": 1.7653891236082997, + "language_loss": 0.7501303, + "learning_rate": 3.571420177111754e-06, + "loss": 0.7758441, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26672363, + "step": 3924, + "time_per_iteration": 3.051602602005005 + }, + { + "auxiliary_loss_clip": 0.01523703, + "auxiliary_loss_mlp": 0.01051804, + "balance_loss_clip": 1.31932521, + "balance_loss_mlp": 1.02590001, + "epoch": 0.23598376672177965, + "flos": 18597092123520.0, + "grad_norm": 2.173866935397387, + "language_loss": 0.83098733, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.85674244, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25915527, + "step": 3925, + "time_per_iteration": 2.975119113922119 + }, + { + "auxiliary_loss_clip": 0.01525763, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_clip": 1.31665826, + "balance_loss_mlp": 1.02699196, + "epoch": 0.2360438899744476, + "flos": 22686067630080.0, + "grad_norm": 2.076292465817274, + "language_loss": 0.60473275, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.63052857, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.26831055, + "step": 3926, + "time_per_iteration": 2.9609193801879883 + }, + { + "auxiliary_loss_clip": 0.01496723, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_clip": 1.29562259, + "balance_loss_mlp": 1.02126789, + "epoch": 0.23610401322711558, + "flos": 29582138937600.0, + "grad_norm": 2.012968638031437, + "language_loss": 0.72763968, + "learning_rate": 3.570697151969235e-06, + "loss": 0.75306892, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24938965, + "step": 3927, + "time_per_iteration": 3.014434576034546 + }, + { + "auxiliary_loss_clip": 0.01504857, + "auxiliary_loss_mlp": 0.01046519, + "balance_loss_clip": 1.30192327, + "balance_loss_mlp": 1.02172327, + "epoch": 0.23616413647978354, + "flos": 17867562664320.0, + "grad_norm": 1.69707312890078, + "language_loss": 0.75539434, + "learning_rate": 3.570456024454221e-06, + "loss": 0.78090811, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.24816895, + "step": 3928, + "time_per_iteration": 2.8798787593841553 + }, + { + "auxiliary_loss_clip": 0.01530552, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.32254958, + "balance_loss_mlp": 1.02740908, + "epoch": 0.23622425973245154, + "flos": 11041901544960.0, + "grad_norm": 2.806922488238597, + "language_loss": 0.83738416, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.86323559, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.27160645, + "step": 3929, + "time_per_iteration": 2.860170364379883 + }, + { + "auxiliary_loss_clip": 0.01547299, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.3337816, + "balance_loss_mlp": 1.02490604, + "epoch": 0.2362843829851195, + "flos": 23414465969280.0, + "grad_norm": 2.3950859696548443, + "language_loss": 0.72985625, + "learning_rate": 3.569973590777789e-06, + "loss": 0.75586545, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.28723145, + "step": 3930, + "time_per_iteration": 4.292843818664551 + }, + { + "auxiliary_loss_clip": 0.0151579, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.31145334, + "balance_loss_mlp": 1.01708496, + "epoch": 0.23634450623778747, + "flos": 39544206647040.0, + "grad_norm": 1.804919467961483, + "language_loss": 0.7524094, + "learning_rate": 3.569732284634665e-06, + "loss": 0.77800471, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.26672363, + "step": 3931, + "time_per_iteration": 3.0175082683563232 + }, + { + "auxiliary_loss_clip": 0.0152675, + "auxiliary_loss_mlp": 0.01044326, + "balance_loss_clip": 1.32015824, + "balance_loss_mlp": 1.01758766, + "epoch": 0.23640462949045543, + "flos": 24217889466240.0, + "grad_norm": 2.0665505112122884, + "language_loss": 0.80694991, + "learning_rate": 3.569490918967136e-06, + "loss": 0.83266068, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.26733398, + "step": 3932, + "time_per_iteration": 4.3228371143341064 + }, + { + "auxiliary_loss_clip": 0.01506237, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.30519938, + "balance_loss_mlp": 1.01548982, + "epoch": 0.2364647527431234, + "flos": 26188913007360.0, + "grad_norm": 1.5131036586349695, + "language_loss": 0.87020159, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.89566374, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.24462891, + "step": 3933, + "time_per_iteration": 4.3813183307647705 + }, + { + "auxiliary_loss_clip": 0.01529288, + "auxiliary_loss_mlp": 0.01046306, + "balance_loss_clip": 1.32021904, + "balance_loss_mlp": 1.0196979, + "epoch": 0.23652487599579136, + "flos": 22646994105600.0, + "grad_norm": 5.146381076287382, + "language_loss": 0.84030402, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.8660599, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.26647949, + "step": 3934, + "time_per_iteration": 2.867572784423828 + }, + { + "auxiliary_loss_clip": 0.01524097, + "auxiliary_loss_mlp": 0.01050698, + "balance_loss_clip": 1.31749368, + "balance_loss_mlp": 1.02368557, + "epoch": 0.23658499924845935, + "flos": 21772255524480.0, + "grad_norm": 1.548105574142038, + "language_loss": 0.79836065, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.8241086, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.27050781, + "step": 3935, + "time_per_iteration": 2.88531494140625 + }, + { + "auxiliary_loss_clip": 0.01496948, + "auxiliary_loss_mlp": 0.01044236, + "balance_loss_clip": 1.29628265, + "balance_loss_mlp": 1.01858211, + "epoch": 0.23664512250112732, + "flos": 21809066808960.0, + "grad_norm": 2.0576660674508007, + "language_loss": 0.80937278, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.83478463, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.25683594, + "step": 3936, + "time_per_iteration": 2.902045965194702 + }, + { + "auxiliary_loss_clip": 0.01509063, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_clip": 1.3055898, + "balance_loss_mlp": 1.02024841, + "epoch": 0.23670524575379528, + "flos": 22648079980800.0, + "grad_norm": 1.4812020059566067, + "language_loss": 0.79734623, + "learning_rate": 3.568283198083826e-06, + "loss": 0.82290339, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.26428223, + "step": 3937, + "time_per_iteration": 2.90594482421875 + }, + { + "auxiliary_loss_clip": 0.01515817, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_clip": 1.31608665, + "balance_loss_mlp": 1.02120554, + "epoch": 0.23676536900646325, + "flos": 16733109075840.0, + "grad_norm": 1.900343828252305, + "language_loss": 0.86501133, + "learning_rate": 3.568041475462147e-06, + "loss": 0.89062798, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.24658203, + "step": 3938, + "time_per_iteration": 2.838836908340454 + }, + { + "auxiliary_loss_clip": 0.01512248, + "auxiliary_loss_mlp": 0.01046738, + "balance_loss_clip": 1.31018317, + "balance_loss_mlp": 1.02090526, + "epoch": 0.23682549225913122, + "flos": 11140436016000.0, + "grad_norm": 2.420949367332862, + "language_loss": 0.95157403, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.97716391, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.25830078, + "step": 3939, + "time_per_iteration": 2.8480775356292725 + }, + { + "auxiliary_loss_clip": 0.01526239, + "auxiliary_loss_mlp": 0.01042682, + "balance_loss_clip": 1.31889963, + "balance_loss_mlp": 1.01680136, + "epoch": 0.23688561551179918, + "flos": 22567942160640.0, + "grad_norm": 2.01045704807033, + "language_loss": 0.8294906, + "learning_rate": 3.567557851847088e-06, + "loss": 0.85517979, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.25891113, + "step": 3940, + "time_per_iteration": 2.8664603233337402 + }, + { + "auxiliary_loss_clip": 0.01535542, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.32186103, + "balance_loss_mlp": 1.02370286, + "epoch": 0.23694573876446715, + "flos": 18524012492160.0, + "grad_norm": 2.115666496689497, + "language_loss": 0.90610313, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.93196982, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.27404785, + "step": 3941, + "time_per_iteration": 2.824141502380371 + }, + { + "auxiliary_loss_clip": 0.01534211, + "auxiliary_loss_mlp": 0.01054373, + "balance_loss_clip": 1.32313204, + "balance_loss_mlp": 1.02812362, + "epoch": 0.23700586201713514, + "flos": 15342741043200.0, + "grad_norm": 2.31504447821371, + "language_loss": 0.85485953, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.88074541, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.26257324, + "step": 3942, + "time_per_iteration": 2.820502519607544 + }, + { + "auxiliary_loss_clip": 0.0153715, + "auxiliary_loss_mlp": 0.01051893, + "balance_loss_clip": 1.32605267, + "balance_loss_mlp": 1.02452302, + "epoch": 0.2370659852698031, + "flos": 23957179073280.0, + "grad_norm": 1.7431627216912864, + "language_loss": 0.81624353, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.842134, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.27331543, + "step": 3943, + "time_per_iteration": 2.899634599685669 + }, + { + "auxiliary_loss_clip": 0.01545779, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_clip": 1.33030927, + "balance_loss_mlp": 1.01713252, + "epoch": 0.23712610852247107, + "flos": 15338714256000.0, + "grad_norm": 2.2959552895159714, + "language_loss": 0.69467711, + "learning_rate": 3.566589891386959e-06, + "loss": 0.72056252, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25634766, + "step": 3944, + "time_per_iteration": 2.823150157928467 + }, + { + "auxiliary_loss_clip": 0.01537529, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.32729316, + "balance_loss_mlp": 1.02342129, + "epoch": 0.23718623177513903, + "flos": 19692019739520.0, + "grad_norm": 1.8703510367380072, + "language_loss": 0.76936924, + "learning_rate": 3.566347752735866e-06, + "loss": 0.79524487, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.26611328, + "step": 3945, + "time_per_iteration": 2.8503623008728027 + }, + { + "auxiliary_loss_clip": 0.01546571, + "auxiliary_loss_mlp": 0.01042914, + "balance_loss_clip": 1.33678102, + "balance_loss_mlp": 1.01833344, + "epoch": 0.237246355027807, + "flos": 24984230209920.0, + "grad_norm": 1.4537830073880582, + "language_loss": 0.65083402, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.67672884, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.24597168, + "step": 3946, + "time_per_iteration": 2.9498884677886963 + }, + { + "auxiliary_loss_clip": 0.01538533, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_clip": 1.3293196, + "balance_loss_mlp": 1.01738906, + "epoch": 0.23730647828047496, + "flos": 15385796110080.0, + "grad_norm": 2.204825824296122, + "language_loss": 0.77855986, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.80438209, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.26330566, + "step": 3947, + "time_per_iteration": 2.8044888973236084 + }, + { + "auxiliary_loss_clip": 0.0154898, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.33813, + "balance_loss_mlp": 1.01973116, + "epoch": 0.23736660153314296, + "flos": 28163194174080.0, + "grad_norm": 1.5797423680134133, + "language_loss": 0.81444597, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8403824, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.24926758, + "step": 3948, + "time_per_iteration": 2.944415330886841 + }, + { + "auxiliary_loss_clip": 0.01550657, + "auxiliary_loss_mlp": 0.01047706, + "balance_loss_clip": 1.33860159, + "balance_loss_mlp": 1.02133703, + "epoch": 0.23742672478581092, + "flos": 22095594000000.0, + "grad_norm": 1.7664978038736934, + "language_loss": 0.81595725, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.84194082, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.26367188, + "step": 3949, + "time_per_iteration": 2.8414676189422607 + }, + { + "auxiliary_loss_clip": 0.01551433, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_clip": 1.33843923, + "balance_loss_mlp": 1.02072167, + "epoch": 0.2374868480384789, + "flos": 19546629638400.0, + "grad_norm": 2.0594572906467827, + "language_loss": 0.74260879, + "learning_rate": 3.565136168723163e-06, + "loss": 0.76858819, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25793457, + "step": 3950, + "time_per_iteration": 2.869485378265381 + }, + { + "auxiliary_loss_clip": 0.01534121, + "auxiliary_loss_mlp": 0.01043098, + "balance_loss_clip": 1.32555771, + "balance_loss_mlp": 1.01917243, + "epoch": 0.23754697129114685, + "flos": 19430675919360.0, + "grad_norm": 1.9937069474101101, + "language_loss": 0.7342304, + "learning_rate": 3.564893673833495e-06, + "loss": 0.76000249, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.23950195, + "step": 3951, + "time_per_iteration": 2.8497581481933594 + }, + { + "auxiliary_loss_clip": 0.01560433, + "auxiliary_loss_mlp": 0.01047814, + "balance_loss_clip": 1.34930682, + "balance_loss_mlp": 1.02102768, + "epoch": 0.23760709454381482, + "flos": 19510632760320.0, + "grad_norm": 1.9246186225149102, + "language_loss": 0.74813879, + "learning_rate": 3.564651119602903e-06, + "loss": 0.77422118, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.26794434, + "step": 3952, + "time_per_iteration": 2.83380126953125 + }, + { + "auxiliary_loss_clip": 0.01561434, + "auxiliary_loss_mlp": 0.01048419, + "balance_loss_clip": 1.34831405, + "balance_loss_mlp": 1.02274096, + "epoch": 0.23766721779648278, + "flos": 27648152904960.0, + "grad_norm": 1.594603718909292, + "language_loss": 0.71831346, + "learning_rate": 3.564408506040583e-06, + "loss": 0.74441195, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25683594, + "step": 3953, + "time_per_iteration": 2.9337158203125 + }, + { + "auxiliary_loss_clip": 0.01555479, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.34030294, + "balance_loss_mlp": 1.02212584, + "epoch": 0.23772734104915075, + "flos": 23414827927680.0, + "grad_norm": 1.7963930911697839, + "language_loss": 0.82804978, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.85409462, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.26904297, + "step": 3954, + "time_per_iteration": 2.9956421852111816 + }, + { + "auxiliary_loss_clip": 0.01551672, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.34014988, + "balance_loss_mlp": 1.02257252, + "epoch": 0.23778746430181874, + "flos": 15713885289600.0, + "grad_norm": 2.092371956123049, + "language_loss": 0.67076349, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.69676989, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.26416016, + "step": 3955, + "time_per_iteration": 2.897327423095703 + }, + { + "auxiliary_loss_clip": 0.01544539, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.33564019, + "balance_loss_mlp": 1.02147841, + "epoch": 0.2378475875544867, + "flos": 19435652847360.0, + "grad_norm": 1.4244923638869071, + "language_loss": 0.84607583, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.87198693, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25097656, + "step": 3956, + "time_per_iteration": 2.9360969066619873 + }, + { + "auxiliary_loss_clip": 0.01537056, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.32904935, + "balance_loss_mlp": 1.01746535, + "epoch": 0.23790771080715467, + "flos": 22277523916800.0, + "grad_norm": 2.0241728108383006, + "language_loss": 0.86191249, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.88771439, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25683594, + "step": 3957, + "time_per_iteration": 4.318829774856567 + }, + { + "auxiliary_loss_clip": 0.01537035, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.326092, + "balance_loss_mlp": 1.01898956, + "epoch": 0.23796783405982264, + "flos": 20056829713920.0, + "grad_norm": 1.8770273319911206, + "language_loss": 0.71343458, + "learning_rate": 3.563194548575151e-06, + "loss": 0.73924708, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25244141, + "step": 3958, + "time_per_iteration": 2.8778750896453857 + }, + { + "auxiliary_loss_clip": 0.01545847, + "auxiliary_loss_mlp": 0.01043699, + "balance_loss_clip": 1.3342644, + "balance_loss_mlp": 1.01773477, + "epoch": 0.2380279573124906, + "flos": 14253921475200.0, + "grad_norm": 2.4743879552785883, + "language_loss": 0.67419732, + "learning_rate": 3.562951579215745e-06, + "loss": 0.70009279, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.25964355, + "step": 3959, + "time_per_iteration": 2.8004953861236572 + }, + { + "auxiliary_loss_clip": 0.01544644, + "auxiliary_loss_mlp": 0.01042274, + "balance_loss_clip": 1.33655643, + "balance_loss_mlp": 1.01788402, + "epoch": 0.23808808056515857, + "flos": 21189202041600.0, + "grad_norm": 1.742023749824331, + "language_loss": 0.73315847, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.75902772, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.24389648, + "step": 3960, + "time_per_iteration": 2.849541425704956 + }, + { + "auxiliary_loss_clip": 0.01543037, + "auxiliary_loss_mlp": 0.01042553, + "balance_loss_clip": 1.3329699, + "balance_loss_mlp": 1.01619625, + "epoch": 0.23814820381782653, + "flos": 22538460533760.0, + "grad_norm": 1.7519187691688702, + "language_loss": 0.75916147, + "learning_rate": 3.562465462704307e-06, + "loss": 0.78501737, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.26367188, + "step": 3961, + "time_per_iteration": 2.8578577041625977 + }, + { + "auxiliary_loss_clip": 0.01553307, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_clip": 1.33975244, + "balance_loss_mlp": 1.02023244, + "epoch": 0.23820832707049452, + "flos": 22313113591680.0, + "grad_norm": 2.182549379608525, + "language_loss": 0.66652668, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.69254482, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.28295898, + "step": 3962, + "time_per_iteration": 2.8278043270111084 + }, + { + "auxiliary_loss_clip": 0.01539144, + "auxiliary_loss_mlp": 0.01047168, + "balance_loss_clip": 1.32896233, + "balance_loss_mlp": 1.02248013, + "epoch": 0.2382684503231625, + "flos": 24875198945280.0, + "grad_norm": 1.9090306286048286, + "language_loss": 0.74962616, + "learning_rate": 3.561979109197483e-06, + "loss": 0.77548927, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24694824, + "step": 3963, + "time_per_iteration": 2.8934361934661865 + }, + { + "auxiliary_loss_clip": 0.01558306, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.34373379, + "balance_loss_mlp": 1.01145375, + "epoch": 0.23832857357583045, + "flos": 21881603502720.0, + "grad_norm": 1.6912593943241845, + "language_loss": 0.78495979, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.8109116, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.25415039, + "step": 3964, + "time_per_iteration": 2.8404552936553955 + }, + { + "auxiliary_loss_clip": 0.01539181, + "auxiliary_loss_mlp": 0.01045653, + "balance_loss_clip": 1.33210993, + "balance_loss_mlp": 1.02096438, + "epoch": 0.23838869682849842, + "flos": 21297961837440.0, + "grad_norm": 2.2331401482797557, + "language_loss": 0.73290926, + "learning_rate": 3.561492518769045e-06, + "loss": 0.75875753, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24719238, + "step": 3965, + "time_per_iteration": 4.219902992248535 + }, + { + "auxiliary_loss_clip": 0.01541221, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.33383346, + "balance_loss_mlp": 1.02248943, + "epoch": 0.23844882008116638, + "flos": 16189310096640.0, + "grad_norm": 1.7703392915046345, + "language_loss": 0.79375303, + "learning_rate": 3.561249134732282e-06, + "loss": 0.81963682, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.24633789, + "step": 3966, + "time_per_iteration": 2.8615241050720215 + }, + { + "auxiliary_loss_clip": 0.01556463, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.34637749, + "balance_loss_mlp": 1.02044594, + "epoch": 0.23850894333383435, + "flos": 21079718328960.0, + "grad_norm": 1.6226753905816218, + "language_loss": 0.69612134, + "learning_rate": 3.561005691492797e-06, + "loss": 0.72213709, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.24682617, + "step": 3967, + "time_per_iteration": 4.395959377288818 + }, + { + "auxiliary_loss_clip": 0.01555515, + "auxiliary_loss_mlp": 0.01054256, + "balance_loss_clip": 1.34432912, + "balance_loss_mlp": 1.02963972, + "epoch": 0.23856906658650234, + "flos": 17210434164480.0, + "grad_norm": 2.1114591538507073, + "language_loss": 0.69540739, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.72150505, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24645996, + "step": 3968, + "time_per_iteration": 4.234614372253418 + }, + { + "auxiliary_loss_clip": 0.01543077, + "auxiliary_loss_mlp": 0.01052968, + "balance_loss_clip": 1.3331995, + "balance_loss_mlp": 1.02768397, + "epoch": 0.2386291898391703, + "flos": 29505439722240.0, + "grad_norm": 6.56384161639236, + "language_loss": 0.77630305, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.80226344, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.2532959, + "step": 3969, + "time_per_iteration": 2.91475248336792 + }, + { + "auxiliary_loss_clip": 0.01557475, + "auxiliary_loss_mlp": 0.01043001, + "balance_loss_clip": 1.35054588, + "balance_loss_mlp": 1.01847959, + "epoch": 0.23868931309183827, + "flos": 21152390757120.0, + "grad_norm": 1.9853770457409146, + "language_loss": 0.77703917, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.80304396, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.2454834, + "step": 3970, + "time_per_iteration": 2.8673665523529053 + }, + { + "auxiliary_loss_clip": 0.01554926, + "auxiliary_loss_mlp": 0.01047218, + "balance_loss_clip": 1.34227157, + "balance_loss_mlp": 1.02270818, + "epoch": 0.23874943634450624, + "flos": 25668623341440.0, + "grad_norm": 2.5531048161158774, + "language_loss": 0.85936046, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.88538188, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.24511719, + "step": 3971, + "time_per_iteration": 2.8758463859558105 + }, + { + "auxiliary_loss_clip": 0.01278858, + "auxiliary_loss_mlp": 0.0105489, + "balance_loss_clip": 1.1657629, + "balance_loss_mlp": 1.036484, + "epoch": 0.2388095595971742, + "flos": 59018046105600.0, + "grad_norm": 0.7573690089249212, + "language_loss": 0.62818253, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65152001, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18359375, + "step": 3972, + "time_per_iteration": 3.4521329402923584 + }, + { + "auxiliary_loss_clip": 0.01559966, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.34985948, + "balance_loss_mlp": 1.01545143, + "epoch": 0.23886968284984217, + "flos": 16809084374400.0, + "grad_norm": 2.2145282374838744, + "language_loss": 0.82892931, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.85492706, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.24353027, + "step": 3973, + "time_per_iteration": 2.8343052864074707 + }, + { + "auxiliary_loss_clip": 0.0154603, + "auxiliary_loss_mlp": 0.01052576, + "balance_loss_clip": 1.33825254, + "balance_loss_mlp": 1.02778006, + "epoch": 0.23892980610251013, + "flos": 22392844208640.0, + "grad_norm": 1.532413655494669, + "language_loss": 0.80047673, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.82646275, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.24804688, + "step": 3974, + "time_per_iteration": 2.84763503074646 + }, + { + "auxiliary_loss_clip": 0.01563468, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.35205662, + "balance_loss_mlp": 1.02537942, + "epoch": 0.23898992935517813, + "flos": 12830859434880.0, + "grad_norm": 1.9499612186035467, + "language_loss": 0.85732222, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.88346571, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.25463867, + "step": 3975, + "time_per_iteration": 2.8622958660125732 + }, + { + "auxiliary_loss_clip": 0.01543374, + "auxiliary_loss_mlp": 0.01042008, + "balance_loss_clip": 1.33566916, + "balance_loss_mlp": 1.0188216, + "epoch": 0.2390500526078461, + "flos": 22355716210560.0, + "grad_norm": 2.282040057258212, + "language_loss": 0.84178734, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.86764109, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.23168945, + "step": 3976, + "time_per_iteration": 2.8790998458862305 + }, + { + "auxiliary_loss_clip": 0.01534009, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.329584, + "balance_loss_mlp": 1.02017677, + "epoch": 0.23911017586051406, + "flos": 22644641376000.0, + "grad_norm": 1.8535276895342452, + "language_loss": 0.75277513, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.77854145, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.2244873, + "step": 3977, + "time_per_iteration": 2.872523546218872 + }, + { + "auxiliary_loss_clip": 0.01536477, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.32876635, + "balance_loss_mlp": 1.0235734, + "epoch": 0.23917029911318202, + "flos": 23662145859840.0, + "grad_norm": 1.6669788033809092, + "language_loss": 0.72807872, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.75391978, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24060059, + "step": 3978, + "time_per_iteration": 2.8388442993164062 + }, + { + "auxiliary_loss_clip": 0.01564621, + "auxiliary_loss_mlp": 0.01052795, + "balance_loss_clip": 1.35112047, + "balance_loss_mlp": 1.02844048, + "epoch": 0.23923042236585, + "flos": 22793786795520.0, + "grad_norm": 2.05705396466009, + "language_loss": 0.79720128, + "learning_rate": 3.558079758168997e-06, + "loss": 0.82337546, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.24365234, + "step": 3979, + "time_per_iteration": 2.824420690536499 + }, + { + "auxiliary_loss_clip": 0.01543135, + "auxiliary_loss_mlp": 0.01050839, + "balance_loss_clip": 1.3367269, + "balance_loss_mlp": 1.02611494, + "epoch": 0.23929054561851795, + "flos": 28159484100480.0, + "grad_norm": 1.8326109414324316, + "language_loss": 0.82766134, + "learning_rate": 3.557835546134977e-06, + "loss": 0.8536011, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.24731445, + "step": 3980, + "time_per_iteration": 2.884382724761963 + }, + { + "auxiliary_loss_clip": 0.01536719, + "auxiliary_loss_mlp": 0.01047602, + "balance_loss_clip": 1.33072937, + "balance_loss_mlp": 1.02458239, + "epoch": 0.23935066887118592, + "flos": 21695782533120.0, + "grad_norm": 1.6902024244381624, + "language_loss": 0.84396827, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86981153, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.23034668, + "step": 3981, + "time_per_iteration": 2.8399159908294678 + }, + { + "auxiliary_loss_clip": 0.01560133, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.34734535, + "balance_loss_mlp": 1.02629602, + "epoch": 0.2394107921238539, + "flos": 32134451414400.0, + "grad_norm": 1.9458902225879464, + "language_loss": 0.779055, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.80517197, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 2.12597656, + "router_z_loss_mlp": 0.25268555, + "step": 3982, + "time_per_iteration": 2.907567024230957 + }, + { + "auxiliary_loss_clip": 0.01535111, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.32864189, + "balance_loss_mlp": 1.03089178, + "epoch": 0.23947091537652188, + "flos": 17027961310080.0, + "grad_norm": 1.8724007398630402, + "language_loss": 0.78483593, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.81072378, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2277832, + "step": 3983, + "time_per_iteration": 2.8125290870666504 + }, + { + "auxiliary_loss_clip": 0.01531507, + "auxiliary_loss_mlp": 0.01054416, + "balance_loss_clip": 1.32427561, + "balance_loss_mlp": 1.030586, + "epoch": 0.23953103862918984, + "flos": 20602981422720.0, + "grad_norm": 4.237696213511316, + "language_loss": 0.74380469, + "learning_rate": 3.556858107358737e-06, + "loss": 0.76966393, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.23828125, + "step": 3984, + "time_per_iteration": 2.903290033340454 + }, + { + "auxiliary_loss_clip": 0.01538796, + "auxiliary_loss_mlp": 0.01056329, + "balance_loss_clip": 1.3296982, + "balance_loss_mlp": 1.03247535, + "epoch": 0.2395911618818578, + "flos": 20714184437760.0, + "grad_norm": 2.0649574702825637, + "language_loss": 0.79572308, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.82167435, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23864746, + "step": 3985, + "time_per_iteration": 2.8235702514648438 + }, + { + "auxiliary_loss_clip": 0.01540894, + "auxiliary_loss_mlp": 0.01057544, + "balance_loss_clip": 1.33374977, + "balance_loss_mlp": 1.0325458, + "epoch": 0.23965128513452577, + "flos": 27065099422080.0, + "grad_norm": 2.033189932205901, + "language_loss": 0.748052, + "learning_rate": 3.556369033716254e-06, + "loss": 0.77403641, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.25024414, + "step": 3986, + "time_per_iteration": 2.9143483638763428 + }, + { + "auxiliary_loss_clip": 0.01549494, + "auxiliary_loss_mlp": 0.01054651, + "balance_loss_clip": 1.335271, + "balance_loss_mlp": 1.03067815, + "epoch": 0.23971140838719374, + "flos": 23153529352320.0, + "grad_norm": 2.068527092619971, + "language_loss": 0.88230395, + "learning_rate": 3.556124408363871e-06, + "loss": 0.9083454, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 2.14160156, + "router_z_loss_mlp": 0.23950195, + "step": 3987, + "time_per_iteration": 2.859154462814331 + }, + { + "auxiliary_loss_clip": 0.01518085, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_clip": 1.31889343, + "balance_loss_mlp": 1.02431095, + "epoch": 0.23977153163986173, + "flos": 18041981944320.0, + "grad_norm": 2.1375307163852737, + "language_loss": 0.84268945, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.86833608, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.22277832, + "step": 3988, + "time_per_iteration": 2.8060498237609863 + }, + { + "auxiliary_loss_clip": 0.01519512, + "auxiliary_loss_mlp": 0.01050452, + "balance_loss_clip": 1.31495762, + "balance_loss_mlp": 1.02581096, + "epoch": 0.2398316548925297, + "flos": 18122345988480.0, + "grad_norm": 2.7227803388797907, + "language_loss": 0.85829449, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.8839941, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.2467041, + "step": 3989, + "time_per_iteration": 2.8503901958465576 + }, + { + "auxiliary_loss_clip": 0.01512145, + "auxiliary_loss_mlp": 0.01045729, + "balance_loss_clip": 1.30960822, + "balance_loss_mlp": 1.02231622, + "epoch": 0.23989177814519766, + "flos": 12575306949120.0, + "grad_norm": 2.023636911443924, + "language_loss": 0.86251968, + "learning_rate": 3.555390178293477e-06, + "loss": 0.88809842, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.23413086, + "step": 3990, + "time_per_iteration": 2.8243470191955566 + }, + { + "auxiliary_loss_clip": 0.01532489, + "auxiliary_loss_mlp": 0.01049036, + "balance_loss_clip": 1.32638168, + "balance_loss_mlp": 1.0271014, + "epoch": 0.23995190139786562, + "flos": 25275417615360.0, + "grad_norm": 1.712671784107316, + "language_loss": 0.7729001, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.79871535, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.21936035, + "step": 3991, + "time_per_iteration": 2.930565118789673 + }, + { + "auxiliary_loss_clip": 0.01291292, + "auxiliary_loss_mlp": 0.01023174, + "balance_loss_clip": 1.17641592, + "balance_loss_mlp": 1.00772405, + "epoch": 0.2400120246505336, + "flos": 61987888028160.0, + "grad_norm": 0.8879148971818479, + "language_loss": 0.63918686, + "learning_rate": 3.554900396661656e-06, + "loss": 0.66233152, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.15429688, + "step": 3992, + "time_per_iteration": 4.721688508987427 + }, + { + "auxiliary_loss_clip": 0.01291246, + "auxiliary_loss_mlp": 0.01026016, + "balance_loss_clip": 1.17552853, + "balance_loss_mlp": 1.00722837, + "epoch": 0.24007214790320155, + "flos": 66738607004160.0, + "grad_norm": 0.7567890470030744, + "language_loss": 0.62995815, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65313077, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.1875, + "step": 3993, + "time_per_iteration": 3.388216257095337 + }, + { + "auxiliary_loss_clip": 0.0155336, + "auxiliary_loss_mlp": 0.01047156, + "balance_loss_clip": 1.34361041, + "balance_loss_mlp": 1.02243197, + "epoch": 0.24013227115586952, + "flos": 25819307084160.0, + "grad_norm": 1.5419392633269222, + "language_loss": 0.78012401, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.8061291, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24707031, + "step": 3994, + "time_per_iteration": 2.9148221015930176 + }, + { + "auxiliary_loss_clip": 0.01550853, + "auxiliary_loss_mlp": 0.01050264, + "balance_loss_clip": 1.34106588, + "balance_loss_mlp": 1.02515829, + "epoch": 0.2401923944085375, + "flos": 25568641036800.0, + "grad_norm": 2.0870410129169743, + "language_loss": 0.79295528, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.81896639, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 2.09765625, + "router_z_loss_mlp": 0.2512207, + "step": 3995, + "time_per_iteration": 2.9044837951660156 + }, + { + "auxiliary_loss_clip": 0.01288486, + "auxiliary_loss_mlp": 0.01019374, + "balance_loss_clip": 1.17029428, + "balance_loss_mlp": 1.000301, + "epoch": 0.24025251766120548, + "flos": 54972668603520.0, + "grad_norm": 0.9113811358755965, + "language_loss": 0.63485277, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65793133, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.19042969, + "step": 3996, + "time_per_iteration": 3.454390525817871 + }, + { + "auxiliary_loss_clip": 0.01555453, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.3443346, + "balance_loss_mlp": 1.02213788, + "epoch": 0.24031264091387344, + "flos": 20640742848000.0, + "grad_norm": 2.4698875964816254, + "language_loss": 0.70258915, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72860521, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.2401123, + "step": 3997, + "time_per_iteration": 2.9914352893829346 + }, + { + "auxiliary_loss_clip": 0.01529168, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_clip": 1.32387996, + "balance_loss_mlp": 1.02294135, + "epoch": 0.2403727641665414, + "flos": 20895933375360.0, + "grad_norm": 2.13307114814015, + "language_loss": 0.87700909, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.90276957, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.23962402, + "step": 3998, + "time_per_iteration": 2.8382608890533447 + }, + { + "auxiliary_loss_clip": 0.01559851, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.34476542, + "balance_loss_mlp": 1.01928544, + "epoch": 0.24043288741920937, + "flos": 22830236121600.0, + "grad_norm": 1.7185887107814417, + "language_loss": 0.76394033, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.78998733, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25598145, + "step": 3999, + "time_per_iteration": 2.880476236343384 + }, + { + "auxiliary_loss_clip": 0.01529831, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.32556725, + "balance_loss_mlp": 1.01957428, + "epoch": 0.24049301067187734, + "flos": 27969907812480.0, + "grad_norm": 2.652143356394441, + "language_loss": 0.73558527, + "learning_rate": 3.552938912398679e-06, + "loss": 0.76132321, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.24414062, + "step": 4000, + "time_per_iteration": 4.264817237854004 + }, + { + "auxiliary_loss_clip": 0.01562282, + "auxiliary_loss_mlp": 0.01048122, + "balance_loss_clip": 1.34797573, + "balance_loss_mlp": 1.02346933, + "epoch": 0.24055313392454533, + "flos": 27462196200960.0, + "grad_norm": 2.2128145542419686, + "language_loss": 0.67895555, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.70505959, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.24645996, + "step": 4001, + "time_per_iteration": 2.8735036849975586 + }, + { + "auxiliary_loss_clip": 0.01541141, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.33126009, + "balance_loss_mlp": 1.02280807, + "epoch": 0.2406132571772133, + "flos": 25567555161600.0, + "grad_norm": 1.709900383598382, + "language_loss": 0.84027964, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.86616725, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.2479248, + "step": 4002, + "time_per_iteration": 4.30093789100647 + }, + { + "auxiliary_loss_clip": 0.01536093, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.32999289, + "balance_loss_mlp": 1.01502693, + "epoch": 0.24067338042988126, + "flos": 24802390782720.0, + "grad_norm": 1.8115652009772572, + "language_loss": 0.83823979, + "learning_rate": 3.552202383898897e-06, + "loss": 0.86398917, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.23815918, + "step": 4003, + "time_per_iteration": 2.8722972869873047 + }, + { + "auxiliary_loss_clip": 0.01566514, + "auxiliary_loss_mlp": 0.01050647, + "balance_loss_clip": 1.35509157, + "balance_loss_mlp": 1.02666235, + "epoch": 0.24073350368254923, + "flos": 21187618473600.0, + "grad_norm": 4.515549902131664, + "language_loss": 0.88857913, + "learning_rate": 3.551956756667215e-06, + "loss": 0.91475075, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.23999023, + "step": 4004, + "time_per_iteration": 4.215851068496704 + }, + { + "auxiliary_loss_clip": 0.01547329, + "auxiliary_loss_mlp": 0.01050915, + "balance_loss_clip": 1.33335495, + "balance_loss_mlp": 1.02662063, + "epoch": 0.2407936269352172, + "flos": 22505087854080.0, + "grad_norm": 2.208901344746885, + "language_loss": 0.78745961, + "learning_rate": 3.551711070585177e-06, + "loss": 0.81344205, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24291992, + "step": 4005, + "time_per_iteration": 2.895538568496704 + }, + { + "auxiliary_loss_clip": 0.01525872, + "auxiliary_loss_mlp": 0.01047044, + "balance_loss_clip": 1.32198119, + "balance_loss_mlp": 1.0229044, + "epoch": 0.24085375018788516, + "flos": 18560190349440.0, + "grad_norm": 1.5902826744612062, + "language_loss": 0.79683352, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.82256269, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.24133301, + "step": 4006, + "time_per_iteration": 2.847377300262451 + }, + { + "auxiliary_loss_clip": 0.01564399, + "auxiliary_loss_mlp": 0.010528, + "balance_loss_clip": 1.34714985, + "balance_loss_mlp": 1.02697945, + "epoch": 0.24091387344055312, + "flos": 24181213916160.0, + "grad_norm": 1.6692640534790306, + "language_loss": 0.72501504, + "learning_rate": 3.551219521907302e-06, + "loss": 0.75118703, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.25842285, + "step": 4007, + "time_per_iteration": 2.857990264892578 + }, + { + "auxiliary_loss_clip": 0.01529198, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.32464862, + "balance_loss_mlp": 1.02421331, + "epoch": 0.24097399669322112, + "flos": 11043756581760.0, + "grad_norm": 1.6861190504102508, + "language_loss": 0.7697804, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.79555333, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.2388916, + "step": 4008, + "time_per_iteration": 2.8299713134765625 + }, + { + "auxiliary_loss_clip": 0.0153837, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_clip": 1.32974744, + "balance_loss_mlp": 1.02111912, + "epoch": 0.24103411994588908, + "flos": 17173894348800.0, + "grad_norm": 2.9686758625456924, + "language_loss": 0.76010048, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.78592968, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.234375, + "step": 4009, + "time_per_iteration": 2.930962324142456 + }, + { + "auxiliary_loss_clip": 0.01526328, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.32139695, + "balance_loss_mlp": 1.02347028, + "epoch": 0.24109424319855705, + "flos": 20677735111680.0, + "grad_norm": 13.245457932985847, + "language_loss": 0.81277978, + "learning_rate": 3.550481757745804e-06, + "loss": 0.83850968, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.23193359, + "step": 4010, + "time_per_iteration": 2.8428843021392822 + }, + { + "auxiliary_loss_clip": 0.01543345, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.33203673, + "balance_loss_mlp": 1.02244794, + "epoch": 0.241154366451225, + "flos": 28192947269760.0, + "grad_norm": 1.9971350706065911, + "language_loss": 0.71395445, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.73986483, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.25256348, + "step": 4011, + "time_per_iteration": 2.9492876529693604 + }, + { + "auxiliary_loss_clip": 0.01542071, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.3323257, + "balance_loss_mlp": 1.02121234, + "epoch": 0.24121448970389298, + "flos": 21699809320320.0, + "grad_norm": 1.6965998898968548, + "language_loss": 0.69654548, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.72242314, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.24499512, + "step": 4012, + "time_per_iteration": 2.838156223297119 + }, + { + "auxiliary_loss_clip": 0.01534962, + "auxiliary_loss_mlp": 0.01042953, + "balance_loss_clip": 1.32604218, + "balance_loss_mlp": 1.01756096, + "epoch": 0.24127461295656094, + "flos": 39690682623360.0, + "grad_norm": 1.611192746141382, + "language_loss": 0.75061011, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.77638924, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.25390625, + "step": 4013, + "time_per_iteration": 3.0236399173736572 + }, + { + "auxiliary_loss_clip": 0.01553305, + "auxiliary_loss_mlp": 0.01051083, + "balance_loss_clip": 1.34252453, + "balance_loss_mlp": 1.02722967, + "epoch": 0.2413347362092289, + "flos": 19145325093120.0, + "grad_norm": 1.7014246774073183, + "language_loss": 0.89092404, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.91696793, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23852539, + "step": 4014, + "time_per_iteration": 2.8191099166870117 + }, + { + "auxiliary_loss_clip": 0.01554705, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.33850956, + "balance_loss_mlp": 1.02086258, + "epoch": 0.2413948594618969, + "flos": 26949779130240.0, + "grad_norm": 2.351975150329219, + "language_loss": 0.95784211, + "learning_rate": 3.549250975045952e-06, + "loss": 0.98383904, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.24133301, + "step": 4015, + "time_per_iteration": 2.9241411685943604 + }, + { + "auxiliary_loss_clip": 0.01539523, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.32938147, + "balance_loss_mlp": 1.01939702, + "epoch": 0.24145498271456486, + "flos": 25238923044480.0, + "grad_norm": 1.5769877397366443, + "language_loss": 0.84483731, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.87066853, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.2421875, + "step": 4016, + "time_per_iteration": 2.9136672019958496 + }, + { + "auxiliary_loss_clip": 0.01521718, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.31805253, + "balance_loss_mlp": 1.02431512, + "epoch": 0.24151510596723283, + "flos": 40676624219520.0, + "grad_norm": 2.0821015072609788, + "language_loss": 0.69363886, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71933234, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.2331543, + "step": 4017, + "time_per_iteration": 3.003466844558716 + }, + { + "auxiliary_loss_clip": 0.01557129, + "auxiliary_loss_mlp": 0.01055072, + "balance_loss_clip": 1.34181488, + "balance_loss_mlp": 1.03000271, + "epoch": 0.2415752292199008, + "flos": 18154813772160.0, + "grad_norm": 1.5877086002957073, + "language_loss": 0.85219491, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87831688, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.25073242, + "step": 4018, + "time_per_iteration": 2.8351943492889404 + }, + { + "auxiliary_loss_clip": 0.01279373, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.15836358, + "balance_loss_mlp": 1.01760864, + "epoch": 0.24163535247256876, + "flos": 67317588455040.0, + "grad_norm": 0.8247855191503591, + "language_loss": 0.60777897, + "learning_rate": 3.548265291370558e-06, + "loss": 0.63093281, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.18359375, + "step": 4019, + "time_per_iteration": 3.410660982131958 + }, + { + "auxiliary_loss_clip": 0.01527388, + "auxiliary_loss_mlp": 0.01046649, + "balance_loss_clip": 1.31916332, + "balance_loss_mlp": 1.02391553, + "epoch": 0.24169547572523672, + "flos": 24939048637440.0, + "grad_norm": 1.987518149210506, + "language_loss": 0.74177909, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.76751947, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.22741699, + "step": 4020, + "time_per_iteration": 2.89510440826416 + }, + { + "auxiliary_loss_clip": 0.01551486, + "auxiliary_loss_mlp": 0.01049487, + "balance_loss_clip": 1.34226525, + "balance_loss_mlp": 1.02600336, + "epoch": 0.24175559897790472, + "flos": 18736057463040.0, + "grad_norm": 1.7970668602907283, + "language_loss": 0.82996666, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.8559764, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.23474121, + "step": 4021, + "time_per_iteration": 2.8438777923583984 + }, + { + "auxiliary_loss_clip": 0.01547918, + "auxiliary_loss_mlp": 0.01053425, + "balance_loss_clip": 1.3360424, + "balance_loss_mlp": 1.02687752, + "epoch": 0.24181572223057268, + "flos": 23049610750080.0, + "grad_norm": 2.0345470685334637, + "language_loss": 0.7733866, + "learning_rate": 3.547525412122378e-06, + "loss": 0.79939997, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.265625, + "step": 4022, + "time_per_iteration": 2.845194101333618 + }, + { + "auxiliary_loss_clip": 0.01558214, + "auxiliary_loss_mlp": 0.01048734, + "balance_loss_clip": 1.34009981, + "balance_loss_mlp": 1.02442753, + "epoch": 0.24187584548324065, + "flos": 20385869034240.0, + "grad_norm": 1.704665729797788, + "language_loss": 0.76065254, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.78672206, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.24304199, + "step": 4023, + "time_per_iteration": 2.870590925216675 + }, + { + "auxiliary_loss_clip": 0.01547401, + "auxiliary_loss_mlp": 0.01051221, + "balance_loss_clip": 1.33567381, + "balance_loss_mlp": 1.02635407, + "epoch": 0.2419359687359086, + "flos": 21407445550080.0, + "grad_norm": 2.0502549368112946, + "language_loss": 0.83288229, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.8588686, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2487793, + "step": 4024, + "time_per_iteration": 2.845041513442993 + }, + { + "auxiliary_loss_clip": 0.01528912, + "auxiliary_loss_mlp": 0.01048381, + "balance_loss_clip": 1.32253933, + "balance_loss_mlp": 1.02439594, + "epoch": 0.24199609198857658, + "flos": 18378577146240.0, + "grad_norm": 1.9630240144777524, + "language_loss": 0.86579263, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.89156562, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.23999023, + "step": 4025, + "time_per_iteration": 2.8311245441436768 + }, + { + "auxiliary_loss_clip": 0.01555609, + "auxiliary_loss_mlp": 0.01057011, + "balance_loss_clip": 1.34124827, + "balance_loss_mlp": 1.03265643, + "epoch": 0.24205621524124454, + "flos": 19473323783040.0, + "grad_norm": 2.6797582123147996, + "language_loss": 0.72414231, + "learning_rate": 3.546538084949365e-06, + "loss": 0.75026846, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.2434082, + "step": 4026, + "time_per_iteration": 2.8220276832580566 + }, + { + "auxiliary_loss_clip": 0.01528933, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_clip": 1.32348263, + "balance_loss_mlp": 1.02566516, + "epoch": 0.2421163384939125, + "flos": 14984672544000.0, + "grad_norm": 2.0145058809473233, + "language_loss": 0.65943408, + "learning_rate": 3.546291106520509e-06, + "loss": 0.68520999, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.22998047, + "step": 4027, + "time_per_iteration": 4.277646064758301 + }, + { + "auxiliary_loss_clip": 0.01545477, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.33059132, + "balance_loss_mlp": 1.0265348, + "epoch": 0.2421764617465805, + "flos": 18671529098880.0, + "grad_norm": 1.7947282324871627, + "language_loss": 0.7247051, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.75070059, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.27502441, + "step": 4028, + "time_per_iteration": 2.871005058288574 + }, + { + "auxiliary_loss_clip": 0.01275817, + "auxiliary_loss_mlp": 0.01021617, + "balance_loss_clip": 1.15912127, + "balance_loss_mlp": 1.00492752, + "epoch": 0.24223658499924847, + "flos": 64379445154560.0, + "grad_norm": 0.8560531053294638, + "language_loss": 0.55393541, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57690978, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.16699219, + "step": 4029, + "time_per_iteration": 3.3756840229034424 + }, + { + "auxiliary_loss_clip": 0.01540592, + "auxiliary_loss_mlp": 0.01050898, + "balance_loss_clip": 1.33094108, + "balance_loss_mlp": 1.02566087, + "epoch": 0.24229670825191643, + "flos": 25786160628480.0, + "grad_norm": 2.113636184320302, + "language_loss": 0.74870384, + "learning_rate": 3.54554981945833e-06, + "loss": 0.77461869, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25256348, + "step": 4030, + "time_per_iteration": 2.873448133468628 + }, + { + "auxiliary_loss_clip": 0.01540326, + "auxiliary_loss_mlp": 0.01047563, + "balance_loss_clip": 1.33140099, + "balance_loss_mlp": 1.02360177, + "epoch": 0.2423568315045844, + "flos": 20676649236480.0, + "grad_norm": 1.7125512941718748, + "language_loss": 0.77676946, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.80264837, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.23974609, + "step": 4031, + "time_per_iteration": 2.884960174560547 + }, + { + "auxiliary_loss_clip": 0.01556613, + "auxiliary_loss_mlp": 0.0105394, + "balance_loss_clip": 1.34080589, + "balance_loss_mlp": 1.02950168, + "epoch": 0.24241695475725236, + "flos": 22426397867520.0, + "grad_norm": 2.339947637372905, + "language_loss": 0.66330278, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.6894083, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 2.15820312, + "router_z_loss_mlp": 0.24450684, + "step": 4032, + "time_per_iteration": 2.8432087898254395 + }, + { + "auxiliary_loss_clip": 0.01531974, + "auxiliary_loss_mlp": 0.01041704, + "balance_loss_clip": 1.32540989, + "balance_loss_mlp": 1.01869678, + "epoch": 0.24247707800992033, + "flos": 17137897470720.0, + "grad_norm": 1.93527291778241, + "language_loss": 0.82039106, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.84612787, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.23010254, + "step": 4033, + "time_per_iteration": 2.829012870788574 + }, + { + "auxiliary_loss_clip": 0.01532042, + "auxiliary_loss_mlp": 0.01042522, + "balance_loss_clip": 1.32627308, + "balance_loss_mlp": 1.01820314, + "epoch": 0.2425372012625883, + "flos": 31626377844480.0, + "grad_norm": 1.8499002502884996, + "language_loss": 0.70237982, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.72812539, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.2434082, + "step": 4034, + "time_per_iteration": 2.9322004318237305 + }, + { + "auxiliary_loss_clip": 0.01539856, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.3294003, + "balance_loss_mlp": 1.01778924, + "epoch": 0.24259732451525629, + "flos": 16334654952960.0, + "grad_norm": 2.234998207512587, + "language_loss": 0.96860647, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.9944219, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.2388916, + "step": 4035, + "time_per_iteration": 4.255556344985962 + }, + { + "auxiliary_loss_clip": 0.01524644, + "auxiliary_loss_mlp": 0.01046692, + "balance_loss_clip": 1.32124877, + "balance_loss_mlp": 1.02220654, + "epoch": 0.24265744776792425, + "flos": 22866866426880.0, + "grad_norm": 1.6641152161470683, + "language_loss": 0.79322374, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.81893706, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.24487305, + "step": 4036, + "time_per_iteration": 2.9280831813812256 + }, + { + "auxiliary_loss_clip": 0.01535853, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.3275609, + "balance_loss_mlp": 1.01599514, + "epoch": 0.24271757102059222, + "flos": 21881558257920.0, + "grad_norm": 2.028943667364006, + "language_loss": 0.7511158, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.77688193, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.24768066, + "step": 4037, + "time_per_iteration": 4.310131072998047 + }, + { + "auxiliary_loss_clip": 0.01538517, + "auxiliary_loss_mlp": 0.01044958, + "balance_loss_clip": 1.32954121, + "balance_loss_mlp": 1.01924443, + "epoch": 0.24277769427326018, + "flos": 19217997521280.0, + "grad_norm": 2.1515862743842376, + "language_loss": 0.78209341, + "learning_rate": 3.543570475921171e-06, + "loss": 0.80792814, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.25756836, + "step": 4038, + "time_per_iteration": 2.8268887996673584 + }, + { + "auxiliary_loss_clip": 0.01533606, + "auxiliary_loss_mlp": 0.01050211, + "balance_loss_clip": 1.3254441, + "balance_loss_mlp": 1.02418745, + "epoch": 0.24283781752592815, + "flos": 19509230171520.0, + "grad_norm": 2.22871959886407, + "language_loss": 0.72558099, + "learning_rate": 3.543322794484905e-06, + "loss": 0.75141919, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26037598, + "step": 4039, + "time_per_iteration": 4.3983776569366455 + }, + { + "auxiliary_loss_clip": 0.01547426, + "auxiliary_loss_mlp": 0.01043507, + "balance_loss_clip": 1.33706844, + "balance_loss_mlp": 1.01749527, + "epoch": 0.2428979407785961, + "flos": 19911892060800.0, + "grad_norm": 1.6588451499159933, + "language_loss": 0.79931104, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.82522035, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.26013184, + "step": 4040, + "time_per_iteration": 2.860842704772949 + }, + { + "auxiliary_loss_clip": 0.0153238, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.32837009, + "balance_loss_mlp": 1.01613379, + "epoch": 0.2429580640312641, + "flos": 24726189260160.0, + "grad_norm": 1.8497861512407998, + "language_loss": 0.80993712, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.83566546, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.2434082, + "step": 4041, + "time_per_iteration": 2.876401662826538 + }, + { + "auxiliary_loss_clip": 0.01533402, + "auxiliary_loss_mlp": 0.01044823, + "balance_loss_clip": 1.3273685, + "balance_loss_mlp": 1.020576, + "epoch": 0.24301818728393207, + "flos": 25641494444160.0, + "grad_norm": 9.02462556591432, + "language_loss": 0.77303213, + "learning_rate": 3.542579399075957e-06, + "loss": 0.79881442, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.24243164, + "step": 4042, + "time_per_iteration": 2.88173246383667 + }, + { + "auxiliary_loss_clip": 0.01531543, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.32709336, + "balance_loss_mlp": 1.01445007, + "epoch": 0.24307831053660003, + "flos": 26152735150080.0, + "grad_norm": 2.303921665234922, + "language_loss": 0.82395589, + "learning_rate": 3.542331483604246e-06, + "loss": 0.84967738, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.26184082, + "step": 4043, + "time_per_iteration": 2.9143826961517334 + }, + { + "auxiliary_loss_clip": 0.01557097, + "auxiliary_loss_mlp": 0.01046132, + "balance_loss_clip": 1.34262538, + "balance_loss_mlp": 1.01993024, + "epoch": 0.243138433789268, + "flos": 14979740860800.0, + "grad_norm": 3.7439719246110763, + "language_loss": 0.73978198, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.76581424, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26245117, + "step": 4044, + "time_per_iteration": 2.810695171356201 + }, + { + "auxiliary_loss_clip": 0.01541471, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.33366275, + "balance_loss_mlp": 1.01984787, + "epoch": 0.24319855704193596, + "flos": 25202518963200.0, + "grad_norm": 3.3186675044968323, + "language_loss": 0.85431659, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.88018119, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.25170898, + "step": 4045, + "time_per_iteration": 2.916626453399658 + }, + { + "auxiliary_loss_clip": 0.01541437, + "auxiliary_loss_mlp": 0.01048006, + "balance_loss_clip": 1.33408451, + "balance_loss_mlp": 1.02280533, + "epoch": 0.24325868029460393, + "flos": 22137336967680.0, + "grad_norm": 1.6749230965042554, + "language_loss": 0.87596768, + "learning_rate": 3.541587386314541e-06, + "loss": 0.90186214, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 2.07324219, + "router_z_loss_mlp": 0.2520752, + "step": 4046, + "time_per_iteration": 2.851457118988037 + }, + { + "auxiliary_loss_clip": 0.01524633, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.32160234, + "balance_loss_mlp": 1.01527309, + "epoch": 0.2433188035472719, + "flos": 23591418958080.0, + "grad_norm": 2.3070992050247288, + "language_loss": 0.73487997, + "learning_rate": 3.5413392369578e-06, + "loss": 0.76053959, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.26037598, + "step": 4047, + "time_per_iteration": 2.908132791519165 + }, + { + "auxiliary_loss_clip": 0.01533618, + "auxiliary_loss_mlp": 0.01042129, + "balance_loss_clip": 1.3254987, + "balance_loss_mlp": 1.01584291, + "epoch": 0.2433789267999399, + "flos": 24473668176000.0, + "grad_norm": 3.41546600792429, + "language_loss": 0.74041557, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.76617301, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 2.07910156, + "router_z_loss_mlp": 0.26257324, + "step": 4048, + "time_per_iteration": 2.876168966293335 + }, + { + "auxiliary_loss_clip": 0.01533787, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.32754803, + "balance_loss_mlp": 1.0189265, + "epoch": 0.24343905005260785, + "flos": 16736773904640.0, + "grad_norm": 2.069100843552389, + "language_loss": 0.74153674, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.76731026, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.2467041, + "step": 4049, + "time_per_iteration": 2.877272844314575 + }, + { + "auxiliary_loss_clip": 0.01527621, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_clip": 1.32245398, + "balance_loss_mlp": 1.01915145, + "epoch": 0.24349917330527582, + "flos": 20052350478720.0, + "grad_norm": 1.6155221165559606, + "language_loss": 0.75157601, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.77729386, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25024414, + "step": 4050, + "time_per_iteration": 2.8639228343963623 + }, + { + "auxiliary_loss_clip": 0.01521953, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.31957769, + "balance_loss_mlp": 1.01983964, + "epoch": 0.24355929655794378, + "flos": 17429446834560.0, + "grad_norm": 2.4282746700594355, + "language_loss": 0.75961912, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.78529024, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.25341797, + "step": 4051, + "time_per_iteration": 2.844693899154663 + }, + { + "auxiliary_loss_clip": 0.01537284, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.32925391, + "balance_loss_mlp": 1.01779389, + "epoch": 0.24361941981061175, + "flos": 25421893591680.0, + "grad_norm": 1.7619503585485203, + "language_loss": 0.71723431, + "learning_rate": 3.540097613646296e-06, + "loss": 0.74302769, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.24267578, + "step": 4052, + "time_per_iteration": 3.0478668212890625 + }, + { + "auxiliary_loss_clip": 0.01526787, + "auxiliary_loss_mlp": 0.01048049, + "balance_loss_clip": 1.32364154, + "balance_loss_mlp": 1.02244294, + "epoch": 0.2436795430632797, + "flos": 22831186262400.0, + "grad_norm": 1.5345535711094274, + "language_loss": 0.82018745, + "learning_rate": 3.539849113744351e-06, + "loss": 0.84593582, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25622559, + "step": 4053, + "time_per_iteration": 2.8567092418670654 + }, + { + "auxiliary_loss_clip": 0.0152854, + "auxiliary_loss_mlp": 0.01041399, + "balance_loss_clip": 1.32173932, + "balance_loss_mlp": 1.01642454, + "epoch": 0.2437396663159477, + "flos": 15165923788800.0, + "grad_norm": 2.0303203919320927, + "language_loss": 0.78611827, + "learning_rate": 3.539600555451172e-06, + "loss": 0.81181777, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24963379, + "step": 4054, + "time_per_iteration": 2.8815085887908936 + }, + { + "auxiliary_loss_clip": 0.01530953, + "auxiliary_loss_mlp": 0.01047494, + "balance_loss_clip": 1.32496703, + "balance_loss_mlp": 1.02249575, + "epoch": 0.24379978956861567, + "flos": 22101340089600.0, + "grad_norm": 1.6729472135361405, + "language_loss": 0.84812385, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.87390834, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 2.05761719, + "router_z_loss_mlp": 0.25024414, + "step": 4055, + "time_per_iteration": 2.858440399169922 + }, + { + "auxiliary_loss_clip": 0.01555821, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_clip": 1.34277296, + "balance_loss_mlp": 1.01883364, + "epoch": 0.24385991282128364, + "flos": 31480354316160.0, + "grad_norm": 2.8092468307728025, + "language_loss": 0.56427968, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.59027779, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.25109863, + "step": 4056, + "time_per_iteration": 2.8953638076782227 + }, + { + "auxiliary_loss_clip": 0.01553509, + "auxiliary_loss_mlp": 0.0104653, + "balance_loss_clip": 1.34256506, + "balance_loss_mlp": 1.02051878, + "epoch": 0.2439200360739516, + "flos": 23848509767040.0, + "grad_norm": 3.568979611954272, + "language_loss": 0.81092751, + "learning_rate": 3.538854530318506e-06, + "loss": 0.83692789, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.26025391, + "step": 4057, + "time_per_iteration": 2.878134250640869 + }, + { + "auxiliary_loss_clip": 0.0154353, + "auxiliary_loss_mlp": 0.0104337, + "balance_loss_clip": 1.33804238, + "balance_loss_mlp": 1.01855087, + "epoch": 0.24398015932661957, + "flos": 19178833507200.0, + "grad_norm": 1.6804874197186, + "language_loss": 0.80812585, + "learning_rate": 3.538605738554673e-06, + "loss": 0.83399481, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.24829102, + "step": 4058, + "time_per_iteration": 2.8491127490997314 + }, + { + "auxiliary_loss_clip": 0.0154463, + "auxiliary_loss_mlp": 0.01042265, + "balance_loss_clip": 1.3356992, + "balance_loss_mlp": 1.01705194, + "epoch": 0.24404028257928753, + "flos": 25272748172160.0, + "grad_norm": 1.6027979269298358, + "language_loss": 0.86212075, + "learning_rate": 3.538356888446756e-06, + "loss": 0.8879897, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.2520752, + "step": 4059, + "time_per_iteration": 2.909658432006836 + }, + { + "auxiliary_loss_clip": 0.01539266, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.33642495, + "balance_loss_mlp": 1.01882827, + "epoch": 0.2441004058319555, + "flos": 26478471600000.0, + "grad_norm": 1.5790547177953014, + "language_loss": 0.74764204, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.77345848, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.23547363, + "step": 4060, + "time_per_iteration": 2.8980133533477783 + }, + { + "auxiliary_loss_clip": 0.01553974, + "auxiliary_loss_mlp": 0.01046642, + "balance_loss_clip": 1.344154, + "balance_loss_mlp": 1.02070248, + "epoch": 0.2441605290846235, + "flos": 26771740266240.0, + "grad_norm": 1.617433868957443, + "language_loss": 0.74813801, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.77414417, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.25976562, + "step": 4061, + "time_per_iteration": 2.9292373657226562 + }, + { + "auxiliary_loss_clip": 0.01529249, + "auxiliary_loss_mlp": 0.01053038, + "balance_loss_clip": 1.32762694, + "balance_loss_mlp": 1.02858853, + "epoch": 0.24422065233729146, + "flos": 21115760451840.0, + "grad_norm": 1.7754115236026111, + "language_loss": 0.77037323, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.7961961, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.24450684, + "step": 4062, + "time_per_iteration": 4.366536855697632 + }, + { + "auxiliary_loss_clip": 0.01536323, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.33613503, + "balance_loss_mlp": 1.02064848, + "epoch": 0.24428077558995942, + "flos": 25273607823360.0, + "grad_norm": 1.4955271960761864, + "language_loss": 0.85614043, + "learning_rate": 3.537360904763011e-06, + "loss": 0.88193959, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.22937012, + "step": 4063, + "time_per_iteration": 2.8736603260040283 + }, + { + "auxiliary_loss_clip": 0.01538275, + "auxiliary_loss_mlp": 0.01038196, + "balance_loss_clip": 1.32901478, + "balance_loss_mlp": 1.01417542, + "epoch": 0.24434089884262739, + "flos": 20494945543680.0, + "grad_norm": 2.0498084043999216, + "language_loss": 0.69325483, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.71901953, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.24035645, + "step": 4064, + "time_per_iteration": 2.8824684619903564 + }, + { + "auxiliary_loss_clip": 0.01557877, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.34638429, + "balance_loss_mlp": 1.01625979, + "epoch": 0.24440102209529535, + "flos": 23632076050560.0, + "grad_norm": 1.4709937078484387, + "language_loss": 0.70602274, + "learning_rate": 3.536862563102088e-06, + "loss": 0.73202342, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.25952148, + "step": 4065, + "time_per_iteration": 2.9009432792663574 + }, + { + "auxiliary_loss_clip": 0.0154939, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.33864784, + "balance_loss_mlp": 1.0178194, + "epoch": 0.24446114534796332, + "flos": 20563500695040.0, + "grad_norm": 1.7488322837146757, + "language_loss": 0.85410953, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.88004011, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25854492, + "step": 4066, + "time_per_iteration": 2.8591020107269287 + }, + { + "auxiliary_loss_clip": 0.0130175, + "auxiliary_loss_mlp": 0.01039092, + "balance_loss_clip": 1.18631864, + "balance_loss_mlp": 1.01429641, + "epoch": 0.24452126860063128, + "flos": 60416178261120.0, + "grad_norm": 0.7570450696352516, + "language_loss": 0.5234533, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54686171, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.24707031, + "step": 4067, + "time_per_iteration": 3.323737144470215 + }, + { + "auxiliary_loss_clip": 0.01543515, + "auxiliary_loss_mlp": 0.010452, + "balance_loss_clip": 1.3353765, + "balance_loss_mlp": 1.02001131, + "epoch": 0.24458139185329927, + "flos": 15129881665920.0, + "grad_norm": 2.534726198565204, + "language_loss": 0.73941612, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.76530325, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 2.078125, + "router_z_loss_mlp": 0.25170898, + "step": 4068, + "time_per_iteration": 2.8170695304870605 + }, + { + "auxiliary_loss_clip": 0.01530996, + "auxiliary_loss_mlp": 0.01047479, + "balance_loss_clip": 1.32688034, + "balance_loss_mlp": 1.0221467, + "epoch": 0.24464151510596724, + "flos": 28009660008960.0, + "grad_norm": 1.5152240841300242, + "language_loss": 0.78581524, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.81159997, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.25341797, + "step": 4069, + "time_per_iteration": 4.318330526351929 + }, + { + "auxiliary_loss_clip": 0.01530332, + "auxiliary_loss_mlp": 0.01051838, + "balance_loss_clip": 1.32828248, + "balance_loss_mlp": 1.02681637, + "epoch": 0.2447016383586352, + "flos": 19802589327360.0, + "grad_norm": 1.7111457335156701, + "language_loss": 0.81132317, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.83714485, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.25036621, + "step": 4070, + "time_per_iteration": 2.8873612880706787 + }, + { + "auxiliary_loss_clip": 0.01541261, + "auxiliary_loss_mlp": 0.01045257, + "balance_loss_clip": 1.3348068, + "balance_loss_mlp": 1.02038991, + "epoch": 0.24476176161130317, + "flos": 26078388664320.0, + "grad_norm": 4.535724780700914, + "language_loss": 0.84827626, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.87414145, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.24841309, + "step": 4071, + "time_per_iteration": 2.9025285243988037 + }, + { + "auxiliary_loss_clip": 0.01564541, + "auxiliary_loss_mlp": 0.0105842, + "balance_loss_clip": 1.35068429, + "balance_loss_mlp": 1.03259969, + "epoch": 0.24482188486397113, + "flos": 18852825588480.0, + "grad_norm": 1.8858993192776379, + "language_loss": 0.81069362, + "learning_rate": 3.535116532028798e-06, + "loss": 0.83692324, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.25830078, + "step": 4072, + "time_per_iteration": 4.234854221343994 + }, + { + "auxiliary_loss_clip": 0.0153423, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.33388424, + "balance_loss_mlp": 1.02621043, + "epoch": 0.2448820081166391, + "flos": 21261738735360.0, + "grad_norm": 1.436359096763441, + "language_loss": 0.7108981, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7367382, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.23571777, + "step": 4073, + "time_per_iteration": 2.8814799785614014 + }, + { + "auxiliary_loss_clip": 0.01540846, + "auxiliary_loss_mlp": 0.01054039, + "balance_loss_clip": 1.33829176, + "balance_loss_mlp": 1.0301851, + "epoch": 0.2449421313693071, + "flos": 23960934391680.0, + "grad_norm": 2.379739349614642, + "language_loss": 0.68706942, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.71301824, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.23852539, + "step": 4074, + "time_per_iteration": 4.3329758644104 + }, + { + "auxiliary_loss_clip": 0.01306369, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.1916734, + "balance_loss_mlp": 1.00462472, + "epoch": 0.24500225462197506, + "flos": 60715962178560.0, + "grad_norm": 1.0349972936049683, + "language_loss": 0.68711865, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.7104975, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.26953125, + "step": 4075, + "time_per_iteration": 3.4778990745544434 + }, + { + "auxiliary_loss_clip": 0.01531343, + "auxiliary_loss_mlp": 0.01051537, + "balance_loss_clip": 1.32964897, + "balance_loss_mlp": 1.02589512, + "epoch": 0.24506237787464302, + "flos": 26294777136000.0, + "grad_norm": 1.794202287332855, + "language_loss": 0.80614114, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.83196998, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.25671387, + "step": 4076, + "time_per_iteration": 2.9264001846313477 + }, + { + "auxiliary_loss_clip": 0.01549646, + "auxiliary_loss_mlp": 0.01052303, + "balance_loss_clip": 1.33847535, + "balance_loss_mlp": 1.02655423, + "epoch": 0.245122501127311, + "flos": 20560424048640.0, + "grad_norm": 2.3873004631587875, + "language_loss": 0.8373028, + "learning_rate": 3.533867620434151e-06, + "loss": 0.86332226, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25744629, + "step": 4077, + "time_per_iteration": 2.841952085494995 + }, + { + "auxiliary_loss_clip": 0.01547825, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_clip": 1.33900547, + "balance_loss_mlp": 1.02707577, + "epoch": 0.24518262437997895, + "flos": 29144566045440.0, + "grad_norm": 1.814100034227778, + "language_loss": 0.63204509, + "learning_rate": 3.533617663584082e-06, + "loss": 0.65805137, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.25695801, + "step": 4078, + "time_per_iteration": 2.9186201095581055 + }, + { + "auxiliary_loss_clip": 0.01540514, + "auxiliary_loss_mlp": 0.01053533, + "balance_loss_clip": 1.33660793, + "balance_loss_mlp": 1.0297749, + "epoch": 0.24524274763264692, + "flos": 23487319376640.0, + "grad_norm": 1.5111688765312712, + "language_loss": 0.76275754, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.78869802, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23742676, + "step": 4079, + "time_per_iteration": 2.8980133533477783 + }, + { + "auxiliary_loss_clip": 0.01538826, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.33406591, + "balance_loss_mlp": 1.02011061, + "epoch": 0.24530287088531488, + "flos": 17209891226880.0, + "grad_norm": 2.5840070852938735, + "language_loss": 0.75403917, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77988124, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25268555, + "step": 4080, + "time_per_iteration": 2.8104021549224854 + }, + { + "auxiliary_loss_clip": 0.01540562, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.33856595, + "balance_loss_mlp": 1.02156484, + "epoch": 0.24536299413798288, + "flos": 14875912748160.0, + "grad_norm": 1.7918077701428412, + "language_loss": 0.84733588, + "learning_rate": 3.532867444142186e-06, + "loss": 0.87319362, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.23657227, + "step": 4081, + "time_per_iteration": 2.8599462509155273 + }, + { + "auxiliary_loss_clip": 0.01535605, + "auxiliary_loss_mlp": 0.01042514, + "balance_loss_clip": 1.33312023, + "balance_loss_mlp": 1.01863599, + "epoch": 0.24542311739065084, + "flos": 35275563463680.0, + "grad_norm": 2.0886886451046296, + "language_loss": 0.74776137, + "learning_rate": 3.532617254729267e-06, + "loss": 0.77354252, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.23876953, + "step": 4082, + "time_per_iteration": 3.0146734714508057 + }, + { + "auxiliary_loss_clip": 0.01543753, + "auxiliary_loss_mlp": 0.01042305, + "balance_loss_clip": 1.34329128, + "balance_loss_mlp": 1.01942873, + "epoch": 0.2454832406433188, + "flos": 21512088069120.0, + "grad_norm": 1.7551364899048196, + "language_loss": 0.72628725, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.75214779, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.2286377, + "step": 4083, + "time_per_iteration": 2.8884942531585693 + }, + { + "auxiliary_loss_clip": 0.01556115, + "auxiliary_loss_mlp": 0.01045287, + "balance_loss_clip": 1.3479985, + "balance_loss_mlp": 1.01912045, + "epoch": 0.24554336389598677, + "flos": 14764754977920.0, + "grad_norm": 2.0945899474975396, + "language_loss": 0.758448, + "learning_rate": 3.532116701561919e-06, + "loss": 0.78446198, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26159668, + "step": 4084, + "time_per_iteration": 2.827548027038574 + }, + { + "auxiliary_loss_clip": 0.01542778, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.34062386, + "balance_loss_mlp": 1.01687145, + "epoch": 0.24560348714865474, + "flos": 14984763033600.0, + "grad_norm": 1.7272352791747034, + "language_loss": 0.86402535, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8898561, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.23425293, + "step": 4085, + "time_per_iteration": 2.810129404067993 + }, + { + "auxiliary_loss_clip": 0.01554419, + "auxiliary_loss_mlp": 0.01051528, + "balance_loss_clip": 1.34897351, + "balance_loss_mlp": 1.02675581, + "epoch": 0.2456636104013227, + "flos": 22685796161280.0, + "grad_norm": 1.8950149038432358, + "language_loss": 0.80176699, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.82782644, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.24768066, + "step": 4086, + "time_per_iteration": 2.954594373703003 + }, + { + "auxiliary_loss_clip": 0.01538319, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.33620095, + "balance_loss_mlp": 1.0148623, + "epoch": 0.2457237336539907, + "flos": 27429411703680.0, + "grad_norm": 1.5850204229208797, + "language_loss": 0.75713003, + "learning_rate": 3.531365436099496e-06, + "loss": 0.78288907, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.22717285, + "step": 4087, + "time_per_iteration": 2.9384562969207764 + }, + { + "auxiliary_loss_clip": 0.01561505, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.35675228, + "balance_loss_mlp": 1.02255881, + "epoch": 0.24578385690665866, + "flos": 20422046891520.0, + "grad_norm": 2.366702696993089, + "language_loss": 0.80769265, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.83378363, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25024414, + "step": 4088, + "time_per_iteration": 2.8505074977874756 + }, + { + "auxiliary_loss_clip": 0.01530171, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.33140492, + "balance_loss_mlp": 1.01684082, + "epoch": 0.24584398015932662, + "flos": 23926068633600.0, + "grad_norm": 1.4072810400846387, + "language_loss": 0.77616215, + "learning_rate": 3.5308643020944e-06, + "loss": 0.80186653, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.234375, + "step": 4089, + "time_per_iteration": 2.901097536087036 + }, + { + "auxiliary_loss_clip": 0.01541691, + "auxiliary_loss_mlp": 0.0104356, + "balance_loss_clip": 1.33663487, + "balance_loss_mlp": 1.0205642, + "epoch": 0.2459041034119946, + "flos": 41510253260160.0, + "grad_norm": 1.813346053141106, + "language_loss": 0.82103157, + "learning_rate": 3.530613648011309e-06, + "loss": 0.84688413, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.2298584, + "step": 4090, + "time_per_iteration": 3.008744239807129 + }, + { + "auxiliary_loss_clip": 0.01548201, + "auxiliary_loss_mlp": 0.01040514, + "balance_loss_clip": 1.34338784, + "balance_loss_mlp": 1.01534855, + "epoch": 0.24596422666466256, + "flos": 19946033902080.0, + "grad_norm": 1.7057450910068406, + "language_loss": 0.74652076, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.25183105, + "step": 4091, + "time_per_iteration": 2.863926887512207 + }, + { + "auxiliary_loss_clip": 0.01548547, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_clip": 1.34617341, + "balance_loss_mlp": 1.01911378, + "epoch": 0.24602434991733052, + "flos": 21554600198400.0, + "grad_norm": 2.047574358846292, + "language_loss": 0.7796098, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.80552495, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.23852539, + "step": 4092, + "time_per_iteration": 2.872969388961792 + }, + { + "auxiliary_loss_clip": 0.01565151, + "auxiliary_loss_mlp": 0.01051109, + "balance_loss_clip": 1.35434425, + "balance_loss_mlp": 1.02639651, + "epoch": 0.24608447316999849, + "flos": 23195408054400.0, + "grad_norm": 2.5746071629125518, + "language_loss": 0.82919145, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.85535407, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24755859, + "step": 4093, + "time_per_iteration": 2.8568148612976074 + }, + { + "auxiliary_loss_clip": 0.01568061, + "auxiliary_loss_mlp": 0.01044095, + "balance_loss_clip": 1.35908258, + "balance_loss_mlp": 1.01850116, + "epoch": 0.24614459642266648, + "flos": 19650322016640.0, + "grad_norm": 1.7985818326356795, + "language_loss": 0.88609481, + "learning_rate": 3.529610451363797e-06, + "loss": 0.91221637, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25622559, + "step": 4094, + "time_per_iteration": 2.906229019165039 + }, + { + "auxiliary_loss_clip": 0.01293463, + "auxiliary_loss_mlp": 0.01023829, + "balance_loss_clip": 1.18143249, + "balance_loss_mlp": 1.0058049, + "epoch": 0.24620471967533444, + "flos": 61766839342080.0, + "grad_norm": 0.8192335639071074, + "language_loss": 0.57632244, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59949529, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18066406, + "step": 4095, + "time_per_iteration": 3.483731508255005 + }, + { + "auxiliary_loss_clip": 0.01297974, + "auxiliary_loss_mlp": 0.01020958, + "balance_loss_clip": 1.18779445, + "balance_loss_mlp": 1.00159883, + "epoch": 0.2462648429280024, + "flos": 69185462555520.0, + "grad_norm": 0.6514127323740173, + "language_loss": 0.5627017, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58589101, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.19335938, + "step": 4096, + "time_per_iteration": 3.4202136993408203 + }, + { + "auxiliary_loss_clip": 0.01560568, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.35347104, + "balance_loss_mlp": 1.02297413, + "epoch": 0.24632496618067037, + "flos": 29471297880960.0, + "grad_norm": 1.9120760817027531, + "language_loss": 0.78284454, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.80891961, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.23974609, + "step": 4097, + "time_per_iteration": 4.400754928588867 + }, + { + "auxiliary_loss_clip": 0.01573687, + "auxiliary_loss_mlp": 0.01056039, + "balance_loss_clip": 1.35971725, + "balance_loss_mlp": 1.02907336, + "epoch": 0.24638508943333834, + "flos": 24327146954880.0, + "grad_norm": 1.9044455490195293, + "language_loss": 0.77431715, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.80061436, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.27001953, + "step": 4098, + "time_per_iteration": 2.902709484100342 + }, + { + "auxiliary_loss_clip": 0.0155832, + "auxiliary_loss_mlp": 0.01051489, + "balance_loss_clip": 1.35237598, + "balance_loss_mlp": 1.02836227, + "epoch": 0.2464452126860063, + "flos": 26624087925120.0, + "grad_norm": 2.8683576200770746, + "language_loss": 0.69863862, + "learning_rate": 3.528355150558764e-06, + "loss": 0.72473669, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.23120117, + "step": 4099, + "time_per_iteration": 2.941516876220703 + }, + { + "auxiliary_loss_clip": 0.01549948, + "auxiliary_loss_mlp": 0.01051489, + "balance_loss_clip": 1.34902382, + "balance_loss_mlp": 1.02802813, + "epoch": 0.24650533593867427, + "flos": 31224937564800.0, + "grad_norm": 2.7745483876845882, + "language_loss": 0.66782731, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.6938417, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.23474121, + "step": 4100, + "time_per_iteration": 2.9128377437591553 + }, + { + "auxiliary_loss_clip": 0.01294438, + "auxiliary_loss_mlp": 0.01015944, + "balance_loss_clip": 1.18318462, + "balance_loss_mlp": 0.99849206, + "epoch": 0.24656545919134226, + "flos": 68523900065280.0, + "grad_norm": 0.7042903294625029, + "language_loss": 0.61568069, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63878453, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.17480469, + "step": 4101, + "time_per_iteration": 3.4796142578125 + }, + { + "auxiliary_loss_clip": 0.01548991, + "auxiliary_loss_mlp": 0.01049898, + "balance_loss_clip": 1.34568357, + "balance_loss_mlp": 1.02526927, + "epoch": 0.24662558244401023, + "flos": 20093821977600.0, + "grad_norm": 1.5384883944137095, + "language_loss": 0.74234504, + "learning_rate": 3.527601274535012e-06, + "loss": 0.76833391, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.24609375, + "step": 4102, + "time_per_iteration": 2.8833229541778564 + }, + { + "auxiliary_loss_clip": 0.01560013, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.35325813, + "balance_loss_mlp": 1.0246371, + "epoch": 0.2466857056966782, + "flos": 30713696858880.0, + "grad_norm": 1.9763270026652755, + "language_loss": 0.75891072, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78499722, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.2401123, + "step": 4103, + "time_per_iteration": 2.9058477878570557 + }, + { + "auxiliary_loss_clip": 0.01569296, + "auxiliary_loss_mlp": 0.01050061, + "balance_loss_clip": 1.35890436, + "balance_loss_mlp": 1.02631474, + "epoch": 0.24674582894934616, + "flos": 22538732002560.0, + "grad_norm": 1.8657194927541842, + "language_loss": 0.79586881, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.82206237, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.23754883, + "step": 4104, + "time_per_iteration": 4.2830493450164795 + }, + { + "auxiliary_loss_clip": 0.015482, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.34369218, + "balance_loss_mlp": 1.02394545, + "epoch": 0.24680595220201412, + "flos": 20714048703360.0, + "grad_norm": 2.048217336683823, + "language_loss": 0.84657305, + "learning_rate": 3.526846877170133e-06, + "loss": 0.8725493, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.2545166, + "step": 4105, + "time_per_iteration": 2.9072978496551514 + }, + { + "auxiliary_loss_clip": 0.01549938, + "auxiliary_loss_mlp": 0.01045989, + "balance_loss_clip": 1.34490216, + "balance_loss_mlp": 1.02280295, + "epoch": 0.2468660754546821, + "flos": 21840132003840.0, + "grad_norm": 1.7919197440714565, + "language_loss": 0.76977468, + "learning_rate": 3.52659529557275e-06, + "loss": 0.79573393, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.23193359, + "step": 4106, + "time_per_iteration": 2.865534782409668 + }, + { + "auxiliary_loss_clip": 0.01558593, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.35232854, + "balance_loss_mlp": 1.01964724, + "epoch": 0.24692619870735008, + "flos": 15275995683840.0, + "grad_norm": 2.2030554627586034, + "language_loss": 0.73829514, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.76431787, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.24035645, + "step": 4107, + "time_per_iteration": 4.289412260055542 + }, + { + "auxiliary_loss_clip": 0.01572962, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_clip": 1.36491132, + "balance_loss_mlp": 1.01862741, + "epoch": 0.24698632196001805, + "flos": 29692256077440.0, + "grad_norm": 4.194945048330935, + "language_loss": 0.66653407, + "learning_rate": 3.526091958721587e-06, + "loss": 0.69268084, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.23083496, + "step": 4108, + "time_per_iteration": 2.9575953483581543 + }, + { + "auxiliary_loss_clip": 0.01569527, + "auxiliary_loss_mlp": 0.01049614, + "balance_loss_clip": 1.36006796, + "balance_loss_mlp": 1.02320862, + "epoch": 0.247046445212686, + "flos": 39179849120640.0, + "grad_norm": 1.7209873532949953, + "language_loss": 0.73621082, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.76240224, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.26416016, + "step": 4109, + "time_per_iteration": 4.443131685256958 + }, + { + "auxiliary_loss_clip": 0.01565444, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.35530627, + "balance_loss_mlp": 1.02021253, + "epoch": 0.24710656846535398, + "flos": 23008139251200.0, + "grad_norm": 5.244666295388618, + "language_loss": 0.79407692, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.82019579, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 2.10058594, + "router_z_loss_mlp": 0.2623291, + "step": 4110, + "time_per_iteration": 2.8426079750061035 + }, + { + "auxiliary_loss_clip": 0.01577607, + "auxiliary_loss_mlp": 0.01046237, + "balance_loss_clip": 1.36764288, + "balance_loss_mlp": 1.02146494, + "epoch": 0.24716669171802194, + "flos": 26444103534720.0, + "grad_norm": 2.151932752146124, + "language_loss": 0.81960827, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.84584671, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.2479248, + "step": 4111, + "time_per_iteration": 2.899045944213867 + }, + { + "auxiliary_loss_clip": 0.01564583, + "auxiliary_loss_mlp": 0.01042086, + "balance_loss_clip": 1.35476315, + "balance_loss_mlp": 1.01882792, + "epoch": 0.2472268149706899, + "flos": 23339938504320.0, + "grad_norm": 1.8933064218841427, + "language_loss": 0.75820613, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.78427279, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.23254395, + "step": 4112, + "time_per_iteration": 2.87064790725708 + }, + { + "auxiliary_loss_clip": 0.01560512, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.35267353, + "balance_loss_mlp": 1.01368368, + "epoch": 0.24728693822335787, + "flos": 23779185454080.0, + "grad_norm": 2.6461200641347324, + "language_loss": 0.83264863, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.85863304, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.24267578, + "step": 4113, + "time_per_iteration": 2.8673107624053955 + }, + { + "auxiliary_loss_clip": 0.01551523, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.34475076, + "balance_loss_mlp": 1.01511717, + "epoch": 0.24734706147602586, + "flos": 19326802561920.0, + "grad_norm": 2.20006467496588, + "language_loss": 0.8846696, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.91058517, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24926758, + "step": 4114, + "time_per_iteration": 2.8257949352264404 + }, + { + "auxiliary_loss_clip": 0.01549332, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.34278095, + "balance_loss_mlp": 1.013592, + "epoch": 0.24740718472869383, + "flos": 28048326330240.0, + "grad_norm": 2.124041177375743, + "language_loss": 0.76166427, + "learning_rate": 3.524328457352734e-06, + "loss": 0.78753293, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.23913574, + "step": 4115, + "time_per_iteration": 2.922628402709961 + }, + { + "auxiliary_loss_clip": 0.01285802, + "auxiliary_loss_mlp": 0.0102445, + "balance_loss_clip": 1.17146206, + "balance_loss_mlp": 1.00680697, + "epoch": 0.2474673079813618, + "flos": 68141236377600.0, + "grad_norm": 0.6944937417467176, + "language_loss": 0.5841136, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60721612, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.17675781, + "step": 4116, + "time_per_iteration": 3.4839670658111572 + }, + { + "auxiliary_loss_clip": 0.01556195, + "auxiliary_loss_mlp": 0.01045268, + "balance_loss_clip": 1.34922743, + "balance_loss_mlp": 1.02078259, + "epoch": 0.24752743123402976, + "flos": 29474465016960.0, + "grad_norm": 1.6386755091344234, + "language_loss": 0.84182668, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8678413, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24499512, + "step": 4117, + "time_per_iteration": 2.917466402053833 + }, + { + "auxiliary_loss_clip": 0.01286704, + "auxiliary_loss_mlp": 0.01024362, + "balance_loss_clip": 1.17173672, + "balance_loss_mlp": 1.00738704, + "epoch": 0.24758755448669773, + "flos": 58377594954240.0, + "grad_norm": 0.8966812671241432, + "language_loss": 0.63467383, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65778446, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.16992188, + "step": 4118, + "time_per_iteration": 3.164216995239258 + }, + { + "auxiliary_loss_clip": 0.0155157, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.34608448, + "balance_loss_mlp": 1.01683307, + "epoch": 0.2476476777393657, + "flos": 20493859668480.0, + "grad_norm": 2.2635271845579688, + "language_loss": 0.80151308, + "learning_rate": 3.523319470415491e-06, + "loss": 0.82743806, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.2409668, + "step": 4119, + "time_per_iteration": 2.847882032394409 + }, + { + "auxiliary_loss_clip": 0.01541145, + "auxiliary_loss_mlp": 0.01044582, + "balance_loss_clip": 1.33763826, + "balance_loss_mlp": 1.02027547, + "epoch": 0.24770780099203366, + "flos": 20495669460480.0, + "grad_norm": 2.58032762765821, + "language_loss": 0.75201511, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.77787232, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.24304199, + "step": 4120, + "time_per_iteration": 2.8408749103546143 + }, + { + "auxiliary_loss_clip": 0.01541548, + "auxiliary_loss_mlp": 0.01047365, + "balance_loss_clip": 1.33533859, + "balance_loss_mlp": 1.02283168, + "epoch": 0.24776792424470165, + "flos": 15160403923200.0, + "grad_norm": 2.0802273664543174, + "language_loss": 0.89138079, + "learning_rate": 3.522814630322041e-06, + "loss": 0.91726995, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.24511719, + "step": 4121, + "time_per_iteration": 2.8349595069885254 + }, + { + "auxiliary_loss_clip": 0.01558679, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.34981775, + "balance_loss_mlp": 1.01488876, + "epoch": 0.2478280474973696, + "flos": 21735353750400.0, + "grad_norm": 2.0962796903888536, + "language_loss": 0.70904028, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.73502803, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.25231934, + "step": 4122, + "time_per_iteration": 2.824787139892578 + }, + { + "auxiliary_loss_clip": 0.01545081, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.33681202, + "balance_loss_mlp": 1.01775002, + "epoch": 0.24788817075003758, + "flos": 20421820667520.0, + "grad_norm": 3.0800651570564646, + "language_loss": 0.80694616, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.83283436, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.2598877, + "step": 4123, + "time_per_iteration": 2.8626742362976074 + }, + { + "auxiliary_loss_clip": 0.01536376, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.33249378, + "balance_loss_mlp": 1.01740754, + "epoch": 0.24794829400270554, + "flos": 22602853163520.0, + "grad_norm": 2.3377607126552657, + "language_loss": 0.75539041, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.78118479, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.25646973, + "step": 4124, + "time_per_iteration": 2.9859230518341064 + }, + { + "auxiliary_loss_clip": 0.01540795, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.33808208, + "balance_loss_mlp": 1.01621747, + "epoch": 0.2480084172553735, + "flos": 39690501644160.0, + "grad_norm": 1.4101275848800583, + "language_loss": 0.74208021, + "learning_rate": 3.521804257268357e-06, + "loss": 0.76788771, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.23718262, + "step": 4125, + "time_per_iteration": 3.105130910873413 + }, + { + "auxiliary_loss_clip": 0.01571795, + "auxiliary_loss_mlp": 0.01045072, + "balance_loss_clip": 1.35775042, + "balance_loss_mlp": 1.02069414, + "epoch": 0.24806854050804147, + "flos": 22063669153920.0, + "grad_norm": 2.0233768407485377, + "language_loss": 0.69749087, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.72365952, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.24377441, + "step": 4126, + "time_per_iteration": 2.858269453048706 + }, + { + "auxiliary_loss_clip": 0.01530659, + "auxiliary_loss_mlp": 0.01043774, + "balance_loss_clip": 1.3251946, + "balance_loss_mlp": 1.01874042, + "epoch": 0.24812866376070947, + "flos": 15495144088320.0, + "grad_norm": 2.0573556051273822, + "language_loss": 0.82499254, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.85073686, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.25048828, + "step": 4127, + "time_per_iteration": 2.82283091545105 + }, + { + "auxiliary_loss_clip": 0.01541804, + "auxiliary_loss_mlp": 0.01040619, + "balance_loss_clip": 1.33517313, + "balance_loss_mlp": 1.01664615, + "epoch": 0.24818878701337743, + "flos": 14765705118720.0, + "grad_norm": 2.180847698415879, + "language_loss": 0.85056186, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.87638605, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.23974609, + "step": 4128, + "time_per_iteration": 2.8791873455047607 + }, + { + "auxiliary_loss_clip": 0.01557207, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.34750247, + "balance_loss_mlp": 1.02077079, + "epoch": 0.2482489102660454, + "flos": 27100960565760.0, + "grad_norm": 3.0121155658487693, + "language_loss": 0.66878819, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.69481623, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24829102, + "step": 4129, + "time_per_iteration": 2.882044792175293 + }, + { + "auxiliary_loss_clip": 0.01535609, + "auxiliary_loss_mlp": 0.01045233, + "balance_loss_clip": 1.32907009, + "balance_loss_mlp": 1.01912594, + "epoch": 0.24830903351871336, + "flos": 26478381110400.0, + "grad_norm": 1.6167037300283822, + "language_loss": 0.75998193, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.78579032, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.2611084, + "step": 4130, + "time_per_iteration": 2.895861864089966 + }, + { + "auxiliary_loss_clip": 0.0153372, + "auxiliary_loss_mlp": 0.01047536, + "balance_loss_clip": 1.328251, + "balance_loss_mlp": 1.02276444, + "epoch": 0.24836915677138133, + "flos": 10234315526400.0, + "grad_norm": 4.354161046787067, + "language_loss": 0.78188908, + "learning_rate": 3.520286966670535e-06, + "loss": 0.80770165, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.24780273, + "step": 4131, + "time_per_iteration": 2.941476583480835 + }, + { + "auxiliary_loss_clip": 0.01530441, + "auxiliary_loss_mlp": 0.01039262, + "balance_loss_clip": 1.32865667, + "balance_loss_mlp": 1.0148716, + "epoch": 0.2484292800240493, + "flos": 30092429502720.0, + "grad_norm": 1.5786114224072478, + "language_loss": 0.84938645, + "learning_rate": 3.520033883075255e-06, + "loss": 0.87508345, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.24414062, + "step": 4132, + "time_per_iteration": 4.39780855178833 + }, + { + "auxiliary_loss_clip": 0.01542177, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.33744574, + "balance_loss_mlp": 1.01585567, + "epoch": 0.24848940327671726, + "flos": 13450633712640.0, + "grad_norm": 2.213269750036648, + "language_loss": 0.72392821, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.74977291, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.26428223, + "step": 4133, + "time_per_iteration": 2.821784496307373 + }, + { + "auxiliary_loss_clip": 0.01580571, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.36352706, + "balance_loss_mlp": 1.01943851, + "epoch": 0.24854952652938525, + "flos": 19978682664960.0, + "grad_norm": 2.250191465096188, + "language_loss": 0.63015759, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.65642947, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.27160645, + "step": 4134, + "time_per_iteration": 2.9214653968811035 + }, + { + "auxiliary_loss_clip": 0.01551589, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_clip": 1.34336793, + "balance_loss_mlp": 1.01791239, + "epoch": 0.24860964978205322, + "flos": 18159247762560.0, + "grad_norm": 1.9121575264078476, + "language_loss": 0.79356509, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81951189, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.2520752, + "step": 4135, + "time_per_iteration": 2.834266424179077 + }, + { + "auxiliary_loss_clip": 0.01552683, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.34585977, + "balance_loss_mlp": 1.0189265, + "epoch": 0.24866977303472118, + "flos": 11736022308480.0, + "grad_norm": 2.373647640151077, + "language_loss": 0.84245098, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.86841178, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24511719, + "step": 4136, + "time_per_iteration": 2.8805932998657227 + }, + { + "auxiliary_loss_clip": 0.015648, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.35274005, + "balance_loss_mlp": 1.01818585, + "epoch": 0.24872989628738915, + "flos": 34836814206720.0, + "grad_norm": 1.6043786979033687, + "language_loss": 0.72031641, + "learning_rate": 3.518767600693314e-06, + "loss": 0.74639064, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.24450684, + "step": 4137, + "time_per_iteration": 2.956493377685547 + }, + { + "auxiliary_loss_clip": 0.01565997, + "auxiliary_loss_mlp": 0.01044528, + "balance_loss_clip": 1.35433137, + "balance_loss_mlp": 1.02036428, + "epoch": 0.2487900195400571, + "flos": 13707362563200.0, + "grad_norm": 2.035887232189882, + "language_loss": 0.68102264, + "learning_rate": 3.518514171403042e-06, + "loss": 0.70712793, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 2.11425781, + "router_z_loss_mlp": 0.24157715, + "step": 4138, + "time_per_iteration": 2.8355743885040283 + }, + { + "auxiliary_loss_clip": 0.01547981, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.34353197, + "balance_loss_mlp": 1.01808834, + "epoch": 0.24885014279272508, + "flos": 25348723470720.0, + "grad_norm": 1.8602379610798447, + "language_loss": 0.85181248, + "learning_rate": 3.51826068453056e-06, + "loss": 0.87770671, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.23352051, + "step": 4139, + "time_per_iteration": 2.8749048709869385 + }, + { + "auxiliary_loss_clip": 0.01566811, + "auxiliary_loss_mlp": 0.0105019, + "balance_loss_clip": 1.35222673, + "balance_loss_mlp": 1.02390385, + "epoch": 0.24891026604539307, + "flos": 20641195296000.0, + "grad_norm": 1.51579934623308, + "language_loss": 0.79580003, + "learning_rate": 3.518007140085481e-06, + "loss": 0.82196999, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.26281738, + "step": 4140, + "time_per_iteration": 4.284176826477051 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.1912508, + "balance_loss_mlp": 1.00599909, + "epoch": 0.24897038929806103, + "flos": 66989454030720.0, + "grad_norm": 0.8386226834367205, + "language_loss": 0.61086941, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63431609, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.24414062, + "step": 4141, + "time_per_iteration": 4.808851718902588 + }, + { + "auxiliary_loss_clip": 0.01569104, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.35810602, + "balance_loss_mlp": 1.02176547, + "epoch": 0.249030512550729, + "flos": 36406216488960.0, + "grad_norm": 1.7673376466490704, + "language_loss": 0.73644614, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.76260269, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24804688, + "step": 4142, + "time_per_iteration": 3.0371172428131104 + }, + { + "auxiliary_loss_clip": 0.01551135, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.34466648, + "balance_loss_mlp": 1.02112567, + "epoch": 0.24909063580339696, + "flos": 20163553493760.0, + "grad_norm": 2.3074238284393966, + "language_loss": 0.82060218, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.84656429, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.23950195, + "step": 4143, + "time_per_iteration": 2.8668746948242188 + }, + { + "auxiliary_loss_clip": 0.01551768, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.34482682, + "balance_loss_mlp": 1.02095723, + "epoch": 0.24915075905606493, + "flos": 26407925677440.0, + "grad_norm": 2.0285207578531255, + "language_loss": 0.59896815, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.62493575, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24060059, + "step": 4144, + "time_per_iteration": 4.340224027633667 + }, + { + "auxiliary_loss_clip": 0.01536908, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.33105135, + "balance_loss_mlp": 1.01372313, + "epoch": 0.2492108823087329, + "flos": 27538759681920.0, + "grad_norm": 2.3742930967975107, + "language_loss": 0.7994616, + "learning_rate": 3.516738554607708e-06, + "loss": 0.82520795, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.23999023, + "step": 4145, + "time_per_iteration": 2.956942319869995 + }, + { + "auxiliary_loss_clip": 0.01568417, + "auxiliary_loss_mlp": 0.01048247, + "balance_loss_clip": 1.35352504, + "balance_loss_mlp": 1.01950526, + "epoch": 0.24927100556140086, + "flos": 16700234088960.0, + "grad_norm": 2.150635292223525, + "language_loss": 0.66812062, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.6942873, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.28771973, + "step": 4146, + "time_per_iteration": 2.843592643737793 + }, + { + "auxiliary_loss_clip": 0.01310694, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.19017816, + "balance_loss_mlp": 1.00590181, + "epoch": 0.24933112881406885, + "flos": 62802622725120.0, + "grad_norm": 0.9373230418655215, + "language_loss": 0.67289877, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69629741, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.23242188, + "step": 4147, + "time_per_iteration": 3.4775214195251465 + }, + { + "auxiliary_loss_clip": 0.01549029, + "auxiliary_loss_mlp": 0.01043582, + "balance_loss_clip": 1.34282041, + "balance_loss_mlp": 1.0179286, + "epoch": 0.24939125206673682, + "flos": 26663297184000.0, + "grad_norm": 1.8537607040913286, + "language_loss": 0.89696056, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.92288661, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2565918, + "step": 4148, + "time_per_iteration": 2.945061683654785 + }, + { + "auxiliary_loss_clip": 0.01566273, + "auxiliary_loss_mlp": 0.01047232, + "balance_loss_clip": 1.35335863, + "balance_loss_mlp": 1.01964664, + "epoch": 0.24945137531940478, + "flos": 20714093948160.0, + "grad_norm": 1.840706232990269, + "language_loss": 0.69624335, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.72237843, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.27600098, + "step": 4149, + "time_per_iteration": 2.8541877269744873 + }, + { + "auxiliary_loss_clip": 0.01551508, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.34472251, + "balance_loss_mlp": 1.01431334, + "epoch": 0.24951149857207275, + "flos": 23779094964480.0, + "grad_norm": 1.8718381932563004, + "language_loss": 0.72190928, + "learning_rate": 3.515468531258095e-06, + "loss": 0.74782461, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.25708008, + "step": 4150, + "time_per_iteration": 2.966066837310791 + }, + { + "auxiliary_loss_clip": 0.01548874, + "auxiliary_loss_mlp": 0.01040106, + "balance_loss_clip": 1.33679175, + "balance_loss_mlp": 1.01464319, + "epoch": 0.2495716218247407, + "flos": 15672187566720.0, + "grad_norm": 2.109604906928087, + "language_loss": 0.7362417, + "learning_rate": 3.515214354149478e-06, + "loss": 0.76213157, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2545166, + "step": 4151, + "time_per_iteration": 2.818089008331299 + }, + { + "auxiliary_loss_clip": 0.01571111, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.35430884, + "balance_loss_mlp": 1.0138917, + "epoch": 0.24963174507740868, + "flos": 24060916696320.0, + "grad_norm": 2.974834033877862, + "language_loss": 0.65797788, + "learning_rate": 3.514960119583781e-06, + "loss": 0.68407112, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 2.16601562, + "router_z_loss_mlp": 0.24328613, + "step": 4152, + "time_per_iteration": 2.8644957542419434 + }, + { + "auxiliary_loss_clip": 0.01536699, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.3329078, + "balance_loss_mlp": 1.01653814, + "epoch": 0.24969186833007664, + "flos": 21809700236160.0, + "grad_norm": 1.9344260272248734, + "language_loss": 0.78211004, + "learning_rate": 3.514705827570645e-06, + "loss": 0.80789077, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.24853516, + "step": 4153, + "time_per_iteration": 2.8515055179595947 + }, + { + "auxiliary_loss_clip": 0.01544796, + "auxiliary_loss_mlp": 0.01043401, + "balance_loss_clip": 1.33732486, + "balance_loss_mlp": 1.01790202, + "epoch": 0.24975199158274464, + "flos": 19947707959680.0, + "grad_norm": 1.8393199048058204, + "language_loss": 0.77895445, + "learning_rate": 3.514451478119711e-06, + "loss": 0.80483639, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.25500488, + "step": 4154, + "time_per_iteration": 2.8365697860717773 + }, + { + "auxiliary_loss_clip": 0.01562775, + "auxiliary_loss_mlp": 0.01045358, + "balance_loss_clip": 1.34551513, + "balance_loss_mlp": 1.01853561, + "epoch": 0.2498121148354126, + "flos": 25349447387520.0, + "grad_norm": 3.1480393824861337, + "language_loss": 0.71448779, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.74056911, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 2.171875, + "router_z_loss_mlp": 0.26831055, + "step": 4155, + "time_per_iteration": 2.954972267150879 + }, + { + "auxiliary_loss_clip": 0.01561155, + "auxiliary_loss_mlp": 0.0104241, + "balance_loss_clip": 1.34666371, + "balance_loss_mlp": 1.01768589, + "epoch": 0.24987223808808057, + "flos": 20568613357440.0, + "grad_norm": 5.027013849976075, + "language_loss": 0.75961411, + "learning_rate": 3.513942606943036e-06, + "loss": 0.78564978, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.24743652, + "step": 4156, + "time_per_iteration": 2.844348669052124 + }, + { + "auxiliary_loss_clip": 0.01536613, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.32984984, + "balance_loss_mlp": 1.01577163, + "epoch": 0.24993236134074853, + "flos": 19756819572480.0, + "grad_norm": 2.2572891174708833, + "language_loss": 0.77839047, + "learning_rate": 3.513688085236591e-06, + "loss": 0.80415869, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24438477, + "step": 4157, + "time_per_iteration": 2.8707756996154785 + }, + { + "auxiliary_loss_clip": 0.01551269, + "auxiliary_loss_mlp": 0.01049222, + "balance_loss_clip": 1.34053791, + "balance_loss_mlp": 1.02355611, + "epoch": 0.2499924845934165, + "flos": 18779474488320.0, + "grad_norm": 1.9600691760994138, + "language_loss": 0.82339525, + "learning_rate": 3.513433506130942e-06, + "loss": 0.84940016, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25671387, + "step": 4158, + "time_per_iteration": 2.8424744606018066 + }, + { + "auxiliary_loss_clip": 0.01544118, + "auxiliary_loss_mlp": 0.01042379, + "balance_loss_clip": 1.3344692, + "balance_loss_mlp": 1.01742899, + "epoch": 0.25005260784608446, + "flos": 16880670927360.0, + "grad_norm": 3.102606699580561, + "language_loss": 0.76927948, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.79514444, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24963379, + "step": 4159, + "time_per_iteration": 2.8461995124816895 + }, + { + "auxiliary_loss_clip": 0.01560712, + "auxiliary_loss_mlp": 0.01047704, + "balance_loss_clip": 1.34614587, + "balance_loss_mlp": 1.02259898, + "epoch": 0.2501127310987524, + "flos": 22134441300480.0, + "grad_norm": 1.7265624399435509, + "language_loss": 0.71721274, + "learning_rate": 3.512924175760649e-06, + "loss": 0.74329692, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 2.14257812, + "router_z_loss_mlp": 0.25134277, + "step": 4160, + "time_per_iteration": 2.8609492778778076 + }, + { + "auxiliary_loss_clip": 0.01315628, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.18603945, + "balance_loss_mlp": 1.00246847, + "epoch": 0.2501728543514204, + "flos": 69492214172160.0, + "grad_norm": 0.747175042815921, + "language_loss": 0.56823742, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.59167969, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.26171875, + "step": 4161, + "time_per_iteration": 3.4035096168518066 + }, + { + "auxiliary_loss_clip": 0.01561925, + "auxiliary_loss_mlp": 0.01045671, + "balance_loss_clip": 1.34488273, + "balance_loss_mlp": 1.01963532, + "epoch": 0.25023297760408836, + "flos": 16298703319680.0, + "grad_norm": 1.893776371134238, + "language_loss": 0.8211239, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.84719992, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.26049805, + "step": 4162, + "time_per_iteration": 2.870893955230713 + }, + { + "auxiliary_loss_clip": 0.01551594, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_clip": 1.33702791, + "balance_loss_mlp": 1.02484012, + "epoch": 0.2502931008567563, + "flos": 12245136508800.0, + "grad_norm": 2.3838014003026524, + "language_loss": 0.88711321, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.91313577, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.25817871, + "step": 4163, + "time_per_iteration": 2.8320767879486084 + }, + { + "auxiliary_loss_clip": 0.01558015, + "auxiliary_loss_mlp": 0.01046184, + "balance_loss_clip": 1.34607971, + "balance_loss_mlp": 1.02050686, + "epoch": 0.25035322410942434, + "flos": 23191381267200.0, + "grad_norm": 1.7011125765934234, + "language_loss": 0.8414377, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.86747968, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.2565918, + "step": 4164, + "time_per_iteration": 2.901123285293579 + }, + { + "auxiliary_loss_clip": 0.01520591, + "auxiliary_loss_mlp": 0.01047904, + "balance_loss_clip": 1.31722438, + "balance_loss_mlp": 1.02223873, + "epoch": 0.2504133473620923, + "flos": 20925867450240.0, + "grad_norm": 1.688983524663447, + "language_loss": 0.75229323, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.77797812, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.2565918, + "step": 4165, + "time_per_iteration": 2.9231045246124268 + }, + { + "auxiliary_loss_clip": 0.015417, + "auxiliary_loss_mlp": 0.01050601, + "balance_loss_clip": 1.32904506, + "balance_loss_mlp": 1.02505469, + "epoch": 0.2504734706147603, + "flos": 20786132949120.0, + "grad_norm": 1.8006735071649989, + "language_loss": 0.7426796, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76860261, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 2.12792969, + "router_z_loss_mlp": 0.25561523, + "step": 4166, + "time_per_iteration": 2.8615317344665527 + }, + { + "auxiliary_loss_clip": 0.01528713, + "auxiliary_loss_mlp": 0.01050988, + "balance_loss_clip": 1.31936371, + "balance_loss_mlp": 1.02522695, + "epoch": 0.25053359386742824, + "flos": 24359162290560.0, + "grad_norm": 1.956298370550903, + "language_loss": 0.82273245, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84852946, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25720215, + "step": 4167, + "time_per_iteration": 4.385786056518555 + }, + { + "auxiliary_loss_clip": 0.01527819, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.32215858, + "balance_loss_mlp": 1.01649928, + "epoch": 0.2505937171200962, + "flos": 21224022554880.0, + "grad_norm": 2.0543361420483786, + "language_loss": 0.81217974, + "learning_rate": 3.51088456024312e-06, + "loss": 0.83787632, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.25390625, + "step": 4168, + "time_per_iteration": 2.8361828327178955 + }, + { + "auxiliary_loss_clip": 0.01559574, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.34217596, + "balance_loss_mlp": 1.01659155, + "epoch": 0.25065384037276417, + "flos": 41442105312000.0, + "grad_norm": 2.0885710463698035, + "language_loss": 0.70115638, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72718811, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.27001953, + "step": 4169, + "time_per_iteration": 3.017787456512451 + }, + { + "auxiliary_loss_clip": 0.01528144, + "auxiliary_loss_mlp": 0.01044518, + "balance_loss_clip": 1.32069087, + "balance_loss_mlp": 1.01965141, + "epoch": 0.25071396362543213, + "flos": 26113073443200.0, + "grad_norm": 3.6430497085183573, + "language_loss": 0.78590167, + "learning_rate": 3.510374083241361e-06, + "loss": 0.81162822, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.2487793, + "step": 4170, + "time_per_iteration": 2.928492546081543 + }, + { + "auxiliary_loss_clip": 0.01536518, + "auxiliary_loss_mlp": 0.01043871, + "balance_loss_clip": 1.32550764, + "balance_loss_mlp": 1.01708496, + "epoch": 0.2507740868781001, + "flos": 19108332829440.0, + "grad_norm": 2.1207014495117362, + "language_loss": 0.77624774, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.8020516, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.2677002, + "step": 4171, + "time_per_iteration": 2.8056704998016357 + }, + { + "auxiliary_loss_clip": 0.01301313, + "auxiliary_loss_mlp": 0.01048854, + "balance_loss_clip": 1.17628121, + "balance_loss_mlp": 1.01852667, + "epoch": 0.25083421013076806, + "flos": 64372567944960.0, + "grad_norm": 0.9371618396865574, + "language_loss": 0.60094047, + "learning_rate": 3.509863377145458e-06, + "loss": 0.6244421, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.30273438, + "step": 4172, + "time_per_iteration": 3.351564407348633 + }, + { + "auxiliary_loss_clip": 0.01532683, + "auxiliary_loss_mlp": 0.01046202, + "balance_loss_clip": 1.32163262, + "balance_loss_mlp": 1.02005947, + "epoch": 0.25089433338343603, + "flos": 24290199936000.0, + "grad_norm": 1.5938772910401175, + "language_loss": 0.79872042, + "learning_rate": 3.509607938211409e-06, + "loss": 0.82450926, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 2.10839844, + "router_z_loss_mlp": 0.26159668, + "step": 4173, + "time_per_iteration": 2.8763394355773926 + }, + { + "auxiliary_loss_clip": 0.0154076, + "auxiliary_loss_mlp": 0.01052544, + "balance_loss_clip": 1.330814, + "balance_loss_mlp": 1.02686679, + "epoch": 0.250954456636104, + "flos": 14729889219840.0, + "grad_norm": 3.20039148214578, + "language_loss": 0.84098446, + "learning_rate": 3.509352442032875e-06, + "loss": 0.86691749, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.25695801, + "step": 4174, + "time_per_iteration": 4.23157525062561 + }, + { + "auxiliary_loss_clip": 0.0153795, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.32779765, + "balance_loss_mlp": 1.01924372, + "epoch": 0.25101457988877196, + "flos": 22283858188800.0, + "grad_norm": 1.9827364250993678, + "language_loss": 0.72213364, + "learning_rate": 3.509096888619545e-06, + "loss": 0.74795473, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.24914551, + "step": 4175, + "time_per_iteration": 2.88515305519104 + }, + { + "auxiliary_loss_clip": 0.01541232, + "auxiliary_loss_mlp": 0.01041888, + "balance_loss_clip": 1.32756782, + "balance_loss_mlp": 1.01531625, + "epoch": 0.2510747031414399, + "flos": 25199035113600.0, + "grad_norm": 2.0307854223475563, + "language_loss": 0.82128352, + "learning_rate": 3.50884127798111e-06, + "loss": 0.84711474, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.26574707, + "step": 4176, + "time_per_iteration": 2.8935353755950928 + }, + { + "auxiliary_loss_clip": 0.01535028, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_clip": 1.32412231, + "balance_loss_mlp": 1.01512432, + "epoch": 0.25113482639410795, + "flos": 20713912968960.0, + "grad_norm": 2.0573190221891906, + "language_loss": 0.83679163, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.86256433, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.27124023, + "step": 4177, + "time_per_iteration": 4.202538251876831 + }, + { + "auxiliary_loss_clip": 0.01532499, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.32371736, + "balance_loss_mlp": 1.02306008, + "epoch": 0.2511949496467759, + "flos": 21516884017920.0, + "grad_norm": 3.9475979705968003, + "language_loss": 0.83931816, + "learning_rate": 3.508329885067698e-06, + "loss": 0.86512876, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.25476074, + "step": 4178, + "time_per_iteration": 2.971217632293701 + }, + { + "auxiliary_loss_clip": 0.01523931, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.31763315, + "balance_loss_mlp": 1.01846159, + "epoch": 0.2512550728994439, + "flos": 20711153036160.0, + "grad_norm": 2.3139927911324976, + "language_loss": 0.76380014, + "learning_rate": 3.508074102812112e-06, + "loss": 0.78946555, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.24145508, + "step": 4179, + "time_per_iteration": 4.259655714035034 + }, + { + "auxiliary_loss_clip": 0.0153536, + "auxiliary_loss_mlp": 0.01053319, + "balance_loss_clip": 1.32370067, + "balance_loss_mlp": 1.02760601, + "epoch": 0.25131519615211184, + "flos": 18487789390080.0, + "grad_norm": 1.9625158235792273, + "language_loss": 0.71811378, + "learning_rate": 3.507818263370206e-06, + "loss": 0.74400061, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.25720215, + "step": 4180, + "time_per_iteration": 2.916630744934082 + }, + { + "auxiliary_loss_clip": 0.01529193, + "auxiliary_loss_mlp": 0.01054008, + "balance_loss_clip": 1.32307267, + "balance_loss_mlp": 1.02859282, + "epoch": 0.2513753194047798, + "flos": 20494538340480.0, + "grad_norm": 2.6340340189286313, + "language_loss": 0.86067271, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88650471, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.25427246, + "step": 4181, + "time_per_iteration": 2.867891311645508 + }, + { + "auxiliary_loss_clip": 0.01527204, + "auxiliary_loss_mlp": 0.01050235, + "balance_loss_clip": 1.31958175, + "balance_loss_mlp": 1.02495027, + "epoch": 0.25143544265744777, + "flos": 37684205141760.0, + "grad_norm": 2.612496587648746, + "language_loss": 0.69849813, + "learning_rate": 3.507306412966238e-06, + "loss": 0.72427249, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.25292969, + "step": 4182, + "time_per_iteration": 3.0725393295288086 + }, + { + "auxiliary_loss_clip": 0.01301271, + "auxiliary_loss_mlp": 0.01063787, + "balance_loss_clip": 1.17256773, + "balance_loss_mlp": 1.03899157, + "epoch": 0.25149556591011574, + "flos": 69397009833600.0, + "grad_norm": 0.9848465998454152, + "language_loss": 0.70206356, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72571415, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.24804688, + "step": 4183, + "time_per_iteration": 3.4219133853912354 + }, + { + "auxiliary_loss_clip": 0.01531881, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.3214525, + "balance_loss_mlp": 1.02214742, + "epoch": 0.2515556891627837, + "flos": 13998278499840.0, + "grad_norm": 1.7110555973656565, + "language_loss": 0.75452155, + "learning_rate": 3.506794333933431e-06, + "loss": 0.78031254, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.25061035, + "step": 4184, + "time_per_iteration": 2.978550910949707 + }, + { + "auxiliary_loss_clip": 0.01529584, + "auxiliary_loss_mlp": 0.01049241, + "balance_loss_clip": 1.32228577, + "balance_loss_mlp": 1.02418351, + "epoch": 0.25161581241545167, + "flos": 22173605314560.0, + "grad_norm": 1.7464299157045184, + "language_loss": 0.83694935, + "learning_rate": 3.506538208705484e-06, + "loss": 0.86273766, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.25036621, + "step": 4185, + "time_per_iteration": 2.9578592777252197 + }, + { + "auxiliary_loss_clip": 0.01309932, + "auxiliary_loss_mlp": 0.0102033, + "balance_loss_clip": 1.18226206, + "balance_loss_mlp": 1.00039816, + "epoch": 0.25167593566811963, + "flos": 69385608144000.0, + "grad_norm": 0.7901497563145897, + "language_loss": 0.61514294, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.6384455, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.19921875, + "step": 4186, + "time_per_iteration": 3.287594795227051 + }, + { + "auxiliary_loss_clip": 0.01531929, + "auxiliary_loss_mlp": 0.01043509, + "balance_loss_clip": 1.32294226, + "balance_loss_mlp": 1.01773643, + "epoch": 0.2517360589207876, + "flos": 13269337223040.0, + "grad_norm": 1.8831487239081453, + "language_loss": 0.80176532, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.82751977, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.25817871, + "step": 4187, + "time_per_iteration": 2.9569191932678223 + }, + { + "auxiliary_loss_clip": 0.01527167, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.32097793, + "balance_loss_mlp": 1.02291274, + "epoch": 0.25179618217345556, + "flos": 20386728685440.0, + "grad_norm": 1.4860743309875963, + "language_loss": 0.80742657, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.83318609, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.25878906, + "step": 4188, + "time_per_iteration": 2.869967222213745 + }, + { + "auxiliary_loss_clip": 0.01541411, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.33296895, + "balance_loss_mlp": 1.02224672, + "epoch": 0.25185630542612353, + "flos": 27674105437440.0, + "grad_norm": 1.8490157684553794, + "language_loss": 0.75509262, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.78097129, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.24182129, + "step": 4189, + "time_per_iteration": 2.937288999557495 + }, + { + "auxiliary_loss_clip": 0.01521506, + "auxiliary_loss_mlp": 0.0104703, + "balance_loss_clip": 1.31734574, + "balance_loss_mlp": 1.02421319, + "epoch": 0.25191642867879155, + "flos": 21006321984000.0, + "grad_norm": 1.9743094490745905, + "language_loss": 0.8580417, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.88372707, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.22827148, + "step": 4190, + "time_per_iteration": 2.832486867904663 + }, + { + "auxiliary_loss_clip": 0.015235, + "auxiliary_loss_mlp": 0.01054588, + "balance_loss_clip": 1.31599689, + "balance_loss_mlp": 1.02814698, + "epoch": 0.2519765519314595, + "flos": 21115534227840.0, + "grad_norm": 2.377170522915985, + "language_loss": 0.76019627, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.78597713, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.26452637, + "step": 4191, + "time_per_iteration": 2.876530408859253 + }, + { + "auxiliary_loss_clip": 0.01297433, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.17525125, + "balance_loss_mlp": 1.01009011, + "epoch": 0.2520366751841275, + "flos": 62777484599040.0, + "grad_norm": 0.7207595514930212, + "language_loss": 0.57275057, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59602988, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.20410156, + "step": 4192, + "time_per_iteration": 3.453383684158325 + }, + { + "auxiliary_loss_clip": 0.01531145, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.32703948, + "balance_loss_mlp": 1.02682805, + "epoch": 0.25209679843679544, + "flos": 22239310043520.0, + "grad_norm": 2.022261416183438, + "language_loss": 0.76602805, + "learning_rate": 3.504487151087323e-06, + "loss": 0.79186201, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.25427246, + "step": 4193, + "time_per_iteration": 2.835674285888672 + }, + { + "auxiliary_loss_clip": 0.0152968, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.32064915, + "balance_loss_mlp": 1.02499115, + "epoch": 0.2521569216894634, + "flos": 12174183383040.0, + "grad_norm": 2.0979232296172614, + "language_loss": 0.84814847, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.8739509, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.2557373, + "step": 4194, + "time_per_iteration": 2.8352138996124268 + }, + { + "auxiliary_loss_clip": 0.01535041, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.32416821, + "balance_loss_mlp": 1.03419614, + "epoch": 0.2522170449421314, + "flos": 23711263729920.0, + "grad_norm": 1.5637017349119167, + "language_loss": 0.88841426, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.91434938, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.24267578, + "step": 4195, + "time_per_iteration": 2.912318468093872 + }, + { + "auxiliary_loss_clip": 0.01543547, + "auxiliary_loss_mlp": 0.01052836, + "balance_loss_clip": 1.33422089, + "balance_loss_mlp": 1.02688396, + "epoch": 0.25227716819479934, + "flos": 20964171813120.0, + "grad_norm": 1.7695295400440785, + "language_loss": 0.87001407, + "learning_rate": 3.503717062883053e-06, + "loss": 0.89597785, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.25964355, + "step": 4196, + "time_per_iteration": 2.8785829544067383 + }, + { + "auxiliary_loss_clip": 0.01539305, + "auxiliary_loss_mlp": 0.01051446, + "balance_loss_clip": 1.32774568, + "balance_loss_mlp": 1.02656746, + "epoch": 0.2523372914474673, + "flos": 23341793541120.0, + "grad_norm": 1.9834047177443144, + "language_loss": 0.84497219, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.87087971, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.2487793, + "step": 4197, + "time_per_iteration": 2.9115893840789795 + }, + { + "auxiliary_loss_clip": 0.01528974, + "auxiliary_loss_mlp": 0.01052734, + "balance_loss_clip": 1.31840324, + "balance_loss_mlp": 1.0268898, + "epoch": 0.25239741470013527, + "flos": 36982573741440.0, + "grad_norm": 2.245463716551549, + "language_loss": 0.73933476, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.76515186, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.25878906, + "step": 4198, + "time_per_iteration": 2.972625494003296 + }, + { + "auxiliary_loss_clip": 0.01536599, + "auxiliary_loss_mlp": 0.01048744, + "balance_loss_clip": 1.32364082, + "balance_loss_mlp": 1.02357864, + "epoch": 0.25245753795280323, + "flos": 18525098367360.0, + "grad_norm": 1.8066810630308692, + "language_loss": 0.77994812, + "learning_rate": 3.50294646148888e-06, + "loss": 0.80580157, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25158691, + "step": 4199, + "time_per_iteration": 2.845914602279663 + }, + { + "auxiliary_loss_clip": 0.01540875, + "auxiliary_loss_mlp": 0.01051058, + "balance_loss_clip": 1.32765007, + "balance_loss_mlp": 1.02515364, + "epoch": 0.2525176612054712, + "flos": 32358079054080.0, + "grad_norm": 1.6322777585658708, + "language_loss": 0.74173975, + "learning_rate": 3.502689480360739e-06, + "loss": 0.76765907, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.2590332, + "step": 4200, + "time_per_iteration": 2.9290146827697754 + }, + { + "auxiliary_loss_clip": 0.01520343, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.31199682, + "balance_loss_mlp": 1.0184201, + "epoch": 0.25257778445813917, + "flos": 45274080499200.0, + "grad_norm": 1.5378463036957457, + "language_loss": 0.82981312, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.85544622, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24560547, + "step": 4201, + "time_per_iteration": 3.0643723011016846 + }, + { + "auxiliary_loss_clip": 0.01534806, + "auxiliary_loss_mlp": 0.01047535, + "balance_loss_clip": 1.32228959, + "balance_loss_mlp": 1.02216744, + "epoch": 0.25263790771080713, + "flos": 23378242867200.0, + "grad_norm": 1.7711968013888884, + "language_loss": 0.76044232, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.78626567, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.25390625, + "step": 4202, + "time_per_iteration": 2.895042896270752 + }, + { + "auxiliary_loss_clip": 0.01519595, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.31436014, + "balance_loss_mlp": 1.01649976, + "epoch": 0.25269803096347515, + "flos": 18524193471360.0, + "grad_norm": 1.8901598671031354, + "language_loss": 0.7452482, + "learning_rate": 3.501918195122491e-06, + "loss": 0.77085435, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.2454834, + "step": 4203, + "time_per_iteration": 4.242213249206543 + }, + { + "auxiliary_loss_clip": 0.01542016, + "auxiliary_loss_mlp": 0.01046139, + "balance_loss_clip": 1.32970238, + "balance_loss_mlp": 1.02042532, + "epoch": 0.2527581542161431, + "flos": 24621908699520.0, + "grad_norm": 1.382876129681569, + "language_loss": 0.78170729, + "learning_rate": 3.501660986124297e-06, + "loss": 0.80758888, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 2.125, + "router_z_loss_mlp": 0.25708008, + "step": 4204, + "time_per_iteration": 2.908308744430542 + }, + { + "auxiliary_loss_clip": 0.01529384, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.32053828, + "balance_loss_mlp": 1.02377176, + "epoch": 0.2528182774688111, + "flos": 12649020007680.0, + "grad_norm": 2.1470519940759374, + "language_loss": 0.73202235, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.75779462, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.24060059, + "step": 4205, + "time_per_iteration": 2.8494181632995605 + }, + { + "auxiliary_loss_clip": 0.01505781, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.30538285, + "balance_loss_mlp": 1.02139688, + "epoch": 0.25287840072147905, + "flos": 46953147473280.0, + "grad_norm": 1.4204120761923895, + "language_loss": 0.76603758, + "learning_rate": 3.50114639730826e-06, + "loss": 0.79155397, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.24438477, + "step": 4206, + "time_per_iteration": 3.091952085494995 + }, + { + "auxiliary_loss_clip": 0.01532394, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_clip": 1.32248795, + "balance_loss_mlp": 1.0191747, + "epoch": 0.252938523974147, + "flos": 18888912956160.0, + "grad_norm": 1.5612702676217622, + "language_loss": 0.80061448, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.82637835, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24816895, + "step": 4207, + "time_per_iteration": 2.85799241065979 + }, + { + "auxiliary_loss_clip": 0.01515582, + "auxiliary_loss_mlp": 0.01044272, + "balance_loss_clip": 1.30971956, + "balance_loss_mlp": 1.01951265, + "epoch": 0.252998647226815, + "flos": 21444845016960.0, + "grad_norm": 1.550928577560789, + "language_loss": 0.77069825, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.79629683, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.24780273, + "step": 4208, + "time_per_iteration": 2.8857839107513428 + }, + { + "auxiliary_loss_clip": 0.01511107, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.31037617, + "balance_loss_mlp": 1.01906276, + "epoch": 0.25305877047948294, + "flos": 25451782421760.0, + "grad_norm": 2.020424720531339, + "language_loss": 0.70451498, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.73006332, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.24682617, + "step": 4209, + "time_per_iteration": 4.440242528915405 + }, + { + "auxiliary_loss_clip": 0.01295206, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_clip": 1.1674614, + "balance_loss_mlp": 1.02202821, + "epoch": 0.2531188937321509, + "flos": 60216032672640.0, + "grad_norm": 0.7638864027724452, + "language_loss": 0.55144906, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.5748713, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.25, + "step": 4210, + "time_per_iteration": 3.418522834777832 + }, + { + "auxiliary_loss_clip": 0.01541881, + "auxiliary_loss_mlp": 0.0103726, + "balance_loss_clip": 1.33202267, + "balance_loss_mlp": 1.0129056, + "epoch": 0.25317901698481887, + "flos": 19691793515520.0, + "grad_norm": 2.2781343232214453, + "language_loss": 0.80875993, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.83455133, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 2.09863281, + "router_z_loss_mlp": 0.24377441, + "step": 4211, + "time_per_iteration": 4.25178599357605 + }, + { + "auxiliary_loss_clip": 0.0151196, + "auxiliary_loss_mlp": 0.01045475, + "balance_loss_clip": 1.30905259, + "balance_loss_mlp": 1.02145386, + "epoch": 0.25323914023748684, + "flos": 24434685141120.0, + "grad_norm": 1.6466273105075449, + "language_loss": 0.79180539, + "learning_rate": 3.499601265005622e-06, + "loss": 0.81737971, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.24023438, + "step": 4212, + "time_per_iteration": 2.868617057800293 + }, + { + "auxiliary_loss_clip": 0.01522943, + "auxiliary_loss_mlp": 0.01041812, + "balance_loss_clip": 1.3161633, + "balance_loss_mlp": 1.01717162, + "epoch": 0.2532992634901548, + "flos": 25458433407360.0, + "grad_norm": 1.8914698764183817, + "language_loss": 0.54502666, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.57067418, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24633789, + "step": 4213, + "time_per_iteration": 2.9028208255767822 + }, + { + "auxiliary_loss_clip": 0.01527675, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_clip": 1.31843424, + "balance_loss_mlp": 1.01957583, + "epoch": 0.25335938674282277, + "flos": 18889953586560.0, + "grad_norm": 4.755586770229758, + "language_loss": 0.66283149, + "learning_rate": 3.499085765880308e-06, + "loss": 0.68856788, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.26391602, + "step": 4214, + "time_per_iteration": 4.364147424697876 + }, + { + "auxiliary_loss_clip": 0.01288002, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.16520286, + "balance_loss_mlp": 1.01470375, + "epoch": 0.25341950999549073, + "flos": 53089094557440.0, + "grad_norm": 0.8515879820347452, + "language_loss": 0.5814395, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60473359, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.26757812, + "step": 4215, + "time_per_iteration": 3.1124112606048584 + }, + { + "auxiliary_loss_clip": 0.01516066, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.3097856, + "balance_loss_mlp": 1.01392031, + "epoch": 0.2534796332481587, + "flos": 39034775733120.0, + "grad_norm": 1.575038653877057, + "language_loss": 0.8460052, + "learning_rate": 3.498570039373066e-06, + "loss": 0.87155032, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.2454834, + "step": 4216, + "time_per_iteration": 2.9882240295410156 + }, + { + "auxiliary_loss_clip": 0.01524417, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.3189708, + "balance_loss_mlp": 1.01983762, + "epoch": 0.2535397565008267, + "flos": 23597572250880.0, + "grad_norm": 1.9262801534543614, + "language_loss": 0.81342852, + "learning_rate": 3.498312090875666e-06, + "loss": 0.83911133, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24047852, + "step": 4217, + "time_per_iteration": 2.900644302368164 + }, + { + "auxiliary_loss_clip": 0.01515062, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.30890858, + "balance_loss_mlp": 1.01298714, + "epoch": 0.2535998797534947, + "flos": 19290760439040.0, + "grad_norm": 2.9991357016801268, + "language_loss": 0.764328, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.78985488, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.24658203, + "step": 4218, + "time_per_iteration": 2.8198580741882324 + }, + { + "auxiliary_loss_clip": 0.01537782, + "auxiliary_loss_mlp": 0.01048694, + "balance_loss_clip": 1.32758451, + "balance_loss_mlp": 1.02349305, + "epoch": 0.25366000300616265, + "flos": 24034964163840.0, + "grad_norm": 2.1229275507408296, + "language_loss": 0.75820792, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.7840727, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.25219727, + "step": 4219, + "time_per_iteration": 2.9028990268707275 + }, + { + "auxiliary_loss_clip": 0.01532636, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.32247007, + "balance_loss_mlp": 1.01975596, + "epoch": 0.2537201262588306, + "flos": 16297934158080.0, + "grad_norm": 1.7573000603866413, + "language_loss": 0.82250094, + "learning_rate": 3.497537904525736e-06, + "loss": 0.84827453, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24975586, + "step": 4220, + "time_per_iteration": 2.8830039501190186 + }, + { + "auxiliary_loss_clip": 0.01533593, + "auxiliary_loss_mlp": 0.01047949, + "balance_loss_clip": 1.32510841, + "balance_loss_mlp": 1.02368999, + "epoch": 0.2537802495114986, + "flos": 23305072746240.0, + "grad_norm": 1.9146424466104128, + "language_loss": 0.72402924, + "learning_rate": 3.497279728822468e-06, + "loss": 0.74984467, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.24255371, + "step": 4221, + "time_per_iteration": 2.961087942123413 + }, + { + "auxiliary_loss_clip": 0.0152793, + "auxiliary_loss_mlp": 0.01049081, + "balance_loss_clip": 1.31650186, + "balance_loss_mlp": 1.02424943, + "epoch": 0.25384037276416654, + "flos": 17648007056640.0, + "grad_norm": 2.468452544230068, + "language_loss": 0.63044596, + "learning_rate": 3.497021496342202e-06, + "loss": 0.65621608, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.24841309, + "step": 4222, + "time_per_iteration": 2.8441786766052246 + }, + { + "auxiliary_loss_clip": 0.01531223, + "auxiliary_loss_mlp": 0.01044747, + "balance_loss_clip": 1.32211423, + "balance_loss_mlp": 1.02039266, + "epoch": 0.2539004960168345, + "flos": 21517019752320.0, + "grad_norm": 1.7366475569368265, + "language_loss": 0.75597346, + "learning_rate": 3.496763207094731e-06, + "loss": 0.78173316, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.24353027, + "step": 4223, + "time_per_iteration": 2.8664767742156982 + }, + { + "auxiliary_loss_clip": 0.01510607, + "auxiliary_loss_mlp": 0.01039449, + "balance_loss_clip": 1.30846667, + "balance_loss_mlp": 1.01734805, + "epoch": 0.2539606192695025, + "flos": 23961341594880.0, + "grad_norm": 1.6718633837605723, + "language_loss": 0.80781442, + "learning_rate": 3.49650486108985e-06, + "loss": 0.83331501, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.22094727, + "step": 4224, + "time_per_iteration": 2.954723596572876 + }, + { + "auxiliary_loss_clip": 0.0151452, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.31065249, + "balance_loss_mlp": 1.02015388, + "epoch": 0.25402074252217044, + "flos": 24180263775360.0, + "grad_norm": 1.4953989182170675, + "language_loss": 0.78091311, + "learning_rate": 3.496246458337354e-06, + "loss": 0.80650353, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.24353027, + "step": 4225, + "time_per_iteration": 2.844721555709839 + }, + { + "auxiliary_loss_clip": 0.01518711, + "auxiliary_loss_mlp": 0.01049725, + "balance_loss_clip": 1.31381238, + "balance_loss_mlp": 1.02608609, + "epoch": 0.2540808657748384, + "flos": 22312389674880.0, + "grad_norm": 1.8735665398410137, + "language_loss": 0.85736108, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.88304549, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.23657227, + "step": 4226, + "time_per_iteration": 2.890226125717163 + }, + { + "auxiliary_loss_clip": 0.01520633, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.31512225, + "balance_loss_mlp": 1.02296054, + "epoch": 0.25414098902750637, + "flos": 27610934417280.0, + "grad_norm": 1.417705534420962, + "language_loss": 0.72182524, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.747504, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.24291992, + "step": 4227, + "time_per_iteration": 2.9160988330841064 + }, + { + "auxiliary_loss_clip": 0.01273838, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.15205359, + "balance_loss_mlp": 1.01486254, + "epoch": 0.25420111228017434, + "flos": 58198651194240.0, + "grad_norm": 0.9783498061141501, + "language_loss": 0.6185109, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64157629, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.17871094, + "step": 4228, + "time_per_iteration": 3.2070937156677246 + }, + { + "auxiliary_loss_clip": 0.01539772, + "auxiliary_loss_mlp": 0.01041955, + "balance_loss_clip": 1.32909775, + "balance_loss_mlp": 1.01653934, + "epoch": 0.2542612355328423, + "flos": 11469430091520.0, + "grad_norm": 2.1876230903333385, + "language_loss": 0.87795663, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.90377396, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.25427246, + "step": 4229, + "time_per_iteration": 2.8106462955474854 + }, + { + "auxiliary_loss_clip": 0.01539277, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.33235669, + "balance_loss_mlp": 1.01762724, + "epoch": 0.2543213587855103, + "flos": 22976078670720.0, + "grad_norm": 2.5018685040894804, + "language_loss": 0.77775216, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.80356395, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.24291992, + "step": 4230, + "time_per_iteration": 2.8537495136260986 + }, + { + "auxiliary_loss_clip": 0.0152094, + "auxiliary_loss_mlp": 0.01048387, + "balance_loss_clip": 1.31353629, + "balance_loss_mlp": 1.02250719, + "epoch": 0.2543814820381783, + "flos": 18260587411200.0, + "grad_norm": 1.9017187069990786, + "language_loss": 0.7595731, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.7852664, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.25878906, + "step": 4231, + "time_per_iteration": 2.9102985858917236 + }, + { + "auxiliary_loss_clip": 0.01532093, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.32175827, + "balance_loss_mlp": 1.02221322, + "epoch": 0.25444160529084625, + "flos": 15641122371840.0, + "grad_norm": 1.5797516428161937, + "language_loss": 0.74691164, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.77270108, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24658203, + "step": 4232, + "time_per_iteration": 2.8397443294525146 + }, + { + "auxiliary_loss_clip": 0.0152027, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.31279039, + "balance_loss_mlp": 1.01566195, + "epoch": 0.2545017285435142, + "flos": 24610642744320.0, + "grad_norm": 1.895604092637888, + "language_loss": 0.87973142, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.90533614, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.2454834, + "step": 4233, + "time_per_iteration": 2.8908908367156982 + }, + { + "auxiliary_loss_clip": 0.01499127, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.2982645, + "balance_loss_mlp": 1.02054453, + "epoch": 0.2545618517961822, + "flos": 24689287486080.0, + "grad_norm": 1.6657064864018902, + "language_loss": 0.75700706, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7824344, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.23071289, + "step": 4234, + "time_per_iteration": 2.871424674987793 + }, + { + "auxiliary_loss_clip": 0.01525862, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.31765997, + "balance_loss_mlp": 1.01702881, + "epoch": 0.25462197504885015, + "flos": 23925706675200.0, + "grad_norm": 1.4346920545652044, + "language_loss": 0.75571346, + "learning_rate": 3.493659311850379e-06, + "loss": 0.7813831, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.24084473, + "step": 4235, + "time_per_iteration": 2.889403820037842 + }, + { + "auxiliary_loss_clip": 0.01581572, + "auxiliary_loss_mlp": 0.01041563, + "balance_loss_clip": 1.36144555, + "balance_loss_mlp": 1.01613545, + "epoch": 0.2546820983015181, + "flos": 24799857073920.0, + "grad_norm": 2.680127371715491, + "language_loss": 0.65860617, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.68483752, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 2.20019531, + "router_z_loss_mlp": 0.25415039, + "step": 4236, + "time_per_iteration": 2.8805904388427734 + }, + { + "auxiliary_loss_clip": 0.01516845, + "auxiliary_loss_mlp": 0.01038646, + "balance_loss_clip": 1.31270349, + "balance_loss_mlp": 1.01489949, + "epoch": 0.2547422215541861, + "flos": 18743387120640.0, + "grad_norm": 1.5403681037642782, + "language_loss": 0.67890245, + "learning_rate": 3.493141202562354e-06, + "loss": 0.70445734, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23754883, + "step": 4237, + "time_per_iteration": 2.825218677520752 + }, + { + "auxiliary_loss_clip": 0.01520913, + "auxiliary_loss_mlp": 0.01043776, + "balance_loss_clip": 1.31129646, + "balance_loss_mlp": 1.01839674, + "epoch": 0.25480234480685404, + "flos": 21042228372480.0, + "grad_norm": 1.9989598687617298, + "language_loss": 0.76261437, + "learning_rate": 3.492882062983333e-06, + "loss": 0.78826129, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.25390625, + "step": 4238, + "time_per_iteration": 4.2961297035217285 + }, + { + "auxiliary_loss_clip": 0.01525831, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_clip": 1.3177228, + "balance_loss_mlp": 1.02098274, + "epoch": 0.254862468059522, + "flos": 25092944760960.0, + "grad_norm": 4.151359097962964, + "language_loss": 0.81479347, + "learning_rate": 3.492622866794074e-06, + "loss": 0.84051162, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.24987793, + "step": 4239, + "time_per_iteration": 2.9050424098968506 + }, + { + "auxiliary_loss_clip": 0.01514068, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.31100869, + "balance_loss_mlp": 1.01840639, + "epoch": 0.25492259131219, + "flos": 20568115664640.0, + "grad_norm": 1.663637818419357, + "language_loss": 0.78510821, + "learning_rate": 3.492363614004407e-06, + "loss": 0.8106792, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.24633789, + "step": 4240, + "time_per_iteration": 2.81251859664917 + }, + { + "auxiliary_loss_clip": 0.01533732, + "auxiliary_loss_mlp": 0.01042693, + "balance_loss_clip": 1.32295632, + "balance_loss_mlp": 1.01758778, + "epoch": 0.25498271456485794, + "flos": 25052423402880.0, + "grad_norm": 1.7158471663124326, + "language_loss": 0.84004438, + "learning_rate": 3.492104304624162e-06, + "loss": 0.86580867, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25085449, + "step": 4241, + "time_per_iteration": 2.891291379928589 + }, + { + "auxiliary_loss_clip": 0.01531187, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.32259905, + "balance_loss_mlp": 1.01830804, + "epoch": 0.2550428378175259, + "flos": 26189908392960.0, + "grad_norm": 2.0104611761517224, + "language_loss": 0.74143374, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.76717007, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24133301, + "step": 4242, + "time_per_iteration": 2.8750274181365967 + }, + { + "auxiliary_loss_clip": 0.0154318, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.33308148, + "balance_loss_mlp": 1.01960397, + "epoch": 0.2551029610701939, + "flos": 15275317011840.0, + "grad_norm": 2.6688108268079604, + "language_loss": 0.74190927, + "learning_rate": 3.491585516131273e-06, + "loss": 0.76778352, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.24633789, + "step": 4243, + "time_per_iteration": 2.8517098426818848 + }, + { + "auxiliary_loss_clip": 0.01519511, + "auxiliary_loss_mlp": 0.01048513, + "balance_loss_clip": 1.31219828, + "balance_loss_mlp": 1.02358675, + "epoch": 0.2551630843228619, + "flos": 18120852910080.0, + "grad_norm": 1.611570058369904, + "language_loss": 0.82834005, + "learning_rate": 3.491326037038301e-06, + "loss": 0.85402024, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24914551, + "step": 4244, + "time_per_iteration": 4.2871222496032715 + }, + { + "auxiliary_loss_clip": 0.01293774, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.16943383, + "balance_loss_mlp": 1.0066694, + "epoch": 0.25522320757552985, + "flos": 70555669390080.0, + "grad_norm": 0.682864921752293, + "language_loss": 0.57729113, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.60048056, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.18457031, + "step": 4245, + "time_per_iteration": 3.429171085357666 + }, + { + "auxiliary_loss_clip": 0.01543188, + "auxiliary_loss_mlp": 0.01053599, + "balance_loss_clip": 1.33100188, + "balance_loss_mlp": 1.0282793, + "epoch": 0.2552833308281978, + "flos": 22903315752960.0, + "grad_norm": 2.0550025370386056, + "language_loss": 0.66928113, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.69524896, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.25317383, + "step": 4246, + "time_per_iteration": 4.288013696670532 + }, + { + "auxiliary_loss_clip": 0.01515883, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.31519651, + "balance_loss_mlp": 1.01768243, + "epoch": 0.2553434540808658, + "flos": 22063533419520.0, + "grad_norm": 1.7218867763908496, + "language_loss": 0.82214606, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.84770894, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.22705078, + "step": 4247, + "time_per_iteration": 2.856114149093628 + }, + { + "auxiliary_loss_clip": 0.01554774, + "auxiliary_loss_mlp": 0.01046682, + "balance_loss_clip": 1.33868551, + "balance_loss_mlp": 1.02123094, + "epoch": 0.25540357733353375, + "flos": 16552672237440.0, + "grad_norm": 2.1450477017325893, + "language_loss": 0.84519672, + "learning_rate": 3.490287555252514e-06, + "loss": 0.87121123, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.25476074, + "step": 4248, + "time_per_iteration": 2.804044246673584 + }, + { + "auxiliary_loss_clip": 0.01548175, + "auxiliary_loss_mlp": 0.01049273, + "balance_loss_clip": 1.33798599, + "balance_loss_mlp": 1.02457309, + "epoch": 0.2554637005862017, + "flos": 17573570081280.0, + "grad_norm": 2.0952920265659265, + "language_loss": 0.85303271, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.87900716, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.24707031, + "step": 4249, + "time_per_iteration": 4.22739839553833 + }, + { + "auxiliary_loss_clip": 0.01294335, + "auxiliary_loss_mlp": 0.01027265, + "balance_loss_clip": 1.16865146, + "balance_loss_mlp": 1.00704753, + "epoch": 0.2555238238388697, + "flos": 72273746661120.0, + "grad_norm": 0.7678677995863831, + "language_loss": 0.56356514, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58678114, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.20214844, + "step": 4250, + "time_per_iteration": 3.402534008026123 + }, + { + "auxiliary_loss_clip": 0.01535048, + "auxiliary_loss_mlp": 0.01048438, + "balance_loss_clip": 1.32534611, + "balance_loss_mlp": 1.02277207, + "epoch": 0.25558394709153764, + "flos": 24400136096640.0, + "grad_norm": 1.9149506597383026, + "language_loss": 0.82218701, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.84802186, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.25683594, + "step": 4251, + "time_per_iteration": 2.895263195037842 + }, + { + "auxiliary_loss_clip": 0.01290421, + "auxiliary_loss_mlp": 0.01055961, + "balance_loss_clip": 1.16685295, + "balance_loss_mlp": 1.03307283, + "epoch": 0.2556440703442056, + "flos": 69263247646080.0, + "grad_norm": 0.8041048749186148, + "language_loss": 0.66197085, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6854347, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.22851562, + "step": 4252, + "time_per_iteration": 3.3405096530914307 + }, + { + "auxiliary_loss_clip": 0.0151283, + "auxiliary_loss_mlp": 0.01041234, + "balance_loss_clip": 1.31069922, + "balance_loss_mlp": 1.01808345, + "epoch": 0.2557041935968736, + "flos": 24874791742080.0, + "grad_norm": 2.3969089601448266, + "language_loss": 0.74501812, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.77055871, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23144531, + "step": 4253, + "time_per_iteration": 2.8995838165283203 + }, + { + "auxiliary_loss_clip": 0.01541234, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.33403349, + "balance_loss_mlp": 1.01867533, + "epoch": 0.25576431684954154, + "flos": 22502327921280.0, + "grad_norm": 1.9192156814862793, + "language_loss": 0.74707383, + "learning_rate": 3.488728137415357e-06, + "loss": 0.77291673, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.24389648, + "step": 4254, + "time_per_iteration": 2.865361452102661 + }, + { + "auxiliary_loss_clip": 0.01527511, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.31876385, + "balance_loss_mlp": 1.01723742, + "epoch": 0.2558244401022095, + "flos": 19835781027840.0, + "grad_norm": 2.312488940571898, + "language_loss": 0.81931609, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.84502238, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.2590332, + "step": 4255, + "time_per_iteration": 2.8436686992645264 + }, + { + "auxiliary_loss_clip": 0.01518855, + "auxiliary_loss_mlp": 0.01049802, + "balance_loss_clip": 1.3150878, + "balance_loss_mlp": 1.02469623, + "epoch": 0.2558845633548775, + "flos": 23230545281280.0, + "grad_norm": 1.5558396809392558, + "language_loss": 0.86166632, + "learning_rate": 3.488207879742721e-06, + "loss": 0.88735282, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.25134277, + "step": 4256, + "time_per_iteration": 2.859417676925659 + }, + { + "auxiliary_loss_clip": 0.01545739, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.33175755, + "balance_loss_mlp": 1.01458156, + "epoch": 0.2559446866075455, + "flos": 16846891044480.0, + "grad_norm": 1.7354723665652005, + "language_loss": 0.75842619, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.78429163, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26208496, + "step": 4257, + "time_per_iteration": 2.8664653301239014 + }, + { + "auxiliary_loss_clip": 0.01283138, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.15620267, + "balance_loss_mlp": 1.0049504, + "epoch": 0.25600480986021346, + "flos": 57623198837760.0, + "grad_norm": 0.7945477927640593, + "language_loss": 0.65354002, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67666316, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.2421875, + "step": 4258, + "time_per_iteration": 3.30615234375 + }, + { + "auxiliary_loss_clip": 0.01503733, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.30224156, + "balance_loss_mlp": 1.0222609, + "epoch": 0.2560649331128814, + "flos": 27831259186560.0, + "grad_norm": 1.7647732466044816, + "language_loss": 0.77528644, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.80080843, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.26208496, + "step": 4259, + "time_per_iteration": 2.8937032222747803 + }, + { + "auxiliary_loss_clip": 0.01280626, + "auxiliary_loss_mlp": 0.01028213, + "balance_loss_clip": 1.1565516, + "balance_loss_mlp": 1.00742328, + "epoch": 0.2561250563655494, + "flos": 70984148077440.0, + "grad_norm": 0.7964492978757748, + "language_loss": 0.58503646, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60812485, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20800781, + "step": 4260, + "time_per_iteration": 3.443023204803467 + }, + { + "auxiliary_loss_clip": 0.01503401, + "auxiliary_loss_mlp": 0.01048093, + "balance_loss_clip": 1.29709923, + "balance_loss_mlp": 1.02124739, + "epoch": 0.25618517961821735, + "flos": 27022677782400.0, + "grad_norm": 1.8433915950467596, + "language_loss": 0.77362537, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79914033, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.26879883, + "step": 4261, + "time_per_iteration": 3.0200788974761963 + }, + { + "auxiliary_loss_clip": 0.01509356, + "auxiliary_loss_mlp": 0.01045848, + "balance_loss_clip": 1.30533862, + "balance_loss_mlp": 1.02126718, + "epoch": 0.2562453028708853, + "flos": 23076694402560.0, + "grad_norm": 1.659973431836026, + "language_loss": 0.84148133, + "learning_rate": 3.486645752648842e-06, + "loss": 0.86703336, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.24584961, + "step": 4262, + "time_per_iteration": 2.9245333671569824 + }, + { + "auxiliary_loss_clip": 0.01522719, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_clip": 1.30945086, + "balance_loss_mlp": 1.02204847, + "epoch": 0.2563054261235533, + "flos": 15128976769920.0, + "grad_norm": 2.4630944052797044, + "language_loss": 0.75269675, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.77841002, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.26586914, + "step": 4263, + "time_per_iteration": 2.9228169918060303 + }, + { + "auxiliary_loss_clip": 0.01510316, + "auxiliary_loss_mlp": 0.0104551, + "balance_loss_clip": 1.30652905, + "balance_loss_mlp": 1.02060699, + "epoch": 0.25636554937622125, + "flos": 27866305923840.0, + "grad_norm": 1.5491104365930486, + "language_loss": 0.83495986, + "learning_rate": 3.486124592522163e-06, + "loss": 0.8605181, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.24902344, + "step": 4264, + "time_per_iteration": 2.9376823902130127 + }, + { + "auxiliary_loss_clip": 0.01532702, + "auxiliary_loss_mlp": 0.01048565, + "balance_loss_clip": 1.32345641, + "balance_loss_mlp": 1.02274418, + "epoch": 0.2564256726288892, + "flos": 28916549660160.0, + "grad_norm": 1.6470340666173287, + "language_loss": 0.75773448, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.78354716, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.25817871, + "step": 4265, + "time_per_iteration": 2.952333688735962 + }, + { + "auxiliary_loss_clip": 0.01518302, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.31232166, + "balance_loss_mlp": 1.01330149, + "epoch": 0.2564857958815572, + "flos": 18524057736960.0, + "grad_norm": 2.110845222443224, + "language_loss": 0.82922381, + "learning_rate": 3.485603206979513e-06, + "loss": 0.85479593, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.25610352, + "step": 4266, + "time_per_iteration": 2.8531272411346436 + }, + { + "auxiliary_loss_clip": 0.01494427, + "auxiliary_loss_mlp": 0.01044828, + "balance_loss_clip": 1.2921015, + "balance_loss_mlp": 1.01961589, + "epoch": 0.25654591913422514, + "flos": 25818311698560.0, + "grad_norm": 1.6224734294874987, + "language_loss": 0.80235124, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.82774377, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.25231934, + "step": 4267, + "time_per_iteration": 2.8710947036743164 + }, + { + "auxiliary_loss_clip": 0.01494479, + "auxiliary_loss_mlp": 0.01045165, + "balance_loss_clip": 1.29596639, + "balance_loss_mlp": 1.02040565, + "epoch": 0.2566060423868931, + "flos": 19109192480640.0, + "grad_norm": 1.5682987505188923, + "language_loss": 0.7991811, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.82457757, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24743652, + "step": 4268, + "time_per_iteration": 2.8485147953033447 + }, + { + "auxiliary_loss_clip": 0.01504564, + "auxiliary_loss_mlp": 0.010491, + "balance_loss_clip": 1.30006981, + "balance_loss_mlp": 1.02444708, + "epoch": 0.25666616563956113, + "flos": 23853396205440.0, + "grad_norm": 1.8988597897716362, + "language_loss": 0.69483197, + "learning_rate": 3.484820706183595e-06, + "loss": 0.72036856, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.2467041, + "step": 4269, + "time_per_iteration": 2.850463628768921 + }, + { + "auxiliary_loss_clip": 0.01523322, + "auxiliary_loss_mlp": 0.0104968, + "balance_loss_clip": 1.31502795, + "balance_loss_mlp": 1.02536166, + "epoch": 0.2567262888922291, + "flos": 14610632630400.0, + "grad_norm": 2.808533407213428, + "language_loss": 0.81224775, + "learning_rate": 3.484559759962666e-06, + "loss": 0.83797777, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.24316406, + "step": 4270, + "time_per_iteration": 2.8116774559020996 + }, + { + "auxiliary_loss_clip": 0.01539784, + "auxiliary_loss_mlp": 0.010552, + "balance_loss_clip": 1.32476544, + "balance_loss_mlp": 1.02760339, + "epoch": 0.25678641214489706, + "flos": 32935024488960.0, + "grad_norm": 1.914207737353283, + "language_loss": 0.69032907, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.71627891, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27600098, + "step": 4271, + "time_per_iteration": 2.9345316886901855 + }, + { + "auxiliary_loss_clip": 0.01529935, + "auxiliary_loss_mlp": 0.01053619, + "balance_loss_clip": 1.31832242, + "balance_loss_mlp": 1.02883577, + "epoch": 0.256846535397565, + "flos": 24109944076800.0, + "grad_norm": 1.4221674582723618, + "language_loss": 0.8788054, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.90464103, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.24804688, + "step": 4272, + "time_per_iteration": 4.295428037643433 + }, + { + "auxiliary_loss_clip": 0.01535963, + "auxiliary_loss_mlp": 0.01058626, + "balance_loss_clip": 1.32505584, + "balance_loss_mlp": 1.03286493, + "epoch": 0.256906658650233, + "flos": 19727745148800.0, + "grad_norm": 3.2289228564465384, + "language_loss": 0.829391, + "learning_rate": 3.483776583571541e-06, + "loss": 0.8553369, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25805664, + "step": 4273, + "time_per_iteration": 2.8580329418182373 + }, + { + "auxiliary_loss_clip": 0.01522021, + "auxiliary_loss_mlp": 0.01058066, + "balance_loss_clip": 1.31774497, + "balance_loss_mlp": 1.03361678, + "epoch": 0.25696678190290095, + "flos": 22935331088640.0, + "grad_norm": 1.7194352439697924, + "language_loss": 0.78180712, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.80760801, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.24462891, + "step": 4274, + "time_per_iteration": 2.8791096210479736 + }, + { + "auxiliary_loss_clip": 0.0151791, + "auxiliary_loss_mlp": 0.01053349, + "balance_loss_clip": 1.31382465, + "balance_loss_mlp": 1.02811241, + "epoch": 0.2570269051555689, + "flos": 27319204074240.0, + "grad_norm": 1.7523613899439572, + "language_loss": 0.84449375, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.8702063, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.25256348, + "step": 4275, + "time_per_iteration": 2.9033493995666504 + }, + { + "auxiliary_loss_clip": 0.01541382, + "auxiliary_loss_mlp": 0.01051201, + "balance_loss_clip": 1.33039904, + "balance_loss_mlp": 1.02594066, + "epoch": 0.2570870284082369, + "flos": 27574892294400.0, + "grad_norm": 2.053966000544112, + "language_loss": 0.79462945, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.82055533, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.25280762, + "step": 4276, + "time_per_iteration": 2.9344139099121094 + }, + { + "auxiliary_loss_clip": 0.01535806, + "auxiliary_loss_mlp": 0.01062937, + "balance_loss_clip": 1.32854807, + "balance_loss_mlp": 1.03808141, + "epoch": 0.25714715166090485, + "flos": 28742582828160.0, + "grad_norm": 1.9552830878413154, + "language_loss": 0.80314827, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.82913566, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.24841309, + "step": 4277, + "time_per_iteration": 2.9190521240234375 + }, + { + "auxiliary_loss_clip": 0.01545369, + "auxiliary_loss_mlp": 0.01057213, + "balance_loss_clip": 1.3363626, + "balance_loss_mlp": 1.03229845, + "epoch": 0.2572072749135728, + "flos": 20124615703680.0, + "grad_norm": 10.969114845266425, + "language_loss": 0.80072403, + "learning_rate": 3.482470164419295e-06, + "loss": 0.8267498, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 2.08691406, + "router_z_loss_mlp": 0.24914551, + "step": 4278, + "time_per_iteration": 2.885235071182251 + }, + { + "auxiliary_loss_clip": 0.01552526, + "auxiliary_loss_mlp": 0.01051946, + "balance_loss_clip": 1.34156883, + "balance_loss_mlp": 1.02713871, + "epoch": 0.2572673981662408, + "flos": 26041894093440.0, + "grad_norm": 1.7394480729513695, + "language_loss": 0.75070459, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77674937, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 2.11035156, + "router_z_loss_mlp": 0.24816895, + "step": 4279, + "time_per_iteration": 4.330492973327637 + }, + { + "auxiliary_loss_clip": 0.01541743, + "auxiliary_loss_mlp": 0.01062, + "balance_loss_clip": 1.32998419, + "balance_loss_mlp": 1.03522539, + "epoch": 0.25732752141890874, + "flos": 16115054100480.0, + "grad_norm": 2.3956694291503795, + "language_loss": 0.86648828, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.89252573, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26794434, + "step": 4280, + "time_per_iteration": 2.836162805557251 + }, + { + "auxiliary_loss_clip": 0.01549014, + "auxiliary_loss_mlp": 0.01055447, + "balance_loss_clip": 1.33415115, + "balance_loss_mlp": 1.0297339, + "epoch": 0.2573876446715767, + "flos": 22533574095360.0, + "grad_norm": 2.2098473110294883, + "language_loss": 0.79914051, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.82518512, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25744629, + "step": 4281, + "time_per_iteration": 4.271686315536499 + }, + { + "auxiliary_loss_clip": 0.01536491, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.32717156, + "balance_loss_mlp": 1.02596462, + "epoch": 0.2574477679242447, + "flos": 23961251105280.0, + "grad_norm": 2.022837623378409, + "language_loss": 0.87542099, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.90129781, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 2.09082031, + "router_z_loss_mlp": 0.25219727, + "step": 4282, + "time_per_iteration": 2.9051856994628906 + }, + { + "auxiliary_loss_clip": 0.01549059, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_clip": 1.33772635, + "balance_loss_mlp": 1.02593052, + "epoch": 0.2575078911769127, + "flos": 21991901621760.0, + "grad_norm": 1.5672992739997922, + "language_loss": 0.71403074, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.74001974, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.23925781, + "step": 4283, + "time_per_iteration": 2.90817928314209 + }, + { + "auxiliary_loss_clip": 0.01518651, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.31660581, + "balance_loss_mlp": 1.02752614, + "epoch": 0.25756801442958066, + "flos": 21955633274880.0, + "grad_norm": 1.7312683618808944, + "language_loss": 0.81499964, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.84070796, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.24658203, + "step": 4284, + "time_per_iteration": 4.3241307735443115 + }, + { + "auxiliary_loss_clip": 0.01549474, + "auxiliary_loss_mlp": 0.01048931, + "balance_loss_clip": 1.33843017, + "balance_loss_mlp": 1.02443385, + "epoch": 0.2576281376822486, + "flos": 35275472974080.0, + "grad_norm": 1.7973952594267986, + "language_loss": 0.71157438, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.73755848, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 2.11132812, + "router_z_loss_mlp": 0.24511719, + "step": 4285, + "time_per_iteration": 2.9777517318725586 + }, + { + "auxiliary_loss_clip": 0.01543469, + "auxiliary_loss_mlp": 0.01045784, + "balance_loss_clip": 1.33498716, + "balance_loss_mlp": 1.02090526, + "epoch": 0.2576882609349166, + "flos": 14139144120960.0, + "grad_norm": 1.786972001403901, + "language_loss": 0.59457946, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.62047195, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.24902344, + "step": 4286, + "time_per_iteration": 2.8152787685394287 + }, + { + "auxiliary_loss_clip": 0.01570535, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.35712552, + "balance_loss_mlp": 1.02344882, + "epoch": 0.25774838418758456, + "flos": 23268759154560.0, + "grad_norm": 8.051339501603794, + "language_loss": 0.65385038, + "learning_rate": 3.480115069207354e-06, + "loss": 0.68002814, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.23803711, + "step": 4287, + "time_per_iteration": 2.8763434886932373 + }, + { + "auxiliary_loss_clip": 0.01554794, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.33969152, + "balance_loss_mlp": 1.01968145, + "epoch": 0.2578085074402525, + "flos": 22612037857920.0, + "grad_norm": 1.983639907734974, + "language_loss": 0.72985113, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.75586832, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.27209473, + "step": 4288, + "time_per_iteration": 2.8657469749450684 + }, + { + "auxiliary_loss_clip": 0.01537275, + "auxiliary_loss_mlp": 0.01046399, + "balance_loss_clip": 1.33098376, + "balance_loss_mlp": 1.02177024, + "epoch": 0.2578686306929205, + "flos": 24582608951040.0, + "grad_norm": 1.5897684120963937, + "language_loss": 0.77462614, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.8004629, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.24645996, + "step": 4289, + "time_per_iteration": 2.9498164653778076 + }, + { + "auxiliary_loss_clip": 0.01545845, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.33643329, + "balance_loss_mlp": 1.02033615, + "epoch": 0.25792875394558845, + "flos": 18123069905280.0, + "grad_norm": 2.8048928586557555, + "language_loss": 0.84936357, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87528586, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.26062012, + "step": 4290, + "time_per_iteration": 2.8241350650787354 + }, + { + "auxiliary_loss_clip": 0.01550638, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.33734274, + "balance_loss_mlp": 1.01977932, + "epoch": 0.2579888771982564, + "flos": 17721991584000.0, + "grad_norm": 3.1491716086476935, + "language_loss": 0.73665947, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.76262695, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 2.13378906, + "router_z_loss_mlp": 0.26379395, + "step": 4291, + "time_per_iteration": 2.841290235519409 + }, + { + "auxiliary_loss_clip": 0.01551736, + "auxiliary_loss_mlp": 0.01045318, + "balance_loss_clip": 1.33960295, + "balance_loss_mlp": 1.01792419, + "epoch": 0.2580490004509244, + "flos": 16443369504000.0, + "grad_norm": 2.651628341899296, + "language_loss": 0.81892836, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.84489894, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.27404785, + "step": 4292, + "time_per_iteration": 2.851317882537842 + }, + { + "auxiliary_loss_clip": 0.01546222, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.33707404, + "balance_loss_mlp": 1.01773119, + "epoch": 0.25810912370359235, + "flos": 33847253026560.0, + "grad_norm": 2.1187369557933753, + "language_loss": 0.69604284, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.72195381, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.27148438, + "step": 4293, + "time_per_iteration": 2.953986406326294 + }, + { + "auxiliary_loss_clip": 0.01545088, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_clip": 1.3369565, + "balance_loss_mlp": 1.02272594, + "epoch": 0.2581692469562603, + "flos": 25203152390400.0, + "grad_norm": 2.914496276230053, + "language_loss": 0.76670265, + "learning_rate": 3.478280185054542e-06, + "loss": 0.79262978, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.24914551, + "step": 4294, + "time_per_iteration": 2.9885363578796387 + }, + { + "auxiliary_loss_clip": 0.01536971, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.32877147, + "balance_loss_mlp": 1.02069449, + "epoch": 0.2582293702089283, + "flos": 34945257288960.0, + "grad_norm": 2.2207129698642025, + "language_loss": 0.81603003, + "learning_rate": 3.478017834441318e-06, + "loss": 0.84187317, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.26672363, + "step": 4295, + "time_per_iteration": 2.962592124938965 + }, + { + "auxiliary_loss_clip": 0.01553953, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.33813834, + "balance_loss_mlp": 1.0165354, + "epoch": 0.2582894934615963, + "flos": 26845046121600.0, + "grad_norm": 2.4275269131903743, + "language_loss": 0.73627061, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.76224011, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.26489258, + "step": 4296, + "time_per_iteration": 2.891291856765747 + }, + { + "auxiliary_loss_clip": 0.01561237, + "auxiliary_loss_mlp": 0.01041827, + "balance_loss_clip": 1.34872866, + "balance_loss_mlp": 1.01569629, + "epoch": 0.25834961671426426, + "flos": 23526121432320.0, + "grad_norm": 2.2023706091803117, + "language_loss": 0.87716484, + "learning_rate": 3.477492965085067e-06, + "loss": 0.9031955, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.26123047, + "step": 4297, + "time_per_iteration": 2.904721260070801 + }, + { + "auxiliary_loss_clip": 0.01556792, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.34492564, + "balance_loss_mlp": 1.0223825, + "epoch": 0.25840973996693223, + "flos": 22460042016000.0, + "grad_norm": 1.697232557508499, + "language_loss": 0.85359848, + "learning_rate": 3.477230446361943e-06, + "loss": 0.87963617, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24621582, + "step": 4298, + "time_per_iteration": 2.838688850402832 + }, + { + "auxiliary_loss_clip": 0.01558589, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.34696364, + "balance_loss_mlp": 1.02050972, + "epoch": 0.2584698632196002, + "flos": 11298132702720.0, + "grad_norm": 1.9688893338296947, + "language_loss": 0.84200859, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.86806464, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.26538086, + "step": 4299, + "time_per_iteration": 2.839909553527832 + }, + { + "auxiliary_loss_clip": 0.01531507, + "auxiliary_loss_mlp": 0.01043547, + "balance_loss_clip": 1.32603359, + "balance_loss_mlp": 1.01945484, + "epoch": 0.25852998647226816, + "flos": 17938651524480.0, + "grad_norm": 2.4053938938950674, + "language_loss": 0.83931786, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.86506838, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.2409668, + "step": 4300, + "time_per_iteration": 2.8212552070617676 + }, + { + "auxiliary_loss_clip": 0.01548493, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.33656168, + "balance_loss_mlp": 1.01906133, + "epoch": 0.2585901097249361, + "flos": 33269312206080.0, + "grad_norm": 2.0393505920931334, + "language_loss": 0.68277049, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.70868695, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.24084473, + "step": 4301, + "time_per_iteration": 2.9867055416107178 + }, + { + "auxiliary_loss_clip": 0.01557165, + "auxiliary_loss_mlp": 0.0105263, + "balance_loss_clip": 1.34103405, + "balance_loss_mlp": 1.0252955, + "epoch": 0.2586502329776041, + "flos": 18450208944000.0, + "grad_norm": 2.2601280610731416, + "language_loss": 0.82826036, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.85435832, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 2.16210938, + "router_z_loss_mlp": 0.27331543, + "step": 4302, + "time_per_iteration": 2.836373805999756 + }, + { + "auxiliary_loss_clip": 0.01544165, + "auxiliary_loss_mlp": 0.0105071, + "balance_loss_clip": 1.33412194, + "balance_loss_mlp": 1.02462673, + "epoch": 0.25871035623027205, + "flos": 17977227356160.0, + "grad_norm": 2.1166353188074494, + "language_loss": 0.92955744, + "learning_rate": 3.475917012694595e-06, + "loss": 0.95550621, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.26074219, + "step": 4303, + "time_per_iteration": 2.8549914360046387 + }, + { + "auxiliary_loss_clip": 0.0153996, + "auxiliary_loss_mlp": 0.01054705, + "balance_loss_clip": 1.32778442, + "balance_loss_mlp": 1.02429497, + "epoch": 0.25877047948294, + "flos": 27788068385280.0, + "grad_norm": 1.8917274396719457, + "language_loss": 0.68113804, + "learning_rate": 3.475654158020507e-06, + "loss": 0.70708466, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 2.12109375, + "router_z_loss_mlp": 0.30432129, + "step": 4304, + "time_per_iteration": 2.9784390926361084 + }, + { + "auxiliary_loss_clip": 0.01547092, + "auxiliary_loss_mlp": 0.01042773, + "balance_loss_clip": 1.33320189, + "balance_loss_mlp": 1.01744056, + "epoch": 0.258830602735608, + "flos": 27137274157440.0, + "grad_norm": 2.3599063192984477, + "language_loss": 0.73215675, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75805545, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.25317383, + "step": 4305, + "time_per_iteration": 2.887453079223633 + }, + { + "auxiliary_loss_clip": 0.01551682, + "auxiliary_loss_mlp": 0.01046345, + "balance_loss_clip": 1.33708, + "balance_loss_mlp": 1.01957059, + "epoch": 0.25889072598827595, + "flos": 17899623244800.0, + "grad_norm": 1.9113160780371612, + "language_loss": 0.7685281, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.79450834, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.26782227, + "step": 4306, + "time_per_iteration": 2.8557512760162354 + }, + { + "auxiliary_loss_clip": 0.01274753, + "auxiliary_loss_mlp": 0.0105202, + "balance_loss_clip": 1.1498239, + "balance_loss_mlp": 1.03294647, + "epoch": 0.2589508492409439, + "flos": 53960756492160.0, + "grad_norm": 0.8659033066560984, + "language_loss": 0.57197279, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59524053, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.19042969, + "step": 4307, + "time_per_iteration": 4.747069597244263 + }, + { + "auxiliary_loss_clip": 0.01534782, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.32948637, + "balance_loss_mlp": 1.02272749, + "epoch": 0.2590109724936119, + "flos": 22135527175680.0, + "grad_norm": 1.6019545817153567, + "language_loss": 0.72435629, + "learning_rate": 3.474602179854327e-06, + "loss": 0.75018907, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.25769043, + "step": 4308, + "time_per_iteration": 2.8435721397399902 + }, + { + "auxiliary_loss_clip": 0.01555887, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.34114563, + "balance_loss_mlp": 1.02150774, + "epoch": 0.2590710957462799, + "flos": 13480658277120.0, + "grad_norm": 1.7034627613247233, + "language_loss": 0.84752935, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.87355375, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 2.1484375, + "router_z_loss_mlp": 0.25061035, + "step": 4309, + "time_per_iteration": 2.8347713947296143 + }, + { + "auxiliary_loss_clip": 0.01529549, + "auxiliary_loss_mlp": 0.01044892, + "balance_loss_clip": 1.3236376, + "balance_loss_mlp": 1.02059698, + "epoch": 0.25913121899894787, + "flos": 22316597441280.0, + "grad_norm": 1.5331207278404484, + "language_loss": 0.84971333, + "learning_rate": 3.474075855228966e-06, + "loss": 0.87545776, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.24304199, + "step": 4310, + "time_per_iteration": 2.852576494216919 + }, + { + "auxiliary_loss_clip": 0.01559177, + "auxiliary_loss_mlp": 0.01052964, + "balance_loss_clip": 1.34663117, + "balance_loss_mlp": 1.02460384, + "epoch": 0.25919134225161583, + "flos": 25822655199360.0, + "grad_norm": 4.255204712540294, + "language_loss": 0.78808552, + "learning_rate": 3.473812609065639e-06, + "loss": 0.81420696, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.28356934, + "step": 4311, + "time_per_iteration": 2.8843564987182617 + }, + { + "auxiliary_loss_clip": 0.01551971, + "auxiliary_loss_mlp": 0.01049885, + "balance_loss_clip": 1.33880591, + "balance_loss_mlp": 1.02408791, + "epoch": 0.2592514655042838, + "flos": 31224756585600.0, + "grad_norm": 4.676813196961946, + "language_loss": 0.73626405, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.76228261, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.25805664, + "step": 4312, + "time_per_iteration": 2.9447107315063477 + }, + { + "auxiliary_loss_clip": 0.01536258, + "auxiliary_loss_mlp": 0.01044688, + "balance_loss_clip": 1.32628775, + "balance_loss_mlp": 1.02035737, + "epoch": 0.25931158875695176, + "flos": 18483219665280.0, + "grad_norm": 1.7922669088839218, + "language_loss": 0.7127347, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.73854411, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.24353027, + "step": 4313, + "time_per_iteration": 2.8080410957336426 + }, + { + "auxiliary_loss_clip": 0.01536808, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.32850313, + "balance_loss_mlp": 1.02647996, + "epoch": 0.2593717120096197, + "flos": 19217183114880.0, + "grad_norm": 1.8536092888964166, + "language_loss": 0.81387448, + "learning_rate": 3.473022535292867e-06, + "loss": 0.83975804, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.25048828, + "step": 4314, + "time_per_iteration": 4.291544437408447 + }, + { + "auxiliary_loss_clip": 0.01548774, + "auxiliary_loss_mlp": 0.01062026, + "balance_loss_clip": 1.33460152, + "balance_loss_mlp": 1.03621757, + "epoch": 0.2594318352622877, + "flos": 31260436750080.0, + "grad_norm": 1.990484932738092, + "language_loss": 0.6831556, + "learning_rate": 3.472759065640968e-06, + "loss": 0.70926368, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.25805664, + "step": 4315, + "time_per_iteration": 2.9445948600769043 + }, + { + "auxiliary_loss_clip": 0.01530844, + "auxiliary_loss_mlp": 0.01054272, + "balance_loss_clip": 1.32362318, + "balance_loss_mlp": 1.0304656, + "epoch": 0.25949195851495566, + "flos": 22247408862720.0, + "grad_norm": 1.5073653180334778, + "language_loss": 0.80207789, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.82792902, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.23828125, + "step": 4316, + "time_per_iteration": 4.272040367126465 + }, + { + "auxiliary_loss_clip": 0.01548486, + "auxiliary_loss_mlp": 0.01049725, + "balance_loss_clip": 1.33228493, + "balance_loss_mlp": 1.02321303, + "epoch": 0.2595520817676236, + "flos": 28087716568320.0, + "grad_norm": 1.7240259445029567, + "language_loss": 0.78552389, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.81150603, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.26513672, + "step": 4317, + "time_per_iteration": 2.9294583797454834 + }, + { + "auxiliary_loss_clip": 0.01536915, + "auxiliary_loss_mlp": 0.0105629, + "balance_loss_clip": 1.32860267, + "balance_loss_mlp": 1.02990901, + "epoch": 0.2596122050202916, + "flos": 20199912330240.0, + "grad_norm": 1.9960473553112512, + "language_loss": 0.78942406, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.81535608, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.26379395, + "step": 4318, + "time_per_iteration": 2.8939156532287598 + }, + { + "auxiliary_loss_clip": 0.01526185, + "auxiliary_loss_mlp": 0.01054258, + "balance_loss_clip": 1.31833911, + "balance_loss_mlp": 1.02687573, + "epoch": 0.25967232827295955, + "flos": 22538008085760.0, + "grad_norm": 1.7701020642633152, + "language_loss": 0.77048516, + "learning_rate": 3.471704628661598e-06, + "loss": 0.79628962, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.27380371, + "step": 4319, + "time_per_iteration": 4.279749155044556 + }, + { + "auxiliary_loss_clip": 0.01527982, + "auxiliary_loss_mlp": 0.01047231, + "balance_loss_clip": 1.32004642, + "balance_loss_mlp": 1.02175605, + "epoch": 0.2597324515256275, + "flos": 21077863292160.0, + "grad_norm": 2.327396823357656, + "language_loss": 0.77148795, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.79724014, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.25476074, + "step": 4320, + "time_per_iteration": 2.8872604370117188 + }, + { + "auxiliary_loss_clip": 0.01535316, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.32543516, + "balance_loss_mlp": 1.01541209, + "epoch": 0.2597925747782955, + "flos": 22059008939520.0, + "grad_norm": 1.9484503461200693, + "language_loss": 0.72303355, + "learning_rate": 3.471177075288801e-06, + "loss": 0.74879158, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 2.09667969, + "router_z_loss_mlp": 0.25061035, + "step": 4321, + "time_per_iteration": 2.879722833633423 + }, + { + "auxiliary_loss_clip": 0.01554151, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.33849108, + "balance_loss_mlp": 1.01672304, + "epoch": 0.2598526980309635, + "flos": 19546448659200.0, + "grad_norm": 2.0339592354195823, + "language_loss": 0.75761747, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.78359151, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.265625, + "step": 4322, + "time_per_iteration": 2.8381519317626953 + }, + { + "auxiliary_loss_clip": 0.01538885, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_clip": 1.32688355, + "balance_loss_mlp": 1.02252853, + "epoch": 0.25991282128363147, + "flos": 24505004839680.0, + "grad_norm": 2.4579813246143662, + "language_loss": 0.74213266, + "learning_rate": 3.470649298767278e-06, + "loss": 0.76801801, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.27111816, + "step": 4323, + "time_per_iteration": 2.93650484085083 + }, + { + "auxiliary_loss_clip": 0.01562297, + "auxiliary_loss_mlp": 0.01044502, + "balance_loss_clip": 1.34208572, + "balance_loss_mlp": 1.0176084, + "epoch": 0.25997294453629943, + "flos": 24210695543040.0, + "grad_norm": 2.582305123894663, + "language_loss": 0.67023969, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69630772, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 2.19921875, + "router_z_loss_mlp": 0.26904297, + "step": 4324, + "time_per_iteration": 2.8541853427886963 + }, + { + "auxiliary_loss_clip": 0.01551258, + "auxiliary_loss_mlp": 0.01050293, + "balance_loss_clip": 1.33917475, + "balance_loss_mlp": 1.02548599, + "epoch": 0.2600330677889674, + "flos": 31444085969280.0, + "grad_norm": 1.8090248420102464, + "language_loss": 0.71789181, + "learning_rate": 3.470121299177082e-06, + "loss": 0.74390727, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.24829102, + "step": 4325, + "time_per_iteration": 2.914515972137451 + }, + { + "auxiliary_loss_clip": 0.01541025, + "auxiliary_loss_mlp": 0.01044872, + "balance_loss_clip": 1.33029532, + "balance_loss_mlp": 1.01884842, + "epoch": 0.26009319104163536, + "flos": 32278303192320.0, + "grad_norm": 2.430913848089865, + "language_loss": 0.73754013, + "learning_rate": 3.469857215756257e-06, + "loss": 0.76339906, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.26025391, + "step": 4326, + "time_per_iteration": 2.894897222518921 + }, + { + "auxiliary_loss_clip": 0.01530043, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.32221341, + "balance_loss_mlp": 1.02172148, + "epoch": 0.26015331429430333, + "flos": 26297989516800.0, + "grad_norm": 1.8475483185590078, + "language_loss": 0.88114959, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.90690815, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 2.07714844, + "router_z_loss_mlp": 0.24084473, + "step": 4327, + "time_per_iteration": 2.854595422744751 + }, + { + "auxiliary_loss_clip": 0.01552816, + "auxiliary_loss_mlp": 0.01063959, + "balance_loss_clip": 1.33745885, + "balance_loss_mlp": 1.03543246, + "epoch": 0.2602134375469713, + "flos": 21152255022720.0, + "grad_norm": 1.5252139165794856, + "language_loss": 0.80930191, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.83546966, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.28515625, + "step": 4328, + "time_per_iteration": 2.8479790687561035 + }, + { + "auxiliary_loss_clip": 0.01532305, + "auxiliary_loss_mlp": 0.01056845, + "balance_loss_clip": 1.32056522, + "balance_loss_mlp": 1.0328362, + "epoch": 0.26027356079963926, + "flos": 25932319891200.0, + "grad_norm": 2.2093658083315346, + "language_loss": 0.8832792, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.90917075, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 2.11914062, + "router_z_loss_mlp": 0.23986816, + "step": 4329, + "time_per_iteration": 2.8868157863616943 + }, + { + "auxiliary_loss_clip": 0.01530486, + "auxiliary_loss_mlp": 0.01054773, + "balance_loss_clip": 1.32276654, + "balance_loss_mlp": 1.03053808, + "epoch": 0.2603336840523072, + "flos": 26370345231360.0, + "grad_norm": 2.9443156757842823, + "language_loss": 0.79011512, + "learning_rate": 3.468800324801802e-06, + "loss": 0.81596768, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.24255371, + "step": 4330, + "time_per_iteration": 2.8730273246765137 + }, + { + "auxiliary_loss_clip": 0.01542487, + "auxiliary_loss_mlp": 0.0105679, + "balance_loss_clip": 1.32775664, + "balance_loss_mlp": 1.03129053, + "epoch": 0.2603938073049752, + "flos": 23524130661120.0, + "grad_norm": 1.4777406324452096, + "language_loss": 0.76466483, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.79065764, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.25512695, + "step": 4331, + "time_per_iteration": 2.8902511596679688 + }, + { + "auxiliary_loss_clip": 0.0153582, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.32652569, + "balance_loss_mlp": 1.03368771, + "epoch": 0.26045393055764315, + "flos": 25385218041600.0, + "grad_norm": 1.5539816484942166, + "language_loss": 0.69757259, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.72351521, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24731445, + "step": 4332, + "time_per_iteration": 2.8708066940307617 + }, + { + "auxiliary_loss_clip": 0.01543269, + "auxiliary_loss_mlp": 0.01055345, + "balance_loss_clip": 1.32920933, + "balance_loss_mlp": 1.0304184, + "epoch": 0.2605140538103111, + "flos": 27646478847360.0, + "grad_norm": 3.18984671962052, + "language_loss": 0.80787873, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.83386493, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.24890137, + "step": 4333, + "time_per_iteration": 2.8994672298431396 + }, + { + "auxiliary_loss_clip": 0.01520277, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_clip": 1.31424212, + "balance_loss_mlp": 1.02361166, + "epoch": 0.2605741770629791, + "flos": 13777229813760.0, + "grad_norm": 1.9272631156026911, + "language_loss": 0.81590474, + "learning_rate": 3.467742542694501e-06, + "loss": 0.84160542, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.26208496, + "step": 4334, + "time_per_iteration": 2.7789146900177 + }, + { + "auxiliary_loss_clip": 0.0153119, + "auxiliary_loss_mlp": 0.01048621, + "balance_loss_clip": 1.32064021, + "balance_loss_mlp": 1.02222776, + "epoch": 0.26063430031564705, + "flos": 26042934723840.0, + "grad_norm": 1.8163989708306456, + "language_loss": 0.80056745, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82636559, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.26391602, + "step": 4335, + "time_per_iteration": 2.922973871231079 + }, + { + "auxiliary_loss_clip": 0.01296884, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.1663928, + "balance_loss_mlp": 1.00930405, + "epoch": 0.26069442356831507, + "flos": 62473809628800.0, + "grad_norm": 0.840472374470239, + "language_loss": 0.60826683, + "learning_rate": 3.467213317659068e-06, + "loss": 0.63156521, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.23632812, + "step": 4336, + "time_per_iteration": 3.3345608711242676 + }, + { + "auxiliary_loss_clip": 0.01532544, + "auxiliary_loss_mlp": 0.01055107, + "balance_loss_clip": 1.31960869, + "balance_loss_mlp": 1.02826154, + "epoch": 0.26075454682098304, + "flos": 13634373421440.0, + "grad_norm": 2.1729115752032064, + "language_loss": 0.78052145, + "learning_rate": 3.46694862168102e-06, + "loss": 0.80639791, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.26879883, + "step": 4337, + "time_per_iteration": 2.9384918212890625 + }, + { + "auxiliary_loss_clip": 0.01522997, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.31211329, + "balance_loss_mlp": 1.02144933, + "epoch": 0.260814670073651, + "flos": 12133119087360.0, + "grad_norm": 2.1358247085364654, + "language_loss": 0.75974, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.78545535, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.27099609, + "step": 4338, + "time_per_iteration": 2.8210909366607666 + }, + { + "auxiliary_loss_clip": 0.0154091, + "auxiliary_loss_mlp": 0.01055746, + "balance_loss_clip": 1.32427144, + "balance_loss_mlp": 1.02754128, + "epoch": 0.26087479332631897, + "flos": 15130967541120.0, + "grad_norm": 2.5868472384030703, + "language_loss": 0.82050383, + "learning_rate": 3.466419062854447e-06, + "loss": 0.84647036, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 2.16503906, + "router_z_loss_mlp": 0.28198242, + "step": 4339, + "time_per_iteration": 2.828016519546509 + }, + { + "auxiliary_loss_clip": 0.01512222, + "auxiliary_loss_mlp": 0.01053674, + "balance_loss_clip": 1.30565619, + "balance_loss_mlp": 1.02742422, + "epoch": 0.26093491657898693, + "flos": 24691640215680.0, + "grad_norm": 1.5850070740845916, + "language_loss": 0.77593911, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.80159807, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.26293945, + "step": 4340, + "time_per_iteration": 2.887030601501465 + }, + { + "auxiliary_loss_clip": 0.0153428, + "auxiliary_loss_mlp": 0.01052668, + "balance_loss_clip": 1.32343149, + "balance_loss_mlp": 1.02530932, + "epoch": 0.2609950398316549, + "flos": 25126272195840.0, + "grad_norm": 1.491726682860765, + "language_loss": 0.82819825, + "learning_rate": 3.465889281600845e-06, + "loss": 0.85406768, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 2.10644531, + "router_z_loss_mlp": 0.27355957, + "step": 4341, + "time_per_iteration": 2.856893539428711 + }, + { + "auxiliary_loss_clip": 0.01513396, + "auxiliary_loss_mlp": 0.01052665, + "balance_loss_clip": 1.30361438, + "balance_loss_mlp": 1.02547324, + "epoch": 0.26105516308432286, + "flos": 28560336197760.0, + "grad_norm": 2.7702825753716587, + "language_loss": 0.77654958, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.80221021, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.27209473, + "step": 4342, + "time_per_iteration": 4.305824279785156 + }, + { + "auxiliary_loss_clip": 0.01526267, + "auxiliary_loss_mlp": 0.01052165, + "balance_loss_clip": 1.31479979, + "balance_loss_mlp": 1.0243181, + "epoch": 0.2611152863369908, + "flos": 39545156787840.0, + "grad_norm": 2.620990631377114, + "language_loss": 0.66750324, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.69328761, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.27856445, + "step": 4343, + "time_per_iteration": 3.038104772567749 + }, + { + "auxiliary_loss_clip": 0.0153126, + "auxiliary_loss_mlp": 0.01050868, + "balance_loss_clip": 1.31805301, + "balance_loss_mlp": 1.02544069, + "epoch": 0.2611754095896588, + "flos": 13743223706880.0, + "grad_norm": 2.569619971373267, + "language_loss": 0.74668151, + "learning_rate": 3.465094192845553e-06, + "loss": 0.77250278, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.25439453, + "step": 4344, + "time_per_iteration": 2.848231077194214 + }, + { + "auxiliary_loss_clip": 0.01526441, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.31629348, + "balance_loss_mlp": 1.02497768, + "epoch": 0.26123553284232676, + "flos": 21516522059520.0, + "grad_norm": 3.0636397672614946, + "language_loss": 0.87634742, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.90213019, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.2689209, + "step": 4345, + "time_per_iteration": 2.8392138481140137 + }, + { + "auxiliary_loss_clip": 0.01520914, + "auxiliary_loss_mlp": 0.01050891, + "balance_loss_clip": 1.31482935, + "balance_loss_mlp": 1.02526152, + "epoch": 0.2612956560949947, + "flos": 21149042641920.0, + "grad_norm": 2.409237343793178, + "language_loss": 0.77848101, + "learning_rate": 3.464563855876015e-06, + "loss": 0.80419904, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.25622559, + "step": 4346, + "time_per_iteration": 2.8563320636749268 + }, + { + "auxiliary_loss_clip": 0.01530399, + "auxiliary_loss_mlp": 0.01058969, + "balance_loss_clip": 1.31881511, + "balance_loss_mlp": 1.03105056, + "epoch": 0.2613557793476627, + "flos": 25129891779840.0, + "grad_norm": 1.481790677209732, + "language_loss": 0.76229972, + "learning_rate": 3.464298604081606e-06, + "loss": 0.7881934, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 2.1171875, + "router_z_loss_mlp": 0.27929688, + "step": 4347, + "time_per_iteration": 2.8837060928344727 + }, + { + "auxiliary_loss_clip": 0.01518243, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_clip": 1.31027293, + "balance_loss_mlp": 1.02180386, + "epoch": 0.26141590260033065, + "flos": 26078841112320.0, + "grad_norm": 1.303315032586668, + "language_loss": 0.74417424, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.76983976, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.26513672, + "step": 4348, + "time_per_iteration": 2.912468433380127 + }, + { + "auxiliary_loss_clip": 0.01532938, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.32359529, + "balance_loss_mlp": 1.02046156, + "epoch": 0.2614760258529987, + "flos": 25712221345920.0, + "grad_norm": 1.7301801179413712, + "language_loss": 0.92008489, + "learning_rate": 3.463767933923799e-06, + "loss": 0.94587195, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 2.09472656, + "router_z_loss_mlp": 0.2532959, + "step": 4349, + "time_per_iteration": 4.304780960083008 + }, + { + "auxiliary_loss_clip": 0.01515136, + "auxiliary_loss_mlp": 0.01044367, + "balance_loss_clip": 1.31048977, + "balance_loss_mlp": 1.01779556, + "epoch": 0.26153614910566664, + "flos": 17466077139840.0, + "grad_norm": 2.1628789302838585, + "language_loss": 0.81038678, + "learning_rate": 3.463502515580524e-06, + "loss": 0.83598179, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26550293, + "step": 4350, + "time_per_iteration": 2.860969066619873 + }, + { + "auxiliary_loss_clip": 0.01506664, + "auxiliary_loss_mlp": 0.01044677, + "balance_loss_clip": 1.30266881, + "balance_loss_mlp": 1.01785505, + "epoch": 0.2615962723583346, + "flos": 17721901094400.0, + "grad_norm": 3.9761005585149674, + "language_loss": 0.63055629, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.6560697, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.26794434, + "step": 4351, + "time_per_iteration": 4.1543285846710205 + }, + { + "auxiliary_loss_clip": 0.01525312, + "auxiliary_loss_mlp": 0.01044105, + "balance_loss_clip": 1.31344199, + "balance_loss_mlp": 1.01719964, + "epoch": 0.26165639561100257, + "flos": 23267989992960.0, + "grad_norm": 2.1742600309903035, + "language_loss": 0.8498807, + "learning_rate": 3.462971512415555e-06, + "loss": 0.87557483, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.2689209, + "step": 4352, + "time_per_iteration": 2.8238565921783447 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.15777326, + "balance_loss_mlp": 1.0118928, + "epoch": 0.26171651886367053, + "flos": 66766188349440.0, + "grad_norm": 0.8151844094540279, + "language_loss": 0.70609063, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72937047, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.2734375, + "step": 4353, + "time_per_iteration": 3.2244582176208496 + }, + { + "auxiliary_loss_clip": 0.01515455, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.30958331, + "balance_loss_mlp": 1.02113307, + "epoch": 0.2617766421163385, + "flos": 22360647893760.0, + "grad_norm": 1.7085689841927818, + "language_loss": 0.78777766, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.81341445, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.27099609, + "step": 4354, + "time_per_iteration": 4.359747886657715 + }, + { + "auxiliary_loss_clip": 0.01528528, + "auxiliary_loss_mlp": 0.01049155, + "balance_loss_clip": 1.31437993, + "balance_loss_mlp": 1.02297688, + "epoch": 0.26183676536900646, + "flos": 26078117195520.0, + "grad_norm": 2.4481691411642577, + "language_loss": 0.69179416, + "learning_rate": 3.462174591623085e-06, + "loss": 0.71757102, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 2.140625, + "router_z_loss_mlp": 0.26171875, + "step": 4355, + "time_per_iteration": 2.8772928714752197 + }, + { + "auxiliary_loss_clip": 0.01518498, + "auxiliary_loss_mlp": 0.01043478, + "balance_loss_clip": 1.30931735, + "balance_loss_mlp": 1.01675153, + "epoch": 0.26189688862167443, + "flos": 21006367228800.0, + "grad_norm": 1.915767444586632, + "language_loss": 0.6803987, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.70601845, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 2.09277344, + "router_z_loss_mlp": 0.2677002, + "step": 4356, + "time_per_iteration": 2.8707733154296875 + }, + { + "auxiliary_loss_clip": 0.01295549, + "auxiliary_loss_mlp": 0.01023621, + "balance_loss_clip": 1.16651011, + "balance_loss_mlp": 1.00264001, + "epoch": 0.2619570118743424, + "flos": 65828550216960.0, + "grad_norm": 0.6844197312751421, + "language_loss": 0.53090227, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55409396, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.20996094, + "step": 4357, + "time_per_iteration": 3.224717617034912 + }, + { + "auxiliary_loss_clip": 0.01535685, + "auxiliary_loss_mlp": 0.01044532, + "balance_loss_clip": 1.32270217, + "balance_loss_mlp": 1.01940215, + "epoch": 0.26201713512701036, + "flos": 28778443971840.0, + "grad_norm": 2.0394699941086, + "language_loss": 0.85458797, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.88039017, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25085449, + "step": 4358, + "time_per_iteration": 2.8925161361694336 + }, + { + "auxiliary_loss_clip": 0.01557919, + "auxiliary_loss_mlp": 0.01053799, + "balance_loss_clip": 1.33845329, + "balance_loss_mlp": 1.02773964, + "epoch": 0.2620772583796783, + "flos": 26443741576320.0, + "grad_norm": 2.363841264826529, + "language_loss": 0.68988246, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.7159996, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 2.19140625, + "router_z_loss_mlp": 0.26074219, + "step": 4359, + "time_per_iteration": 2.8992254734039307 + }, + { + "auxiliary_loss_clip": 0.01540138, + "auxiliary_loss_mlp": 0.01053519, + "balance_loss_clip": 1.32933724, + "balance_loss_mlp": 1.0279721, + "epoch": 0.2621373816323463, + "flos": 20166041957760.0, + "grad_norm": 2.167844238716568, + "language_loss": 0.79266322, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.81859976, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 2.10742188, + "router_z_loss_mlp": 0.25561523, + "step": 4360, + "time_per_iteration": 2.8378803730010986 + }, + { + "auxiliary_loss_clip": 0.01505799, + "auxiliary_loss_mlp": 0.01049082, + "balance_loss_clip": 1.3024497, + "balance_loss_mlp": 1.0239054, + "epoch": 0.26219750488501425, + "flos": 28632918136320.0, + "grad_norm": 1.790973432712191, + "language_loss": 0.68711156, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.71266037, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.25170898, + "step": 4361, + "time_per_iteration": 2.932987928390503 + }, + { + "auxiliary_loss_clip": 0.01545726, + "auxiliary_loss_mlp": 0.01059664, + "balance_loss_clip": 1.33280969, + "balance_loss_mlp": 1.03416538, + "epoch": 0.2622576281376823, + "flos": 15049155663360.0, + "grad_norm": 1.7496310228883967, + "language_loss": 0.8450706, + "learning_rate": 3.46031316964119e-06, + "loss": 0.87112451, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 2.12890625, + "router_z_loss_mlp": 0.25537109, + "step": 4362, + "time_per_iteration": 2.8265230655670166 + }, + { + "auxiliary_loss_clip": 0.01524059, + "auxiliary_loss_mlp": 0.01057915, + "balance_loss_clip": 1.3179934, + "balance_loss_mlp": 1.02939963, + "epoch": 0.26231775139035024, + "flos": 26407427984640.0, + "grad_norm": 1.730811506941307, + "language_loss": 0.66053694, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.68635672, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 2.06152344, + "router_z_loss_mlp": 0.28491211, + "step": 4363, + "time_per_iteration": 2.8585410118103027 + }, + { + "auxiliary_loss_clip": 0.01285645, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.15961838, + "balance_loss_mlp": 1.00759375, + "epoch": 0.2623778746430182, + "flos": 65442221700480.0, + "grad_norm": 0.8908791774180849, + "language_loss": 0.61177057, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63494134, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.23828125, + "step": 4364, + "time_per_iteration": 3.4337141513824463 + }, + { + "auxiliary_loss_clip": 0.01540113, + "auxiliary_loss_mlp": 0.01056836, + "balance_loss_clip": 1.32813239, + "balance_loss_mlp": 1.02982354, + "epoch": 0.26243799789568617, + "flos": 12611801520000.0, + "grad_norm": 2.465566876196162, + "language_loss": 0.72888923, + "learning_rate": 3.459514586533184e-06, + "loss": 0.75485873, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 2.11816406, + "router_z_loss_mlp": 0.27001953, + "step": 4365, + "time_per_iteration": 2.7960169315338135 + }, + { + "auxiliary_loss_clip": 0.01518978, + "auxiliary_loss_mlp": 0.01051868, + "balance_loss_clip": 1.31402755, + "balance_loss_mlp": 1.02574909, + "epoch": 0.26249812114835414, + "flos": 28636175761920.0, + "grad_norm": 2.728274289119461, + "language_loss": 0.77695417, + "learning_rate": 3.459248281460509e-06, + "loss": 0.80266261, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.2611084, + "step": 4366, + "time_per_iteration": 2.903496503829956 + }, + { + "auxiliary_loss_clip": 0.01539199, + "auxiliary_loss_mlp": 0.01047038, + "balance_loss_clip": 1.32962346, + "balance_loss_mlp": 1.02257609, + "epoch": 0.2625582444010221, + "flos": 14473567572480.0, + "grad_norm": 1.5587902467557506, + "language_loss": 0.76693547, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.7927978, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24462891, + "step": 4367, + "time_per_iteration": 2.823155641555786 + }, + { + "auxiliary_loss_clip": 0.01530391, + "auxiliary_loss_mlp": 0.0105032, + "balance_loss_clip": 1.32593381, + "balance_loss_mlp": 1.02519059, + "epoch": 0.26261836765369007, + "flos": 16620639206400.0, + "grad_norm": 1.5042125857216515, + "language_loss": 0.70399392, + "learning_rate": 3.458715505320736e-06, + "loss": 0.72980106, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.25109863, + "step": 4368, + "time_per_iteration": 2.7993810176849365 + }, + { + "auxiliary_loss_clip": 0.01518554, + "auxiliary_loss_mlp": 0.01051681, + "balance_loss_clip": 1.31201148, + "balance_loss_mlp": 1.02469254, + "epoch": 0.26267849090635803, + "flos": 20529132629760.0, + "grad_norm": 1.875777821928032, + "language_loss": 0.78976554, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81546789, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.26977539, + "step": 4369, + "time_per_iteration": 2.833770513534546 + }, + { + "auxiliary_loss_clip": 0.01516529, + "auxiliary_loss_mlp": 0.01048082, + "balance_loss_clip": 1.31193447, + "balance_loss_mlp": 1.02239192, + "epoch": 0.262738614159026, + "flos": 21333687246720.0, + "grad_norm": 1.8479244115291924, + "language_loss": 0.84221637, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.86786246, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.25671387, + "step": 4370, + "time_per_iteration": 2.849787712097168 + }, + { + "auxiliary_loss_clip": 0.01540453, + "auxiliary_loss_mlp": 0.01049839, + "balance_loss_clip": 1.32729673, + "balance_loss_mlp": 1.02341032, + "epoch": 0.26279873741169396, + "flos": 17612688850560.0, + "grad_norm": 1.603238933188055, + "language_loss": 0.72193408, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.74783701, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 2.13476562, + "router_z_loss_mlp": 0.2644043, + "step": 4371, + "time_per_iteration": 2.8413732051849365 + }, + { + "auxiliary_loss_clip": 0.01284781, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.15733993, + "balance_loss_mlp": 1.01346934, + "epoch": 0.2628588606643619, + "flos": 60979839707520.0, + "grad_norm": 0.6857680744458399, + "language_loss": 0.56481111, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58801675, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.22363281, + "step": 4372, + "time_per_iteration": 3.4885976314544678 + }, + { + "auxiliary_loss_clip": 0.01514544, + "auxiliary_loss_mlp": 0.010427, + "balance_loss_clip": 1.31047761, + "balance_loss_mlp": 1.01747537, + "epoch": 0.2629189839170299, + "flos": 27027699955200.0, + "grad_norm": 1.5376102209070737, + "language_loss": 0.78757548, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.8131479, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.25244141, + "step": 4373, + "time_per_iteration": 2.884505033493042 + }, + { + "auxiliary_loss_clip": 0.015151, + "auxiliary_loss_mlp": 0.01041468, + "balance_loss_clip": 1.31045091, + "balance_loss_mlp": 1.01592135, + "epoch": 0.26297910716969786, + "flos": 17028820961280.0, + "grad_norm": 3.5563579814290436, + "language_loss": 0.72049582, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.74606144, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.2557373, + "step": 4374, + "time_per_iteration": 2.926116943359375 + }, + { + "auxiliary_loss_clip": 0.01528109, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_clip": 1.32208014, + "balance_loss_mlp": 1.01854324, + "epoch": 0.2630392304223659, + "flos": 24907485749760.0, + "grad_norm": 2.5622222864093898, + "language_loss": 0.80978173, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83551013, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.26184082, + "step": 4375, + "time_per_iteration": 2.8686394691467285 + }, + { + "auxiliary_loss_clip": 0.01509123, + "auxiliary_loss_mlp": 0.01041323, + "balance_loss_clip": 1.3056531, + "balance_loss_mlp": 1.01695633, + "epoch": 0.26309935367503384, + "flos": 32866921785600.0, + "grad_norm": 1.825715332964434, + "language_loss": 0.67257237, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.69807684, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.24353027, + "step": 4376, + "time_per_iteration": 2.9296579360961914 + }, + { + "auxiliary_loss_clip": 0.01538352, + "auxiliary_loss_mlp": 0.01047721, + "balance_loss_clip": 1.33103895, + "balance_loss_mlp": 1.0231638, + "epoch": 0.2631594769277018, + "flos": 15896403388800.0, + "grad_norm": 1.9027616750234337, + "language_loss": 0.70038062, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.72624135, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24560547, + "step": 4377, + "time_per_iteration": 4.25696325302124 + }, + { + "auxiliary_loss_clip": 0.01519416, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_clip": 1.31376493, + "balance_loss_mlp": 1.02128172, + "epoch": 0.2632196001803698, + "flos": 50822476882560.0, + "grad_norm": 2.168020926484134, + "language_loss": 0.80373544, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.82939339, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.25109863, + "step": 4378, + "time_per_iteration": 3.078158140182495 + }, + { + "auxiliary_loss_clip": 0.01529264, + "auxiliary_loss_mlp": 0.01054869, + "balance_loss_clip": 1.32640028, + "balance_loss_mlp": 1.03170705, + "epoch": 0.26327972343303774, + "flos": 13740328039680.0, + "grad_norm": 2.3896879385624468, + "language_loss": 0.77744091, + "learning_rate": 3.455781283723846e-06, + "loss": 0.8032822, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.23156738, + "step": 4379, + "time_per_iteration": 2.9241106510162354 + }, + { + "auxiliary_loss_clip": 0.01544294, + "auxiliary_loss_mlp": 0.01046294, + "balance_loss_clip": 1.33323979, + "balance_loss_mlp": 1.02013946, + "epoch": 0.2633398466857057, + "flos": 23779592657280.0, + "grad_norm": 1.9693124960458068, + "language_loss": 0.78604573, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.81195164, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.26159668, + "step": 4380, + "time_per_iteration": 2.857485055923462 + }, + { + "auxiliary_loss_clip": 0.01526657, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.31760788, + "balance_loss_mlp": 1.01630318, + "epoch": 0.26339996993837367, + "flos": 27611794068480.0, + "grad_norm": 2.813304737557254, + "language_loss": 0.65038168, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.6760602, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.24926758, + "step": 4381, + "time_per_iteration": 2.879714250564575 + }, + { + "auxiliary_loss_clip": 0.01528043, + "auxiliary_loss_mlp": 0.01042341, + "balance_loss_clip": 1.32215667, + "balance_loss_mlp": 1.01861846, + "epoch": 0.26346009319104163, + "flos": 16955107902720.0, + "grad_norm": 2.2188558899779776, + "language_loss": 0.83001232, + "learning_rate": 3.454979881632595e-06, + "loss": 0.85571623, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.23730469, + "step": 4382, + "time_per_iteration": 2.8500680923461914 + }, + { + "auxiliary_loss_clip": 0.01550427, + "auxiliary_loss_mlp": 0.01043652, + "balance_loss_clip": 1.33728158, + "balance_loss_mlp": 1.01865387, + "epoch": 0.2635202164437096, + "flos": 37246225046400.0, + "grad_norm": 1.8896720576590502, + "language_loss": 0.71050298, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.73644376, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 2.13183594, + "router_z_loss_mlp": 0.24987793, + "step": 4383, + "time_per_iteration": 2.9775073528289795 + }, + { + "auxiliary_loss_clip": 0.01529375, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.32548118, + "balance_loss_mlp": 1.02364182, + "epoch": 0.26358033969637756, + "flos": 21006231494400.0, + "grad_norm": 1.90476581095725, + "language_loss": 0.70507365, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.73083735, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23352051, + "step": 4384, + "time_per_iteration": 4.246594429016113 + }, + { + "auxiliary_loss_clip": 0.01521961, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.31776762, + "balance_loss_mlp": 1.01784182, + "epoch": 0.26364046294904553, + "flos": 27757681862400.0, + "grad_norm": 2.084104467433158, + "language_loss": 0.70594382, + "learning_rate": 3.45417798298451e-06, + "loss": 0.73158121, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.23937988, + "step": 4385, + "time_per_iteration": 2.9262397289276123 + }, + { + "auxiliary_loss_clip": 0.0152244, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.31925964, + "balance_loss_mlp": 1.0192039, + "epoch": 0.2637005862017135, + "flos": 22903315752960.0, + "grad_norm": 1.9095303948859703, + "language_loss": 0.85500282, + "learning_rate": 3.453910573136482e-06, + "loss": 0.88067341, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.25427246, + "step": 4386, + "time_per_iteration": 2.8517913818359375 + }, + { + "auxiliary_loss_clip": 0.01522979, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.31972802, + "balance_loss_mlp": 1.01819086, + "epoch": 0.26376070945438146, + "flos": 15057073503360.0, + "grad_norm": 2.090660821915703, + "language_loss": 0.78369153, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.80934668, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.24353027, + "step": 4387, + "time_per_iteration": 4.163708686828613 + }, + { + "auxiliary_loss_clip": 0.01522182, + "auxiliary_loss_mlp": 0.01038966, + "balance_loss_clip": 1.31920481, + "balance_loss_mlp": 1.01543427, + "epoch": 0.2638208327070494, + "flos": 21151621595520.0, + "grad_norm": 2.0069154852581423, + "language_loss": 0.76522291, + "learning_rate": 3.453375588053264e-06, + "loss": 0.79083443, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.23522949, + "step": 4388, + "time_per_iteration": 2.827009677886963 + }, + { + "auxiliary_loss_clip": 0.01518004, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.31413853, + "balance_loss_mlp": 1.01471102, + "epoch": 0.26388095595971744, + "flos": 21735534729600.0, + "grad_norm": 2.067805992440318, + "language_loss": 0.87815952, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.90372491, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23828125, + "step": 4389, + "time_per_iteration": 4.225706338882446 + }, + { + "auxiliary_loss_clip": 0.01300892, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.17686319, + "balance_loss_mlp": 1.01808095, + "epoch": 0.2639410792123854, + "flos": 65548131073920.0, + "grad_norm": 0.8427475568619489, + "language_loss": 0.60358632, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62698102, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.20507812, + "step": 4390, + "time_per_iteration": 3.392824172973633 + }, + { + "auxiliary_loss_clip": 0.01531193, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.32262111, + "balance_loss_mlp": 1.01483929, + "epoch": 0.2640012024650534, + "flos": 23958400682880.0, + "grad_norm": 1.6314567637237802, + "language_loss": 0.78425699, + "learning_rate": 3.4525726971127e-06, + "loss": 0.80995077, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.23327637, + "step": 4391, + "time_per_iteration": 2.9354658126831055 + }, + { + "auxiliary_loss_clip": 0.01298687, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.17517698, + "balance_loss_mlp": 1.00955474, + "epoch": 0.26406132571772134, + "flos": 56474149161600.0, + "grad_norm": 0.9106257356736966, + "language_loss": 0.58872283, + "learning_rate": 3.45230495662224e-06, + "loss": 0.61201978, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.21484375, + "step": 4392, + "time_per_iteration": 3.2705657482147217 + }, + { + "auxiliary_loss_clip": 0.01525385, + "auxiliary_loss_mlp": 0.01046056, + "balance_loss_clip": 1.31773686, + "balance_loss_mlp": 1.0214988, + "epoch": 0.2641214489703893, + "flos": 22100616172800.0, + "grad_norm": 1.8795410932087275, + "language_loss": 0.70098758, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.72670197, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24572754, + "step": 4393, + "time_per_iteration": 2.95015811920166 + }, + { + "auxiliary_loss_clip": 0.01545515, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.33317471, + "balance_loss_mlp": 1.01797175, + "epoch": 0.26418157222305727, + "flos": 16553396154240.0, + "grad_norm": 2.683938649985518, + "language_loss": 0.85332561, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.8792206, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.26013184, + "step": 4394, + "time_per_iteration": 2.9153456687927246 + }, + { + "auxiliary_loss_clip": 0.01538724, + "auxiliary_loss_mlp": 0.01044347, + "balance_loss_clip": 1.32497358, + "balance_loss_mlp": 1.01937222, + "epoch": 0.26424169547572524, + "flos": 18011052483840.0, + "grad_norm": 4.043430004442886, + "language_loss": 0.71635568, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.74218643, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 2.13671875, + "router_z_loss_mlp": 0.24938965, + "step": 4395, + "time_per_iteration": 2.8526618480682373 + }, + { + "auxiliary_loss_clip": 0.01519815, + "auxiliary_loss_mlp": 0.0104424, + "balance_loss_clip": 1.31646478, + "balance_loss_mlp": 1.02019572, + "epoch": 0.2643018187283932, + "flos": 16992054921600.0, + "grad_norm": 1.7516845339848786, + "language_loss": 0.87364721, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89928776, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.24035645, + "step": 4396, + "time_per_iteration": 2.8595776557922363 + }, + { + "auxiliary_loss_clip": 0.01304253, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.18003905, + "balance_loss_mlp": 1.01250994, + "epoch": 0.26436194198106117, + "flos": 59691942443520.0, + "grad_norm": 0.7962026812127484, + "language_loss": 0.55065805, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57403266, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.20703125, + "step": 4397, + "time_per_iteration": 3.13437819480896 + }, + { + "auxiliary_loss_clip": 0.01521974, + "auxiliary_loss_mlp": 0.01044664, + "balance_loss_clip": 1.31868124, + "balance_loss_mlp": 1.02142978, + "epoch": 0.26442206523372913, + "flos": 32932762248960.0, + "grad_norm": 2.2016703717221047, + "language_loss": 0.78573084, + "learning_rate": 3.450697357532435e-06, + "loss": 0.81139719, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.23217773, + "step": 4398, + "time_per_iteration": 2.9811439514160156 + }, + { + "auxiliary_loss_clip": 0.01524789, + "auxiliary_loss_mlp": 0.01042792, + "balance_loss_clip": 1.32125902, + "balance_loss_mlp": 1.01940298, + "epoch": 0.2644821884863971, + "flos": 21040780538880.0, + "grad_norm": 1.967708778928165, + "language_loss": 0.67741418, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.70308995, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23400879, + "step": 4399, + "time_per_iteration": 2.8819286823272705 + }, + { + "auxiliary_loss_clip": 0.01504437, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.30779207, + "balance_loss_mlp": 1.01505387, + "epoch": 0.26454231173906506, + "flos": 20786449662720.0, + "grad_norm": 1.554974440848081, + "language_loss": 0.87214875, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.8975662, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.22253418, + "step": 4400, + "time_per_iteration": 2.8537957668304443 + }, + { + "auxiliary_loss_clip": 0.01534232, + "auxiliary_loss_mlp": 0.01045038, + "balance_loss_clip": 1.32601404, + "balance_loss_mlp": 1.0198487, + "epoch": 0.264602434991733, + "flos": 16627561660800.0, + "grad_norm": 1.8715207275812595, + "language_loss": 0.7786777, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.80447036, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.25183105, + "step": 4401, + "time_per_iteration": 2.894500732421875 + }, + { + "auxiliary_loss_clip": 0.01544041, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.33508027, + "balance_loss_mlp": 1.02084744, + "epoch": 0.26466255824440105, + "flos": 19072064482560.0, + "grad_norm": 1.6518533074602137, + "language_loss": 0.88966846, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.91556406, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 2.08789062, + "router_z_loss_mlp": 0.2467041, + "step": 4402, + "time_per_iteration": 2.8054749965667725 + }, + { + "auxiliary_loss_clip": 0.01530281, + "auxiliary_loss_mlp": 0.01041311, + "balance_loss_clip": 1.32660794, + "balance_loss_mlp": 1.01807702, + "epoch": 0.264722681497069, + "flos": 22648939632000.0, + "grad_norm": 1.669792516038524, + "language_loss": 0.78914678, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.81486267, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.2322998, + "step": 4403, + "time_per_iteration": 2.908496856689453 + }, + { + "auxiliary_loss_clip": 0.01525939, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.32115781, + "balance_loss_mlp": 1.01543581, + "epoch": 0.264782804749737, + "flos": 22502508900480.0, + "grad_norm": 1.7591338941963925, + "language_loss": 0.89979988, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.92545432, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.24060059, + "step": 4404, + "time_per_iteration": 2.854583740234375 + }, + { + "auxiliary_loss_clip": 0.01526712, + "auxiliary_loss_mlp": 0.01042219, + "balance_loss_clip": 1.32048154, + "balance_loss_mlp": 1.01956952, + "epoch": 0.26484292800240494, + "flos": 16808993884800.0, + "grad_norm": 1.8309025703095054, + "language_loss": 0.76851112, + "learning_rate": 3.448819322433709e-06, + "loss": 0.79420042, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.2265625, + "step": 4405, + "time_per_iteration": 2.839958906173706 + }, + { + "auxiliary_loss_clip": 0.01543763, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.33658028, + "balance_loss_mlp": 1.0163908, + "epoch": 0.2649030512550729, + "flos": 20459446358400.0, + "grad_norm": 1.704466538836302, + "language_loss": 0.71071798, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.73656702, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 2.07128906, + "router_z_loss_mlp": 0.24755859, + "step": 4406, + "time_per_iteration": 2.8724164962768555 + }, + { + "auxiliary_loss_clip": 0.01514227, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.31192744, + "balance_loss_mlp": 1.0172317, + "epoch": 0.2649631745077409, + "flos": 22425538216320.0, + "grad_norm": 1.8125199871560334, + "language_loss": 0.84814143, + "learning_rate": 3.448282246369912e-06, + "loss": 0.87369299, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.23718262, + "step": 4407, + "time_per_iteration": 2.8510217666625977 + }, + { + "auxiliary_loss_clip": 0.01533089, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.32822227, + "balance_loss_mlp": 1.01290059, + "epoch": 0.26502329776040884, + "flos": 35129585180160.0, + "grad_norm": 2.0657121164559538, + "language_loss": 0.76798666, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.79367888, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.2322998, + "step": 4408, + "time_per_iteration": 2.9734225273132324 + }, + { + "auxiliary_loss_clip": 0.0152472, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.32181621, + "balance_loss_mlp": 1.0179522, + "epoch": 0.2650834210130768, + "flos": 38700804729600.0, + "grad_norm": 2.0387639700262685, + "language_loss": 0.71677941, + "learning_rate": 3.447744950630084e-06, + "loss": 0.74243939, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.23352051, + "step": 4409, + "time_per_iteration": 2.984628915786743 + }, + { + "auxiliary_loss_clip": 0.01535019, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.32780707, + "balance_loss_mlp": 1.01794195, + "epoch": 0.26514354426574477, + "flos": 24727410869760.0, + "grad_norm": 1.8166396241438636, + "language_loss": 0.74267077, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.76845229, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.25183105, + "step": 4410, + "time_per_iteration": 2.8831350803375244 + }, + { + "auxiliary_loss_clip": 0.01526927, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.3209244, + "balance_loss_mlp": 1.01397562, + "epoch": 0.26520366751841273, + "flos": 20349872156160.0, + "grad_norm": 1.8098784404045032, + "language_loss": 0.74623829, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.77187508, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.22766113, + "step": 4411, + "time_per_iteration": 2.8563497066497803 + }, + { + "auxiliary_loss_clip": 0.01511096, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.30818176, + "balance_loss_mlp": 1.01715767, + "epoch": 0.2652637907710807, + "flos": 22353453970560.0, + "grad_norm": 1.871453537657166, + "language_loss": 0.83076775, + "learning_rate": 3.446938595306071e-06, + "loss": 0.8563, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.24975586, + "step": 4412, + "time_per_iteration": 4.283655643463135 + }, + { + "auxiliary_loss_clip": 0.01530871, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.32708406, + "balance_loss_mlp": 1.02452922, + "epoch": 0.26532391402374866, + "flos": 19363613846400.0, + "grad_norm": 2.531761544680092, + "language_loss": 0.75505376, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.78084546, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.23779297, + "step": 4413, + "time_per_iteration": 2.8651609420776367 + }, + { + "auxiliary_loss_clip": 0.01291085, + "auxiliary_loss_mlp": 0.01049851, + "balance_loss_clip": 1.17188764, + "balance_loss_mlp": 1.03068256, + "epoch": 0.26538403727641663, + "flos": 44813785438080.0, + "grad_norm": 0.8808210720389538, + "language_loss": 0.56982142, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59323078, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.19140625, + "step": 4414, + "time_per_iteration": 3.30176043510437 + }, + { + "auxiliary_loss_clip": 0.01508089, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.31036043, + "balance_loss_mlp": 1.02055347, + "epoch": 0.26544416052908465, + "flos": 28193354472960.0, + "grad_norm": 1.5811340610465416, + "language_loss": 0.75117469, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.77669454, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.23352051, + "step": 4415, + "time_per_iteration": 2.9457054138183594 + }, + { + "auxiliary_loss_clip": 0.01532268, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.32484031, + "balance_loss_mlp": 1.01406813, + "epoch": 0.2655042837817526, + "flos": 17573886794880.0, + "grad_norm": 2.564044396578109, + "language_loss": 0.87453306, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.90025842, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.26208496, + "step": 4416, + "time_per_iteration": 2.881178617477417 + }, + { + "auxiliary_loss_clip": 0.01535627, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.33059335, + "balance_loss_mlp": 1.01306224, + "epoch": 0.2655644070344206, + "flos": 23415189886080.0, + "grad_norm": 1.5074861383172853, + "language_loss": 0.77671099, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.80244827, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25036621, + "step": 4417, + "time_per_iteration": 2.8869640827178955 + }, + { + "auxiliary_loss_clip": 0.01521311, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_clip": 1.321733, + "balance_loss_mlp": 1.02139127, + "epoch": 0.26562453028708854, + "flos": 26479557475200.0, + "grad_norm": 1.4721480089396264, + "language_loss": 0.81099552, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.83665752, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.23498535, + "step": 4418, + "time_per_iteration": 2.9317519664764404 + }, + { + "auxiliary_loss_clip": 0.01525477, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.32039833, + "balance_loss_mlp": 1.02165949, + "epoch": 0.2656846535397565, + "flos": 19216730666880.0, + "grad_norm": 1.9952149919770632, + "language_loss": 0.68677104, + "learning_rate": 3.445055179644071e-06, + "loss": 0.71248162, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.23913574, + "step": 4419, + "time_per_iteration": 4.282297134399414 + }, + { + "auxiliary_loss_clip": 0.01531585, + "auxiliary_loss_mlp": 0.01048353, + "balance_loss_clip": 1.32636416, + "balance_loss_mlp": 1.02291346, + "epoch": 0.2657447767924245, + "flos": 30562560668160.0, + "grad_norm": 2.128527944077912, + "language_loss": 0.79896045, + "learning_rate": 3.444785900995585e-06, + "loss": 0.82475984, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.25427246, + "step": 4420, + "time_per_iteration": 2.9104080200195312 + }, + { + "auxiliary_loss_clip": 0.01537072, + "auxiliary_loss_mlp": 0.01047284, + "balance_loss_clip": 1.33008146, + "balance_loss_mlp": 1.02099788, + "epoch": 0.26580490004509244, + "flos": 20932427946240.0, + "grad_norm": 1.861202400116776, + "language_loss": 0.82594121, + "learning_rate": 3.444516567560673e-06, + "loss": 0.85178483, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.26281738, + "step": 4421, + "time_per_iteration": 2.867424726486206 + }, + { + "auxiliary_loss_clip": 0.01524715, + "auxiliary_loss_mlp": 0.01043998, + "balance_loss_clip": 1.32453895, + "balance_loss_mlp": 1.02009666, + "epoch": 0.2658650232977604, + "flos": 43961452312320.0, + "grad_norm": 1.6331566580627146, + "language_loss": 0.67027295, + "learning_rate": 3.444247179349548e-06, + "loss": 0.6959601, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.23925781, + "step": 4422, + "time_per_iteration": 4.534050703048706 + }, + { + "auxiliary_loss_clip": 0.01529654, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.32670617, + "balance_loss_mlp": 1.02622819, + "epoch": 0.26592514655042837, + "flos": 29728569669120.0, + "grad_norm": 2.068611112653832, + "language_loss": 0.75676966, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.78256297, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.23461914, + "step": 4423, + "time_per_iteration": 2.9582459926605225 + }, + { + "auxiliary_loss_clip": 0.01533692, + "auxiliary_loss_mlp": 0.01055753, + "balance_loss_clip": 1.32853258, + "balance_loss_mlp": 1.03248358, + "epoch": 0.26598526980309634, + "flos": 46693070507520.0, + "grad_norm": 1.584457524237858, + "language_loss": 0.78762293, + "learning_rate": 3.443708238639522e-06, + "loss": 0.81351733, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.23278809, + "step": 4424, + "time_per_iteration": 4.526980876922607 + }, + { + "auxiliary_loss_clip": 0.01535373, + "auxiliary_loss_mlp": 0.01050341, + "balance_loss_clip": 1.33124638, + "balance_loss_mlp": 1.02621305, + "epoch": 0.2660453930557643, + "flos": 11516692924800.0, + "grad_norm": 1.9968483402637718, + "language_loss": 0.80449462, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.83035177, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.24133301, + "step": 4425, + "time_per_iteration": 2.85209059715271 + }, + { + "auxiliary_loss_clip": 0.01517965, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.31816006, + "balance_loss_mlp": 1.02821183, + "epoch": 0.26610551630843227, + "flos": 24802617006720.0, + "grad_norm": 1.771907854303047, + "language_loss": 0.81602776, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.84172797, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.23815918, + "step": 4426, + "time_per_iteration": 2.8540475368499756 + }, + { + "auxiliary_loss_clip": 0.01529937, + "auxiliary_loss_mlp": 0.01056883, + "balance_loss_clip": 1.32693303, + "balance_loss_mlp": 1.03090692, + "epoch": 0.26616563956110023, + "flos": 27647881436160.0, + "grad_norm": 1.6511327224614956, + "language_loss": 0.77599728, + "learning_rate": 3.442899417008333e-06, + "loss": 0.80186546, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.25964355, + "step": 4427, + "time_per_iteration": 2.921757936477661 + }, + { + "auxiliary_loss_clip": 0.01516432, + "auxiliary_loss_mlp": 0.01042441, + "balance_loss_clip": 1.3184967, + "balance_loss_mlp": 1.02031541, + "epoch": 0.26622576281376825, + "flos": 28373746066560.0, + "grad_norm": 2.7507957441061546, + "language_loss": 0.77384019, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.79942888, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.22119141, + "step": 4428, + "time_per_iteration": 2.9779465198516846 + }, + { + "auxiliary_loss_clip": 0.01530985, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.32678163, + "balance_loss_mlp": 1.02047276, + "epoch": 0.2662858860664362, + "flos": 18050533211520.0, + "grad_norm": 2.2413583795885628, + "language_loss": 0.83901978, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.86476713, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.23278809, + "step": 4429, + "time_per_iteration": 2.8244171142578125 + }, + { + "auxiliary_loss_clip": 0.0153072, + "auxiliary_loss_mlp": 0.01048262, + "balance_loss_clip": 1.32850742, + "balance_loss_mlp": 1.02359784, + "epoch": 0.2663460093191042, + "flos": 22755663411840.0, + "grad_norm": 1.8066965703403772, + "language_loss": 0.73460293, + "learning_rate": 3.442090102943143e-06, + "loss": 0.76039279, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.24645996, + "step": 4430, + "time_per_iteration": 2.874713659286499 + }, + { + "auxiliary_loss_clip": 0.01514847, + "auxiliary_loss_mlp": 0.0104741, + "balance_loss_clip": 1.3131429, + "balance_loss_mlp": 1.0241046, + "epoch": 0.26640613257177215, + "flos": 16517535010560.0, + "grad_norm": 1.8357819495139982, + "language_loss": 0.82728243, + "learning_rate": 3.441820222206035e-06, + "loss": 0.85290504, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.2331543, + "step": 4431, + "time_per_iteration": 2.8452835083007812 + }, + { + "auxiliary_loss_clip": 0.01539485, + "auxiliary_loss_mlp": 0.01054399, + "balance_loss_clip": 1.33033347, + "balance_loss_mlp": 1.02912664, + "epoch": 0.2664662558244401, + "flos": 23086331544960.0, + "grad_norm": 3.087713956260392, + "language_loss": 0.76909894, + "learning_rate": 3.44155028679496e-06, + "loss": 0.79503775, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25268555, + "step": 4432, + "time_per_iteration": 2.861431360244751 + }, + { + "auxiliary_loss_clip": 0.01540196, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.33436728, + "balance_loss_mlp": 1.01907158, + "epoch": 0.2665263790771081, + "flos": 23779864126080.0, + "grad_norm": 1.9173016058230161, + "language_loss": 0.83984083, + "learning_rate": 3.441280296720154e-06, + "loss": 0.86567152, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.23815918, + "step": 4433, + "time_per_iteration": 3.0163822174072266 + }, + { + "auxiliary_loss_clip": 0.01519967, + "auxiliary_loss_mlp": 0.01050682, + "balance_loss_clip": 1.32062495, + "balance_loss_mlp": 1.02554059, + "epoch": 0.26658650232977604, + "flos": 28012872389760.0, + "grad_norm": 1.9643143083143801, + "language_loss": 0.77007312, + "learning_rate": 3.441010251991854e-06, + "loss": 0.79577959, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.25134277, + "step": 4434, + "time_per_iteration": 2.9614663124084473 + }, + { + "auxiliary_loss_clip": 0.0151703, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.31697559, + "balance_loss_mlp": 1.02364469, + "epoch": 0.266646625582444, + "flos": 22173741048960.0, + "grad_norm": 1.938497369516178, + "language_loss": 0.83739114, + "learning_rate": 3.440740152620301e-06, + "loss": 0.86304098, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.24316406, + "step": 4435, + "time_per_iteration": 2.869894504547119 + }, + { + "auxiliary_loss_clip": 0.01523381, + "auxiliary_loss_mlp": 0.01056922, + "balance_loss_clip": 1.31536007, + "balance_loss_mlp": 1.03058839, + "epoch": 0.266706748835112, + "flos": 27864631866240.0, + "grad_norm": 2.2367574520744564, + "language_loss": 0.88681746, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.91262054, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.26306152, + "step": 4436, + "time_per_iteration": 2.9244894981384277 + }, + { + "auxiliary_loss_clip": 0.01523095, + "auxiliary_loss_mlp": 0.01042538, + "balance_loss_clip": 1.31738806, + "balance_loss_mlp": 1.01700342, + "epoch": 0.26676687208777994, + "flos": 25823152892160.0, + "grad_norm": 2.0180491898179187, + "language_loss": 0.78995836, + "learning_rate": 3.440199789988407e-06, + "loss": 0.8156147, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.25561523, + "step": 4437, + "time_per_iteration": 2.916325569152832 + }, + { + "auxiliary_loss_clip": 0.01513785, + "auxiliary_loss_mlp": 0.01045737, + "balance_loss_clip": 1.3126992, + "balance_loss_mlp": 1.02016699, + "epoch": 0.2668269953404479, + "flos": 36078263043840.0, + "grad_norm": 2.0832828580137197, + "language_loss": 0.65537935, + "learning_rate": 3.439929526748556e-06, + "loss": 0.6809746, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.25585938, + "step": 4438, + "time_per_iteration": 2.944976568222046 + }, + { + "auxiliary_loss_clip": 0.015196, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.31672001, + "balance_loss_mlp": 1.01845288, + "epoch": 0.26688711859311587, + "flos": 26580308941440.0, + "grad_norm": 1.9608864294263801, + "language_loss": 0.76580644, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.79144371, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.2565918, + "step": 4439, + "time_per_iteration": 2.8695287704467773 + }, + { + "auxiliary_loss_clip": 0.01523703, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.31921792, + "balance_loss_mlp": 1.01286459, + "epoch": 0.26694724184578383, + "flos": 26773052365440.0, + "grad_norm": 1.7719274028850802, + "language_loss": 0.72027147, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.74590904, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.27185059, + "step": 4440, + "time_per_iteration": 2.9155635833740234 + }, + { + "auxiliary_loss_clip": 0.01517759, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.31158912, + "balance_loss_mlp": 1.01622593, + "epoch": 0.2670073650984518, + "flos": 20969374965120.0, + "grad_norm": 1.8410703149429473, + "language_loss": 0.67723465, + "learning_rate": 3.439118409456376e-06, + "loss": 0.70284677, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.27209473, + "step": 4441, + "time_per_iteration": 2.823272466659546 + }, + { + "auxiliary_loss_clip": 0.01519301, + "auxiliary_loss_mlp": 0.01046817, + "balance_loss_clip": 1.31573582, + "balance_loss_mlp": 1.01955354, + "epoch": 0.2670674883511198, + "flos": 28377727608960.0, + "grad_norm": 1.8274250470550508, + "language_loss": 0.77580225, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.80146343, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.27258301, + "step": 4442, + "time_per_iteration": 2.912431001663208 + }, + { + "auxiliary_loss_clip": 0.01300238, + "auxiliary_loss_mlp": 0.01069606, + "balance_loss_clip": 1.1741128, + "balance_loss_mlp": 1.02955174, + "epoch": 0.2671276116037878, + "flos": 58998771820800.0, + "grad_norm": 0.9371487785988872, + "language_loss": 0.61317694, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63687539, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.40039062, + "step": 4443, + "time_per_iteration": 3.228618860244751 + }, + { + "auxiliary_loss_clip": 0.01521967, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.31866336, + "balance_loss_mlp": 1.01697528, + "epoch": 0.26718773485645575, + "flos": 43960683150720.0, + "grad_norm": 7.355362075147745, + "language_loss": 0.77027893, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.79593676, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.26806641, + "step": 4444, + "time_per_iteration": 3.070876359939575 + }, + { + "auxiliary_loss_clip": 0.0150962, + "auxiliary_loss_mlp": 0.01049648, + "balance_loss_clip": 1.30809498, + "balance_loss_mlp": 1.02298057, + "epoch": 0.2672478581091237, + "flos": 25239194513280.0, + "grad_norm": 1.6325759822643362, + "language_loss": 0.81355369, + "learning_rate": 3.438036155780158e-06, + "loss": 0.83914632, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.2668457, + "step": 4445, + "time_per_iteration": 2.87665057182312 + }, + { + "auxiliary_loss_clip": 0.01520185, + "auxiliary_loss_mlp": 0.01043872, + "balance_loss_clip": 1.31591606, + "balance_loss_mlp": 1.01758623, + "epoch": 0.2673079813617917, + "flos": 15276583866240.0, + "grad_norm": 1.7372285896816426, + "language_loss": 0.90442276, + "learning_rate": 3.43776545600926e-06, + "loss": 0.93006325, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 2.04394531, + "router_z_loss_mlp": 0.26318359, + "step": 4446, + "time_per_iteration": 2.8198487758636475 + }, + { + "auxiliary_loss_clip": 0.0152027, + "auxiliary_loss_mlp": 0.0104747, + "balance_loss_clip": 1.31642783, + "balance_loss_mlp": 1.02223372, + "epoch": 0.26736810461445965, + "flos": 25823922053760.0, + "grad_norm": 4.444928622575966, + "language_loss": 0.68711305, + "learning_rate": 3.437494701718153e-06, + "loss": 0.71279049, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.25256348, + "step": 4447, + "time_per_iteration": 4.251283168792725 + }, + { + "auxiliary_loss_clip": 0.01519034, + "auxiliary_loss_mlp": 0.01041955, + "balance_loss_clip": 1.31543767, + "balance_loss_mlp": 1.01562202, + "epoch": 0.2674282278671276, + "flos": 24322441495680.0, + "grad_norm": 2.450978689143095, + "language_loss": 0.84392059, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.8695305, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.2635498, + "step": 4448, + "time_per_iteration": 2.8767435550689697 + }, + { + "auxiliary_loss_clip": 0.01506109, + "auxiliary_loss_mlp": 0.01056177, + "balance_loss_clip": 1.30550075, + "balance_loss_mlp": 1.02886617, + "epoch": 0.2674883511197956, + "flos": 22824535276800.0, + "grad_norm": 1.4621392229313053, + "language_loss": 0.85171175, + "learning_rate": 3.436953029616378e-06, + "loss": 0.87733459, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.27307129, + "step": 4449, + "time_per_iteration": 2.8537991046905518 + }, + { + "auxiliary_loss_clip": 0.01537159, + "auxiliary_loss_mlp": 0.0105105, + "balance_loss_clip": 1.32391691, + "balance_loss_mlp": 1.02342963, + "epoch": 0.26754847437246354, + "flos": 25380014889600.0, + "grad_norm": 1.6417443680541726, + "language_loss": 0.84410179, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86998379, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.27648926, + "step": 4450, + "time_per_iteration": 2.8755767345428467 + }, + { + "auxiliary_loss_clip": 0.01493202, + "auxiliary_loss_mlp": 0.01049759, + "balance_loss_clip": 1.29561567, + "balance_loss_mlp": 1.02381921, + "epoch": 0.2676085976251315, + "flos": 20240162219520.0, + "grad_norm": 4.114279523344304, + "language_loss": 0.81907201, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.84450155, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.25939941, + "step": 4451, + "time_per_iteration": 2.8153154850006104 + }, + { + "auxiliary_loss_clip": 0.01511406, + "auxiliary_loss_mlp": 0.0104701, + "balance_loss_clip": 1.31207836, + "balance_loss_mlp": 1.02195239, + "epoch": 0.26766872087779947, + "flos": 28049955143040.0, + "grad_norm": 1.6147705887826143, + "language_loss": 0.86888748, + "learning_rate": 3.436140112818882e-06, + "loss": 0.89447165, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.25048828, + "step": 4452, + "time_per_iteration": 2.923161268234253 + }, + { + "auxiliary_loss_clip": 0.01520066, + "auxiliary_loss_mlp": 0.01051527, + "balance_loss_clip": 1.31836283, + "balance_loss_mlp": 1.02487135, + "epoch": 0.26772884413046744, + "flos": 18332354943360.0, + "grad_norm": 2.3917675978850377, + "language_loss": 0.84740114, + "learning_rate": 3.435869031622194e-06, + "loss": 0.87311703, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.26660156, + "step": 4453, + "time_per_iteration": 2.8075778484344482 + }, + { + "auxiliary_loss_clip": 0.01516238, + "auxiliary_loss_mlp": 0.01059335, + "balance_loss_clip": 1.31352663, + "balance_loss_mlp": 1.03206015, + "epoch": 0.2677889673831354, + "flos": 22137698926080.0, + "grad_norm": 1.6747056314030577, + "language_loss": 0.80427641, + "learning_rate": 3.435597895977208e-06, + "loss": 0.83003217, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.27319336, + "step": 4454, + "time_per_iteration": 4.3820905685424805 + }, + { + "auxiliary_loss_clip": 0.01524215, + "auxiliary_loss_mlp": 0.01046112, + "balance_loss_clip": 1.31974483, + "balance_loss_mlp": 1.02106643, + "epoch": 0.2678490906358034, + "flos": 23739478502400.0, + "grad_norm": 1.6681156457499626, + "language_loss": 0.73431039, + "learning_rate": 3.435326705894206e-06, + "loss": 0.76001364, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25061035, + "step": 4455, + "time_per_iteration": 2.910524606704712 + }, + { + "auxiliary_loss_clip": 0.0151458, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_clip": 1.31688273, + "balance_loss_mlp": 1.0205946, + "epoch": 0.2679092138884714, + "flos": 21773024686080.0, + "grad_norm": 1.4910234440050816, + "language_loss": 0.74514723, + "learning_rate": 3.435055461383471e-06, + "loss": 0.77073967, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.24072266, + "step": 4456, + "time_per_iteration": 2.896707534790039 + }, + { + "auxiliary_loss_clip": 0.01535003, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.32860148, + "balance_loss_mlp": 1.02118301, + "epoch": 0.26796933714113935, + "flos": 19869696645120.0, + "grad_norm": 2.927129881848539, + "language_loss": 0.72150171, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.74731213, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24865723, + "step": 4457, + "time_per_iteration": 4.166894197463989 + }, + { + "auxiliary_loss_clip": 0.01540205, + "auxiliary_loss_mlp": 0.01041767, + "balance_loss_clip": 1.33508384, + "balance_loss_mlp": 1.01716197, + "epoch": 0.2680294603938073, + "flos": 20057553630720.0, + "grad_norm": 1.6220325537808822, + "language_loss": 0.80135399, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.82717371, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24609375, + "step": 4458, + "time_per_iteration": 2.846040725708008 + }, + { + "auxiliary_loss_clip": 0.01296127, + "auxiliary_loss_mlp": 0.01022009, + "balance_loss_clip": 1.1768508, + "balance_loss_mlp": 1.00160015, + "epoch": 0.2680895836464753, + "flos": 72146590214400.0, + "grad_norm": 0.8901027373155399, + "language_loss": 0.58835876, + "learning_rate": 3.434241401387739e-06, + "loss": 0.61154014, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.20410156, + "step": 4459, + "time_per_iteration": 4.736678123474121 + }, + { + "auxiliary_loss_clip": 0.01521079, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_clip": 1.31899548, + "balance_loss_mlp": 1.02111018, + "epoch": 0.26814970689914325, + "flos": 20458767686400.0, + "grad_norm": 1.8678507906264046, + "language_loss": 0.85950977, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.88516861, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23718262, + "step": 4460, + "time_per_iteration": 2.8161098957061768 + }, + { + "auxiliary_loss_clip": 0.01524257, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.32424283, + "balance_loss_mlp": 1.02066898, + "epoch": 0.2682098301518112, + "flos": 17575651342080.0, + "grad_norm": 1.7172688425154983, + "language_loss": 0.68880856, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.71449018, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.23217773, + "step": 4461, + "time_per_iteration": 2.8570456504821777 + }, + { + "auxiliary_loss_clip": 0.0152097, + "auxiliary_loss_mlp": 0.01047964, + "balance_loss_clip": 1.32015753, + "balance_loss_mlp": 1.02363312, + "epoch": 0.2682699534044792, + "flos": 18342535023360.0, + "grad_norm": 1.4548136871606214, + "language_loss": 0.68235868, + "learning_rate": 3.43342685191282e-06, + "loss": 0.70804799, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.24353027, + "step": 4462, + "time_per_iteration": 2.8807334899902344 + }, + { + "auxiliary_loss_clip": 0.0152623, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.32519603, + "balance_loss_mlp": 1.02082896, + "epoch": 0.26833007665714714, + "flos": 25312183655040.0, + "grad_norm": 1.8316420456471962, + "language_loss": 0.7070933, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.73280537, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.24145508, + "step": 4463, + "time_per_iteration": 2.9418694972991943 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.01044958, + "balance_loss_clip": 1.33017135, + "balance_loss_mlp": 1.02134264, + "epoch": 0.2683901999098151, + "flos": 16106412343680.0, + "grad_norm": 4.130139473240399, + "language_loss": 0.78491861, + "learning_rate": 3.432883547133931e-06, + "loss": 0.81074202, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.23632812, + "step": 4464, + "time_per_iteration": 2.913673162460327 + }, + { + "auxiliary_loss_clip": 0.01528349, + "auxiliary_loss_mlp": 0.01044006, + "balance_loss_clip": 1.32602525, + "balance_loss_mlp": 1.02142799, + "epoch": 0.2684503231624831, + "flos": 27319475543040.0, + "grad_norm": 1.824387969057913, + "language_loss": 0.71600354, + "learning_rate": 3.432611813236704e-06, + "loss": 0.74172717, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.22595215, + "step": 4465, + "time_per_iteration": 2.8759219646453857 + }, + { + "auxiliary_loss_clip": 0.0129338, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.17613339, + "balance_loss_mlp": 1.01646614, + "epoch": 0.26851044641515104, + "flos": 71890675770240.0, + "grad_norm": 0.7178615314687674, + "language_loss": 0.53256559, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.5558691, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20507812, + "step": 4466, + "time_per_iteration": 3.506331205368042 + }, + { + "auxiliary_loss_clip": 0.01523212, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.32113171, + "balance_loss_mlp": 1.02406716, + "epoch": 0.268570569667819, + "flos": 18742934672640.0, + "grad_norm": 2.4058971636445454, + "language_loss": 0.74208295, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.7677927, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23706055, + "step": 4467, + "time_per_iteration": 2.8276045322418213 + }, + { + "auxiliary_loss_clip": 0.01541312, + "auxiliary_loss_mlp": 0.01044913, + "balance_loss_clip": 1.33442187, + "balance_loss_mlp": 1.02142882, + "epoch": 0.268630692920487, + "flos": 18185290784640.0, + "grad_norm": 3.807262057356128, + "language_loss": 0.82034016, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.84620237, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.23486328, + "step": 4468, + "time_per_iteration": 2.8067502975463867 + }, + { + "auxiliary_loss_clip": 0.01295298, + "auxiliary_loss_mlp": 0.01046181, + "balance_loss_clip": 1.17366219, + "balance_loss_mlp": 1.02920556, + "epoch": 0.268690816173155, + "flos": 68766621010560.0, + "grad_norm": 0.8568809874710711, + "language_loss": 0.59745157, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.62086642, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.16992188, + "step": 4469, + "time_per_iteration": 3.368896484375 + }, + { + "auxiliary_loss_clip": 0.01532753, + "auxiliary_loss_mlp": 0.01044411, + "balance_loss_clip": 1.32833123, + "balance_loss_mlp": 1.01919794, + "epoch": 0.26875093942582295, + "flos": 23303760647040.0, + "grad_norm": 1.9444616399261607, + "language_loss": 0.82342112, + "learning_rate": 3.431252329084972e-06, + "loss": 0.84919274, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 2.04296875, + "router_z_loss_mlp": 0.25219727, + "step": 4470, + "time_per_iteration": 2.8601126670837402 + }, + { + "auxiliary_loss_clip": 0.01512829, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.315346, + "balance_loss_mlp": 1.0181303, + "epoch": 0.2688110626784909, + "flos": 21553423833600.0, + "grad_norm": 1.7073796038118079, + "language_loss": 0.837511, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.86305916, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.23852539, + "step": 4471, + "time_per_iteration": 2.8547284603118896 + }, + { + "auxiliary_loss_clip": 0.01511735, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.31636357, + "balance_loss_mlp": 1.01322162, + "epoch": 0.2688711859311589, + "flos": 28411778960640.0, + "grad_norm": 2.6637533667195177, + "language_loss": 0.7037167, + "learning_rate": 3.43070815543947e-06, + "loss": 0.72919405, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.22753906, + "step": 4472, + "time_per_iteration": 2.91843581199646 + }, + { + "auxiliary_loss_clip": 0.01514896, + "auxiliary_loss_mlp": 0.01046724, + "balance_loss_clip": 1.31592548, + "balance_loss_mlp": 1.02328753, + "epoch": 0.26893130918382685, + "flos": 26006123439360.0, + "grad_norm": 1.6439356154777058, + "language_loss": 0.69349527, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.7191115, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.234375, + "step": 4473, + "time_per_iteration": 2.9201087951660156 + }, + { + "auxiliary_loss_clip": 0.01515815, + "auxiliary_loss_mlp": 0.01041825, + "balance_loss_clip": 1.31863236, + "balance_loss_mlp": 1.01835322, + "epoch": 0.2689914324364948, + "flos": 20349329218560.0, + "grad_norm": 1.6951365580709838, + "language_loss": 0.84205401, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.86763048, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.23498535, + "step": 4474, + "time_per_iteration": 2.894864320755005 + }, + { + "auxiliary_loss_clip": 0.01523863, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_clip": 1.32642031, + "balance_loss_mlp": 1.01909184, + "epoch": 0.2690515556891628, + "flos": 19474183434240.0, + "grad_norm": 2.2175235405452702, + "language_loss": 0.71801925, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.7436806, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.23193359, + "step": 4475, + "time_per_iteration": 2.8265228271484375 + }, + { + "auxiliary_loss_clip": 0.01531614, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.32857573, + "balance_loss_mlp": 1.01620603, + "epoch": 0.26911167894183075, + "flos": 18154768527360.0, + "grad_norm": 1.7490499895849227, + "language_loss": 0.74200571, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.76773083, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.24682617, + "step": 4476, + "time_per_iteration": 2.899834156036377 + }, + { + "auxiliary_loss_clip": 0.01518713, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.32059312, + "balance_loss_mlp": 1.01234245, + "epoch": 0.2691718021944987, + "flos": 19984835957760.0, + "grad_norm": 2.081641303547754, + "language_loss": 0.81800151, + "learning_rate": 3.429346772085922e-06, + "loss": 0.84354305, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.23120117, + "step": 4477, + "time_per_iteration": 2.844987154006958 + }, + { + "auxiliary_loss_clip": 0.01534855, + "auxiliary_loss_mlp": 0.01041472, + "balance_loss_clip": 1.33069563, + "balance_loss_mlp": 1.01776171, + "epoch": 0.2692319254471667, + "flos": 37460260788480.0, + "grad_norm": 1.6903498597265838, + "language_loss": 0.665362, + "learning_rate": 3.429074332770984e-06, + "loss": 0.69112527, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.23681641, + "step": 4478, + "time_per_iteration": 2.975780487060547 + }, + { + "auxiliary_loss_clip": 0.01514371, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.31386089, + "balance_loss_mlp": 1.01411712, + "epoch": 0.26929204869983464, + "flos": 22138060884480.0, + "grad_norm": 2.07843198557456, + "language_loss": 0.81425226, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83976734, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.23046875, + "step": 4479, + "time_per_iteration": 2.8427698612213135 + }, + { + "auxiliary_loss_clip": 0.01531587, + "auxiliary_loss_mlp": 0.01047432, + "balance_loss_clip": 1.32863688, + "balance_loss_mlp": 1.02345896, + "epoch": 0.2693521719525026, + "flos": 19802498837760.0, + "grad_norm": 2.0892483971033022, + "language_loss": 0.81991559, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.84570581, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.2401123, + "step": 4480, + "time_per_iteration": 2.8133246898651123 + }, + { + "auxiliary_loss_clip": 0.01515571, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_clip": 1.3169663, + "balance_loss_mlp": 1.0184226, + "epoch": 0.2694122952051706, + "flos": 21003743030400.0, + "grad_norm": 1.5643643635270468, + "language_loss": 0.78294063, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80851746, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.23693848, + "step": 4481, + "time_per_iteration": 2.8998193740844727 + }, + { + "auxiliary_loss_clip": 0.01518088, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.31754112, + "balance_loss_mlp": 1.02175379, + "epoch": 0.2694724184578386, + "flos": 25860507114240.0, + "grad_norm": 1.7556804992802264, + "language_loss": 0.74469578, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.77033544, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.24133301, + "step": 4482, + "time_per_iteration": 4.287272214889526 + }, + { + "auxiliary_loss_clip": 0.01520579, + "auxiliary_loss_mlp": 0.01039507, + "balance_loss_clip": 1.32060075, + "balance_loss_mlp": 1.01536703, + "epoch": 0.26953254171050656, + "flos": 21737254032000.0, + "grad_norm": 1.7941885585558535, + "language_loss": 0.73235637, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75795722, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.24133301, + "step": 4483, + "time_per_iteration": 2.8306100368499756 + }, + { + "auxiliary_loss_clip": 0.015187, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.31381774, + "balance_loss_mlp": 1.02095699, + "epoch": 0.2695926649631745, + "flos": 19692472187520.0, + "grad_norm": 1.9586012438449065, + "language_loss": 0.87803388, + "learning_rate": 3.427438559239605e-06, + "loss": 0.9036746, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.24438477, + "step": 4484, + "time_per_iteration": 2.9276130199432373 + }, + { + "auxiliary_loss_clip": 0.0152582, + "auxiliary_loss_mlp": 0.01043915, + "balance_loss_clip": 1.32417393, + "balance_loss_mlp": 1.01988268, + "epoch": 0.2696527882158425, + "flos": 32898484673280.0, + "grad_norm": 1.6773623536912314, + "language_loss": 0.67409897, + "learning_rate": 3.427165740807239e-06, + "loss": 0.69979632, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.24035645, + "step": 4485, + "time_per_iteration": 2.917393207550049 + }, + { + "auxiliary_loss_clip": 0.01521215, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.32023418, + "balance_loss_mlp": 1.01860046, + "epoch": 0.26971291146851045, + "flos": 12130992581760.0, + "grad_norm": 2.5273858962031874, + "language_loss": 0.74549788, + "learning_rate": 3.426892868256604e-06, + "loss": 0.77113867, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24279785, + "step": 4486, + "time_per_iteration": 2.8294968605041504 + }, + { + "auxiliary_loss_clip": 0.01532343, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.32925034, + "balance_loss_mlp": 1.01673067, + "epoch": 0.2697730347211784, + "flos": 22643781724800.0, + "grad_norm": 2.075857005406408, + "language_loss": 0.85435903, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.88009489, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.24511719, + "step": 4487, + "time_per_iteration": 2.8783228397369385 + }, + { + "auxiliary_loss_clip": 0.01549794, + "auxiliary_loss_mlp": 0.01047499, + "balance_loss_clip": 1.34115684, + "balance_loss_mlp": 1.02223873, + "epoch": 0.2698331579738464, + "flos": 23523361499520.0, + "grad_norm": 2.587359203731111, + "language_loss": 0.73266125, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.75863415, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.25292969, + "step": 4488, + "time_per_iteration": 2.876934051513672 + }, + { + "auxiliary_loss_clip": 0.01531321, + "auxiliary_loss_mlp": 0.01052291, + "balance_loss_clip": 1.32812726, + "balance_loss_mlp": 1.02676845, + "epoch": 0.26989328122651435, + "flos": 24650847388800.0, + "grad_norm": 1.6458859640149677, + "language_loss": 0.84273887, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86857498, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25549316, + "step": 4489, + "time_per_iteration": 4.276723861694336 + }, + { + "auxiliary_loss_clip": 0.01544081, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_clip": 1.3387084, + "balance_loss_mlp": 1.02841067, + "epoch": 0.2699534044791823, + "flos": 10777752547200.0, + "grad_norm": 2.365556009860475, + "language_loss": 0.91654402, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.94252259, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.25390625, + "step": 4490, + "time_per_iteration": 2.8266777992248535 + }, + { + "auxiliary_loss_clip": 0.01522404, + "auxiliary_loss_mlp": 0.01048883, + "balance_loss_clip": 1.32380557, + "balance_loss_mlp": 1.02541089, + "epoch": 0.2700135277318503, + "flos": 36184670110080.0, + "grad_norm": 2.422056637120002, + "language_loss": 0.73936009, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.765073, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.23498535, + "step": 4491, + "time_per_iteration": 3.000488519668579 + }, + { + "auxiliary_loss_clip": 0.0153894, + "auxiliary_loss_mlp": 0.0104753, + "balance_loss_clip": 1.33541024, + "balance_loss_mlp": 1.02322268, + "epoch": 0.27007365098451824, + "flos": 17427003615360.0, + "grad_norm": 2.6415423951019754, + "language_loss": 0.74846494, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7743296, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 2.03613281, + "router_z_loss_mlp": 0.24328613, + "step": 4492, + "time_per_iteration": 4.165241718292236 + }, + { + "auxiliary_loss_clip": 0.01529333, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.32946682, + "balance_loss_mlp": 1.02342701, + "epoch": 0.2701337742371862, + "flos": 23196177216000.0, + "grad_norm": 2.2112982659418963, + "language_loss": 0.90074587, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.92652023, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.24707031, + "step": 4493, + "time_per_iteration": 2.8973934650421143 + }, + { + "auxiliary_loss_clip": 0.01544985, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.34319484, + "balance_loss_mlp": 1.02015185, + "epoch": 0.2701938974898542, + "flos": 24400181341440.0, + "grad_norm": 1.476103825633319, + "language_loss": 0.72259426, + "learning_rate": 3.424707940835998e-06, + "loss": 0.74848908, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.24353027, + "step": 4494, + "time_per_iteration": 4.419919967651367 + }, + { + "auxiliary_loss_clip": 0.01534548, + "auxiliary_loss_mlp": 0.01041899, + "balance_loss_clip": 1.33494902, + "balance_loss_mlp": 1.01889145, + "epoch": 0.2702540207425222, + "flos": 26225679047040.0, + "grad_norm": 1.9245671842252807, + "language_loss": 0.87764406, + "learning_rate": 3.42443458168683e-06, + "loss": 0.90340853, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.23010254, + "step": 4495, + "time_per_iteration": 2.9214870929718018 + }, + { + "auxiliary_loss_clip": 0.01537274, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.335096, + "balance_loss_mlp": 1.01769948, + "epoch": 0.27031414399519016, + "flos": 22935783536640.0, + "grad_norm": 1.6715632523666182, + "language_loss": 0.77769727, + "learning_rate": 3.424161168522959e-06, + "loss": 0.80347776, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23071289, + "step": 4496, + "time_per_iteration": 2.839212417602539 + }, + { + "auxiliary_loss_clip": 0.01292383, + "auxiliary_loss_mlp": 0.01028138, + "balance_loss_clip": 1.17786098, + "balance_loss_mlp": 1.00439167, + "epoch": 0.2703742672478581, + "flos": 63047904641280.0, + "grad_norm": 0.6929025080574113, + "language_loss": 0.50212032, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52532554, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.23730469, + "step": 4497, + "time_per_iteration": 3.3999369144439697 + }, + { + "auxiliary_loss_clip": 0.01549161, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.34698558, + "balance_loss_mlp": 1.02036071, + "epoch": 0.2704343905005261, + "flos": 18849386983680.0, + "grad_norm": 1.7815894381854147, + "language_loss": 0.7360279, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.76195049, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.22741699, + "step": 4498, + "time_per_iteration": 2.859025478363037 + }, + { + "auxiliary_loss_clip": 0.01293681, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.17508101, + "balance_loss_mlp": 1.00435245, + "epoch": 0.27049451375319405, + "flos": 71264793444480.0, + "grad_norm": 0.7573861723297939, + "language_loss": 0.59180617, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61500967, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.22363281, + "step": 4499, + "time_per_iteration": 3.299910306930542 + }, + { + "auxiliary_loss_clip": 0.01544718, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.3423903, + "balance_loss_mlp": 1.01945126, + "epoch": 0.270554637005862, + "flos": 24288933081600.0, + "grad_norm": 2.395059282927292, + "language_loss": 0.74327737, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.76915693, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.23803711, + "step": 4500, + "time_per_iteration": 2.84537935256958 + }, + { + "auxiliary_loss_clip": 0.01531424, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_clip": 1.33205068, + "balance_loss_mlp": 1.01982522, + "epoch": 0.27061476025853, + "flos": 17639410544640.0, + "grad_norm": 3.464175558924343, + "language_loss": 0.82455879, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.85030895, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.2376709, + "step": 4501, + "time_per_iteration": 2.7714593410491943 + }, + { + "auxiliary_loss_clip": 0.01551137, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_clip": 1.34441113, + "balance_loss_mlp": 1.0220468, + "epoch": 0.27067488351119795, + "flos": 22720164226560.0, + "grad_norm": 1.8646656525468983, + "language_loss": 0.73067367, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75665188, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24621582, + "step": 4502, + "time_per_iteration": 2.8765456676483154 + }, + { + "auxiliary_loss_clip": 0.01548797, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.34215581, + "balance_loss_mlp": 1.0148195, + "epoch": 0.2707350067638659, + "flos": 41734695306240.0, + "grad_norm": 1.9024346202620086, + "language_loss": 0.69334483, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.71922612, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 2.06738281, + "router_z_loss_mlp": 0.24511719, + "step": 4503, + "time_per_iteration": 3.0678322315216064 + }, + { + "auxiliary_loss_clip": 0.01532001, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.32947779, + "balance_loss_mlp": 1.01812184, + "epoch": 0.2707951300165339, + "flos": 20202536528640.0, + "grad_norm": 2.091670978712513, + "language_loss": 0.6880694, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.71381128, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.24060059, + "step": 4504, + "time_per_iteration": 2.8131747245788574 + }, + { + "auxiliary_loss_clip": 0.01539801, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_clip": 1.33672678, + "balance_loss_mlp": 1.01935232, + "epoch": 0.27085525326920185, + "flos": 21443894876160.0, + "grad_norm": 1.639890872603608, + "language_loss": 0.7628355, + "learning_rate": 3.421698021097902e-06, + "loss": 0.78867179, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.24499512, + "step": 4505, + "time_per_iteration": 2.8475844860076904 + }, + { + "auxiliary_loss_clip": 0.01549067, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.34139395, + "balance_loss_mlp": 1.01957631, + "epoch": 0.2709153765218698, + "flos": 17684003934720.0, + "grad_norm": 3.1035579874120405, + "language_loss": 0.7515623, + "learning_rate": 3.42142406835758e-06, + "loss": 0.77750325, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.2545166, + "step": 4506, + "time_per_iteration": 2.937532663345337 + }, + { + "auxiliary_loss_clip": 0.01540674, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_clip": 1.33571398, + "balance_loss_mlp": 1.01951694, + "epoch": 0.2709754997745378, + "flos": 24465388377600.0, + "grad_norm": 1.9194779144569616, + "language_loss": 0.81609458, + "learning_rate": 3.421150061716715e-06, + "loss": 0.84194148, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.24511719, + "step": 4507, + "time_per_iteration": 2.8937251567840576 + }, + { + "auxiliary_loss_clip": 0.01277042, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.16090786, + "balance_loss_mlp": 1.00585079, + "epoch": 0.2710356230272058, + "flos": 65239976868480.0, + "grad_norm": 0.7388666150128858, + "language_loss": 0.50854933, + "learning_rate": 3.420876001185698e-06, + "loss": 0.53157854, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.20019531, + "step": 4508, + "time_per_iteration": 3.261735200881958 + }, + { + "auxiliary_loss_clip": 0.0152474, + "auxiliary_loss_mlp": 0.01046376, + "balance_loss_clip": 1.32556009, + "balance_loss_mlp": 1.02285612, + "epoch": 0.27109574627987376, + "flos": 25495561405440.0, + "grad_norm": 1.991897825364481, + "language_loss": 0.75892687, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.78463805, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.23535156, + "step": 4509, + "time_per_iteration": 2.8819637298583984 + }, + { + "auxiliary_loss_clip": 0.01505652, + "auxiliary_loss_mlp": 0.01051832, + "balance_loss_clip": 1.30797935, + "balance_loss_mlp": 1.02886069, + "epoch": 0.2711558695325417, + "flos": 19692291208320.0, + "grad_norm": 1.7903139396306431, + "language_loss": 0.72443724, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.75001204, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.22961426, + "step": 4510, + "time_per_iteration": 2.864017963409424 + }, + { + "auxiliary_loss_clip": 0.01525966, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.32391834, + "balance_loss_mlp": 1.02164865, + "epoch": 0.2712159927852097, + "flos": 18596865899520.0, + "grad_norm": 2.206869137674953, + "language_loss": 0.71527231, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.74098301, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.23449707, + "step": 4511, + "time_per_iteration": 2.7969539165496826 + }, + { + "auxiliary_loss_clip": 0.01538405, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.33237386, + "balance_loss_mlp": 1.0265379, + "epoch": 0.27127611603787766, + "flos": 25641358709760.0, + "grad_norm": 1.9821475581319983, + "language_loss": 0.8225131, + "learning_rate": 3.419779220367979e-06, + "loss": 0.84840786, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.2454834, + "step": 4512, + "time_per_iteration": 2.8763821125030518 + }, + { + "auxiliary_loss_clip": 0.01523746, + "auxiliary_loss_mlp": 0.01049393, + "balance_loss_clip": 1.32336104, + "balance_loss_mlp": 1.02512217, + "epoch": 0.2713362392905456, + "flos": 23159365931520.0, + "grad_norm": 3.338849466002978, + "language_loss": 0.80903387, + "learning_rate": 3.419504890542124e-06, + "loss": 0.83476526, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.24279785, + "step": 4513, + "time_per_iteration": 2.8452086448669434 + }, + { + "auxiliary_loss_clip": 0.01533667, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.32888484, + "balance_loss_mlp": 1.0280174, + "epoch": 0.2713963625432136, + "flos": 18374369379840.0, + "grad_norm": 2.684706151923164, + "language_loss": 0.88874876, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.91459703, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.23144531, + "step": 4514, + "time_per_iteration": 2.8170228004455566 + }, + { + "auxiliary_loss_clip": 0.01538351, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_clip": 1.33589149, + "balance_loss_mlp": 1.02620554, + "epoch": 0.27145648579588155, + "flos": 22501875473280.0, + "grad_norm": 1.5879991795552901, + "language_loss": 0.92382729, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94970649, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.23339844, + "step": 4515, + "time_per_iteration": 2.854952573776245 + }, + { + "auxiliary_loss_clip": 0.01541849, + "auxiliary_loss_mlp": 0.01059893, + "balance_loss_clip": 1.33435988, + "balance_loss_mlp": 1.03247488, + "epoch": 0.2715166090485495, + "flos": 19247071944960.0, + "grad_norm": 5.701665402824538, + "language_loss": 0.75141078, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.77742821, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.27416992, + "step": 4516, + "time_per_iteration": 2.7883284091949463 + }, + { + "auxiliary_loss_clip": 0.01523304, + "auxiliary_loss_mlp": 0.01051539, + "balance_loss_clip": 1.32336271, + "balance_loss_mlp": 1.0271492, + "epoch": 0.2715767323012175, + "flos": 17717829062400.0, + "grad_norm": 1.8205656916879882, + "language_loss": 0.77065909, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.79640758, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.24401855, + "step": 4517, + "time_per_iteration": 4.288268804550171 + }, + { + "auxiliary_loss_clip": 0.01524075, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.3225044, + "balance_loss_mlp": 1.0257628, + "epoch": 0.27163685555388545, + "flos": 22393070432640.0, + "grad_norm": 3.151959911710973, + "language_loss": 0.78918213, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.81491667, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.23608398, + "step": 4518, + "time_per_iteration": 2.8431448936462402 + }, + { + "auxiliary_loss_clip": 0.01531604, + "auxiliary_loss_mlp": 0.01051724, + "balance_loss_clip": 1.3290987, + "balance_loss_mlp": 1.02900314, + "epoch": 0.2716969788065534, + "flos": 22357028309760.0, + "grad_norm": 1.698227330384358, + "language_loss": 0.69249296, + "learning_rate": 3.41785778156811e-06, + "loss": 0.71832621, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.22717285, + "step": 4519, + "time_per_iteration": 2.847877025604248 + }, + { + "auxiliary_loss_clip": 0.01522266, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.32212591, + "balance_loss_mlp": 1.02544928, + "epoch": 0.2717571020592214, + "flos": 25239375492480.0, + "grad_norm": 2.023738187590326, + "language_loss": 0.76329529, + "learning_rate": 3.417583075166451e-06, + "loss": 0.78900027, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.2277832, + "step": 4520, + "time_per_iteration": 2.869149923324585 + }, + { + "auxiliary_loss_clip": 0.01541822, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.3353548, + "balance_loss_mlp": 1.02395868, + "epoch": 0.2718172253118894, + "flos": 20198419251840.0, + "grad_norm": 2.8833189281257345, + "language_loss": 0.76940417, + "learning_rate": 3.4173083150099e-06, + "loss": 0.79532599, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.26403809, + "step": 4521, + "time_per_iteration": 2.819533586502075 + }, + { + "auxiliary_loss_clip": 0.01540183, + "auxiliary_loss_mlp": 0.01056946, + "balance_loss_clip": 1.33379292, + "balance_loss_mlp": 1.03232932, + "epoch": 0.27187734856455736, + "flos": 14327182085760.0, + "grad_norm": 2.8330427663272504, + "language_loss": 0.76224911, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78822041, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.24645996, + "step": 4522, + "time_per_iteration": 2.824429988861084 + }, + { + "auxiliary_loss_clip": 0.01539029, + "auxiliary_loss_mlp": 0.01050308, + "balance_loss_clip": 1.33424997, + "balance_loss_mlp": 1.02479768, + "epoch": 0.27193747181722533, + "flos": 21118067936640.0, + "grad_norm": 1.6635452219029099, + "language_loss": 0.7335614, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75945473, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.25500488, + "step": 4523, + "time_per_iteration": 2.839393377304077 + }, + { + "auxiliary_loss_clip": 0.0152041, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.32195401, + "balance_loss_mlp": 1.02175057, + "epoch": 0.2719975950698933, + "flos": 19692291208320.0, + "grad_norm": 1.5592061137812998, + "language_loss": 0.74660599, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.77228677, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.25915527, + "step": 4524, + "time_per_iteration": 4.234760284423828 + }, + { + "auxiliary_loss_clip": 0.01540466, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.33870673, + "balance_loss_mlp": 1.02654564, + "epoch": 0.27205771832256126, + "flos": 24765353274240.0, + "grad_norm": 1.592078818239205, + "language_loss": 0.77258193, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.79850733, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.25512695, + "step": 4525, + "time_per_iteration": 2.926093578338623 + }, + { + "auxiliary_loss_clip": 0.01525974, + "auxiliary_loss_mlp": 0.01050906, + "balance_loss_clip": 1.32428527, + "balance_loss_mlp": 1.02730227, + "epoch": 0.2721178415752292, + "flos": 21763839991680.0, + "grad_norm": 2.339047790230912, + "language_loss": 0.82520175, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.85097063, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.23596191, + "step": 4526, + "time_per_iteration": 2.8065624237060547 + }, + { + "auxiliary_loss_clip": 0.01554951, + "auxiliary_loss_mlp": 0.01046532, + "balance_loss_clip": 1.34348369, + "balance_loss_mlp": 1.02166486, + "epoch": 0.2721779648278972, + "flos": 12684338213760.0, + "grad_norm": 2.2119183252801795, + "language_loss": 0.77919114, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.80520606, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.2487793, + "step": 4527, + "time_per_iteration": 4.177669525146484 + }, + { + "auxiliary_loss_clip": 0.01537586, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.3349539, + "balance_loss_mlp": 1.02279973, + "epoch": 0.27223808808056515, + "flos": 16261846790400.0, + "grad_norm": 2.19648132928749, + "language_loss": 0.82656884, + "learning_rate": 3.415383489652503e-06, + "loss": 0.85240829, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.23571777, + "step": 4528, + "time_per_iteration": 2.8477084636688232 + }, + { + "auxiliary_loss_clip": 0.01515965, + "auxiliary_loss_mlp": 0.01041892, + "balance_loss_clip": 1.31728482, + "balance_loss_mlp": 1.01753747, + "epoch": 0.2722982113332331, + "flos": 27757319904000.0, + "grad_norm": 5.803982971644948, + "language_loss": 0.78252745, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.80810595, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.24365234, + "step": 4529, + "time_per_iteration": 4.350273609161377 + }, + { + "auxiliary_loss_clip": 0.01538389, + "auxiliary_loss_mlp": 0.01052958, + "balance_loss_clip": 1.33270931, + "balance_loss_mlp": 1.02730429, + "epoch": 0.2723583345859011, + "flos": 21736349136000.0, + "grad_norm": 7.69959924023562, + "language_loss": 0.83446801, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.86038154, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.25671387, + "step": 4530, + "time_per_iteration": 2.8245480060577393 + }, + { + "auxiliary_loss_clip": 0.01522358, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_clip": 1.32181787, + "balance_loss_mlp": 1.02042961, + "epoch": 0.27241845783856905, + "flos": 17356050489600.0, + "grad_norm": 2.0049636029477926, + "language_loss": 0.92915356, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.95484227, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.26098633, + "step": 4531, + "time_per_iteration": 2.8348162174224854 + }, + { + "auxiliary_loss_clip": 0.01533091, + "auxiliary_loss_mlp": 0.0104841, + "balance_loss_clip": 1.32849991, + "balance_loss_mlp": 1.02196932, + "epoch": 0.272478581091237, + "flos": 24765127050240.0, + "grad_norm": 2.5009026653901936, + "language_loss": 0.7770735, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.80288851, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.26416016, + "step": 4532, + "time_per_iteration": 2.848690986633301 + }, + { + "auxiliary_loss_clip": 0.0151142, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.31404686, + "balance_loss_mlp": 1.01613009, + "epoch": 0.272538704343905, + "flos": 17898537369600.0, + "grad_norm": 3.462935185690067, + "language_loss": 0.90475899, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.93029046, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.25585938, + "step": 4533, + "time_per_iteration": 2.844538688659668 + }, + { + "auxiliary_loss_clip": 0.01509171, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.31142092, + "balance_loss_mlp": 1.01618385, + "epoch": 0.272598827596573, + "flos": 22942886970240.0, + "grad_norm": 2.348844818246144, + "language_loss": 0.72197914, + "learning_rate": 3.413731546022929e-06, + "loss": 0.74748909, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.25646973, + "step": 4534, + "time_per_iteration": 2.8592658042907715 + }, + { + "auxiliary_loss_clip": 0.01531149, + "auxiliary_loss_mlp": 0.01046175, + "balance_loss_clip": 1.32564723, + "balance_loss_mlp": 1.01854229, + "epoch": 0.27265895084924097, + "flos": 24247778296320.0, + "grad_norm": 1.6410667037528965, + "language_loss": 0.91781628, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.94358957, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 2.05859375, + "router_z_loss_mlp": 0.27685547, + "step": 4535, + "time_per_iteration": 2.8365399837493896 + }, + { + "auxiliary_loss_clip": 0.01529001, + "auxiliary_loss_mlp": 0.01047645, + "balance_loss_clip": 1.32493389, + "balance_loss_mlp": 1.02088201, + "epoch": 0.27271907410190893, + "flos": 27023718412800.0, + "grad_norm": 1.717533848895498, + "language_loss": 0.73555899, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.76132536, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.2677002, + "step": 4536, + "time_per_iteration": 2.9224085807800293 + }, + { + "auxiliary_loss_clip": 0.01535549, + "auxiliary_loss_mlp": 0.01040055, + "balance_loss_clip": 1.33303261, + "balance_loss_mlp": 1.01437747, + "epoch": 0.2727791973545769, + "flos": 34464131637120.0, + "grad_norm": 2.082901301202596, + "language_loss": 0.72126818, + "learning_rate": 3.41290485034781e-06, + "loss": 0.7470243, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.25708008, + "step": 4537, + "time_per_iteration": 2.926367998123169 + }, + { + "auxiliary_loss_clip": 0.01510762, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.30902731, + "balance_loss_mlp": 1.01404905, + "epoch": 0.27283932060724486, + "flos": 15048115032960.0, + "grad_norm": 2.1746670319933625, + "language_loss": 0.79310179, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8186205, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.27124023, + "step": 4538, + "time_per_iteration": 2.81303071975708 + }, + { + "auxiliary_loss_clip": 0.01507939, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_clip": 1.30788612, + "balance_loss_mlp": 1.02055526, + "epoch": 0.2728994438599128, + "flos": 21662274119040.0, + "grad_norm": 1.5855004726806494, + "language_loss": 0.90852118, + "learning_rate": 3.412353451992847e-06, + "loss": 0.93406808, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.26196289, + "step": 4539, + "time_per_iteration": 2.815244674682617 + }, + { + "auxiliary_loss_clip": 0.01503333, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.3052336, + "balance_loss_mlp": 1.01347065, + "epoch": 0.2729595671125808, + "flos": 17496011214720.0, + "grad_norm": 2.0022747815866566, + "language_loss": 0.89022708, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.91565919, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.26428223, + "step": 4540, + "time_per_iteration": 2.8378312587738037 + }, + { + "auxiliary_loss_clip": 0.01513514, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.31273699, + "balance_loss_mlp": 1.01637626, + "epoch": 0.27301969036524876, + "flos": 19327616968320.0, + "grad_norm": 1.978725106874439, + "language_loss": 0.82348096, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84903967, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.25939941, + "step": 4541, + "time_per_iteration": 2.8452773094177246 + }, + { + "auxiliary_loss_clip": 0.01495738, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_clip": 1.2974658, + "balance_loss_mlp": 1.0174017, + "epoch": 0.2730798136179167, + "flos": 21074605666560.0, + "grad_norm": 1.854041562097251, + "language_loss": 0.80730712, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.83270895, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.27062988, + "step": 4542, + "time_per_iteration": 2.8231611251831055 + }, + { + "auxiliary_loss_clip": 0.01505304, + "auxiliary_loss_mlp": 0.01044562, + "balance_loss_clip": 1.30703092, + "balance_loss_mlp": 1.01803827, + "epoch": 0.2731399368705847, + "flos": 19181050502400.0, + "grad_norm": 2.614988424904129, + "language_loss": 0.91043079, + "learning_rate": 3.411250012687582e-06, + "loss": 0.93592948, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.265625, + "step": 4543, + "time_per_iteration": 2.856334924697876 + }, + { + "auxiliary_loss_clip": 0.01510002, + "auxiliary_loss_mlp": 0.01048766, + "balance_loss_clip": 1.30515695, + "balance_loss_mlp": 1.02127647, + "epoch": 0.27320006012325265, + "flos": 18297670164480.0, + "grad_norm": 2.8272418826105183, + "language_loss": 0.64570796, + "learning_rate": 3.410974019048255e-06, + "loss": 0.67129564, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.27514648, + "step": 4544, + "time_per_iteration": 2.840031147003174 + }, + { + "auxiliary_loss_clip": 0.01500918, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.30235147, + "balance_loss_mlp": 1.0203191, + "epoch": 0.2732601833759206, + "flos": 34874394652800.0, + "grad_norm": 2.564007933045178, + "language_loss": 0.70604551, + "learning_rate": 3.410697971904651e-06, + "loss": 0.7315377, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.27966309, + "step": 4545, + "time_per_iteration": 3.0034029483795166 + }, + { + "auxiliary_loss_clip": 0.01345538, + "auxiliary_loss_mlp": 0.01076333, + "balance_loss_clip": 1.22234082, + "balance_loss_mlp": 1.02998471, + "epoch": 0.2733203066285886, + "flos": 53939826132480.0, + "grad_norm": 0.7394199027663796, + "language_loss": 0.61664516, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.6408639, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.46289062, + "step": 4546, + "time_per_iteration": 3.3792715072631836 + }, + { + "auxiliary_loss_clip": 0.01496103, + "auxiliary_loss_mlp": 0.01050521, + "balance_loss_clip": 1.29841542, + "balance_loss_mlp": 1.02199435, + "epoch": 0.2733804298812566, + "flos": 20668188458880.0, + "grad_norm": 1.8335473419018207, + "language_loss": 0.6622355, + "learning_rate": 3.410145717146488e-06, + "loss": 0.6877017, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.28503418, + "step": 4547, + "time_per_iteration": 2.8350131511688232 + }, + { + "auxiliary_loss_clip": 0.01486254, + "auxiliary_loss_mlp": 0.01045668, + "balance_loss_clip": 1.29160058, + "balance_loss_mlp": 1.01729572, + "epoch": 0.27344055313392457, + "flos": 25895191893120.0, + "grad_norm": 1.8470132447492067, + "language_loss": 0.79330468, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.8186239, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.28393555, + "step": 4548, + "time_per_iteration": 2.908257484436035 + }, + { + "auxiliary_loss_clip": 0.01498162, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.29990685, + "balance_loss_mlp": 1.02000046, + "epoch": 0.27350067638659253, + "flos": 22940036547840.0, + "grad_norm": 1.9287180684371499, + "language_loss": 0.83753467, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.8629694, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.25354004, + "step": 4549, + "time_per_iteration": 2.90997314453125 + }, + { + "auxiliary_loss_clip": 0.01507216, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.30711985, + "balance_loss_mlp": 1.01623094, + "epoch": 0.2735607996392605, + "flos": 16581791905920.0, + "grad_norm": 2.148910374784724, + "language_loss": 0.71807754, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.74358571, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.27404785, + "step": 4550, + "time_per_iteration": 2.8507440090179443 + }, + { + "auxiliary_loss_clip": 0.01491796, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.29434848, + "balance_loss_mlp": 1.01340508, + "epoch": 0.27362092289192846, + "flos": 19654529783040.0, + "grad_norm": 2.404857785320402, + "language_loss": 0.79648721, + "learning_rate": 3.409040566039563e-06, + "loss": 0.82179481, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.25537109, + "step": 4551, + "time_per_iteration": 2.7933847904205322 + }, + { + "auxiliary_loss_clip": 0.01493162, + "auxiliary_loss_mlp": 0.01046451, + "balance_loss_clip": 1.29456675, + "balance_loss_mlp": 1.01968884, + "epoch": 0.27368104614459643, + "flos": 17648007056640.0, + "grad_norm": 2.8935141184636937, + "language_loss": 0.72593892, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.75133502, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.26757812, + "step": 4552, + "time_per_iteration": 4.182410478591919 + }, + { + "auxiliary_loss_clip": 0.01506672, + "auxiliary_loss_mlp": 0.01043508, + "balance_loss_clip": 1.30659437, + "balance_loss_mlp": 1.01707971, + "epoch": 0.2737411693972644, + "flos": 21590008894080.0, + "grad_norm": 2.0802376294271587, + "language_loss": 0.72684461, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7523464, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.26428223, + "step": 4553, + "time_per_iteration": 2.8192899227142334 + }, + { + "auxiliary_loss_clip": 0.01506273, + "auxiliary_loss_mlp": 0.01051026, + "balance_loss_clip": 1.30738676, + "balance_loss_mlp": 1.02376294, + "epoch": 0.27380129264993236, + "flos": 25495063712640.0, + "grad_norm": 1.7894983915907772, + "language_loss": 0.60348129, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.62905425, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.27270508, + "step": 4554, + "time_per_iteration": 2.847777843475342 + }, + { + "auxiliary_loss_clip": 0.01518703, + "auxiliary_loss_mlp": 0.01050494, + "balance_loss_clip": 1.31324422, + "balance_loss_mlp": 1.02274203, + "epoch": 0.2738614159026003, + "flos": 18670443223680.0, + "grad_norm": 2.04036368862014, + "language_loss": 0.75009131, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.7757833, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.27734375, + "step": 4555, + "time_per_iteration": 2.81400728225708 + }, + { + "auxiliary_loss_clip": 0.01514885, + "auxiliary_loss_mlp": 0.01058563, + "balance_loss_clip": 1.31254518, + "balance_loss_mlp": 1.0315026, + "epoch": 0.2739215391552683, + "flos": 23487590845440.0, + "grad_norm": 2.0856211976569825, + "language_loss": 0.78470099, + "learning_rate": 3.407657925038002e-06, + "loss": 0.81043541, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.27087402, + "step": 4556, + "time_per_iteration": 2.869431495666504 + }, + { + "auxiliary_loss_clip": 0.01545293, + "auxiliary_loss_mlp": 0.01055973, + "balance_loss_clip": 1.33194709, + "balance_loss_mlp": 1.02748203, + "epoch": 0.27398166240793626, + "flos": 17137445022720.0, + "grad_norm": 1.9235899448487848, + "language_loss": 0.83634549, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.86235809, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.28503418, + "step": 4557, + "time_per_iteration": 2.825578212738037 + }, + { + "auxiliary_loss_clip": 0.01499108, + "auxiliary_loss_mlp": 0.01050142, + "balance_loss_clip": 1.29869473, + "balance_loss_mlp": 1.02269983, + "epoch": 0.2740417856606042, + "flos": 23415325620480.0, + "grad_norm": 2.1407080566497534, + "language_loss": 0.74800706, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.77349961, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.27429199, + "step": 4558, + "time_per_iteration": 2.8777639865875244 + }, + { + "auxiliary_loss_clip": 0.01505302, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_clip": 1.30390239, + "balance_loss_mlp": 1.02275372, + "epoch": 0.2741019089132722, + "flos": 12786447024000.0, + "grad_norm": 2.525979296606505, + "language_loss": 0.69562888, + "learning_rate": 3.406827699810819e-06, + "loss": 0.72117841, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.26904297, + "step": 4559, + "time_per_iteration": 4.185668468475342 + }, + { + "auxiliary_loss_clip": 0.01497495, + "auxiliary_loss_mlp": 0.01047717, + "balance_loss_clip": 1.30041337, + "balance_loss_mlp": 1.02211034, + "epoch": 0.27416203216594015, + "flos": 20641331030400.0, + "grad_norm": 1.7261170266664965, + "language_loss": 0.72878945, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.75424159, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.25622559, + "step": 4560, + "time_per_iteration": 2.8363261222839355 + }, + { + "auxiliary_loss_clip": 0.01514845, + "auxiliary_loss_mlp": 0.01049028, + "balance_loss_clip": 1.31215739, + "balance_loss_mlp": 1.02301669, + "epoch": 0.27422215541860817, + "flos": 26552727596160.0, + "grad_norm": 1.7354299555030641, + "language_loss": 0.82541311, + "learning_rate": 3.406273949573303e-06, + "loss": 0.85105187, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.26037598, + "step": 4561, + "time_per_iteration": 2.943784475326538 + }, + { + "auxiliary_loss_clip": 0.01516818, + "auxiliary_loss_mlp": 0.01052767, + "balance_loss_clip": 1.31353021, + "balance_loss_mlp": 1.02533722, + "epoch": 0.27428227867127614, + "flos": 23341567317120.0, + "grad_norm": 1.7510891669136592, + "language_loss": 0.75606668, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.78176248, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.27441406, + "step": 4562, + "time_per_iteration": 4.257068157196045 + }, + { + "auxiliary_loss_clip": 0.01524084, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.32107627, + "balance_loss_mlp": 1.01394773, + "epoch": 0.2743424019239441, + "flos": 23044362353280.0, + "grad_norm": 2.851078352068534, + "language_loss": 0.75238842, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.77803266, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.2644043, + "step": 4563, + "time_per_iteration": 2.8459272384643555 + }, + { + "auxiliary_loss_clip": 0.01540977, + "auxiliary_loss_mlp": 0.01048863, + "balance_loss_clip": 1.33139777, + "balance_loss_mlp": 1.02280343, + "epoch": 0.27440252517661207, + "flos": 21991132460160.0, + "grad_norm": 2.1455139158019625, + "language_loss": 0.64605242, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.67195082, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.26098633, + "step": 4564, + "time_per_iteration": 4.349806070327759 + }, + { + "auxiliary_loss_clip": 0.01528239, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.32500899, + "balance_loss_mlp": 1.02489638, + "epoch": 0.27446264842928003, + "flos": 40202194798080.0, + "grad_norm": 2.5323808437312754, + "language_loss": 0.79746449, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.82325757, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.26184082, + "step": 4565, + "time_per_iteration": 3.005516767501831 + }, + { + "auxiliary_loss_clip": 0.01526095, + "auxiliary_loss_mlp": 0.01049668, + "balance_loss_clip": 1.32414377, + "balance_loss_mlp": 1.0232867, + "epoch": 0.274522771681948, + "flos": 13487444997120.0, + "grad_norm": 1.8894271268054148, + "language_loss": 0.69786227, + "learning_rate": 3.404888640957477e-06, + "loss": 0.72361994, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.2635498, + "step": 4566, + "time_per_iteration": 2.788851022720337 + }, + { + "auxiliary_loss_clip": 0.01515334, + "auxiliary_loss_mlp": 0.01043505, + "balance_loss_clip": 1.31712091, + "balance_loss_mlp": 1.01952028, + "epoch": 0.27458289493461596, + "flos": 28634049256320.0, + "grad_norm": 1.825092094995753, + "language_loss": 0.62178409, + "learning_rate": 3.404611419371723e-06, + "loss": 0.64737248, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.23999023, + "step": 4567, + "time_per_iteration": 2.9288392066955566 + }, + { + "auxiliary_loss_clip": 0.01528007, + "auxiliary_loss_mlp": 0.01048102, + "balance_loss_clip": 1.32770872, + "balance_loss_mlp": 1.02284181, + "epoch": 0.2746430181872839, + "flos": 20129230673280.0, + "grad_norm": 1.5586648814580764, + "language_loss": 0.8322438, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.85800487, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.25268555, + "step": 4568, + "time_per_iteration": 2.8157613277435303 + }, + { + "auxiliary_loss_clip": 0.01542483, + "auxiliary_loss_mlp": 0.01044801, + "balance_loss_clip": 1.33550274, + "balance_loss_mlp": 1.01899183, + "epoch": 0.2747031414399519, + "flos": 20203079466240.0, + "grad_norm": 2.022382924671659, + "language_loss": 0.68958116, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.71545398, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.25817871, + "step": 4569, + "time_per_iteration": 2.854052782058716 + }, + { + "auxiliary_loss_clip": 0.01539644, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.33467495, + "balance_loss_mlp": 1.01883483, + "epoch": 0.27476326469261986, + "flos": 13524165792000.0, + "grad_norm": 1.9863987952133477, + "language_loss": 0.72014451, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.74597788, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.24853516, + "step": 4570, + "time_per_iteration": 2.798959255218506 + }, + { + "auxiliary_loss_clip": 0.01285946, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.16532719, + "balance_loss_mlp": 1.0272913, + "epoch": 0.2748233879452878, + "flos": 65965615274880.0, + "grad_norm": 0.7283752934738706, + "language_loss": 0.55874455, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.58212483, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.24707031, + "step": 4571, + "time_per_iteration": 3.4542109966278076 + }, + { + "auxiliary_loss_clip": 0.01540045, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.33485126, + "balance_loss_mlp": 1.02114236, + "epoch": 0.2748835111979558, + "flos": 17393133242880.0, + "grad_norm": 2.4801956600766855, + "language_loss": 0.78695858, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.81282616, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.25598145, + "step": 4572, + "time_per_iteration": 2.8242697715759277 + }, + { + "auxiliary_loss_clip": 0.01518509, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.32123899, + "balance_loss_mlp": 1.01888108, + "epoch": 0.27494363445062375, + "flos": 23598296167680.0, + "grad_norm": 1.5960862862018832, + "language_loss": 0.81963313, + "learning_rate": 3.402946971702147e-06, + "loss": 0.84524155, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.23413086, + "step": 4573, + "time_per_iteration": 2.8348841667175293 + }, + { + "auxiliary_loss_clip": 0.01522583, + "auxiliary_loss_mlp": 0.01042949, + "balance_loss_clip": 1.32340789, + "balance_loss_mlp": 1.01873767, + "epoch": 0.2750037577032918, + "flos": 17173260921600.0, + "grad_norm": 1.5907405401441665, + "language_loss": 0.80240077, + "learning_rate": 3.402669377496223e-06, + "loss": 0.8280561, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.2421875, + "step": 4574, + "time_per_iteration": 2.876026153564453 + }, + { + "auxiliary_loss_clip": 0.01539661, + "auxiliary_loss_mlp": 0.01048825, + "balance_loss_clip": 1.33535147, + "balance_loss_mlp": 1.02342129, + "epoch": 0.27506388095595974, + "flos": 24500842318080.0, + "grad_norm": 2.078946460073405, + "language_loss": 0.75125706, + "learning_rate": 3.402391730100936e-06, + "loss": 0.77714193, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.25415039, + "step": 4575, + "time_per_iteration": 2.8231585025787354 + }, + { + "auxiliary_loss_clip": 0.01522383, + "auxiliary_loss_mlp": 0.01043763, + "balance_loss_clip": 1.32447982, + "balance_loss_mlp": 1.01942098, + "epoch": 0.2751240042086277, + "flos": 38779132757760.0, + "grad_norm": 1.684254368231547, + "language_loss": 0.7247293, + "learning_rate": 3.402114029526814e-06, + "loss": 0.75039077, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.24365234, + "step": 4576, + "time_per_iteration": 2.973783493041992 + }, + { + "auxiliary_loss_clip": 0.01534371, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.33186245, + "balance_loss_mlp": 1.01895261, + "epoch": 0.27518412746129567, + "flos": 26918442466560.0, + "grad_norm": 1.7342720332718886, + "language_loss": 0.73989058, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.76567882, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.25524902, + "step": 4577, + "time_per_iteration": 2.880286931991577 + }, + { + "auxiliary_loss_clip": 0.01547554, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.34291625, + "balance_loss_mlp": 1.01631284, + "epoch": 0.27524425071396363, + "flos": 24911467292160.0, + "grad_norm": 1.8732453667425142, + "language_loss": 0.76524925, + "learning_rate": 3.401558468884188e-06, + "loss": 0.79113489, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.24682617, + "step": 4578, + "time_per_iteration": 2.832770586013794 + }, + { + "auxiliary_loss_clip": 0.01533945, + "auxiliary_loss_mlp": 0.01052344, + "balance_loss_clip": 1.33109498, + "balance_loss_mlp": 1.02491355, + "epoch": 0.2753043739666316, + "flos": 26299618329600.0, + "grad_norm": 1.5240506280315338, + "language_loss": 0.6728282, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.69869113, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.27392578, + "step": 4579, + "time_per_iteration": 2.9281535148620605 + }, + { + "auxiliary_loss_clip": 0.01543474, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.33831728, + "balance_loss_mlp": 1.02163315, + "epoch": 0.27536449721929956, + "flos": 24217391773440.0, + "grad_norm": 2.5914645399575007, + "language_loss": 0.8049382, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.83083487, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.24560547, + "step": 4580, + "time_per_iteration": 2.8311805725097656 + }, + { + "auxiliary_loss_clip": 0.01529526, + "auxiliary_loss_mlp": 0.01046478, + "balance_loss_clip": 1.32831049, + "balance_loss_mlp": 1.02018034, + "epoch": 0.27542462047196753, + "flos": 19546855862400.0, + "grad_norm": 6.709255179387057, + "language_loss": 0.68198246, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.70774251, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.26318359, + "step": 4581, + "time_per_iteration": 2.8250272274017334 + }, + { + "auxiliary_loss_clip": 0.01545279, + "auxiliary_loss_mlp": 0.01043098, + "balance_loss_clip": 1.33879662, + "balance_loss_mlp": 1.01873112, + "epoch": 0.2754847437246355, + "flos": 14327227330560.0, + "grad_norm": 1.9370423777982413, + "language_loss": 0.79335976, + "learning_rate": 3.400446709916392e-06, + "loss": 0.81924355, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.24353027, + "step": 4582, + "time_per_iteration": 2.801544427871704 + }, + { + "auxiliary_loss_clip": 0.01521041, + "auxiliary_loss_mlp": 0.01045505, + "balance_loss_clip": 1.32334006, + "balance_loss_mlp": 1.02106738, + "epoch": 0.27554486697730346, + "flos": 18846808030080.0, + "grad_norm": 2.2099114612687023, + "language_loss": 0.85225248, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.87791789, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.24462891, + "step": 4583, + "time_per_iteration": 2.837780714035034 + }, + { + "auxiliary_loss_clip": 0.0156041, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.35399652, + "balance_loss_mlp": 1.02191091, + "epoch": 0.2756049902299714, + "flos": 22392391760640.0, + "grad_norm": 1.7174657876341697, + "language_loss": 0.68196785, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.70802772, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 2.06054688, + "router_z_loss_mlp": 0.23669434, + "step": 4584, + "time_per_iteration": 2.856309175491333 + }, + { + "auxiliary_loss_clip": 0.01533395, + "auxiliary_loss_mlp": 0.010418, + "balance_loss_clip": 1.33317351, + "balance_loss_mlp": 1.01823211, + "epoch": 0.2756651134826394, + "flos": 19583531412480.0, + "grad_norm": 1.8677388301225768, + "language_loss": 0.77715826, + "learning_rate": 3.399612333050327e-06, + "loss": 0.80291021, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.23571777, + "step": 4585, + "time_per_iteration": 2.8364601135253906 + }, + { + "auxiliary_loss_clip": 0.01561923, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_clip": 1.35306692, + "balance_loss_mlp": 1.0217396, + "epoch": 0.27572523673530736, + "flos": 23597029313280.0, + "grad_norm": 2.2378575714728743, + "language_loss": 0.72696877, + "learning_rate": 3.399334101267362e-06, + "loss": 0.75304878, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.24353027, + "step": 4586, + "time_per_iteration": 2.8900082111358643 + }, + { + "auxiliary_loss_clip": 0.01555409, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.35243404, + "balance_loss_mlp": 1.01439381, + "epoch": 0.2757853599879754, + "flos": 22830326611200.0, + "grad_norm": 1.548072796292089, + "language_loss": 0.81298763, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.83893716, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.25170898, + "step": 4587, + "time_per_iteration": 4.255146741867065 + }, + { + "auxiliary_loss_clip": 0.01544128, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.34242654, + "balance_loss_mlp": 1.01870573, + "epoch": 0.27584548324064334, + "flos": 18560914266240.0, + "grad_norm": 2.2646017902912092, + "language_loss": 0.83866489, + "learning_rate": 3.398777478523316e-06, + "loss": 0.86453402, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.24084473, + "step": 4588, + "time_per_iteration": 2.8282077312469482 + }, + { + "auxiliary_loss_clip": 0.01534408, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.33484697, + "balance_loss_mlp": 1.01564217, + "epoch": 0.2759056064933113, + "flos": 23780633287680.0, + "grad_norm": 1.2902929099492473, + "language_loss": 0.76221764, + "learning_rate": 3.398499087583342e-06, + "loss": 0.78796363, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.24572754, + "step": 4589, + "time_per_iteration": 2.879354953765869 + }, + { + "auxiliary_loss_clip": 0.01534495, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.33439922, + "balance_loss_mlp": 1.01728892, + "epoch": 0.27596572974597927, + "flos": 24293095603200.0, + "grad_norm": 2.048050114701066, + "language_loss": 0.89341879, + "learning_rate": 3.398220643612143e-06, + "loss": 0.91919494, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.25866699, + "step": 4590, + "time_per_iteration": 2.845214366912842 + }, + { + "auxiliary_loss_clip": 0.01545667, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.34106159, + "balance_loss_mlp": 1.01422286, + "epoch": 0.27602585299864724, + "flos": 35053881350400.0, + "grad_norm": 1.6500436613029796, + "language_loss": 0.72211373, + "learning_rate": 3.397942146620277e-06, + "loss": 0.74796832, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.25537109, + "step": 4591, + "time_per_iteration": 2.9673690795898438 + }, + { + "auxiliary_loss_clip": 0.01535457, + "auxiliary_loss_mlp": 0.01043206, + "balance_loss_clip": 1.33442307, + "balance_loss_mlp": 1.01968551, + "epoch": 0.2760859762513152, + "flos": 24318278974080.0, + "grad_norm": 2.183959393867276, + "language_loss": 0.80786479, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.83365142, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.23535156, + "step": 4592, + "time_per_iteration": 2.818568468093872 + }, + { + "auxiliary_loss_clip": 0.01305967, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.18179369, + "balance_loss_mlp": 1.00119495, + "epoch": 0.27614609950398317, + "flos": 71289750591360.0, + "grad_norm": 0.7209575223253534, + "language_loss": 0.61669213, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.64003652, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.2734375, + "step": 4593, + "time_per_iteration": 3.3252077102661133 + }, + { + "auxiliary_loss_clip": 0.01539064, + "auxiliary_loss_mlp": 0.01041592, + "balance_loss_clip": 1.33638752, + "balance_loss_mlp": 1.01628363, + "epoch": 0.27620622275665113, + "flos": 29685650336640.0, + "grad_norm": 1.9582342426761867, + "language_loss": 0.78484404, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.81065059, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.2532959, + "step": 4594, + "time_per_iteration": 4.421769618988037 + }, + { + "auxiliary_loss_clip": 0.01537715, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.3358717, + "balance_loss_mlp": 1.01875794, + "epoch": 0.2762663460093191, + "flos": 15386610516480.0, + "grad_norm": 1.4470753332296886, + "language_loss": 0.92162383, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.94743967, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.25097656, + "step": 4595, + "time_per_iteration": 2.8010053634643555 + }, + { + "auxiliary_loss_clip": 0.01568263, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.36113751, + "balance_loss_mlp": 1.02536392, + "epoch": 0.27632646926198706, + "flos": 20713777234560.0, + "grad_norm": 1.7271266298650194, + "language_loss": 0.69684005, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.7230292, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.25317383, + "step": 4596, + "time_per_iteration": 2.852928400039673 + }, + { + "auxiliary_loss_clip": 0.01567218, + "auxiliary_loss_mlp": 0.01045332, + "balance_loss_clip": 1.35689497, + "balance_loss_mlp": 1.02113271, + "epoch": 0.276386592514655, + "flos": 32825178817920.0, + "grad_norm": 2.6592327410852548, + "language_loss": 0.64976323, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.67588878, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.24206543, + "step": 4597, + "time_per_iteration": 4.394801139831543 + }, + { + "auxiliary_loss_clip": 0.01522115, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.32492149, + "balance_loss_mlp": 1.02291059, + "epoch": 0.276446715767323, + "flos": 18560914266240.0, + "grad_norm": 2.032945242425822, + "language_loss": 0.87233692, + "learning_rate": 3.395991183985887e-06, + "loss": 0.89804947, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.26245117, + "step": 4598, + "time_per_iteration": 2.804931402206421 + }, + { + "auxiliary_loss_clip": 0.01548839, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_clip": 1.34395301, + "balance_loss_mlp": 1.02035117, + "epoch": 0.27650683901999096, + "flos": 22829693184000.0, + "grad_norm": 2.968458842715942, + "language_loss": 0.80413496, + "learning_rate": 3.395712263209037e-06, + "loss": 0.83007574, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.24902344, + "step": 4599, + "time_per_iteration": 4.319958448410034 + }, + { + "auxiliary_loss_clip": 0.01557726, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.34716046, + "balance_loss_mlp": 1.02400565, + "epoch": 0.276566962272659, + "flos": 21371584406400.0, + "grad_norm": 1.7698198595303798, + "language_loss": 0.79889119, + "learning_rate": 3.395433289506639e-06, + "loss": 0.82495093, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.24255371, + "step": 4600, + "time_per_iteration": 2.8466014862060547 + }, + { + "auxiliary_loss_clip": 0.01543212, + "auxiliary_loss_mlp": 0.01044938, + "balance_loss_clip": 1.33616173, + "balance_loss_mlp": 1.01978493, + "epoch": 0.27662708552532694, + "flos": 17719095916800.0, + "grad_norm": 5.9980200279512985, + "language_loss": 0.74319249, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.76907396, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.25146484, + "step": 4601, + "time_per_iteration": 2.8012125492095947 + }, + { + "auxiliary_loss_clip": 0.01539215, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.3354398, + "balance_loss_mlp": 1.0234201, + "epoch": 0.2766872087779949, + "flos": 21262915100160.0, + "grad_norm": 1.6175638417207572, + "language_loss": 0.80865705, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.83452588, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.24255371, + "step": 4602, + "time_per_iteration": 2.8983089923858643 + }, + { + "auxiliary_loss_clip": 0.01557491, + "auxiliary_loss_mlp": 0.0105394, + "balance_loss_clip": 1.34823811, + "balance_loss_mlp": 1.02895415, + "epoch": 0.2767473320306629, + "flos": 12939393006720.0, + "grad_norm": 2.0707761106692626, + "language_loss": 0.77752101, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.80363536, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.25, + "step": 4603, + "time_per_iteration": 2.8350472450256348 + }, + { + "auxiliary_loss_clip": 0.01524923, + "auxiliary_loss_mlp": 0.01042199, + "balance_loss_clip": 1.32448781, + "balance_loss_mlp": 1.01809502, + "epoch": 0.27680745528333084, + "flos": 15021212359680.0, + "grad_norm": 1.662229647945683, + "language_loss": 0.82384354, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.84951472, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.24084473, + "step": 4604, + "time_per_iteration": 2.8759212493896484 + }, + { + "auxiliary_loss_clip": 0.01536921, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.3306849, + "balance_loss_mlp": 1.01513731, + "epoch": 0.2768675785359988, + "flos": 22648034736000.0, + "grad_norm": 1.7252303119729349, + "language_loss": 0.70884854, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.73461795, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.24890137, + "step": 4605, + "time_per_iteration": 2.8840291500091553 + }, + { + "auxiliary_loss_clip": 0.01314143, + "auxiliary_loss_mlp": 0.01050199, + "balance_loss_clip": 1.18564272, + "balance_loss_mlp": 1.0261668, + "epoch": 0.27692770178866677, + "flos": 66161933038080.0, + "grad_norm": 0.7074546394580616, + "language_loss": 0.57260883, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59625226, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.24023438, + "step": 4606, + "time_per_iteration": 3.4279680252075195 + }, + { + "auxiliary_loss_clip": 0.01551929, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.34276605, + "balance_loss_mlp": 1.0177176, + "epoch": 0.27698782504133473, + "flos": 26475168729600.0, + "grad_norm": 1.8599510979991014, + "language_loss": 0.69864291, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.72460353, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.26416016, + "step": 4607, + "time_per_iteration": 2.9115915298461914 + }, + { + "auxiliary_loss_clip": 0.01514675, + "auxiliary_loss_mlp": 0.01042328, + "balance_loss_clip": 1.31645417, + "balance_loss_mlp": 1.01802111, + "epoch": 0.2770479482940027, + "flos": 25895780075520.0, + "grad_norm": 1.672784807577903, + "language_loss": 0.70592916, + "learning_rate": 3.393199595837555e-06, + "loss": 0.7314992, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.24304199, + "step": 4608, + "time_per_iteration": 2.885216236114502 + }, + { + "auxiliary_loss_clip": 0.01540832, + "auxiliary_loss_mlp": 0.01048923, + "balance_loss_clip": 1.33609223, + "balance_loss_mlp": 1.02413917, + "epoch": 0.27710807154667066, + "flos": 22867635588480.0, + "grad_norm": 2.987021878169039, + "language_loss": 0.73727334, + "learning_rate": 3.392920146281499e-06, + "loss": 0.76317096, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.2479248, + "step": 4609, + "time_per_iteration": 2.8323919773101807 + }, + { + "auxiliary_loss_clip": 0.01538839, + "auxiliary_loss_mlp": 0.01057802, + "balance_loss_clip": 1.33321285, + "balance_loss_mlp": 1.03179085, + "epoch": 0.27716819479933863, + "flos": 17719684099200.0, + "grad_norm": 2.640414336835422, + "language_loss": 0.85046774, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.87643421, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.26025391, + "step": 4610, + "time_per_iteration": 2.806337833404541 + }, + { + "auxiliary_loss_clip": 0.01548114, + "auxiliary_loss_mlp": 0.01060704, + "balance_loss_clip": 1.34016192, + "balance_loss_mlp": 1.03491926, + "epoch": 0.2772283180520066, + "flos": 19655479923840.0, + "grad_norm": 2.1286993464098027, + "language_loss": 0.70774925, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.73383743, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.25817871, + "step": 4611, + "time_per_iteration": 2.7924163341522217 + }, + { + "auxiliary_loss_clip": 0.01520174, + "auxiliary_loss_mlp": 0.0105426, + "balance_loss_clip": 1.32112646, + "balance_loss_mlp": 1.02959585, + "epoch": 0.27728844130467456, + "flos": 21042454596480.0, + "grad_norm": 2.4278464311670085, + "language_loss": 0.75246328, + "learning_rate": 3.392081480737698e-06, + "loss": 0.77820766, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.24658203, + "step": 4612, + "time_per_iteration": 2.869947671890259 + }, + { + "auxiliary_loss_clip": 0.01538048, + "auxiliary_loss_mlp": 0.01061051, + "balance_loss_clip": 1.33162093, + "balance_loss_mlp": 1.03561211, + "epoch": 0.2773485645573425, + "flos": 18998441913600.0, + "grad_norm": 2.0319390717385617, + "language_loss": 0.67448604, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.700477, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.25427246, + "step": 4613, + "time_per_iteration": 2.8175766468048096 + }, + { + "auxiliary_loss_clip": 0.01523435, + "auxiliary_loss_mlp": 0.01060593, + "balance_loss_clip": 1.3225956, + "balance_loss_mlp": 1.03552318, + "epoch": 0.27740868781001055, + "flos": 21477855738240.0, + "grad_norm": 1.7541154415001636, + "language_loss": 0.80227017, + "learning_rate": 3.39152210641815e-06, + "loss": 0.8281104, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.25085449, + "step": 4614, + "time_per_iteration": 2.8796653747558594 + }, + { + "auxiliary_loss_clip": 0.01533911, + "auxiliary_loss_mlp": 0.01072058, + "balance_loss_clip": 1.32894158, + "balance_loss_mlp": 1.04729831, + "epoch": 0.2774688110626785, + "flos": 19837138371840.0, + "grad_norm": 2.743448438737496, + "language_loss": 0.82102746, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.84708714, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.24755859, + "step": 4615, + "time_per_iteration": 2.8705389499664307 + }, + { + "auxiliary_loss_clip": 0.01545491, + "auxiliary_loss_mlp": 0.01067086, + "balance_loss_clip": 1.33725977, + "balance_loss_mlp": 1.04133677, + "epoch": 0.2775289343153465, + "flos": 18223504657920.0, + "grad_norm": 2.579180398463482, + "language_loss": 0.65234613, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.67847192, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 2.0859375, + "router_z_loss_mlp": 0.25744629, + "step": 4616, + "time_per_iteration": 2.7636282444000244 + }, + { + "auxiliary_loss_clip": 0.01541605, + "auxiliary_loss_mlp": 0.01054772, + "balance_loss_clip": 1.33600295, + "balance_loss_mlp": 1.03027439, + "epoch": 0.27758905756801444, + "flos": 16481266663680.0, + "grad_norm": 1.9163970095237182, + "language_loss": 0.83205897, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.85802281, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.24487305, + "step": 4617, + "time_per_iteration": 2.8285861015319824 + }, + { + "auxiliary_loss_clip": 0.01537357, + "auxiliary_loss_mlp": 0.01062537, + "balance_loss_clip": 1.33036554, + "balance_loss_mlp": 1.03670382, + "epoch": 0.2776491808206824, + "flos": 18735966973440.0, + "grad_norm": 2.506602551667203, + "language_loss": 0.78070891, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.80670786, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 2.07226562, + "router_z_loss_mlp": 0.25842285, + "step": 4618, + "time_per_iteration": 2.790485382080078 + }, + { + "auxiliary_loss_clip": 0.01540877, + "auxiliary_loss_mlp": 0.01050244, + "balance_loss_clip": 1.3354857, + "balance_loss_mlp": 1.02503157, + "epoch": 0.27770930407335037, + "flos": 28049909898240.0, + "grad_norm": 2.0981497819521, + "language_loss": 0.85987276, + "learning_rate": 3.390122747388459e-06, + "loss": 0.88578397, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.25231934, + "step": 4619, + "time_per_iteration": 2.9171717166900635 + }, + { + "auxiliary_loss_clip": 0.01518828, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.32036698, + "balance_loss_mlp": 1.02440488, + "epoch": 0.27776942732601834, + "flos": 23560218028800.0, + "grad_norm": 1.8295519990077045, + "language_loss": 0.77560079, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.80127305, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.2401123, + "step": 4620, + "time_per_iteration": 2.8868675231933594 + }, + { + "auxiliary_loss_clip": 0.01522896, + "auxiliary_loss_mlp": 0.01042262, + "balance_loss_clip": 1.32157016, + "balance_loss_mlp": 1.01677561, + "epoch": 0.2778295505786863, + "flos": 23917653100800.0, + "grad_norm": 2.114915792714337, + "language_loss": 0.79546845, + "learning_rate": 3.389562634707122e-06, + "loss": 0.82112002, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.25488281, + "step": 4621, + "time_per_iteration": 2.855079412460327 + }, + { + "auxiliary_loss_clip": 0.01540873, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.33473682, + "balance_loss_mlp": 1.02120018, + "epoch": 0.27788967383135427, + "flos": 25564976208000.0, + "grad_norm": 2.6595836273293165, + "language_loss": 0.88628858, + "learning_rate": 3.389282499322611e-06, + "loss": 0.91216493, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.25598145, + "step": 4622, + "time_per_iteration": 4.297405242919922 + }, + { + "auxiliary_loss_clip": 0.01535429, + "auxiliary_loss_mlp": 0.01043658, + "balance_loss_clip": 1.33021593, + "balance_loss_mlp": 1.01881456, + "epoch": 0.27794979708402223, + "flos": 16261122873600.0, + "grad_norm": 2.1568256824721326, + "language_loss": 0.82272172, + "learning_rate": 3.389002311256369e-06, + "loss": 0.84851265, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.24865723, + "step": 4623, + "time_per_iteration": 2.794316053390503 + }, + { + "auxiliary_loss_clip": 0.01543727, + "auxiliary_loss_mlp": 0.0105039, + "balance_loss_clip": 1.33947849, + "balance_loss_mlp": 1.02506983, + "epoch": 0.2780099203366902, + "flos": 20677327908480.0, + "grad_norm": 2.0751413988873972, + "language_loss": 0.82842803, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.85436916, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.2532959, + "step": 4624, + "time_per_iteration": 2.864180326461792 + }, + { + "auxiliary_loss_clip": 0.01543499, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.3402791, + "balance_loss_mlp": 1.013659, + "epoch": 0.27807004358935816, + "flos": 17746134324480.0, + "grad_norm": 2.43931145898085, + "language_loss": 0.77130038, + "learning_rate": 3.388441777121191e-06, + "loss": 0.79712105, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.24914551, + "step": 4625, + "time_per_iteration": 2.9032764434814453 + }, + { + "auxiliary_loss_clip": 0.01528323, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_clip": 1.32621503, + "balance_loss_mlp": 1.01677394, + "epoch": 0.2781301668420261, + "flos": 16735778519040.0, + "grad_norm": 1.8912341641458437, + "language_loss": 0.70539951, + "learning_rate": 3.388161431073511e-06, + "loss": 0.73110646, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.25598145, + "step": 4626, + "time_per_iteration": 2.8291382789611816 + }, + { + "auxiliary_loss_clip": 0.0155299, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_clip": 1.34429312, + "balance_loss_mlp": 1.01813436, + "epoch": 0.27819029009469415, + "flos": 13852662174720.0, + "grad_norm": 2.3986917234663974, + "language_loss": 0.94300973, + "learning_rate": 3.38788103238661e-06, + "loss": 0.96897924, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 2.08886719, + "router_z_loss_mlp": 0.25842285, + "step": 4627, + "time_per_iteration": 2.877378463745117 + }, + { + "auxiliary_loss_clip": 0.01546987, + "auxiliary_loss_mlp": 0.01038903, + "balance_loss_clip": 1.33910179, + "balance_loss_mlp": 1.0145365, + "epoch": 0.2782504133473621, + "flos": 27100689096960.0, + "grad_norm": 1.9092315929673354, + "language_loss": 0.86144537, + "learning_rate": 3.387600581071121e-06, + "loss": 0.88730431, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 2.08007812, + "router_z_loss_mlp": 0.24377441, + "step": 4628, + "time_per_iteration": 2.883159637451172 + }, + { + "auxiliary_loss_clip": 0.01532752, + "auxiliary_loss_mlp": 0.01048003, + "balance_loss_clip": 1.32973945, + "balance_loss_mlp": 1.02318335, + "epoch": 0.2783105366000301, + "flos": 21078587208960.0, + "grad_norm": 1.4904598099886661, + "language_loss": 0.79874563, + "learning_rate": 3.387320077137679e-06, + "loss": 0.82455313, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.24841309, + "step": 4629, + "time_per_iteration": 4.251392602920532 + }, + { + "auxiliary_loss_clip": 0.01509567, + "auxiliary_loss_mlp": 0.01048994, + "balance_loss_clip": 1.31459594, + "balance_loss_mlp": 1.02422214, + "epoch": 0.27837065985269804, + "flos": 26512115748480.0, + "grad_norm": 1.4693772870595243, + "language_loss": 0.85303301, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.8786186, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.24768066, + "step": 4630, + "time_per_iteration": 2.9031822681427 + }, + { + "auxiliary_loss_clip": 0.01545443, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_clip": 1.33971214, + "balance_loss_mlp": 1.02231359, + "epoch": 0.278430783105366, + "flos": 20231068014720.0, + "grad_norm": 2.124326738510666, + "language_loss": 0.82340109, + "learning_rate": 3.386758911459485e-06, + "loss": 0.84933627, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.25793457, + "step": 4631, + "time_per_iteration": 2.7962851524353027 + }, + { + "auxiliary_loss_clip": 0.0154781, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.34119439, + "balance_loss_mlp": 1.02134132, + "epoch": 0.278490906358034, + "flos": 25603778263680.0, + "grad_norm": 1.6455193491755362, + "language_loss": 0.7216841, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74762332, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.24768066, + "step": 4632, + "time_per_iteration": 4.245196580886841 + }, + { + "auxiliary_loss_clip": 0.01517516, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.32038212, + "balance_loss_mlp": 1.01804388, + "epoch": 0.27855102961070194, + "flos": 16177908407040.0, + "grad_norm": 1.7271435369225177, + "language_loss": 0.82668477, + "learning_rate": 3.386197535437145e-06, + "loss": 0.85229683, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.25622559, + "step": 4633, + "time_per_iteration": 2.819655418395996 + }, + { + "auxiliary_loss_clip": 0.01531074, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_clip": 1.32778645, + "balance_loss_mlp": 1.01888108, + "epoch": 0.2786111528633699, + "flos": 22937321859840.0, + "grad_norm": 1.946099015580353, + "language_loss": 0.88504589, + "learning_rate": 3.385916768573529e-06, + "loss": 0.91080201, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.25683594, + "step": 4634, + "time_per_iteration": 4.304944038391113 + }, + { + "auxiliary_loss_clip": 0.01552665, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.34614277, + "balance_loss_mlp": 1.01973379, + "epoch": 0.27867127611603787, + "flos": 23414646948480.0, + "grad_norm": 3.280566655690463, + "language_loss": 0.77230716, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79829353, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 2.06640625, + "router_z_loss_mlp": 0.2623291, + "step": 4635, + "time_per_iteration": 2.887576103210449 + }, + { + "auxiliary_loss_clip": 0.01544783, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_clip": 1.33936, + "balance_loss_mlp": 1.02243316, + "epoch": 0.27873139936870583, + "flos": 19838631450240.0, + "grad_norm": 1.5570154569175416, + "language_loss": 0.66323161, + "learning_rate": 3.385355077194637e-06, + "loss": 0.68915701, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.25317383, + "step": 4636, + "time_per_iteration": 2.8910937309265137 + }, + { + "auxiliary_loss_clip": 0.0155913, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.34904361, + "balance_loss_mlp": 1.01770782, + "epoch": 0.2787915226213738, + "flos": 17715385843200.0, + "grad_norm": 3.3214328754646094, + "language_loss": 0.85630822, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.88233793, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 2.10351562, + "router_z_loss_mlp": 0.26171875, + "step": 4637, + "time_per_iteration": 2.7995731830596924 + }, + { + "auxiliary_loss_clip": 0.0153495, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.33287501, + "balance_loss_mlp": 1.01574683, + "epoch": 0.27885164587404176, + "flos": 22100616172800.0, + "grad_norm": 1.5400894595827943, + "language_loss": 0.76772499, + "learning_rate": 3.384793175684533e-06, + "loss": 0.79347426, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.24243164, + "step": 4638, + "time_per_iteration": 2.901843309402466 + }, + { + "auxiliary_loss_clip": 0.01536147, + "auxiliary_loss_mlp": 0.01046521, + "balance_loss_clip": 1.33113837, + "balance_loss_mlp": 1.02191663, + "epoch": 0.27891176912670973, + "flos": 19216866401280.0, + "grad_norm": 5.793029568572898, + "language_loss": 0.72729164, + "learning_rate": 3.38451214615691e-06, + "loss": 0.7531184, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 2.04882812, + "router_z_loss_mlp": 0.24597168, + "step": 4639, + "time_per_iteration": 2.833029270172119 + }, + { + "auxiliary_loss_clip": 0.01549509, + "auxiliary_loss_mlp": 0.01042967, + "balance_loss_clip": 1.34194875, + "balance_loss_mlp": 1.01767087, + "epoch": 0.27897189237937775, + "flos": 27611522599680.0, + "grad_norm": 5.232036838104929, + "language_loss": 0.67072976, + "learning_rate": 3.384231064128447e-06, + "loss": 0.69665456, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.25305176, + "step": 4640, + "time_per_iteration": 2.885321617126465 + }, + { + "auxiliary_loss_clip": 0.01542868, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.33745182, + "balance_loss_mlp": 1.01734233, + "epoch": 0.2790320156320457, + "flos": 21187980432000.0, + "grad_norm": 1.8938355614234992, + "language_loss": 0.7321893, + "learning_rate": 3.383949929609804e-06, + "loss": 0.75802732, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 2.05371094, + "router_z_loss_mlp": 0.23596191, + "step": 4641, + "time_per_iteration": 2.919515609741211 + }, + { + "auxiliary_loss_clip": 0.01553053, + "auxiliary_loss_mlp": 0.01047358, + "balance_loss_clip": 1.34426117, + "balance_loss_mlp": 1.02252674, + "epoch": 0.2790921388847137, + "flos": 22794193998720.0, + "grad_norm": 1.700644699238442, + "language_loss": 0.75840861, + "learning_rate": 3.383668742611641e-06, + "loss": 0.78441274, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 2.08984375, + "router_z_loss_mlp": 0.24829102, + "step": 4642, + "time_per_iteration": 2.8536155223846436 + }, + { + "auxiliary_loss_clip": 0.01548953, + "auxiliary_loss_mlp": 0.01043175, + "balance_loss_clip": 1.34388447, + "balance_loss_mlp": 1.01858199, + "epoch": 0.27915226213738165, + "flos": 23410167713280.0, + "grad_norm": 1.7938251560830922, + "language_loss": 0.8660773, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.89199859, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.24584961, + "step": 4643, + "time_per_iteration": 2.8598272800445557 + }, + { + "auxiliary_loss_clip": 0.01544538, + "auxiliary_loss_mlp": 0.01050137, + "balance_loss_clip": 1.33976865, + "balance_loss_mlp": 1.02671278, + "epoch": 0.2792123853900496, + "flos": 22758151875840.0, + "grad_norm": 3.5731725664139002, + "language_loss": 0.83688301, + "learning_rate": 3.383106211219407e-06, + "loss": 0.8628298, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 2.04785156, + "router_z_loss_mlp": 0.23400879, + "step": 4644, + "time_per_iteration": 2.8783676624298096 + }, + { + "auxiliary_loss_clip": 0.01545827, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_clip": 1.3401103, + "balance_loss_mlp": 1.02456164, + "epoch": 0.2792725086427176, + "flos": 15057752175360.0, + "grad_norm": 1.9364088351056048, + "language_loss": 0.79577458, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.82172191, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.24353027, + "step": 4645, + "time_per_iteration": 2.861405849456787 + }, + { + "auxiliary_loss_clip": 0.01322871, + "auxiliary_loss_mlp": 0.011084, + "balance_loss_clip": 1.19682479, + "balance_loss_mlp": 1.08226895, + "epoch": 0.27933263189538554, + "flos": 62572706058240.0, + "grad_norm": 0.7973396276459345, + "language_loss": 0.62302721, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64733994, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.26171875, + "step": 4646, + "time_per_iteration": 3.3534772396087646 + }, + { + "auxiliary_loss_clip": 0.01528596, + "auxiliary_loss_mlp": 0.01047204, + "balance_loss_clip": 1.32900679, + "balance_loss_mlp": 1.02430367, + "epoch": 0.2793927551480535, + "flos": 25129077373440.0, + "grad_norm": 1.5622114300692187, + "language_loss": 0.9004631, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.92622113, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.22924805, + "step": 4647, + "time_per_iteration": 2.863480567932129 + }, + { + "auxiliary_loss_clip": 0.01544224, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.33771777, + "balance_loss_mlp": 1.01939619, + "epoch": 0.27945287840072147, + "flos": 21334501653120.0, + "grad_norm": 1.840238865979896, + "language_loss": 0.87372589, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89961863, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.2565918, + "step": 4648, + "time_per_iteration": 2.9075167179107666 + }, + { + "auxiliary_loss_clip": 0.01536671, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.3294642, + "balance_loss_mlp": 1.01847339, + "epoch": 0.27951300165338944, + "flos": 27461110325760.0, + "grad_norm": 2.023937686825664, + "language_loss": 0.73908085, + "learning_rate": 3.38169896509385e-06, + "loss": 0.76487643, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 2.06933594, + "router_z_loss_mlp": 0.24401855, + "step": 4649, + "time_per_iteration": 2.8961830139160156 + }, + { + "auxiliary_loss_clip": 0.01518811, + "auxiliary_loss_mlp": 0.01047343, + "balance_loss_clip": 1.31805754, + "balance_loss_mlp": 1.02240491, + "epoch": 0.2795731249060574, + "flos": 15167462112000.0, + "grad_norm": 2.3892794451258843, + "language_loss": 0.82700133, + "learning_rate": 3.381417358643549e-06, + "loss": 0.85266292, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24926758, + "step": 4650, + "time_per_iteration": 2.867680311203003 + }, + { + "auxiliary_loss_clip": 0.01330618, + "auxiliary_loss_mlp": 0.01052936, + "balance_loss_clip": 1.20462823, + "balance_loss_mlp": 1.02661467, + "epoch": 0.27963324815872537, + "flos": 60152454449280.0, + "grad_norm": 0.8271797006925427, + "language_loss": 0.58898622, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.61282176, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.26367188, + "step": 4651, + "time_per_iteration": 3.4177563190460205 + }, + { + "auxiliary_loss_clip": 0.01540292, + "auxiliary_loss_mlp": 0.01042089, + "balance_loss_clip": 1.3327651, + "balance_loss_mlp": 1.0173533, + "epoch": 0.27969337141139333, + "flos": 21776915738880.0, + "grad_norm": 1.68135448337319, + "language_loss": 0.74842942, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.77425325, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.24755859, + "step": 4652, + "time_per_iteration": 2.8721179962158203 + }, + { + "auxiliary_loss_clip": 0.01544613, + "auxiliary_loss_mlp": 0.01050895, + "balance_loss_clip": 1.33875394, + "balance_loss_mlp": 1.02515721, + "epoch": 0.27975349466406135, + "flos": 39864061272960.0, + "grad_norm": 2.1875686962143894, + "language_loss": 0.8071149, + "learning_rate": 3.380572225034461e-06, + "loss": 0.83306998, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.25744629, + "step": 4653, + "time_per_iteration": 2.997776508331299 + }, + { + "auxiliary_loss_clip": 0.0153355, + "auxiliary_loss_mlp": 0.01049638, + "balance_loss_clip": 1.33197021, + "balance_loss_mlp": 1.02413869, + "epoch": 0.2798136179167293, + "flos": 21589737425280.0, + "grad_norm": 4.3015701274963725, + "language_loss": 0.79453504, + "learning_rate": 3.380290409114312e-06, + "loss": 0.82036692, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25488281, + "step": 4654, + "time_per_iteration": 2.854058027267456 + }, + { + "auxiliary_loss_clip": 0.0155977, + "auxiliary_loss_mlp": 0.01038234, + "balance_loss_clip": 1.34991658, + "balance_loss_mlp": 1.01386809, + "epoch": 0.2798737411693973, + "flos": 21546139420800.0, + "grad_norm": 3.5504370601875594, + "language_loss": 0.81556749, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.84154761, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 2.10253906, + "router_z_loss_mlp": 0.24377441, + "step": 4655, + "time_per_iteration": 2.8716318607330322 + }, + { + "auxiliary_loss_clip": 0.01533665, + "auxiliary_loss_mlp": 0.01043507, + "balance_loss_clip": 1.33143592, + "balance_loss_mlp": 1.01842535, + "epoch": 0.27993386442206525, + "flos": 26992743707520.0, + "grad_norm": 2.008666765525074, + "language_loss": 0.82256258, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.84833431, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.25073242, + "step": 4656, + "time_per_iteration": 2.903451442718506 + }, + { + "auxiliary_loss_clip": 0.01524464, + "auxiliary_loss_mlp": 0.01046749, + "balance_loss_clip": 1.32274771, + "balance_loss_mlp": 1.02201319, + "epoch": 0.2799939876747332, + "flos": 24359795717760.0, + "grad_norm": 1.7331837113931718, + "language_loss": 0.83489799, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.86061013, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.24731445, + "step": 4657, + "time_per_iteration": 4.387537479400635 + }, + { + "auxiliary_loss_clip": 0.01532414, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.33038568, + "balance_loss_mlp": 1.01868725, + "epoch": 0.2800541109274012, + "flos": 33670164303360.0, + "grad_norm": 2.3015731500631307, + "language_loss": 0.65082824, + "learning_rate": 3.379162622133105e-06, + "loss": 0.67657936, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.23999023, + "step": 4658, + "time_per_iteration": 2.9561588764190674 + }, + { + "auxiliary_loss_clip": 0.01537235, + "auxiliary_loss_mlp": 0.01042702, + "balance_loss_clip": 1.33401549, + "balance_loss_mlp": 1.01764441, + "epoch": 0.28011423418006914, + "flos": 21623879266560.0, + "grad_norm": 1.7010338918134356, + "language_loss": 0.78942442, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.81522381, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.25048828, + "step": 4659, + "time_per_iteration": 2.84621262550354 + }, + { + "auxiliary_loss_clip": 0.01532893, + "auxiliary_loss_mlp": 0.01043276, + "balance_loss_clip": 1.33214092, + "balance_loss_mlp": 1.01952934, + "epoch": 0.2801743574327371, + "flos": 23122826115840.0, + "grad_norm": 1.7197859378833344, + "language_loss": 0.79932857, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.82509029, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.23742676, + "step": 4660, + "time_per_iteration": 2.8436808586120605 + }, + { + "auxiliary_loss_clip": 0.01511795, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.31515467, + "balance_loss_mlp": 1.01950979, + "epoch": 0.2802344806854051, + "flos": 12649200986880.0, + "grad_norm": 2.5080872965318215, + "language_loss": 0.81914425, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.84469187, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.23474121, + "step": 4661, + "time_per_iteration": 2.9237582683563232 + }, + { + "auxiliary_loss_clip": 0.01564107, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.36132216, + "balance_loss_mlp": 1.0206871, + "epoch": 0.28029460393807304, + "flos": 37281678986880.0, + "grad_norm": 1.585671035263152, + "language_loss": 0.79747635, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.82357067, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.24658203, + "step": 4662, + "time_per_iteration": 2.99454927444458 + }, + { + "auxiliary_loss_clip": 0.01541011, + "auxiliary_loss_mlp": 0.01046194, + "balance_loss_clip": 1.3343209, + "balance_loss_mlp": 1.02051616, + "epoch": 0.280354727190741, + "flos": 20750950477440.0, + "grad_norm": 2.159628797626251, + "language_loss": 0.70753348, + "learning_rate": 3.377751711782227e-06, + "loss": 0.73340547, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.25695801, + "step": 4663, + "time_per_iteration": 2.9657039642333984 + }, + { + "auxiliary_loss_clip": 0.01551369, + "auxiliary_loss_mlp": 0.01054287, + "balance_loss_clip": 1.34585166, + "balance_loss_mlp": 1.0276556, + "epoch": 0.28041485044340897, + "flos": 21481113363840.0, + "grad_norm": 1.7764748517804245, + "language_loss": 0.78708601, + "learning_rate": 3.377469372935791e-06, + "loss": 0.81314254, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.26647949, + "step": 4664, + "time_per_iteration": 4.294102430343628 + }, + { + "auxiliary_loss_clip": 0.01518802, + "auxiliary_loss_mlp": 0.01047013, + "balance_loss_clip": 1.32325792, + "balance_loss_mlp": 1.02234888, + "epoch": 0.28047497369607693, + "flos": 14802652137600.0, + "grad_norm": 1.8010202999073999, + "language_loss": 0.80932271, + "learning_rate": 3.377186981855578e-06, + "loss": 0.83498085, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.24682617, + "step": 4665, + "time_per_iteration": 2.8403472900390625 + }, + { + "auxiliary_loss_clip": 0.01524698, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.326859, + "balance_loss_mlp": 1.02104282, + "epoch": 0.2805350969487449, + "flos": 23079771048960.0, + "grad_norm": 2.1218793222469725, + "language_loss": 0.8130908, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83877754, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.22937012, + "step": 4666, + "time_per_iteration": 2.900665044784546 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.0104713, + "balance_loss_clip": 1.3407129, + "balance_loss_mlp": 1.0228591, + "epoch": 0.2805952202014129, + "flos": 20488113578880.0, + "grad_norm": 2.6795683638836194, + "language_loss": 0.85958481, + "learning_rate": 3.376622043036658e-06, + "loss": 0.88547361, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24255371, + "step": 4667, + "time_per_iteration": 4.266583681106567 + }, + { + "auxiliary_loss_clip": 0.01554971, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.35023057, + "balance_loss_mlp": 1.0210892, + "epoch": 0.2806553434540809, + "flos": 27428733031680.0, + "grad_norm": 1.9148913153239218, + "language_loss": 0.8063733, + "learning_rate": 3.376339495319373e-06, + "loss": 0.83238816, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.25439453, + "step": 4668, + "time_per_iteration": 2.870173454284668 + }, + { + "auxiliary_loss_clip": 0.0155874, + "auxiliary_loss_mlp": 0.0105143, + "balance_loss_clip": 1.35190082, + "balance_loss_mlp": 1.02498972, + "epoch": 0.28071546670674885, + "flos": 26516142535680.0, + "grad_norm": 1.535041282629128, + "language_loss": 0.77328372, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.79938543, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.26464844, + "step": 4669, + "time_per_iteration": 4.3043739795684814 + }, + { + "auxiliary_loss_clip": 0.01548904, + "auxiliary_loss_mlp": 0.01054744, + "balance_loss_clip": 1.34664297, + "balance_loss_mlp": 1.03060436, + "epoch": 0.2807755899594168, + "flos": 20568477623040.0, + "grad_norm": 2.228219269067643, + "language_loss": 0.8120259, + "learning_rate": 3.375774243322725e-06, + "loss": 0.83806235, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.24169922, + "step": 4670, + "time_per_iteration": 2.868858575820923 + }, + { + "auxiliary_loss_clip": 0.01558915, + "auxiliary_loss_mlp": 0.01042874, + "balance_loss_clip": 1.35318613, + "balance_loss_mlp": 1.01750636, + "epoch": 0.2808357132120848, + "flos": 24323210657280.0, + "grad_norm": 3.897746809945319, + "language_loss": 0.80677187, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.83278978, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 2.05566406, + "router_z_loss_mlp": 0.25341797, + "step": 4671, + "time_per_iteration": 2.8513708114624023 + }, + { + "auxiliary_loss_clip": 0.01536425, + "auxiliary_loss_mlp": 0.01045079, + "balance_loss_clip": 1.34043121, + "balance_loss_mlp": 1.0217973, + "epoch": 0.28089583646475275, + "flos": 26443605841920.0, + "grad_norm": 1.9383736840625814, + "language_loss": 0.76011848, + "learning_rate": 3.37520878264809e-06, + "loss": 0.78593349, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.23291016, + "step": 4672, + "time_per_iteration": 2.9534871578216553 + }, + { + "auxiliary_loss_clip": 0.01558954, + "auxiliary_loss_mlp": 0.01048909, + "balance_loss_clip": 1.35500503, + "balance_loss_mlp": 1.02315927, + "epoch": 0.2809559597174207, + "flos": 23122056954240.0, + "grad_norm": 3.4757278882684592, + "language_loss": 0.77245682, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.79853547, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.25744629, + "step": 4673, + "time_per_iteration": 2.8241679668426514 + }, + { + "auxiliary_loss_clip": 0.01543215, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.34209037, + "balance_loss_mlp": 1.02263856, + "epoch": 0.2810160829700887, + "flos": 20933378087040.0, + "grad_norm": 2.567518587508363, + "language_loss": 0.72647887, + "learning_rate": 3.374643113381237e-06, + "loss": 0.75238252, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24499512, + "step": 4674, + "time_per_iteration": 2.904388666152954 + }, + { + "auxiliary_loss_clip": 0.01556005, + "auxiliary_loss_mlp": 0.01048872, + "balance_loss_clip": 1.3535006, + "balance_loss_mlp": 1.02288413, + "epoch": 0.28107620622275664, + "flos": 14364038615040.0, + "grad_norm": 1.8763683492875924, + "language_loss": 0.78239489, + "learning_rate": 3.374360200552541e-06, + "loss": 0.80844367, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.2598877, + "step": 4675, + "time_per_iteration": 2.8612029552459717 + }, + { + "auxiliary_loss_clip": 0.01549115, + "auxiliary_loss_mlp": 0.01044771, + "balance_loss_clip": 1.34530449, + "balance_loss_mlp": 1.01875949, + "epoch": 0.2811363294754246, + "flos": 20927812976640.0, + "grad_norm": 2.5460174394612083, + "language_loss": 0.71106488, + "learning_rate": 3.374077235607968e-06, + "loss": 0.73700368, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.26025391, + "step": 4676, + "time_per_iteration": 2.838313579559326 + }, + { + "auxiliary_loss_clip": 0.0152618, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.33215666, + "balance_loss_mlp": 1.02326441, + "epoch": 0.28119645272809257, + "flos": 20604564990720.0, + "grad_norm": 1.5642248791697388, + "language_loss": 0.7156117, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.74134529, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.23925781, + "step": 4677, + "time_per_iteration": 2.885960340499878 + }, + { + "auxiliary_loss_clip": 0.01529996, + "auxiliary_loss_mlp": 0.01044483, + "balance_loss_clip": 1.33248019, + "balance_loss_mlp": 1.01885295, + "epoch": 0.28125657598076054, + "flos": 25348225777920.0, + "grad_norm": 1.4571964528613524, + "language_loss": 0.64190924, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.66765398, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.25646973, + "step": 4678, + "time_per_iteration": 3.018908977508545 + }, + { + "auxiliary_loss_clip": 0.01527918, + "auxiliary_loss_mlp": 0.0105479, + "balance_loss_clip": 1.33142817, + "balance_loss_mlp": 1.02964854, + "epoch": 0.2813166992334285, + "flos": 24837889968000.0, + "grad_norm": 1.7639118578293043, + "language_loss": 0.71348417, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.73931122, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.25158691, + "step": 4679, + "time_per_iteration": 2.925842761993408 + }, + { + "auxiliary_loss_clip": 0.01544511, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.34294033, + "balance_loss_mlp": 1.01594234, + "epoch": 0.2813768224860965, + "flos": 21770355242880.0, + "grad_norm": 1.8225102854136799, + "language_loss": 0.75622869, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.78209168, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25842285, + "step": 4680, + "time_per_iteration": 2.8713812828063965 + }, + { + "auxiliary_loss_clip": 0.01526636, + "auxiliary_loss_mlp": 0.01041652, + "balance_loss_clip": 1.32852817, + "balance_loss_mlp": 1.01761985, + "epoch": 0.2814369457387645, + "flos": 24327599402880.0, + "grad_norm": 1.7485702854390646, + "language_loss": 0.78409052, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.80977333, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.24023438, + "step": 4681, + "time_per_iteration": 2.8598570823669434 + }, + { + "auxiliary_loss_clip": 0.01562775, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.35878611, + "balance_loss_mlp": 1.02067327, + "epoch": 0.28149706899143245, + "flos": 18524555429760.0, + "grad_norm": 1.8959955349102147, + "language_loss": 0.75099069, + "learning_rate": 3.372378352108146e-06, + "loss": 0.77707303, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.24768066, + "step": 4682, + "time_per_iteration": 2.8738749027252197 + }, + { + "auxiliary_loss_clip": 0.01533092, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.33717644, + "balance_loss_mlp": 1.01305437, + "epoch": 0.2815571922441004, + "flos": 24873977335680.0, + "grad_norm": 1.8054203667947146, + "language_loss": 0.81513488, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.84083629, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.23986816, + "step": 4683, + "time_per_iteration": 2.918318510055542 + }, + { + "auxiliary_loss_clip": 0.01543401, + "auxiliary_loss_mlp": 0.01040066, + "balance_loss_clip": 1.34330952, + "balance_loss_mlp": 1.01544929, + "epoch": 0.2816173154967684, + "flos": 19911168144000.0, + "grad_norm": 1.6315426221960077, + "language_loss": 0.76771981, + "learning_rate": 3.371811641167852e-06, + "loss": 0.79355443, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.24633789, + "step": 4684, + "time_per_iteration": 2.915719985961914 + }, + { + "auxiliary_loss_clip": 0.01534375, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_clip": 1.33789122, + "balance_loss_mlp": 1.01793838, + "epoch": 0.28167743874943635, + "flos": 17499676043520.0, + "grad_norm": 2.4688233561699016, + "language_loss": 0.77871305, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.80447829, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24230957, + "step": 4685, + "time_per_iteration": 2.8332231044769287 + }, + { + "auxiliary_loss_clip": 0.01532676, + "auxiliary_loss_mlp": 0.01041779, + "balance_loss_clip": 1.33743799, + "balance_loss_mlp": 1.01809239, + "epoch": 0.2817375620021043, + "flos": 25313224285440.0, + "grad_norm": 1.6706722733422557, + "language_loss": 0.76556635, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.79131091, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.23657227, + "step": 4686, + "time_per_iteration": 2.883486270904541 + }, + { + "auxiliary_loss_clip": 0.01555091, + "auxiliary_loss_mlp": 0.01047634, + "balance_loss_clip": 1.35301411, + "balance_loss_mlp": 1.02113402, + "epoch": 0.2817976852547723, + "flos": 18701327439360.0, + "grad_norm": 2.7916709092651657, + "language_loss": 0.65518856, + "learning_rate": 3.370961184640025e-06, + "loss": 0.68121582, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.26501465, + "step": 4687, + "time_per_iteration": 2.833021879196167 + }, + { + "auxiliary_loss_clip": 0.01535521, + "auxiliary_loss_mlp": 0.01044518, + "balance_loss_clip": 1.33768117, + "balance_loss_mlp": 1.01981771, + "epoch": 0.28185780850744024, + "flos": 22751184176640.0, + "grad_norm": 2.074734402016617, + "language_loss": 0.76993173, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.79573214, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.24707031, + "step": 4688, + "time_per_iteration": 2.8543012142181396 + }, + { + "auxiliary_loss_clip": 0.01529211, + "auxiliary_loss_mlp": 0.01039537, + "balance_loss_clip": 1.33251071, + "balance_loss_mlp": 1.01551628, + "epoch": 0.2819179317601082, + "flos": 14940712581120.0, + "grad_norm": 1.8669340414847686, + "language_loss": 0.79972821, + "learning_rate": 3.37039395366863e-06, + "loss": 0.82541567, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24023438, + "step": 4689, + "time_per_iteration": 2.807666301727295 + }, + { + "auxiliary_loss_clip": 0.01525473, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.32807481, + "balance_loss_mlp": 1.01505566, + "epoch": 0.2819780550127762, + "flos": 23154796206720.0, + "grad_norm": 1.6115667497613988, + "language_loss": 0.7875281, + "learning_rate": 3.37011026022934e-06, + "loss": 0.81319743, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.26403809, + "step": 4690, + "time_per_iteration": 2.827425241470337 + }, + { + "auxiliary_loss_clip": 0.01530284, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.33232069, + "balance_loss_mlp": 1.0195291, + "epoch": 0.28203817826544414, + "flos": 21626322485760.0, + "grad_norm": 1.8937802824501626, + "language_loss": 0.88482881, + "learning_rate": 3.369826514835332e-06, + "loss": 0.91057777, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.25073242, + "step": 4691, + "time_per_iteration": 2.9522907733917236 + }, + { + "auxiliary_loss_clip": 0.01554121, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_clip": 1.3513484, + "balance_loss_mlp": 1.02226186, + "epoch": 0.2820983015181121, + "flos": 24037995565440.0, + "grad_norm": 1.6923797400431964, + "language_loss": 0.82992399, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.8559444, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.25683594, + "step": 4692, + "time_per_iteration": 4.4081690311431885 + }, + { + "auxiliary_loss_clip": 0.01524897, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.32786357, + "balance_loss_mlp": 1.01452804, + "epoch": 0.2821584247707801, + "flos": 30020028543360.0, + "grad_norm": 1.549928035595499, + "language_loss": 0.74931788, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.77496248, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.25048828, + "step": 4693, + "time_per_iteration": 2.983670473098755 + }, + { + "auxiliary_loss_clip": 0.01526991, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.32918215, + "balance_loss_mlp": 1.02075982, + "epoch": 0.2822185480234481, + "flos": 21406495409280.0, + "grad_norm": 1.7301246618014963, + "language_loss": 0.78709459, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.8128159, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.24401855, + "step": 4694, + "time_per_iteration": 2.9282925128936768 + }, + { + "auxiliary_loss_clip": 0.01525878, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.33226407, + "balance_loss_mlp": 1.01617861, + "epoch": 0.28227867127611606, + "flos": 27463960748160.0, + "grad_norm": 1.8442849760530755, + "language_loss": 0.67464554, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.70031476, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.2487793, + "step": 4695, + "time_per_iteration": 2.939635992050171 + }, + { + "auxiliary_loss_clip": 0.01540186, + "auxiliary_loss_mlp": 0.01046176, + "balance_loss_clip": 1.3410027, + "balance_loss_mlp": 1.01894903, + "epoch": 0.282338794528784, + "flos": 22601948267520.0, + "grad_norm": 2.1147615314791506, + "language_loss": 0.77148104, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.79734474, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.27209473, + "step": 4696, + "time_per_iteration": 2.8597843647003174 + }, + { + "auxiliary_loss_clip": 0.01522409, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.32815826, + "balance_loss_mlp": 1.02269578, + "epoch": 0.282398917781452, + "flos": 42025927956480.0, + "grad_norm": 1.4849399987450038, + "language_loss": 0.63274503, + "learning_rate": 3.368122952024877e-06, + "loss": 0.6584518, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.25561523, + "step": 4697, + "time_per_iteration": 3.0392119884490967 + }, + { + "auxiliary_loss_clip": 0.01509446, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.31685567, + "balance_loss_mlp": 1.01788557, + "epoch": 0.28245904103411995, + "flos": 23235884167680.0, + "grad_norm": 1.4321939738574294, + "language_loss": 0.73982912, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.76533931, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.23681641, + "step": 4698, + "time_per_iteration": 2.8972625732421875 + }, + { + "auxiliary_loss_clip": 0.01512732, + "auxiliary_loss_mlp": 0.0104463, + "balance_loss_clip": 1.32054329, + "balance_loss_mlp": 1.02122927, + "epoch": 0.2825191642867879, + "flos": 25385625244800.0, + "grad_norm": 1.6860931578177958, + "language_loss": 0.76593262, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.79150629, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.23413086, + "step": 4699, + "time_per_iteration": 4.247616767883301 + }, + { + "auxiliary_loss_clip": 0.01531013, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.33160102, + "balance_loss_mlp": 1.01923299, + "epoch": 0.2825792875394559, + "flos": 17245299922560.0, + "grad_norm": 2.865809525090896, + "language_loss": 0.81809878, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.84383917, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.23815918, + "step": 4700, + "time_per_iteration": 2.8890793323516846 + }, + { + "auxiliary_loss_clip": 0.0151299, + "auxiliary_loss_mlp": 0.01049351, + "balance_loss_clip": 1.32437623, + "balance_loss_mlp": 1.02593839, + "epoch": 0.28263941079212385, + "flos": 26735155205760.0, + "grad_norm": 4.38560175114185, + "language_loss": 0.83032811, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.85595149, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.23413086, + "step": 4701, + "time_per_iteration": 2.8632400035858154 + }, + { + "auxiliary_loss_clip": 0.01526289, + "auxiliary_loss_mlp": 0.0104902, + "balance_loss_clip": 1.33054101, + "balance_loss_mlp": 1.02454603, + "epoch": 0.2826995340447918, + "flos": 25932319891200.0, + "grad_norm": 2.807110824905541, + "language_loss": 0.74116647, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.76691961, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.24462891, + "step": 4702, + "time_per_iteration": 4.309850454330444 + }, + { + "auxiliary_loss_clip": 0.01517308, + "auxiliary_loss_mlp": 0.01051074, + "balance_loss_clip": 1.32575905, + "balance_loss_mlp": 1.02683866, + "epoch": 0.2827596572974598, + "flos": 22389134135040.0, + "grad_norm": 1.6720260986019393, + "language_loss": 0.78999037, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.81567419, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.2421875, + "step": 4703, + "time_per_iteration": 2.8646583557128906 + }, + { + "auxiliary_loss_clip": 0.01532508, + "auxiliary_loss_mlp": 0.01060054, + "balance_loss_clip": 1.33537972, + "balance_loss_mlp": 1.03580689, + "epoch": 0.28281978055012774, + "flos": 33560182897920.0, + "grad_norm": 2.1112866262858137, + "language_loss": 0.70653123, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.73245692, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.24230957, + "step": 4704, + "time_per_iteration": 4.379939794540405 + }, + { + "auxiliary_loss_clip": 0.01518395, + "auxiliary_loss_mlp": 0.01059889, + "balance_loss_clip": 1.3230114, + "balance_loss_mlp": 1.03456903, + "epoch": 0.2828799038027957, + "flos": 23451367743360.0, + "grad_norm": 2.705748032796369, + "language_loss": 0.71627772, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.74206054, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.2532959, + "step": 4705, + "time_per_iteration": 2.8943123817443848 + }, + { + "auxiliary_loss_clip": 0.01331311, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.21509302, + "balance_loss_mlp": 1.01098752, + "epoch": 0.2829400270554637, + "flos": 69902187736320.0, + "grad_norm": 0.7276219294092496, + "language_loss": 0.59334534, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.617064, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.29492188, + "step": 4706, + "time_per_iteration": 3.4356019496917725 + }, + { + "auxiliary_loss_clip": 0.01503686, + "auxiliary_loss_mlp": 0.010532, + "balance_loss_clip": 1.3137207, + "balance_loss_mlp": 1.03050303, + "epoch": 0.2830001503081317, + "flos": 24799404625920.0, + "grad_norm": 1.3124473688305813, + "language_loss": 0.82231933, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84788823, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22705078, + "step": 4707, + "time_per_iteration": 2.8827314376831055 + }, + { + "auxiliary_loss_clip": 0.01528942, + "auxiliary_loss_mlp": 0.010588, + "balance_loss_clip": 1.3306632, + "balance_loss_mlp": 1.03348029, + "epoch": 0.28306027356079966, + "flos": 27679987261440.0, + "grad_norm": 1.5573623328948076, + "language_loss": 0.81530559, + "learning_rate": 3.36499490449902e-06, + "loss": 0.84118301, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.25354004, + "step": 4708, + "time_per_iteration": 2.8968722820281982 + }, + { + "auxiliary_loss_clip": 0.01328079, + "auxiliary_loss_mlp": 0.010404, + "balance_loss_clip": 1.21391368, + "balance_loss_mlp": 1.00988245, + "epoch": 0.2831203968134676, + "flos": 60552609891840.0, + "grad_norm": 0.8861583858936074, + "language_loss": 0.62870002, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.65238482, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.3046875, + "step": 4709, + "time_per_iteration": 3.196225643157959 + }, + { + "auxiliary_loss_clip": 0.01522512, + "auxiliary_loss_mlp": 0.01051835, + "balance_loss_clip": 1.32835698, + "balance_loss_mlp": 1.02832675, + "epoch": 0.2831805200661356, + "flos": 22065026497920.0, + "grad_norm": 1.5064957870738145, + "language_loss": 0.74800897, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.77375245, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.23498535, + "step": 4710, + "time_per_iteration": 2.8652305603027344 + }, + { + "auxiliary_loss_clip": 0.01512123, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.31593132, + "balance_loss_mlp": 1.03027284, + "epoch": 0.28324064331880355, + "flos": 22610680513920.0, + "grad_norm": 1.9389356358468988, + "language_loss": 0.80494118, + "learning_rate": 3.364140713048579e-06, + "loss": 0.83061212, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.24719238, + "step": 4711, + "time_per_iteration": 2.861617088317871 + }, + { + "auxiliary_loss_clip": 0.01514789, + "auxiliary_loss_mlp": 0.01050309, + "balance_loss_clip": 1.31900597, + "balance_loss_mlp": 1.02558517, + "epoch": 0.2833007665714715, + "flos": 30414998816640.0, + "grad_norm": 1.880188577798531, + "language_loss": 0.71939212, + "learning_rate": 3.363855879093996e-06, + "loss": 0.7450431, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.24743652, + "step": 4712, + "time_per_iteration": 2.925346851348877 + }, + { + "auxiliary_loss_clip": 0.01515472, + "auxiliary_loss_mlp": 0.01050197, + "balance_loss_clip": 1.32021332, + "balance_loss_mlp": 1.02531791, + "epoch": 0.2833608898241395, + "flos": 23559810825600.0, + "grad_norm": 1.878078310937145, + "language_loss": 0.83264327, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.85829997, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.24841309, + "step": 4713, + "time_per_iteration": 2.872859001159668 + }, + { + "auxiliary_loss_clip": 0.01523436, + "auxiliary_loss_mlp": 0.01048645, + "balance_loss_clip": 1.32564282, + "balance_loss_mlp": 1.02502942, + "epoch": 0.28342101307680745, + "flos": 20276430566400.0, + "grad_norm": 1.7921276678469167, + "language_loss": 0.76168084, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.78740162, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.23608398, + "step": 4714, + "time_per_iteration": 2.920032501220703 + }, + { + "auxiliary_loss_clip": 0.01514739, + "auxiliary_loss_mlp": 0.01047364, + "balance_loss_clip": 1.31836128, + "balance_loss_mlp": 1.0225563, + "epoch": 0.2834811363294754, + "flos": 30859358428800.0, + "grad_norm": 1.4611404743647336, + "language_loss": 0.78774172, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.81336278, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24829102, + "step": 4715, + "time_per_iteration": 2.910038709640503 + }, + { + "auxiliary_loss_clip": 0.01499883, + "auxiliary_loss_mlp": 0.01042473, + "balance_loss_clip": 1.30618119, + "balance_loss_mlp": 1.01807058, + "epoch": 0.2835412595821434, + "flos": 22721295346560.0, + "grad_norm": 1.6829840685349544, + "language_loss": 0.74018884, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76561236, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24401855, + "step": 4716, + "time_per_iteration": 2.850165605545044 + }, + { + "auxiliary_loss_clip": 0.01524101, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.32137024, + "balance_loss_mlp": 1.02457869, + "epoch": 0.28360138283481134, + "flos": 18086892048000.0, + "grad_norm": 2.6400990233305475, + "language_loss": 0.75703943, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.78277409, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.2479248, + "step": 4717, + "time_per_iteration": 2.8362069129943848 + }, + { + "auxiliary_loss_clip": 0.01517023, + "auxiliary_loss_mlp": 0.01059474, + "balance_loss_clip": 1.31715405, + "balance_loss_mlp": 1.03294945, + "epoch": 0.2836615060874793, + "flos": 17862857205120.0, + "grad_norm": 1.6207697111450592, + "language_loss": 0.68132955, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.70709455, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.26538086, + "step": 4718, + "time_per_iteration": 2.907357931137085 + }, + { + "auxiliary_loss_clip": 0.01509635, + "auxiliary_loss_mlp": 0.01044795, + "balance_loss_clip": 1.3126843, + "balance_loss_mlp": 1.01824665, + "epoch": 0.2837216293401473, + "flos": 25751611584000.0, + "grad_norm": 1.7721477108839503, + "language_loss": 0.7357738, + "learning_rate": 3.361860593925566e-06, + "loss": 0.76131815, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.26538086, + "step": 4719, + "time_per_iteration": 2.911893606185913 + }, + { + "auxiliary_loss_clip": 0.01494084, + "auxiliary_loss_mlp": 0.01043923, + "balance_loss_clip": 1.30193686, + "balance_loss_mlp": 1.0187819, + "epoch": 0.2837817525928153, + "flos": 20933423331840.0, + "grad_norm": 1.653835896307464, + "language_loss": 0.81230581, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.83768588, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.2512207, + "step": 4720, + "time_per_iteration": 2.8864338397979736 + }, + { + "auxiliary_loss_clip": 0.01514641, + "auxiliary_loss_mlp": 0.01048426, + "balance_loss_clip": 1.31674194, + "balance_loss_mlp": 1.0216043, + "epoch": 0.28384187584548326, + "flos": 18926040954240.0, + "grad_norm": 1.7317916658375525, + "language_loss": 0.80190432, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.82753503, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.26831055, + "step": 4721, + "time_per_iteration": 3.0064148902893066 + }, + { + "auxiliary_loss_clip": 0.0150957, + "auxiliary_loss_mlp": 0.01044823, + "balance_loss_clip": 1.31390977, + "balance_loss_mlp": 1.01789379, + "epoch": 0.2839019990981512, + "flos": 27355246197120.0, + "grad_norm": 2.7940043409086788, + "language_loss": 0.83557612, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.8611201, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.26904297, + "step": 4722, + "time_per_iteration": 2.9106860160827637 + }, + { + "auxiliary_loss_clip": 0.01510346, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.31457543, + "balance_loss_mlp": 1.01671636, + "epoch": 0.2839621223508192, + "flos": 18123522353280.0, + "grad_norm": 1.7032323835639431, + "language_loss": 0.71342647, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.73895764, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.26049805, + "step": 4723, + "time_per_iteration": 2.839162588119507 + }, + { + "auxiliary_loss_clip": 0.01496337, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_clip": 1.30293131, + "balance_loss_mlp": 1.02231598, + "epoch": 0.28402224560348716, + "flos": 26369259356160.0, + "grad_norm": 1.4764526209025686, + "language_loss": 0.79172635, + "learning_rate": 3.360433840760998e-06, + "loss": 0.81716919, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.25634766, + "step": 4724, + "time_per_iteration": 2.869922161102295 + }, + { + "auxiliary_loss_clip": 0.01492281, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_clip": 1.29643726, + "balance_loss_mlp": 1.02035522, + "epoch": 0.2840823688561551, + "flos": 24071368245120.0, + "grad_norm": 1.7126071159096026, + "language_loss": 0.93138361, + "learning_rate": 3.36014833532143e-06, + "loss": 0.95676064, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.25073242, + "step": 4725, + "time_per_iteration": 2.9126152992248535 + }, + { + "auxiliary_loss_clip": 0.01515567, + "auxiliary_loss_mlp": 0.01051498, + "balance_loss_clip": 1.31765449, + "balance_loss_mlp": 1.02543926, + "epoch": 0.2841424921088231, + "flos": 29471569349760.0, + "grad_norm": 1.5808292075232815, + "language_loss": 0.89436287, + "learning_rate": 3.3598627783049e-06, + "loss": 0.92003351, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.26074219, + "step": 4726, + "time_per_iteration": 2.8982410430908203 + }, + { + "auxiliary_loss_clip": 0.01522212, + "auxiliary_loss_mlp": 0.01049104, + "balance_loss_clip": 1.32205629, + "balance_loss_mlp": 1.02195978, + "epoch": 0.28420261536149105, + "flos": 48115996813440.0, + "grad_norm": 1.6968954269463958, + "language_loss": 0.79549456, + "learning_rate": 3.359577169722238e-06, + "loss": 0.8212077, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.2713623, + "step": 4727, + "time_per_iteration": 3.0966713428497314 + }, + { + "auxiliary_loss_clip": 0.01492299, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.30095112, + "balance_loss_mlp": 1.02447152, + "epoch": 0.284262738614159, + "flos": 25677129363840.0, + "grad_norm": 3.69142491422953, + "language_loss": 0.67755425, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.7029587, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.23693848, + "step": 4728, + "time_per_iteration": 4.314008951187134 + }, + { + "auxiliary_loss_clip": 0.01495845, + "auxiliary_loss_mlp": 0.01049562, + "balance_loss_clip": 1.30194592, + "balance_loss_mlp": 1.02372968, + "epoch": 0.284322861866827, + "flos": 19728423820800.0, + "grad_norm": 1.7796369760240949, + "language_loss": 0.77424693, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.79970098, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.25842285, + "step": 4729, + "time_per_iteration": 2.8259856700897217 + }, + { + "auxiliary_loss_clip": 0.01514179, + "auxiliary_loss_mlp": 0.0104866, + "balance_loss_clip": 1.31708479, + "balance_loss_mlp": 1.02280378, + "epoch": 0.28438298511949495, + "flos": 23925887654400.0, + "grad_norm": 1.8052719678131146, + "language_loss": 0.67644238, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.70207071, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.25842285, + "step": 4730, + "time_per_iteration": 2.836742639541626 + }, + { + "auxiliary_loss_clip": 0.01520698, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_clip": 1.3227427, + "balance_loss_mlp": 1.01687777, + "epoch": 0.2844431083721629, + "flos": 26078388664320.0, + "grad_norm": 2.005640200641468, + "language_loss": 0.75179785, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.77742851, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.25512695, + "step": 4731, + "time_per_iteration": 2.9251632690429688 + }, + { + "auxiliary_loss_clip": 0.01510795, + "auxiliary_loss_mlp": 0.01046858, + "balance_loss_clip": 1.31472063, + "balance_loss_mlp": 1.0210855, + "epoch": 0.2845032316248309, + "flos": 25821026386560.0, + "grad_norm": 3.099827876844954, + "language_loss": 0.84337628, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.86895287, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.2578125, + "step": 4732, + "time_per_iteration": 2.893732786178589 + }, + { + "auxiliary_loss_clip": 0.01493991, + "auxiliary_loss_mlp": 0.01044555, + "balance_loss_clip": 1.2989049, + "balance_loss_mlp": 1.0187819, + "epoch": 0.2845633548774989, + "flos": 19831301792640.0, + "grad_norm": 3.1508705382181827, + "language_loss": 0.79753107, + "learning_rate": 3.357862435944109e-06, + "loss": 0.82291651, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.25769043, + "step": 4733, + "time_per_iteration": 2.8954620361328125 + }, + { + "auxiliary_loss_clip": 0.01506488, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.30770159, + "balance_loss_mlp": 1.01782513, + "epoch": 0.28462347813016686, + "flos": 23192557632000.0, + "grad_norm": 2.647748607671742, + "language_loss": 0.72055209, + "learning_rate": 3.357576466701875e-06, + "loss": 0.74605304, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.25793457, + "step": 4734, + "time_per_iteration": 4.317319631576538 + }, + { + "auxiliary_loss_clip": 0.0149655, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.30264592, + "balance_loss_mlp": 1.01477695, + "epoch": 0.2846836013828348, + "flos": 18669538327680.0, + "grad_norm": 1.885149835893014, + "language_loss": 0.74659753, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.77196461, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.25366211, + "step": 4735, + "time_per_iteration": 2.8169057369232178 + }, + { + "auxiliary_loss_clip": 0.01497332, + "auxiliary_loss_mlp": 0.01043791, + "balance_loss_clip": 1.3028059, + "balance_loss_mlp": 1.01903117, + "epoch": 0.2847437246355028, + "flos": 14181611005440.0, + "grad_norm": 1.7125840821238163, + "language_loss": 0.80901444, + "learning_rate": 3.357004373789946e-06, + "loss": 0.83442569, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.24768066, + "step": 4736, + "time_per_iteration": 2.8812968730926514 + }, + { + "auxiliary_loss_clip": 0.01508352, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.31057191, + "balance_loss_mlp": 1.02401829, + "epoch": 0.28480384788817076, + "flos": 29290318104960.0, + "grad_norm": 2.1819452959012633, + "language_loss": 0.61009496, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.63568544, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.26708984, + "step": 4737, + "time_per_iteration": 4.342505693435669 + }, + { + "auxiliary_loss_clip": 0.01498671, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.30586421, + "balance_loss_mlp": 1.01485825, + "epoch": 0.2848639711408387, + "flos": 22611494920320.0, + "grad_norm": 1.703269928583071, + "language_loss": 0.86801445, + "learning_rate": 3.356432075047052e-06, + "loss": 0.89339864, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.24914551, + "step": 4738, + "time_per_iteration": 4.272680759429932 + }, + { + "auxiliary_loss_clip": 0.01508452, + "auxiliary_loss_mlp": 0.01045792, + "balance_loss_clip": 1.31147277, + "balance_loss_mlp": 1.01926756, + "epoch": 0.2849240943935067, + "flos": 17607485698560.0, + "grad_norm": 2.171795904894641, + "language_loss": 0.90952903, + "learning_rate": 3.356145848516118e-06, + "loss": 0.93507147, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.26538086, + "step": 4739, + "time_per_iteration": 2.801121711730957 + }, + { + "auxiliary_loss_clip": 0.01484506, + "auxiliary_loss_mlp": 0.01047758, + "balance_loss_clip": 1.29239464, + "balance_loss_mlp": 1.02310562, + "epoch": 0.28498421764617465, + "flos": 24873117684480.0, + "grad_norm": 1.4363656083523157, + "language_loss": 0.73054177, + "learning_rate": 3.355859570559998e-06, + "loss": 0.75586438, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.2467041, + "step": 4740, + "time_per_iteration": 2.877530336380005 + }, + { + "auxiliary_loss_clip": 0.01489315, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.29800463, + "balance_loss_mlp": 1.01528072, + "epoch": 0.2850443408988426, + "flos": 22792474696320.0, + "grad_norm": 1.51428086975398, + "language_loss": 0.78658742, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.81189108, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.2578125, + "step": 4741, + "time_per_iteration": 2.8267436027526855 + }, + { + "auxiliary_loss_clip": 0.01515183, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.31326234, + "balance_loss_mlp": 1.0147984, + "epoch": 0.2851044641515106, + "flos": 18853459015680.0, + "grad_norm": 2.5112808609059134, + "language_loss": 0.77833521, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.80389768, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.26281738, + "step": 4742, + "time_per_iteration": 2.8497121334075928 + }, + { + "auxiliary_loss_clip": 0.01508945, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.31153107, + "balance_loss_mlp": 1.02663243, + "epoch": 0.28516458740417855, + "flos": 18889908341760.0, + "grad_norm": 2.0181971032845136, + "language_loss": 0.58533013, + "learning_rate": 3.355000428249086e-06, + "loss": 0.61095226, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.26635742, + "step": 4743, + "time_per_iteration": 2.823291540145874 + }, + { + "auxiliary_loss_clip": 0.01511024, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_clip": 1.31701422, + "balance_loss_mlp": 1.02792525, + "epoch": 0.2852247106568465, + "flos": 25310328618240.0, + "grad_norm": 1.6221476803885846, + "language_loss": 0.74659848, + "learning_rate": 3.354713944700797e-06, + "loss": 0.77224576, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.25793457, + "step": 4744, + "time_per_iteration": 2.9066648483276367 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01047678, + "balance_loss_clip": 1.29827225, + "balance_loss_mlp": 1.02283454, + "epoch": 0.2852848339095145, + "flos": 11662671208320.0, + "grad_norm": 2.2828742923004772, + "language_loss": 0.78554976, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.8109175, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.24829102, + "step": 4745, + "time_per_iteration": 2.817720413208008 + }, + { + "auxiliary_loss_clip": 0.01478162, + "auxiliary_loss_mlp": 0.01051318, + "balance_loss_clip": 1.28979075, + "balance_loss_mlp": 1.02561665, + "epoch": 0.2853449571621825, + "flos": 12941067064320.0, + "grad_norm": 1.7490014232934052, + "language_loss": 0.83547139, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.86076629, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.25720215, + "step": 4746, + "time_per_iteration": 2.858774185180664 + }, + { + "auxiliary_loss_clip": 0.01522136, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_clip": 1.31957936, + "balance_loss_mlp": 1.0238359, + "epoch": 0.28540508041485046, + "flos": 20020018429440.0, + "grad_norm": 1.670729684306003, + "language_loss": 0.80069458, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.82641506, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.26086426, + "step": 4747, + "time_per_iteration": 2.8487648963928223 + }, + { + "auxiliary_loss_clip": 0.01309691, + "auxiliary_loss_mlp": 0.01061469, + "balance_loss_clip": 1.190166, + "balance_loss_mlp": 1.02351284, + "epoch": 0.28546520366751843, + "flos": 68171803879680.0, + "grad_norm": 0.7846862600552561, + "language_loss": 0.60574257, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.6294542, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.37890625, + "step": 4748, + "time_per_iteration": 3.3766870498657227 + }, + { + "auxiliary_loss_clip": 0.01491269, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.29910278, + "balance_loss_mlp": 1.02565956, + "epoch": 0.2855253269201864, + "flos": 13256849658240.0, + "grad_norm": 2.2001388756965117, + "language_loss": 0.81103456, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.83646727, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.26342773, + "step": 4749, + "time_per_iteration": 2.7959842681884766 + }, + { + "auxiliary_loss_clip": 0.01493549, + "auxiliary_loss_mlp": 0.0106253, + "balance_loss_clip": 1.30045962, + "balance_loss_mlp": 1.03716183, + "epoch": 0.28558545017285436, + "flos": 28632465688320.0, + "grad_norm": 2.0568467800864214, + "language_loss": 0.72014463, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.74570537, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.25378418, + "step": 4750, + "time_per_iteration": 2.9076783657073975 + }, + { + "auxiliary_loss_clip": 0.01486217, + "auxiliary_loss_mlp": 0.01064693, + "balance_loss_clip": 1.29637504, + "balance_loss_mlp": 1.03861022, + "epoch": 0.2856455734255223, + "flos": 34143553094400.0, + "grad_norm": 1.5290321335115002, + "language_loss": 0.82949907, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.85500818, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.26062012, + "step": 4751, + "time_per_iteration": 2.9837379455566406 + }, + { + "auxiliary_loss_clip": 0.0149516, + "auxiliary_loss_mlp": 0.01054832, + "balance_loss_clip": 1.30271709, + "balance_loss_mlp": 1.02882075, + "epoch": 0.2857056966781903, + "flos": 39800799763200.0, + "grad_norm": 1.8631218522601813, + "language_loss": 0.80828583, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.83378577, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.26000977, + "step": 4752, + "time_per_iteration": 3.028844118118286 + }, + { + "auxiliary_loss_clip": 0.01484005, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.29258692, + "balance_loss_mlp": 1.03368568, + "epoch": 0.28576581993085826, + "flos": 21882463153920.0, + "grad_norm": 1.7938414793930575, + "language_loss": 0.79762179, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.82304984, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.2512207, + "step": 4753, + "time_per_iteration": 2.8503472805023193 + }, + { + "auxiliary_loss_clip": 0.01513889, + "auxiliary_loss_mlp": 0.01065793, + "balance_loss_clip": 1.31567454, + "balance_loss_mlp": 1.03917372, + "epoch": 0.2858259431835262, + "flos": 19099102890240.0, + "grad_norm": 2.096881783836177, + "language_loss": 0.9081322, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.93392909, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.26599121, + "step": 4754, + "time_per_iteration": 2.8935513496398926 + }, + { + "auxiliary_loss_clip": 0.01475193, + "auxiliary_loss_mlp": 0.01056941, + "balance_loss_clip": 1.28873599, + "balance_loss_mlp": 1.03214598, + "epoch": 0.2858860664361942, + "flos": 20342632988160.0, + "grad_norm": 2.0219378729797017, + "language_loss": 0.83061904, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.85594034, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.2479248, + "step": 4755, + "time_per_iteration": 2.8317129611968994 + }, + { + "auxiliary_loss_clip": 0.01487455, + "auxiliary_loss_mlp": 0.01059804, + "balance_loss_clip": 1.2954855, + "balance_loss_mlp": 1.03503215, + "epoch": 0.28594618968886215, + "flos": 24472310832000.0, + "grad_norm": 1.7516452945033922, + "language_loss": 0.84439814, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86987072, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.24780273, + "step": 4756, + "time_per_iteration": 3.0033655166625977 + }, + { + "auxiliary_loss_clip": 0.01297475, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.17993987, + "balance_loss_mlp": 1.01304078, + "epoch": 0.2860063129415301, + "flos": 71689037103360.0, + "grad_norm": 0.8703360164449953, + "language_loss": 0.6110726, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63441235, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.234375, + "step": 4757, + "time_per_iteration": 3.541386365890503 + }, + { + "auxiliary_loss_clip": 0.01487582, + "auxiliary_loss_mlp": 0.01056089, + "balance_loss_clip": 1.29641056, + "balance_loss_mlp": 1.031353, + "epoch": 0.2860664361941981, + "flos": 20568477623040.0, + "grad_norm": 1.8884014095786694, + "language_loss": 0.66967642, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.69511312, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24743652, + "step": 4758, + "time_per_iteration": 2.9287109375 + }, + { + "auxiliary_loss_clip": 0.0150636, + "auxiliary_loss_mlp": 0.01062374, + "balance_loss_clip": 1.31024134, + "balance_loss_mlp": 1.03620791, + "epoch": 0.2861265594468661, + "flos": 36011291460480.0, + "grad_norm": 1.8973321578026119, + "language_loss": 0.64079773, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.66648513, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.26184082, + "step": 4759, + "time_per_iteration": 3.0510735511779785 + }, + { + "auxiliary_loss_clip": 0.01482836, + "auxiliary_loss_mlp": 0.01049349, + "balance_loss_clip": 1.29136312, + "balance_loss_mlp": 1.02392149, + "epoch": 0.28618668269953407, + "flos": 20056965448320.0, + "grad_norm": 1.7461428311036613, + "language_loss": 0.75438738, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.77970922, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.25415039, + "step": 4760, + "time_per_iteration": 2.865426540374756 + }, + { + "auxiliary_loss_clip": 0.01480516, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.29454505, + "balance_loss_mlp": 1.01798201, + "epoch": 0.28624680595220203, + "flos": 24982375173120.0, + "grad_norm": 1.821856558059459, + "language_loss": 0.73229641, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.75752687, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.2454834, + "step": 4761, + "time_per_iteration": 2.8548829555511475 + }, + { + "auxiliary_loss_clip": 0.01514169, + "auxiliary_loss_mlp": 0.01044856, + "balance_loss_clip": 1.31755412, + "balance_loss_mlp": 1.02095437, + "epoch": 0.28630692920487, + "flos": 22502508900480.0, + "grad_norm": 2.327202285670019, + "language_loss": 0.75605893, + "learning_rate": 3.349548466945793e-06, + "loss": 0.78164923, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.23901367, + "step": 4762, + "time_per_iteration": 2.8607499599456787 + }, + { + "auxiliary_loss_clip": 0.01486534, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.29682314, + "balance_loss_mlp": 1.01640368, + "epoch": 0.28636705245753796, + "flos": 21259340760960.0, + "grad_norm": 1.5307964470388251, + "language_loss": 0.76240659, + "learning_rate": 3.349261009210496e-06, + "loss": 0.7877022, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.26635742, + "step": 4763, + "time_per_iteration": 4.243765830993652 + }, + { + "auxiliary_loss_clip": 0.01496102, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.30178452, + "balance_loss_mlp": 1.01721025, + "epoch": 0.28642717571020593, + "flos": 24105962534400.0, + "grad_norm": 1.6324543065511585, + "language_loss": 0.77755934, + "learning_rate": 3.348973500311086e-06, + "loss": 0.80295014, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.25769043, + "step": 4764, + "time_per_iteration": 3.0266950130462646 + }, + { + "auxiliary_loss_clip": 0.01499544, + "auxiliary_loss_mlp": 0.01045974, + "balance_loss_clip": 1.30615306, + "balance_loss_mlp": 1.0192709, + "epoch": 0.2864872989628739, + "flos": 22611585409920.0, + "grad_norm": 2.532564079671479, + "language_loss": 0.72652328, + "learning_rate": 3.348685940258466e-06, + "loss": 0.75197846, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.26696777, + "step": 4765, + "time_per_iteration": 2.8975718021392822 + }, + { + "auxiliary_loss_clip": 0.0149401, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.30144429, + "balance_loss_mlp": 1.01482069, + "epoch": 0.28654742221554186, + "flos": 32758071500160.0, + "grad_norm": 1.4807160349689954, + "language_loss": 0.76799506, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.79334897, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.26586914, + "step": 4766, + "time_per_iteration": 3.0123000144958496 + }, + { + "auxiliary_loss_clip": 0.01479225, + "auxiliary_loss_mlp": 0.01044378, + "balance_loss_clip": 1.28818309, + "balance_loss_mlp": 1.01792574, + "epoch": 0.2866075454682098, + "flos": 26993467624320.0, + "grad_norm": 1.5662624183931255, + "language_loss": 0.78970724, + "learning_rate": 3.348110666737214e-06, + "loss": 0.81494331, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.26477051, + "step": 4767, + "time_per_iteration": 2.912851572036743 + }, + { + "auxiliary_loss_clip": 0.01494732, + "auxiliary_loss_mlp": 0.01048304, + "balance_loss_clip": 1.30335712, + "balance_loss_mlp": 1.0218159, + "epoch": 0.2866676687208778, + "flos": 23263103554560.0, + "grad_norm": 2.8863508008693413, + "language_loss": 0.65998948, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.68541992, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.26513672, + "step": 4768, + "time_per_iteration": 2.8207504749298096 + }, + { + "auxiliary_loss_clip": 0.015151, + "auxiliary_loss_mlp": 0.01048696, + "balance_loss_clip": 1.31645727, + "balance_loss_mlp": 1.02151644, + "epoch": 0.28672779197354575, + "flos": 21589646935680.0, + "grad_norm": 1.612546264536734, + "language_loss": 0.7117641, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.73740208, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.27197266, + "step": 4769, + "time_per_iteration": 4.20481014251709 + }, + { + "auxiliary_loss_clip": 0.01493512, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_clip": 1.29856801, + "balance_loss_mlp": 1.01926184, + "epoch": 0.2867879152262137, + "flos": 19875216510720.0, + "grad_norm": 2.5031391437997077, + "language_loss": 0.7555992, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.78098154, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.25476074, + "step": 4770, + "time_per_iteration": 2.8624627590179443 + }, + { + "auxiliary_loss_clip": 0.01489837, + "auxiliary_loss_mlp": 0.01050208, + "balance_loss_clip": 1.29682672, + "balance_loss_mlp": 1.02344596, + "epoch": 0.2868480384788817, + "flos": 28223198058240.0, + "grad_norm": 3.043930823755729, + "language_loss": 0.69244659, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.71784699, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.26794434, + "step": 4771, + "time_per_iteration": 2.9029109477996826 + }, + { + "auxiliary_loss_clip": 0.01270399, + "auxiliary_loss_mlp": 0.01094924, + "balance_loss_clip": 1.15784776, + "balance_loss_mlp": 1.06707692, + "epoch": 0.2869081617315497, + "flos": 65452112328960.0, + "grad_norm": 0.7881122780514342, + "language_loss": 0.56926215, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.59291542, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.27929688, + "step": 4772, + "time_per_iteration": 4.717581033706665 + }, + { + "auxiliary_loss_clip": 0.01492246, + "auxiliary_loss_mlp": 0.01047742, + "balance_loss_clip": 1.29880393, + "balance_loss_mlp": 1.02064538, + "epoch": 0.28696828498421767, + "flos": 18669855041280.0, + "grad_norm": 2.451224572889315, + "language_loss": 0.84018207, + "learning_rate": 3.346383619630856e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.27111816, + "step": 4773, + "time_per_iteration": 4.286787748336792 + }, + { + "auxiliary_loss_clip": 0.0150153, + "auxiliary_loss_mlp": 0.01044276, + "balance_loss_clip": 1.30586982, + "balance_loss_mlp": 1.01804948, + "epoch": 0.28702840823688563, + "flos": 23670289923840.0, + "grad_norm": 2.347925815446587, + "language_loss": 0.78770173, + "learning_rate": 3.34609559969027e-06, + "loss": 0.81315982, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.26245117, + "step": 4774, + "time_per_iteration": 2.871081590652466 + }, + { + "auxiliary_loss_clip": 0.01502739, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.30799973, + "balance_loss_mlp": 1.01995385, + "epoch": 0.2870885314895536, + "flos": 13811733613440.0, + "grad_norm": 2.3785858820081094, + "language_loss": 0.74121994, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.76670885, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.26220703, + "step": 4775, + "time_per_iteration": 2.891812801361084 + }, + { + "auxiliary_loss_clip": 0.01494072, + "auxiliary_loss_mlp": 0.01051011, + "balance_loss_clip": 1.30010927, + "balance_loss_mlp": 1.02449906, + "epoch": 0.28714865474222157, + "flos": 17796111845760.0, + "grad_norm": 1.7950991067530138, + "language_loss": 0.88770175, + "learning_rate": 3.34551940668778e-06, + "loss": 0.91315258, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.26538086, + "step": 4776, + "time_per_iteration": 2.81238055229187 + }, + { + "auxiliary_loss_clip": 0.01484342, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.29307866, + "balance_loss_mlp": 1.01614034, + "epoch": 0.28720877799488953, + "flos": 16005977591040.0, + "grad_norm": 1.8845034002333572, + "language_loss": 0.75637901, + "learning_rate": 3.345231233647726e-06, + "loss": 0.78163671, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.25292969, + "step": 4777, + "time_per_iteration": 2.8237879276275635 + }, + { + "auxiliary_loss_clip": 0.01523599, + "auxiliary_loss_mlp": 0.01046869, + "balance_loss_clip": 1.32319117, + "balance_loss_mlp": 1.02041602, + "epoch": 0.2872689012475575, + "flos": 20932925639040.0, + "grad_norm": 2.1763213431904735, + "language_loss": 0.81052136, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.83622599, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.26489258, + "step": 4778, + "time_per_iteration": 2.897320508956909 + }, + { + "auxiliary_loss_clip": 0.01486802, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.29614019, + "balance_loss_mlp": 1.01999784, + "epoch": 0.28732902450022546, + "flos": 21335180325120.0, + "grad_norm": 1.7009157907313364, + "language_loss": 0.74653947, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.77186334, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.25622559, + "step": 4779, + "time_per_iteration": 2.883408784866333 + }, + { + "auxiliary_loss_clip": 0.01505819, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.30982184, + "balance_loss_mlp": 1.01548862, + "epoch": 0.2873891477528934, + "flos": 20860207966080.0, + "grad_norm": 1.6258429334797018, + "language_loss": 0.77075899, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.79623139, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.2590332, + "step": 4780, + "time_per_iteration": 2.801316022872925 + }, + { + "auxiliary_loss_clip": 0.0149114, + "auxiliary_loss_mlp": 0.01045749, + "balance_loss_clip": 1.30094898, + "balance_loss_mlp": 1.02079868, + "epoch": 0.2874492710055614, + "flos": 17428813407360.0, + "grad_norm": 1.7052157467693, + "language_loss": 0.82078338, + "learning_rate": 3.344078031483784e-06, + "loss": 0.84615231, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.24951172, + "step": 4781, + "time_per_iteration": 2.8098368644714355 + }, + { + "auxiliary_loss_clip": 0.01511118, + "auxiliary_loss_mlp": 0.01045342, + "balance_loss_clip": 1.31322229, + "balance_loss_mlp": 1.01908016, + "epoch": 0.28750939425822936, + "flos": 13414591589760.0, + "grad_norm": 1.8907059606061274, + "language_loss": 0.87813222, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.90369689, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.26269531, + "step": 4782, + "time_per_iteration": 2.8150341510772705 + }, + { + "auxiliary_loss_clip": 0.01531959, + "auxiliary_loss_mlp": 0.01051909, + "balance_loss_clip": 1.33242977, + "balance_loss_mlp": 1.02719665, + "epoch": 0.2875695175108973, + "flos": 21879612731520.0, + "grad_norm": 1.7778164148486508, + "language_loss": 0.71918118, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.74501979, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.24682617, + "step": 4783, + "time_per_iteration": 2.8736379146575928 + }, + { + "auxiliary_loss_clip": 0.01508598, + "auxiliary_loss_mlp": 0.01055518, + "balance_loss_clip": 1.31456077, + "balance_loss_mlp": 1.02960145, + "epoch": 0.2876296407635653, + "flos": 26255432142720.0, + "grad_norm": 1.7084215358520662, + "language_loss": 0.77802449, + "learning_rate": 3.343212594663047e-06, + "loss": 0.80366564, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.25952148, + "step": 4784, + "time_per_iteration": 2.9347474575042725 + }, + { + "auxiliary_loss_clip": 0.01482698, + "auxiliary_loss_mlp": 0.01056091, + "balance_loss_clip": 1.29432261, + "balance_loss_mlp": 1.03145015, + "epoch": 0.28768976401623325, + "flos": 25384403635200.0, + "grad_norm": 1.7530953381454806, + "language_loss": 0.76446629, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.78985417, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.24633789, + "step": 4785, + "time_per_iteration": 2.868757486343384 + }, + { + "auxiliary_loss_clip": 0.01500534, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.30593991, + "balance_loss_mlp": 1.03303993, + "epoch": 0.28774988726890127, + "flos": 30676568860800.0, + "grad_norm": 2.0965302498574356, + "language_loss": 0.83496881, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.86054599, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.24169922, + "step": 4786, + "time_per_iteration": 2.9376611709594727 + }, + { + "auxiliary_loss_clip": 0.01510732, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.31845522, + "balance_loss_mlp": 1.03903556, + "epoch": 0.28781001052156924, + "flos": 20605334152320.0, + "grad_norm": 1.7296748370200077, + "language_loss": 0.80825078, + "learning_rate": 3.342346699429516e-06, + "loss": 0.83398008, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.23168945, + "step": 4787, + "time_per_iteration": 2.81431245803833 + }, + { + "auxiliary_loss_clip": 0.01518984, + "auxiliary_loss_mlp": 0.01064791, + "balance_loss_clip": 1.31989765, + "balance_loss_mlp": 1.03956676, + "epoch": 0.2878701337742372, + "flos": 26553677736960.0, + "grad_norm": 1.9175270793084964, + "language_loss": 0.84747088, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.87330866, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.25231934, + "step": 4788, + "time_per_iteration": 2.8786122798919678 + }, + { + "auxiliary_loss_clip": 0.0153837, + "auxiliary_loss_mlp": 0.01066295, + "balance_loss_clip": 1.3374548, + "balance_loss_mlp": 1.04091525, + "epoch": 0.28793025702690517, + "flos": 28158398225280.0, + "grad_norm": 1.7256961203830699, + "language_loss": 0.7505005, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.77654707, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 2.0078125, + "router_z_loss_mlp": 0.25390625, + "step": 4789, + "time_per_iteration": 2.9617104530334473 + }, + { + "auxiliary_loss_clip": 0.0149418, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.30517805, + "balance_loss_mlp": 1.04421449, + "epoch": 0.28799038027957313, + "flos": 23815996738560.0, + "grad_norm": 1.9872898590608237, + "language_loss": 0.85524035, + "learning_rate": 3.341480346078704e-06, + "loss": 0.88087291, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.24865723, + "step": 4790, + "time_per_iteration": 2.900662660598755 + }, + { + "auxiliary_loss_clip": 0.01531064, + "auxiliary_loss_mlp": 0.0105567, + "balance_loss_clip": 1.33426785, + "balance_loss_mlp": 1.03038597, + "epoch": 0.2880505035322411, + "flos": 22353770684160.0, + "grad_norm": 1.6612089252654387, + "language_loss": 0.78785765, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.81372499, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.25292969, + "step": 4791, + "time_per_iteration": 2.8733279705047607 + }, + { + "auxiliary_loss_clip": 0.0153752, + "auxiliary_loss_mlp": 0.01074639, + "balance_loss_clip": 1.33621359, + "balance_loss_mlp": 1.0500226, + "epoch": 0.28811062678490906, + "flos": 18013269479040.0, + "grad_norm": 1.6452938409640467, + "language_loss": 0.71372646, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.73984808, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 2.01171875, + "router_z_loss_mlp": 0.24597168, + "step": 4792, + "time_per_iteration": 2.8041181564331055 + }, + { + "auxiliary_loss_clip": 0.01540191, + "auxiliary_loss_mlp": 0.01063425, + "balance_loss_clip": 1.34112024, + "balance_loss_mlp": 1.0403347, + "epoch": 0.28817075003757703, + "flos": 22101023376000.0, + "grad_norm": 1.7104852192781774, + "language_loss": 0.81046593, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.83650208, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.23095703, + "step": 4793, + "time_per_iteration": 2.857980251312256 + }, + { + "auxiliary_loss_clip": 0.01519572, + "auxiliary_loss_mlp": 0.01063189, + "balance_loss_clip": 1.32537687, + "balance_loss_mlp": 1.03698671, + "epoch": 0.288230873290245, + "flos": 41698200735360.0, + "grad_norm": 1.6368201998318224, + "language_loss": 0.78794962, + "learning_rate": 3.340324496161797e-06, + "loss": 0.81377721, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.26220703, + "step": 4794, + "time_per_iteration": 2.997415065765381 + }, + { + "auxiliary_loss_clip": 0.01536792, + "auxiliary_loss_mlp": 0.01069593, + "balance_loss_clip": 1.3385793, + "balance_loss_mlp": 1.04466617, + "epoch": 0.28829099654291296, + "flos": 18633586694400.0, + "grad_norm": 2.152731538696966, + "language_loss": 0.84172487, + "learning_rate": 3.340035406592074e-06, + "loss": 0.86778873, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24926758, + "step": 4795, + "time_per_iteration": 2.8568642139434814 + }, + { + "auxiliary_loss_clip": 0.0151231, + "auxiliary_loss_mlp": 0.0105711, + "balance_loss_clip": 1.32251263, + "balance_loss_mlp": 1.03289819, + "epoch": 0.2883511197955809, + "flos": 24683948599680.0, + "grad_norm": 6.7672658570282165, + "language_loss": 0.75343412, + "learning_rate": 3.339746266208074e-06, + "loss": 0.77912831, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.2421875, + "step": 4796, + "time_per_iteration": 2.8403103351593018 + }, + { + "auxiliary_loss_clip": 0.01542355, + "auxiliary_loss_mlp": 0.01054884, + "balance_loss_clip": 1.34120059, + "balance_loss_mlp": 1.02912331, + "epoch": 0.2884112430482489, + "flos": 23122464157440.0, + "grad_norm": 1.9491594816427178, + "language_loss": 0.73695242, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.76292485, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.2578125, + "step": 4797, + "time_per_iteration": 2.8224287033081055 + }, + { + "auxiliary_loss_clip": 0.01538706, + "auxiliary_loss_mlp": 0.01059786, + "balance_loss_clip": 1.33951068, + "balance_loss_mlp": 1.03473985, + "epoch": 0.28847136630091685, + "flos": 16882118760960.0, + "grad_norm": 1.848974945977431, + "language_loss": 0.7551378, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.78112268, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.25036621, + "step": 4798, + "time_per_iteration": 4.252542495727539 + }, + { + "auxiliary_loss_clip": 0.01546263, + "auxiliary_loss_mlp": 0.01069613, + "balance_loss_clip": 1.34596443, + "balance_loss_mlp": 1.04338694, + "epoch": 0.2885314895535849, + "flos": 25666270611840.0, + "grad_norm": 2.6080454971333866, + "language_loss": 0.66446519, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.69062394, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.26220703, + "step": 4799, + "time_per_iteration": 2.9233834743499756 + }, + { + "auxiliary_loss_clip": 0.01557067, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.35420537, + "balance_loss_mlp": 1.03607845, + "epoch": 0.28859161280625284, + "flos": 21117524999040.0, + "grad_norm": 1.791865574619288, + "language_loss": 0.83269471, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.85887516, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.24926758, + "step": 4800, + "time_per_iteration": 2.926154136657715 + }, + { + "auxiliary_loss_clip": 0.01529771, + "auxiliary_loss_mlp": 0.01053814, + "balance_loss_clip": 1.33586669, + "balance_loss_mlp": 1.02901816, + "epoch": 0.2886517360589208, + "flos": 26480914819200.0, + "grad_norm": 1.6723654386955853, + "language_loss": 0.91697717, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.94281298, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.2479248, + "step": 4801, + "time_per_iteration": 2.9288527965545654 + }, + { + "auxiliary_loss_clip": 0.01531605, + "auxiliary_loss_mlp": 0.01044753, + "balance_loss_clip": 1.33637595, + "balance_loss_mlp": 1.0201838, + "epoch": 0.28871185931158877, + "flos": 25276277266560.0, + "grad_norm": 1.8169857862655183, + "language_loss": 0.74297076, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76873434, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.24584961, + "step": 4802, + "time_per_iteration": 2.8692831993103027 + }, + { + "auxiliary_loss_clip": 0.01275066, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.16424322, + "balance_loss_mlp": 1.00858808, + "epoch": 0.28877198256425674, + "flos": 66693742145280.0, + "grad_norm": 0.7847566583064334, + "language_loss": 0.6301139, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65316027, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.20996094, + "step": 4803, + "time_per_iteration": 3.3381829261779785 + }, + { + "auxiliary_loss_clip": 0.01544043, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_clip": 1.34608674, + "balance_loss_mlp": 1.02401888, + "epoch": 0.2888321058169247, + "flos": 20312698913280.0, + "grad_norm": 2.9726662510172672, + "language_loss": 0.71983719, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.74577272, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.25512695, + "step": 4804, + "time_per_iteration": 4.236265182495117 + }, + { + "auxiliary_loss_clip": 0.0155845, + "auxiliary_loss_mlp": 0.01048433, + "balance_loss_clip": 1.35582495, + "balance_loss_mlp": 1.0223738, + "epoch": 0.28889222906959267, + "flos": 25526762334720.0, + "grad_norm": 1.9195262912256217, + "language_loss": 0.69079304, + "learning_rate": 3.337141717919346e-06, + "loss": 0.7168619, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.26074219, + "step": 4805, + "time_per_iteration": 2.8494715690612793 + }, + { + "auxiliary_loss_clip": 0.01562349, + "auxiliary_loss_mlp": 0.010503, + "balance_loss_clip": 1.36036122, + "balance_loss_mlp": 1.0248127, + "epoch": 0.28895235232226063, + "flos": 32684086972800.0, + "grad_norm": 1.490165497953458, + "language_loss": 0.70443308, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.73055953, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.25524902, + "step": 4806, + "time_per_iteration": 2.9652867317199707 + }, + { + "auxiliary_loss_clip": 0.01533811, + "auxiliary_loss_mlp": 0.01048598, + "balance_loss_clip": 1.33905625, + "balance_loss_mlp": 1.02324247, + "epoch": 0.2890124755749286, + "flos": 29726352673920.0, + "grad_norm": 1.502460000694783, + "language_loss": 0.71882743, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.7446515, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.25354004, + "step": 4807, + "time_per_iteration": 2.917811870574951 + }, + { + "auxiliary_loss_clip": 0.01536283, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.33804059, + "balance_loss_mlp": 1.02054, + "epoch": 0.28907259882759656, + "flos": 22684710286080.0, + "grad_norm": 2.1252845371801437, + "language_loss": 0.82176751, + "learning_rate": 3.336272622079382e-06, + "loss": 0.84758806, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.25256348, + "step": 4808, + "time_per_iteration": 4.260455369949341 + }, + { + "auxiliary_loss_clip": 0.01544137, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_clip": 1.35023117, + "balance_loss_mlp": 1.02392638, + "epoch": 0.2891327220802645, + "flos": 22576538672640.0, + "grad_norm": 1.5421227318371271, + "language_loss": 0.791233, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.81717741, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.26403809, + "step": 4809, + "time_per_iteration": 2.8474299907684326 + }, + { + "auxiliary_loss_clip": 0.0155848, + "auxiliary_loss_mlp": 0.01045971, + "balance_loss_clip": 1.35414088, + "balance_loss_mlp": 1.01819551, + "epoch": 0.2891928453329325, + "flos": 21662771811840.0, + "grad_norm": 5.248377939640719, + "language_loss": 0.79396164, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.82000613, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 2.04199219, + "router_z_loss_mlp": 0.27807617, + "step": 4810, + "time_per_iteration": 2.8342230319976807 + }, + { + "auxiliary_loss_clip": 0.01539964, + "auxiliary_loss_mlp": 0.01046677, + "balance_loss_clip": 1.34407663, + "balance_loss_mlp": 1.02141643, + "epoch": 0.28925296858560046, + "flos": 23232581297280.0, + "grad_norm": 1.5946035602143989, + "language_loss": 0.77711838, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.80298483, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.25292969, + "step": 4811, + "time_per_iteration": 2.8543879985809326 + }, + { + "auxiliary_loss_clip": 0.01549954, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.35037315, + "balance_loss_mlp": 1.01909506, + "epoch": 0.2893130918382685, + "flos": 28633461073920.0, + "grad_norm": 1.4777567360714885, + "language_loss": 0.78272724, + "learning_rate": 3.335113118275117e-06, + "loss": 0.80868322, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.26538086, + "step": 4812, + "time_per_iteration": 2.967966079711914 + }, + { + "auxiliary_loss_clip": 0.0127695, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.16372037, + "balance_loss_mlp": 1.02189112, + "epoch": 0.28937321509093644, + "flos": 72335487830400.0, + "grad_norm": 0.8816011424443164, + "language_loss": 0.60383868, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62699497, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.16796875, + "step": 4813, + "time_per_iteration": 3.504157066345215 + }, + { + "auxiliary_loss_clip": 0.01530657, + "auxiliary_loss_mlp": 0.01044034, + "balance_loss_clip": 1.33560109, + "balance_loss_mlp": 1.01830912, + "epoch": 0.2894333383436044, + "flos": 16224945016320.0, + "grad_norm": 1.9976018439025227, + "language_loss": 0.83201283, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.85775977, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.25769043, + "step": 4814, + "time_per_iteration": 2.9470860958099365 + }, + { + "auxiliary_loss_clip": 0.01547586, + "auxiliary_loss_mlp": 0.01051618, + "balance_loss_clip": 1.3461144, + "balance_loss_mlp": 1.02623844, + "epoch": 0.2894934615962724, + "flos": 24839383046400.0, + "grad_norm": 1.887844347334312, + "language_loss": 0.73618543, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.76217747, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.25378418, + "step": 4815, + "time_per_iteration": 3.013873338699341 + }, + { + "auxiliary_loss_clip": 0.01527475, + "auxiliary_loss_mlp": 0.010517, + "balance_loss_clip": 1.33689773, + "balance_loss_mlp": 1.02671373, + "epoch": 0.28955358484894034, + "flos": 20459944051200.0, + "grad_norm": 1.6668992190737464, + "language_loss": 0.71457058, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.74036229, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.24975586, + "step": 4816, + "time_per_iteration": 2.935013771057129 + }, + { + "auxiliary_loss_clip": 0.01551796, + "auxiliary_loss_mlp": 0.01049026, + "balance_loss_clip": 1.34699893, + "balance_loss_mlp": 1.02223969, + "epoch": 0.2896137081016083, + "flos": 22575452797440.0, + "grad_norm": 3.0683797747652894, + "language_loss": 0.77180052, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.79780871, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.26806641, + "step": 4817, + "time_per_iteration": 2.957340955734253 + }, + { + "auxiliary_loss_clip": 0.01545201, + "auxiliary_loss_mlp": 0.01052486, + "balance_loss_clip": 1.34511602, + "balance_loss_mlp": 1.02565205, + "epoch": 0.28967383135427627, + "flos": 26699248817280.0, + "grad_norm": 2.304306623731047, + "language_loss": 0.77743804, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.80341494, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.26818848, + "step": 4818, + "time_per_iteration": 2.926295280456543 + }, + { + "auxiliary_loss_clip": 0.01544049, + "auxiliary_loss_mlp": 0.01051175, + "balance_loss_clip": 1.34426093, + "balance_loss_mlp": 1.02524662, + "epoch": 0.28973395460694423, + "flos": 15566278193280.0, + "grad_norm": 2.0244186916553777, + "language_loss": 0.81146032, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.83741254, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 1.99609375, + "router_z_loss_mlp": 0.25952148, + "step": 4819, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.01547162, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.34558666, + "balance_loss_mlp": 1.01917601, + "epoch": 0.2897940778596122, + "flos": 18706666325760.0, + "grad_norm": 2.1067661997091007, + "language_loss": 0.7977221, + "learning_rate": 3.332791681244776e-06, + "loss": 0.82364559, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.26049805, + "step": 4820, + "time_per_iteration": 2.8610074520111084 + }, + { + "auxiliary_loss_clip": 0.01555825, + "auxiliary_loss_mlp": 0.01042882, + "balance_loss_clip": 1.35320866, + "balance_loss_mlp": 1.01772928, + "epoch": 0.28985420111228016, + "flos": 18779474488320.0, + "grad_norm": 2.317086672323062, + "language_loss": 0.7420398, + "learning_rate": 3.332501274072231e-06, + "loss": 0.76802689, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.25134277, + "step": 4821, + "time_per_iteration": 2.988969564437866 + }, + { + "auxiliary_loss_clip": 0.01536339, + "auxiliary_loss_mlp": 0.01043736, + "balance_loss_clip": 1.33676958, + "balance_loss_mlp": 1.01774859, + "epoch": 0.28991432436494813, + "flos": 23078639928960.0, + "grad_norm": 2.043097632165815, + "language_loss": 0.72881335, + "learning_rate": 3.332210816371104e-06, + "loss": 0.75461411, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.26013184, + "step": 4822, + "time_per_iteration": 2.9859092235565186 + }, + { + "auxiliary_loss_clip": 0.01536833, + "auxiliary_loss_mlp": 0.01046257, + "balance_loss_clip": 1.33908534, + "balance_loss_mlp": 1.02066255, + "epoch": 0.2899744476176161, + "flos": 17612191157760.0, + "grad_norm": 1.8348030889975135, + "language_loss": 0.67054808, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.69637895, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.25561523, + "step": 4823, + "time_per_iteration": 2.836082935333252 + }, + { + "auxiliary_loss_clip": 0.01535753, + "auxiliary_loss_mlp": 0.01049515, + "balance_loss_clip": 1.33913839, + "balance_loss_mlp": 1.02302623, + "epoch": 0.29003457087028406, + "flos": 22319447863680.0, + "grad_norm": 2.3728541739496927, + "language_loss": 0.82517797, + "learning_rate": 3.331629749427164e-06, + "loss": 0.85103065, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.26489258, + "step": 4824, + "time_per_iteration": 2.951359272003174 + }, + { + "auxiliary_loss_clip": 0.01562456, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_clip": 1.35772693, + "balance_loss_mlp": 1.02192235, + "epoch": 0.2900946941229521, + "flos": 21955090337280.0, + "grad_norm": 1.8306094758322198, + "language_loss": 0.73933882, + "learning_rate": 3.331339140206385e-06, + "loss": 0.76546037, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 2.046875, + "router_z_loss_mlp": 0.27783203, + "step": 4825, + "time_per_iteration": 2.8490102291107178 + }, + { + "auxiliary_loss_clip": 0.01549437, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.34649682, + "balance_loss_mlp": 1.01685405, + "epoch": 0.29015481737562004, + "flos": 17941411457280.0, + "grad_norm": 2.423924847443782, + "language_loss": 0.7482748, + "learning_rate": 3.331048480501092e-06, + "loss": 0.77418911, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25146484, + "step": 4826, + "time_per_iteration": 2.8724327087402344 + }, + { + "auxiliary_loss_clip": 0.01552093, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.35058808, + "balance_loss_mlp": 1.01796031, + "epoch": 0.290214940628288, + "flos": 22793696305920.0, + "grad_norm": 2.3918772551749843, + "language_loss": 0.69299293, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.7189362, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.24291992, + "step": 4827, + "time_per_iteration": 2.8559670448303223 + }, + { + "auxiliary_loss_clip": 0.01546761, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.34841633, + "balance_loss_mlp": 1.01445222, + "epoch": 0.290275063880956, + "flos": 20014860522240.0, + "grad_norm": 2.4213517209957724, + "language_loss": 0.80933261, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.83520359, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.25915527, + "step": 4828, + "time_per_iteration": 2.845991611480713 + }, + { + "auxiliary_loss_clip": 0.01533895, + "auxiliary_loss_mlp": 0.01047723, + "balance_loss_clip": 1.33837795, + "balance_loss_mlp": 1.02125812, + "epoch": 0.29033518713362394, + "flos": 22063533419520.0, + "grad_norm": 1.674046460545599, + "language_loss": 0.80694538, + "learning_rate": 3.33017619858836e-06, + "loss": 0.83276153, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.2644043, + "step": 4829, + "time_per_iteration": 2.8725380897521973 + }, + { + "auxiliary_loss_clip": 0.01524387, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.33061326, + "balance_loss_mlp": 1.01329732, + "epoch": 0.2903953103862919, + "flos": 25641132485760.0, + "grad_norm": 1.5314307835901912, + "language_loss": 0.83443868, + "learning_rate": 3.329885337055249e-06, + "loss": 0.86006474, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24926758, + "step": 4830, + "time_per_iteration": 2.8604884147644043 + }, + { + "auxiliary_loss_clip": 0.01556761, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.35440576, + "balance_loss_mlp": 1.0230732, + "epoch": 0.29045543363895987, + "flos": 16954519720320.0, + "grad_norm": 2.2594201700147782, + "language_loss": 0.80796683, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.83401936, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.25415039, + "step": 4831, + "time_per_iteration": 2.827427864074707 + }, + { + "auxiliary_loss_clip": 0.01523599, + "auxiliary_loss_mlp": 0.01039442, + "balance_loss_clip": 1.33143163, + "balance_loss_mlp": 1.0156002, + "epoch": 0.29051555689162784, + "flos": 26406568333440.0, + "grad_norm": 1.569339014101516, + "language_loss": 0.74944234, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.77507269, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.23840332, + "step": 4832, + "time_per_iteration": 2.9645872116088867 + }, + { + "auxiliary_loss_clip": 0.01530057, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.33500779, + "balance_loss_mlp": 1.01823163, + "epoch": 0.2905756801442958, + "flos": 21113271987840.0, + "grad_norm": 1.5555517991946426, + "language_loss": 0.76983559, + "learning_rate": 3.329012449923736e-06, + "loss": 0.7955507, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.2322998, + "step": 4833, + "time_per_iteration": 4.333189010620117 + }, + { + "auxiliary_loss_clip": 0.01528442, + "auxiliary_loss_mlp": 0.01040264, + "balance_loss_clip": 1.33234608, + "balance_loss_mlp": 1.01581395, + "epoch": 0.29063580339696377, + "flos": 15714609206400.0, + "grad_norm": 2.1293522336846005, + "language_loss": 0.66124332, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.68693042, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24450684, + "step": 4834, + "time_per_iteration": 2.855473279953003 + }, + { + "auxiliary_loss_clip": 0.01520831, + "auxiliary_loss_mlp": 0.01038656, + "balance_loss_clip": 1.32796001, + "balance_loss_mlp": 1.01581526, + "epoch": 0.29069592664963173, + "flos": 24655824316800.0, + "grad_norm": 1.4999922117605387, + "language_loss": 0.72862506, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.75421989, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.22839355, + "step": 4835, + "time_per_iteration": 2.851914167404175 + }, + { + "auxiliary_loss_clip": 0.01512091, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.32027423, + "balance_loss_mlp": 1.01587892, + "epoch": 0.2907560499022997, + "flos": 24984773147520.0, + "grad_norm": 1.9449550242850762, + "language_loss": 0.80424219, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.82975471, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23266602, + "step": 4836, + "time_per_iteration": 2.896366596221924 + }, + { + "auxiliary_loss_clip": 0.0152986, + "auxiliary_loss_mlp": 0.01045449, + "balance_loss_clip": 1.338099, + "balance_loss_mlp": 1.02051079, + "epoch": 0.29081617315496766, + "flos": 18665783009280.0, + "grad_norm": 1.802684032490924, + "language_loss": 0.81442881, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.84018195, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.24963379, + "step": 4837, + "time_per_iteration": 2.839606761932373 + }, + { + "auxiliary_loss_clip": 0.01526446, + "auxiliary_loss_mlp": 0.01042371, + "balance_loss_clip": 1.33165526, + "balance_loss_mlp": 1.01805234, + "epoch": 0.2908762964076356, + "flos": 35343530432640.0, + "grad_norm": 1.8571268875625093, + "language_loss": 0.68422246, + "learning_rate": 3.327556630259381e-06, + "loss": 0.70991063, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.24328613, + "step": 4838, + "time_per_iteration": 2.9503445625305176 + }, + { + "auxiliary_loss_clip": 0.01538116, + "auxiliary_loss_mlp": 0.01044978, + "balance_loss_clip": 1.34083331, + "balance_loss_mlp": 1.01976538, + "epoch": 0.29093641966030365, + "flos": 23086783992960.0, + "grad_norm": 1.8878325090666845, + "language_loss": 0.72596723, + "learning_rate": 3.327265315259095e-06, + "loss": 0.75179815, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.25219727, + "step": 4839, + "time_per_iteration": 4.245973587036133 + }, + { + "auxiliary_loss_clip": 0.01539673, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_clip": 1.34275234, + "balance_loss_mlp": 1.01933169, + "epoch": 0.2909965429129716, + "flos": 35969141289600.0, + "grad_norm": 1.7732923098622853, + "language_loss": 0.77105856, + "learning_rate": 3.326973949928776e-06, + "loss": 0.79689384, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.24523926, + "step": 4840, + "time_per_iteration": 2.9662835597991943 + }, + { + "auxiliary_loss_clip": 0.01523114, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.33011484, + "balance_loss_mlp": 1.01712835, + "epoch": 0.2910566661656396, + "flos": 30891690478080.0, + "grad_norm": 2.2190223097104806, + "language_loss": 0.61041629, + "learning_rate": 3.326682534279471e-06, + "loss": 0.6360476, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.22875977, + "step": 4841, + "time_per_iteration": 2.891633987426758 + }, + { + "auxiliary_loss_clip": 0.01532875, + "auxiliary_loss_mlp": 0.01045133, + "balance_loss_clip": 1.33904815, + "balance_loss_mlp": 1.02064705, + "epoch": 0.29111678941830754, + "flos": 30022878965760.0, + "grad_norm": 1.3610948618208418, + "language_loss": 0.72327399, + "learning_rate": 3.326391068322232e-06, + "loss": 0.74905396, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24487305, + "step": 4842, + "time_per_iteration": 2.936018466949463 + }, + { + "auxiliary_loss_clip": 0.01535815, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.34281409, + "balance_loss_mlp": 1.01892316, + "epoch": 0.2911769126709755, + "flos": 22867816567680.0, + "grad_norm": 1.5526561295763968, + "language_loss": 0.74521071, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.77098382, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.22558594, + "step": 4843, + "time_per_iteration": 4.274471998214722 + }, + { + "auxiliary_loss_clip": 0.01526612, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.33098733, + "balance_loss_mlp": 1.01724863, + "epoch": 0.2912370359236435, + "flos": 21659876144640.0, + "grad_norm": 2.730233097054129, + "language_loss": 0.59739441, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.62306118, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.22839355, + "step": 4844, + "time_per_iteration": 2.886984348297119 + }, + { + "auxiliary_loss_clip": 0.01553472, + "auxiliary_loss_mlp": 0.01043616, + "balance_loss_clip": 1.35481524, + "balance_loss_mlp": 1.0184387, + "epoch": 0.29129715917631144, + "flos": 22903632466560.0, + "grad_norm": 2.176980253439563, + "language_loss": 0.87435961, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.90033048, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.25195312, + "step": 4845, + "time_per_iteration": 2.8728482723236084 + }, + { + "auxiliary_loss_clip": 0.01541299, + "auxiliary_loss_mlp": 0.01048768, + "balance_loss_clip": 1.3456285, + "balance_loss_mlp": 1.02428246, + "epoch": 0.2913572824289794, + "flos": 22684710286080.0, + "grad_norm": 1.6837862498302114, + "language_loss": 0.67999321, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.70589387, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.24462891, + "step": 4846, + "time_per_iteration": 2.8466150760650635 + }, + { + "auxiliary_loss_clip": 0.01514748, + "auxiliary_loss_mlp": 0.01045379, + "balance_loss_clip": 1.32501113, + "balance_loss_mlp": 1.02197862, + "epoch": 0.29141740568164737, + "flos": 23116491843840.0, + "grad_norm": 1.9915218051066874, + "language_loss": 0.71215254, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.73775381, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.23413086, + "step": 4847, + "time_per_iteration": 2.895432949066162 + }, + { + "auxiliary_loss_clip": 0.01538456, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_clip": 1.34541702, + "balance_loss_mlp": 1.01860094, + "epoch": 0.29147752893431533, + "flos": 23597617495680.0, + "grad_norm": 1.5592810047931016, + "language_loss": 0.74414724, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76995122, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.23352051, + "step": 4848, + "time_per_iteration": 2.884173631668091 + }, + { + "auxiliary_loss_clip": 0.01525543, + "auxiliary_loss_mlp": 0.01044568, + "balance_loss_clip": 1.33031058, + "balance_loss_mlp": 1.02152491, + "epoch": 0.2915376521869833, + "flos": 20600945406720.0, + "grad_norm": 2.6944950377436427, + "language_loss": 0.77402794, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79972905, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.23022461, + "step": 4849, + "time_per_iteration": 2.8488566875457764 + }, + { + "auxiliary_loss_clip": 0.0155703, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.35770774, + "balance_loss_mlp": 1.01876485, + "epoch": 0.29159777543965126, + "flos": 20820681993600.0, + "grad_norm": 1.8448419366586088, + "language_loss": 0.79740727, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.82340664, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.24121094, + "step": 4850, + "time_per_iteration": 2.9935688972473145 + }, + { + "auxiliary_loss_clip": 0.01520104, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.33030248, + "balance_loss_mlp": 1.02002621, + "epoch": 0.29165789869231923, + "flos": 24255017464320.0, + "grad_norm": 2.397375492551384, + "language_loss": 0.76534855, + "learning_rate": 3.323765612674296e-06, + "loss": 0.79099691, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.24694824, + "step": 4851, + "time_per_iteration": 2.8717474937438965 + }, + { + "auxiliary_loss_clip": 0.01531667, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.34110999, + "balance_loss_mlp": 1.0207417, + "epoch": 0.29171802194498725, + "flos": 28961776477440.0, + "grad_norm": 1.3208301180941222, + "language_loss": 0.78186846, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.80761474, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22216797, + "step": 4852, + "time_per_iteration": 2.8919177055358887 + }, + { + "auxiliary_loss_clip": 0.01531064, + "auxiliary_loss_mlp": 0.0105082, + "balance_loss_clip": 1.33513737, + "balance_loss_mlp": 1.02628684, + "epoch": 0.2917781451976552, + "flos": 22607603867520.0, + "grad_norm": 1.8763100868142792, + "language_loss": 0.78700179, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.81282067, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.24536133, + "step": 4853, + "time_per_iteration": 2.8952176570892334 + }, + { + "auxiliary_loss_clip": 0.01531968, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.33793628, + "balance_loss_mlp": 1.02419281, + "epoch": 0.2918382684503232, + "flos": 21582995950080.0, + "grad_norm": 3.878396843782532, + "language_loss": 0.88163388, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90742582, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.23022461, + "step": 4854, + "time_per_iteration": 2.844064712524414 + }, + { + "auxiliary_loss_clip": 0.01527517, + "auxiliary_loss_mlp": 0.01050233, + "balance_loss_clip": 1.33498228, + "balance_loss_mlp": 1.02517498, + "epoch": 0.29189839170299114, + "flos": 24364365442560.0, + "grad_norm": 1.744825566185345, + "language_loss": 0.8709482, + "learning_rate": 3.322597437887519e-06, + "loss": 0.89672565, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.25085449, + "step": 4855, + "time_per_iteration": 2.915282726287842 + }, + { + "auxiliary_loss_clip": 0.01282184, + "auxiliary_loss_mlp": 0.01044068, + "balance_loss_clip": 1.17160237, + "balance_loss_mlp": 1.02089334, + "epoch": 0.2919585149556591, + "flos": 71350813088640.0, + "grad_norm": 0.8062873874133749, + "language_loss": 0.6018914, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6251539, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.23144531, + "step": 4856, + "time_per_iteration": 3.4645884037017822 + }, + { + "auxiliary_loss_clip": 0.01526537, + "auxiliary_loss_mlp": 0.01043622, + "balance_loss_clip": 1.33510208, + "balance_loss_mlp": 1.02127004, + "epoch": 0.2920186382083271, + "flos": 15641755799040.0, + "grad_norm": 2.0002598352783294, + "language_loss": 0.69057965, + "learning_rate": 3.322013049531664e-06, + "loss": 0.71628118, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.22363281, + "step": 4857, + "time_per_iteration": 2.8321139812469482 + }, + { + "auxiliary_loss_clip": 0.0152362, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.33255827, + "balance_loss_mlp": 1.01741052, + "epoch": 0.29207876146099504, + "flos": 28377682364160.0, + "grad_norm": 2.3010382853208142, + "language_loss": 0.84899032, + "learning_rate": 3.321720780151895e-06, + "loss": 0.87463415, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.23364258, + "step": 4858, + "time_per_iteration": 2.8988840579986572 + }, + { + "auxiliary_loss_clip": 0.01528814, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.34009886, + "balance_loss_mlp": 1.01493883, + "epoch": 0.292138884713663, + "flos": 21880789096320.0, + "grad_norm": 2.0630649448216087, + "language_loss": 0.78247088, + "learning_rate": 3.321428460652342e-06, + "loss": 0.80812633, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.21789551, + "step": 4859, + "time_per_iteration": 2.86971116065979 + }, + { + "auxiliary_loss_clip": 0.01538906, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.34105766, + "balance_loss_mlp": 1.016891, + "epoch": 0.29219900796633097, + "flos": 21001435545600.0, + "grad_norm": 2.734799747402687, + "language_loss": 0.69530678, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.72110379, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.23913574, + "step": 4860, + "time_per_iteration": 2.835451364517212 + }, + { + "auxiliary_loss_clip": 0.01528173, + "auxiliary_loss_mlp": 0.01039757, + "balance_loss_clip": 1.339149, + "balance_loss_mlp": 1.01773906, + "epoch": 0.29225913121899894, + "flos": 35017522513920.0, + "grad_norm": 2.3296138396236263, + "language_loss": 0.76085657, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78653586, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.22021484, + "step": 4861, + "time_per_iteration": 2.954454183578491 + }, + { + "auxiliary_loss_clip": 0.01535793, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.3438921, + "balance_loss_mlp": 1.01832724, + "epoch": 0.2923192544716669, + "flos": 13523306140800.0, + "grad_norm": 1.8386470673134288, + "language_loss": 0.92467928, + "learning_rate": 3.320551201545832e-06, + "loss": 0.95046365, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.24316406, + "step": 4862, + "time_per_iteration": 2.8857734203338623 + }, + { + "auxiliary_loss_clip": 0.01513345, + "auxiliary_loss_mlp": 0.01040262, + "balance_loss_clip": 1.32350779, + "balance_loss_mlp": 1.01690936, + "epoch": 0.29237937772433487, + "flos": 19472871335040.0, + "grad_norm": 2.1054205792569367, + "language_loss": 0.74526262, + "learning_rate": 3.320258681678008e-06, + "loss": 0.7707988, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.23364258, + "step": 4863, + "time_per_iteration": 2.8300278186798096 + }, + { + "auxiliary_loss_clip": 0.01520349, + "auxiliary_loss_mlp": 0.01042846, + "balance_loss_clip": 1.33192468, + "balance_loss_mlp": 1.02062535, + "epoch": 0.29243950097700283, + "flos": 20860479434880.0, + "grad_norm": 1.756460713055701, + "language_loss": 0.79002368, + "learning_rate": 3.319966111745842e-06, + "loss": 0.81565565, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.22229004, + "step": 4864, + "time_per_iteration": 2.8762505054473877 + }, + { + "auxiliary_loss_clip": 0.01538523, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.34326458, + "balance_loss_mlp": 1.0195868, + "epoch": 0.29249962422967085, + "flos": 23594088401280.0, + "grad_norm": 1.7791752497708395, + "language_loss": 0.82881081, + "learning_rate": 3.319673491760429e-06, + "loss": 0.85463202, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.2401123, + "step": 4865, + "time_per_iteration": 2.9375648498535156 + }, + { + "auxiliary_loss_clip": 0.01542075, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.34733713, + "balance_loss_mlp": 1.01510537, + "epoch": 0.2925597474823388, + "flos": 22283541475200.0, + "grad_norm": 1.9118610850609186, + "language_loss": 0.86229885, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.88810742, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.23669434, + "step": 4866, + "time_per_iteration": 2.8902969360351562 + }, + { + "auxiliary_loss_clip": 0.01518276, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.32930064, + "balance_loss_mlp": 1.01841736, + "epoch": 0.2926198707350068, + "flos": 34468565627520.0, + "grad_norm": 2.0160114293431532, + "language_loss": 0.76587665, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.79148197, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.23840332, + "step": 4867, + "time_per_iteration": 3.008183002471924 + }, + { + "auxiliary_loss_clip": 0.01533303, + "auxiliary_loss_mlp": 0.01041746, + "balance_loss_clip": 1.33735347, + "balance_loss_mlp": 1.01759386, + "epoch": 0.29267999398767475, + "flos": 20713867724160.0, + "grad_norm": 1.960050782029983, + "language_loss": 0.74922061, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.77497113, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.24157715, + "step": 4868, + "time_per_iteration": 4.286706209182739 + }, + { + "auxiliary_loss_clip": 0.01519556, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.33051467, + "balance_loss_mlp": 1.01803708, + "epoch": 0.2927401172403427, + "flos": 18377672250240.0, + "grad_norm": 1.4191968971136724, + "language_loss": 0.75609601, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.78170449, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.2322998, + "step": 4869, + "time_per_iteration": 2.8878872394561768 + }, + { + "auxiliary_loss_clip": 0.01521719, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.33056462, + "balance_loss_mlp": 1.01861477, + "epoch": 0.2928002404930107, + "flos": 26115154704000.0, + "grad_norm": 1.6219859382381214, + "language_loss": 0.7743417, + "learning_rate": 3.318209641423088e-06, + "loss": 0.79998237, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.23730469, + "step": 4870, + "time_per_iteration": 2.9074721336364746 + }, + { + "auxiliary_loss_clip": 0.01547089, + "auxiliary_loss_mlp": 0.01042147, + "balance_loss_clip": 1.34937668, + "balance_loss_mlp": 1.01816177, + "epoch": 0.29286036374567864, + "flos": 21334682632320.0, + "grad_norm": 2.2010404276808573, + "language_loss": 0.69276273, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.71865511, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.23986816, + "step": 4871, + "time_per_iteration": 2.8736696243286133 + }, + { + "auxiliary_loss_clip": 0.01524867, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.33290839, + "balance_loss_mlp": 1.01634431, + "epoch": 0.2929204869983466, + "flos": 29581324531200.0, + "grad_norm": 3.9092162221117723, + "language_loss": 0.78789973, + "learning_rate": 3.317623751303933e-06, + "loss": 0.81353855, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.22668457, + "step": 4872, + "time_per_iteration": 2.9338669776916504 + }, + { + "auxiliary_loss_clip": 0.01535388, + "auxiliary_loss_mlp": 0.01042063, + "balance_loss_clip": 1.33821607, + "balance_loss_mlp": 1.01742244, + "epoch": 0.2929806102510146, + "flos": 19066499372160.0, + "grad_norm": 1.984474728434417, + "language_loss": 0.73049825, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75627279, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.2467041, + "step": 4873, + "time_per_iteration": 2.8545923233032227 + }, + { + "auxiliary_loss_clip": 0.01545603, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.34888148, + "balance_loss_mlp": 1.01971364, + "epoch": 0.29304073350368254, + "flos": 21954140196480.0, + "grad_norm": 4.680627879807124, + "language_loss": 0.79091197, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.81680161, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.23657227, + "step": 4874, + "time_per_iteration": 4.320667028427124 + }, + { + "auxiliary_loss_clip": 0.01552841, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_clip": 1.3506248, + "balance_loss_mlp": 1.02117586, + "epoch": 0.2931008567563505, + "flos": 15459328189440.0, + "grad_norm": 2.058862323118028, + "language_loss": 0.78623843, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.81222439, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.24584961, + "step": 4875, + "time_per_iteration": 2.9040255546569824 + }, + { + "auxiliary_loss_clip": 0.01530395, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_clip": 1.33684039, + "balance_loss_mlp": 1.01832759, + "epoch": 0.29316098000901847, + "flos": 16992054921600.0, + "grad_norm": 2.092750627964291, + "language_loss": 0.7026211, + "learning_rate": 3.316451371581431e-06, + "loss": 0.72835618, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.2479248, + "step": 4876, + "time_per_iteration": 2.875425338745117 + }, + { + "auxiliary_loss_clip": 0.01514971, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.32243049, + "balance_loss_mlp": 1.02006412, + "epoch": 0.29322110326168643, + "flos": 16365086720640.0, + "grad_norm": 1.9180342404878234, + "language_loss": 0.83360881, + "learning_rate": 3.316158151823096e-06, + "loss": 0.85919201, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.23291016, + "step": 4877, + "time_per_iteration": 2.8471710681915283 + }, + { + "auxiliary_loss_clip": 0.01541036, + "auxiliary_loss_mlp": 0.01045883, + "balance_loss_clip": 1.34278631, + "balance_loss_mlp": 1.02213669, + "epoch": 0.29328122651435445, + "flos": 13998866682240.0, + "grad_norm": 1.9828485282389599, + "language_loss": 0.68991065, + "learning_rate": 3.315864882155911e-06, + "loss": 0.71577978, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.23754883, + "step": 4878, + "time_per_iteration": 4.361980676651001 + }, + { + "auxiliary_loss_clip": 0.01521252, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.3283515, + "balance_loss_mlp": 1.01915598, + "epoch": 0.2933413497670224, + "flos": 25275417615360.0, + "grad_norm": 2.589905879694502, + "language_loss": 0.74170744, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.76734924, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.2376709, + "step": 4879, + "time_per_iteration": 2.876278877258301 + }, + { + "auxiliary_loss_clip": 0.01543646, + "auxiliary_loss_mlp": 0.01049638, + "balance_loss_clip": 1.34660339, + "balance_loss_mlp": 1.02425814, + "epoch": 0.2934014730196904, + "flos": 32136306451200.0, + "grad_norm": 2.1242386139852814, + "language_loss": 0.67465305, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.70058584, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.25378418, + "step": 4880, + "time_per_iteration": 2.945148229598999 + }, + { + "auxiliary_loss_clip": 0.01527222, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_clip": 1.3306942, + "balance_loss_mlp": 1.02724075, + "epoch": 0.29346159627235835, + "flos": 24363098588160.0, + "grad_norm": 2.7265221894160603, + "language_loss": 0.71906066, + "learning_rate": 3.314984773812481e-06, + "loss": 0.74484748, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24206543, + "step": 4881, + "time_per_iteration": 2.8709263801574707 + }, + { + "auxiliary_loss_clip": 0.01529342, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.33305657, + "balance_loss_mlp": 1.01606202, + "epoch": 0.2935217195250263, + "flos": 22756839776640.0, + "grad_norm": 1.6083957014103831, + "language_loss": 0.84018385, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86586845, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.23071289, + "step": 4882, + "time_per_iteration": 2.867452621459961 + }, + { + "auxiliary_loss_clip": 0.01557338, + "auxiliary_loss_mlp": 0.0104781, + "balance_loss_clip": 1.35457015, + "balance_loss_mlp": 1.02223921, + "epoch": 0.2935818427776943, + "flos": 21735489484800.0, + "grad_norm": 2.1191585621505906, + "language_loss": 0.73101318, + "learning_rate": 3.314397785576548e-06, + "loss": 0.75706458, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.25598145, + "step": 4883, + "time_per_iteration": 2.8862459659576416 + }, + { + "auxiliary_loss_clip": 0.01539015, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.34251916, + "balance_loss_mlp": 1.02051747, + "epoch": 0.29364196603036224, + "flos": 23815363311360.0, + "grad_norm": 4.398786132718032, + "language_loss": 0.93146265, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.95729387, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.23571777, + "step": 4884, + "time_per_iteration": 2.861205577850342 + }, + { + "auxiliary_loss_clip": 0.01548594, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.35051203, + "balance_loss_mlp": 1.01976347, + "epoch": 0.2937020892830302, + "flos": 23478677619840.0, + "grad_norm": 3.1338603900124498, + "language_loss": 0.7448709, + "learning_rate": 3.313810597972234e-06, + "loss": 0.77080214, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.24768066, + "step": 4885, + "time_per_iteration": 2.8936352729797363 + }, + { + "auxiliary_loss_clip": 0.0153208, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.33726823, + "balance_loss_mlp": 1.02096057, + "epoch": 0.2937622125356982, + "flos": 24281558179200.0, + "grad_norm": 1.8083308356850656, + "language_loss": 0.86111861, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.88687956, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.23046875, + "step": 4886, + "time_per_iteration": 2.8527188301086426 + }, + { + "auxiliary_loss_clip": 0.01531342, + "auxiliary_loss_mlp": 0.01042517, + "balance_loss_clip": 1.3345468, + "balance_loss_mlp": 1.01980758, + "epoch": 0.29382233578836614, + "flos": 20670812657280.0, + "grad_norm": 3.93025235179736, + "language_loss": 0.77943081, + "learning_rate": 3.313223211088603e-06, + "loss": 0.8051694, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.22692871, + "step": 4887, + "time_per_iteration": 2.857243776321411 + }, + { + "auxiliary_loss_clip": 0.01533465, + "auxiliary_loss_mlp": 0.01042123, + "balance_loss_clip": 1.33469582, + "balance_loss_mlp": 1.01964056, + "epoch": 0.2938824590410341, + "flos": 16553758112640.0, + "grad_norm": 4.884754183185183, + "language_loss": 0.80419457, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.82995045, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.22460938, + "step": 4888, + "time_per_iteration": 2.89253306388855 + }, + { + "auxiliary_loss_clip": 0.01526394, + "auxiliary_loss_mlp": 0.0104772, + "balance_loss_clip": 1.33136714, + "balance_loss_mlp": 1.02329421, + "epoch": 0.29394258229370207, + "flos": 37939667137920.0, + "grad_norm": 1.4564926487818959, + "language_loss": 0.55962968, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.58537078, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.24438477, + "step": 4889, + "time_per_iteration": 3.0274593830108643 + }, + { + "auxiliary_loss_clip": 0.01530776, + "auxiliary_loss_mlp": 0.01050102, + "balance_loss_clip": 1.33372796, + "balance_loss_mlp": 1.02592587, + "epoch": 0.29400270554637004, + "flos": 20053526843520.0, + "grad_norm": 2.7510419294350785, + "language_loss": 0.85621178, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.88202059, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.24194336, + "step": 4890, + "time_per_iteration": 2.86726713180542 + }, + { + "auxiliary_loss_clip": 0.01534436, + "auxiliary_loss_mlp": 0.01050776, + "balance_loss_clip": 1.33387566, + "balance_loss_mlp": 1.02546811, + "epoch": 0.294062828799038, + "flos": 15274321626240.0, + "grad_norm": 2.0845394886231383, + "language_loss": 0.73848218, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.76433432, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.25305176, + "step": 4891, + "time_per_iteration": 2.8367621898651123 + }, + { + "auxiliary_loss_clip": 0.01535862, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.33701372, + "balance_loss_mlp": 1.0226903, + "epoch": 0.294122952051706, + "flos": 22757337469440.0, + "grad_norm": 2.3204329272565474, + "language_loss": 0.78384876, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.80967498, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.24084473, + "step": 4892, + "time_per_iteration": 2.8875036239624023 + }, + { + "auxiliary_loss_clip": 0.01522769, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.32821631, + "balance_loss_mlp": 1.01359642, + "epoch": 0.294183075304374, + "flos": 24983687272320.0, + "grad_norm": 1.6372796940592154, + "language_loss": 0.78318542, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80878073, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.23181152, + "step": 4893, + "time_per_iteration": 3.059711456298828 + }, + { + "auxiliary_loss_clip": 0.01537031, + "auxiliary_loss_mlp": 0.01048844, + "balance_loss_clip": 1.34012079, + "balance_loss_mlp": 1.02534842, + "epoch": 0.29424319855704195, + "flos": 30964408151040.0, + "grad_norm": 1.66569711605378, + "language_loss": 0.85364348, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87950224, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.23510742, + "step": 4894, + "time_per_iteration": 2.9443840980529785 + }, + { + "auxiliary_loss_clip": 0.01540318, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_clip": 1.34008622, + "balance_loss_mlp": 1.02300119, + "epoch": 0.2943033218097099, + "flos": 15239953560960.0, + "grad_norm": 2.6815384010521788, + "language_loss": 0.91394985, + "learning_rate": 3.310871672543274e-06, + "loss": 0.93982077, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.23742676, + "step": 4895, + "time_per_iteration": 2.908296585083008 + }, + { + "auxiliary_loss_clip": 0.01535644, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.33466816, + "balance_loss_mlp": 1.0193156, + "epoch": 0.2943634450623779, + "flos": 21735896688000.0, + "grad_norm": 2.661632450135291, + "language_loss": 0.87944651, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.90524942, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.25354004, + "step": 4896, + "time_per_iteration": 2.8837602138519287 + }, + { + "auxiliary_loss_clip": 0.01539619, + "auxiliary_loss_mlp": 0.01048052, + "balance_loss_clip": 1.34047794, + "balance_loss_mlp": 1.02415061, + "epoch": 0.29442356831504585, + "flos": 22612490305920.0, + "grad_norm": 1.7921180394575458, + "language_loss": 0.74468344, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.77056015, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.23901367, + "step": 4897, + "time_per_iteration": 2.860987424850464 + }, + { + "auxiliary_loss_clip": 0.01544886, + "auxiliary_loss_mlp": 0.01047452, + "balance_loss_clip": 1.34012115, + "balance_loss_mlp": 1.02223957, + "epoch": 0.2944836915677138, + "flos": 20021059059840.0, + "grad_norm": 1.8816052502821425, + "language_loss": 0.75124109, + "learning_rate": 3.309989025093813e-06, + "loss": 0.77716446, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.25219727, + "step": 4898, + "time_per_iteration": 2.83255672454834 + }, + { + "auxiliary_loss_clip": 0.01544546, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.34175086, + "balance_loss_mlp": 1.0263474, + "epoch": 0.2945438148203818, + "flos": 20055019921920.0, + "grad_norm": 6.539752892255044, + "language_loss": 0.71773231, + "learning_rate": 3.309694709912618e-06, + "loss": 0.74369764, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.2565918, + "step": 4899, + "time_per_iteration": 2.8461575508117676 + }, + { + "auxiliary_loss_clip": 0.01519946, + "auxiliary_loss_mlp": 0.0105425, + "balance_loss_clip": 1.32440925, + "balance_loss_mlp": 1.02929926, + "epoch": 0.29460393807304974, + "flos": 23744319696000.0, + "grad_norm": 1.9878150960211893, + "language_loss": 0.80258918, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.82833111, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.24975586, + "step": 4900, + "time_per_iteration": 2.876877784729004 + }, + { + "auxiliary_loss_clip": 0.01522909, + "auxiliary_loss_mlp": 0.01045345, + "balance_loss_clip": 1.32488346, + "balance_loss_mlp": 1.02116942, + "epoch": 0.2946640613257177, + "flos": 14984355830400.0, + "grad_norm": 1.8196305564122879, + "language_loss": 0.81531596, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.84099853, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24182129, + "step": 4901, + "time_per_iteration": 2.814955949783325 + }, + { + "auxiliary_loss_clip": 0.0151258, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.32187533, + "balance_loss_mlp": 1.0174644, + "epoch": 0.2947241845783857, + "flos": 24254700750720.0, + "grad_norm": 3.5167816290111147, + "language_loss": 0.58869886, + "learning_rate": 3.308811466431157e-06, + "loss": 0.61423433, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.23498535, + "step": 4902, + "time_per_iteration": 2.891887903213501 + }, + { + "auxiliary_loss_clip": 0.01513142, + "auxiliary_loss_mlp": 0.0104231, + "balance_loss_clip": 1.31874824, + "balance_loss_mlp": 1.01802659, + "epoch": 0.29478430783105364, + "flos": 19948024673280.0, + "grad_norm": 1.566105891320166, + "language_loss": 0.7690345, + "learning_rate": 3.308516952661925e-06, + "loss": 0.79458904, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.24291992, + "step": 4903, + "time_per_iteration": 4.317591190338135 + }, + { + "auxiliary_loss_clip": 0.01533441, + "auxiliary_loss_mlp": 0.0104496, + "balance_loss_clip": 1.33746862, + "balance_loss_mlp": 1.01925826, + "epoch": 0.2948444310837216, + "flos": 27392871888000.0, + "grad_norm": 1.8117101815746506, + "language_loss": 0.62750602, + "learning_rate": 3.3082223892736e-06, + "loss": 0.65329003, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.25720215, + "step": 4904, + "time_per_iteration": 2.8760673999786377 + }, + { + "auxiliary_loss_clip": 0.01533691, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.33501124, + "balance_loss_mlp": 1.01924741, + "epoch": 0.2949045543363896, + "flos": 23416230516480.0, + "grad_norm": 4.5737289393405165, + "language_loss": 0.73943007, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.76520479, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.24536133, + "step": 4905, + "time_per_iteration": 2.903163433074951 + }, + { + "auxiliary_loss_clip": 0.01529577, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.33389771, + "balance_loss_mlp": 1.01609468, + "epoch": 0.2949646775890576, + "flos": 23962065511680.0, + "grad_norm": 1.5996083176144953, + "language_loss": 0.81970352, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.84541214, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.25183105, + "step": 4906, + "time_per_iteration": 2.906320571899414 + }, + { + "auxiliary_loss_clip": 0.01520631, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.32962823, + "balance_loss_mlp": 1.01530206, + "epoch": 0.29502480084172555, + "flos": 22794691691520.0, + "grad_norm": 2.9017536736950533, + "language_loss": 0.88206303, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.90765977, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.23742676, + "step": 4907, + "time_per_iteration": 2.9583616256713867 + }, + { + "auxiliary_loss_clip": 0.01556811, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.3575058, + "balance_loss_mlp": 1.01946676, + "epoch": 0.2950849240943935, + "flos": 19656249085440.0, + "grad_norm": 2.2702432711454947, + "language_loss": 0.83048761, + "learning_rate": 3.307043639752782e-06, + "loss": 0.85651273, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.2623291, + "step": 4908, + "time_per_iteration": 2.87441349029541 + }, + { + "auxiliary_loss_clip": 0.01274833, + "auxiliary_loss_mlp": 0.01023378, + "balance_loss_clip": 1.16208935, + "balance_loss_mlp": 0.99839133, + "epoch": 0.2951450473470615, + "flos": 71031121459200.0, + "grad_norm": 0.7829443288956217, + "language_loss": 0.57302344, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59600562, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.25, + "step": 4909, + "time_per_iteration": 4.642868518829346 + }, + { + "auxiliary_loss_clip": 0.01512257, + "auxiliary_loss_mlp": 0.01040394, + "balance_loss_clip": 1.31901681, + "balance_loss_mlp": 1.01706493, + "epoch": 0.29520517059972945, + "flos": 22976621608320.0, + "grad_norm": 1.4920629757163228, + "language_loss": 0.87257779, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.89810431, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.23352051, + "step": 4910, + "time_per_iteration": 2.8913161754608154 + }, + { + "auxiliary_loss_clip": 0.01511527, + "auxiliary_loss_mlp": 0.01045439, + "balance_loss_clip": 1.31962204, + "balance_loss_mlp": 1.02062011, + "epoch": 0.2952652938523974, + "flos": 20495443236480.0, + "grad_norm": 1.735403264659366, + "language_loss": 0.73970205, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.76527172, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.24841309, + "step": 4911, + "time_per_iteration": 2.8653385639190674 + }, + { + "auxiliary_loss_clip": 0.01515409, + "auxiliary_loss_mlp": 0.01040059, + "balance_loss_clip": 1.32649553, + "balance_loss_mlp": 1.01533473, + "epoch": 0.2953254171050654, + "flos": 19656158595840.0, + "grad_norm": 2.611400387935349, + "language_loss": 0.90530622, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.93086082, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.24731445, + "step": 4912, + "time_per_iteration": 5.723008394241333 + }, + { + "auxiliary_loss_clip": 0.01521487, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.32773948, + "balance_loss_mlp": 1.02163577, + "epoch": 0.29538554035773334, + "flos": 22758378099840.0, + "grad_norm": 1.955514526276525, + "language_loss": 0.84367937, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.8693558, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24536133, + "step": 4913, + "time_per_iteration": 2.8549680709838867 + }, + { + "auxiliary_loss_clip": 0.01510345, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.31843436, + "balance_loss_mlp": 1.01955128, + "epoch": 0.2954456636104013, + "flos": 21881920216320.0, + "grad_norm": 1.9858125470664223, + "language_loss": 0.77735823, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.80289888, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.24169922, + "step": 4914, + "time_per_iteration": 2.876253366470337 + }, + { + "auxiliary_loss_clip": 0.01514595, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.32313275, + "balance_loss_mlp": 1.01471758, + "epoch": 0.2955057868630693, + "flos": 40457294835840.0, + "grad_norm": 1.7967215235318004, + "language_loss": 0.82578081, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.85132128, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.24755859, + "step": 4915, + "time_per_iteration": 3.078629970550537 + }, + { + "auxiliary_loss_clip": 0.01533861, + "auxiliary_loss_mlp": 0.01044353, + "balance_loss_clip": 1.33960664, + "balance_loss_mlp": 1.01968884, + "epoch": 0.29556591011573724, + "flos": 22574593146240.0, + "grad_norm": 1.8510158137637434, + "language_loss": 0.85425466, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.88003671, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24694824, + "step": 4916, + "time_per_iteration": 2.8459367752075195 + }, + { + "auxiliary_loss_clip": 0.01531898, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.33787382, + "balance_loss_mlp": 1.01380754, + "epoch": 0.2956260333684052, + "flos": 22099032604800.0, + "grad_norm": 1.7742324559194904, + "language_loss": 0.70701897, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.73271966, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.24389648, + "step": 4917, + "time_per_iteration": 2.854931354522705 + }, + { + "auxiliary_loss_clip": 0.01529164, + "auxiliary_loss_mlp": 0.01039537, + "balance_loss_clip": 1.33460867, + "balance_loss_mlp": 1.01562333, + "epoch": 0.2956861566210732, + "flos": 16444138665600.0, + "grad_norm": 2.1228934180457837, + "language_loss": 0.91581422, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.9415012, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.23913574, + "step": 4918, + "time_per_iteration": 2.838308334350586 + }, + { + "auxiliary_loss_clip": 0.01529754, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.33318543, + "balance_loss_mlp": 1.01370931, + "epoch": 0.2957462798737412, + "flos": 25823107647360.0, + "grad_norm": 2.001310918680199, + "language_loss": 0.73416328, + "learning_rate": 3.303797991757425e-06, + "loss": 0.75984728, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24926758, + "step": 4919, + "time_per_iteration": 2.8938424587249756 + }, + { + "auxiliary_loss_clip": 0.01517927, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.32574129, + "balance_loss_mlp": 1.0194329, + "epoch": 0.29580640312640916, + "flos": 16699555416960.0, + "grad_norm": 1.7684296112037556, + "language_loss": 0.77451813, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.80013573, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.2442627, + "step": 4920, + "time_per_iteration": 2.852980375289917 + }, + { + "auxiliary_loss_clip": 0.0153525, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.33689332, + "balance_loss_mlp": 1.02019632, + "epoch": 0.2958665263790771, + "flos": 23954645364480.0, + "grad_norm": 2.881678797187391, + "language_loss": 0.69476038, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.72056937, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.25439453, + "step": 4921, + "time_per_iteration": 2.878619432449341 + }, + { + "auxiliary_loss_clip": 0.01550206, + "auxiliary_loss_mlp": 0.01047892, + "balance_loss_clip": 1.3486526, + "balance_loss_mlp": 1.02239323, + "epoch": 0.2959266496317451, + "flos": 18487517921280.0, + "grad_norm": 1.889584123608708, + "language_loss": 0.7557826, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.78176355, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.25512695, + "step": 4922, + "time_per_iteration": 2.823582887649536 + }, + { + "auxiliary_loss_clip": 0.01549174, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_clip": 1.34748793, + "balance_loss_mlp": 1.01907468, + "epoch": 0.29598677288441305, + "flos": 25968407258880.0, + "grad_norm": 1.9237383426609478, + "language_loss": 0.77484345, + "learning_rate": 3.302616272134737e-06, + "loss": 0.8007586, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.23242188, + "step": 4923, + "time_per_iteration": 2.8569960594177246 + }, + { + "auxiliary_loss_clip": 0.01537164, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.34068477, + "balance_loss_mlp": 1.01937628, + "epoch": 0.296046896137081, + "flos": 25167019777920.0, + "grad_norm": 2.406129168654899, + "language_loss": 0.87425554, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.90006125, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24047852, + "step": 4924, + "time_per_iteration": 2.894080877304077 + }, + { + "auxiliary_loss_clip": 0.01524385, + "auxiliary_loss_mlp": 0.01038871, + "balance_loss_clip": 1.3308717, + "balance_loss_mlp": 1.0135746, + "epoch": 0.296107019389749, + "flos": 21770852935680.0, + "grad_norm": 1.4736664396761814, + "language_loss": 0.82107008, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84670264, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.25305176, + "step": 4925, + "time_per_iteration": 2.867567300796509 + }, + { + "auxiliary_loss_clip": 0.01530033, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.33626437, + "balance_loss_mlp": 1.01648271, + "epoch": 0.29616714264241695, + "flos": 17967047276160.0, + "grad_norm": 3.0619678910917556, + "language_loss": 0.87820709, + "learning_rate": 3.301729463727452e-06, + "loss": 0.90390933, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.23706055, + "step": 4926, + "time_per_iteration": 2.9052720069885254 + }, + { + "auxiliary_loss_clip": 0.01548832, + "auxiliary_loss_mlp": 0.01044928, + "balance_loss_clip": 1.35090256, + "balance_loss_mlp": 1.021873, + "epoch": 0.2962272658950849, + "flos": 15021121870080.0, + "grad_norm": 2.0302793594268245, + "language_loss": 0.87627745, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.902215, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.23071289, + "step": 4927, + "time_per_iteration": 2.831144332885742 + }, + { + "auxiliary_loss_clip": 0.01527697, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.33528423, + "balance_loss_mlp": 1.01862741, + "epoch": 0.2962873891477529, + "flos": 14729120058240.0, + "grad_norm": 1.7805875483029454, + "language_loss": 0.81284195, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83852869, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.22351074, + "step": 4928, + "time_per_iteration": 2.859617233276367 + }, + { + "auxiliary_loss_clip": 0.01541598, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.33818555, + "balance_loss_mlp": 1.02264023, + "epoch": 0.29634751240042084, + "flos": 26734928981760.0, + "grad_norm": 2.3288392180537456, + "language_loss": 0.73596847, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7618655, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.25500488, + "step": 4929, + "time_per_iteration": 2.881685972213745 + }, + { + "auxiliary_loss_clip": 0.01547543, + "auxiliary_loss_mlp": 0.01045848, + "balance_loss_clip": 1.34762061, + "balance_loss_mlp": 1.02074289, + "epoch": 0.2964076356530888, + "flos": 14578572049920.0, + "grad_norm": 2.2212626972885823, + "language_loss": 0.73724723, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.76318115, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.25109863, + "step": 4930, + "time_per_iteration": 2.8367295265197754 + }, + { + "auxiliary_loss_clip": 0.01278756, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.16273761, + "balance_loss_mlp": 1.00567782, + "epoch": 0.29646775890575683, + "flos": 63135462608640.0, + "grad_norm": 0.8263645429582543, + "language_loss": 0.60752988, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.63058788, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.21386719, + "step": 4931, + "time_per_iteration": 3.326747179031372 + }, + { + "auxiliary_loss_clip": 0.01278786, + "auxiliary_loss_mlp": 0.01025153, + "balance_loss_clip": 1.16257524, + "balance_loss_mlp": 1.00627029, + "epoch": 0.2965278821584248, + "flos": 63098787058560.0, + "grad_norm": 0.7453569473888378, + "language_loss": 0.52513027, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54816967, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.18847656, + "step": 4932, + "time_per_iteration": 3.1977884769439697 + }, + { + "auxiliary_loss_clip": 0.01527654, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.3333993, + "balance_loss_mlp": 1.01442003, + "epoch": 0.29658800541109276, + "flos": 23779185454080.0, + "grad_norm": 1.622578645638351, + "language_loss": 0.82749587, + "learning_rate": 3.299658516973972e-06, + "loss": 0.85313922, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.22265625, + "step": 4933, + "time_per_iteration": 2.9025139808654785 + }, + { + "auxiliary_loss_clip": 0.01520699, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.32953, + "balance_loss_mlp": 1.01828289, + "epoch": 0.2966481286637607, + "flos": 23999283999360.0, + "grad_norm": 2.2530013668319535, + "language_loss": 0.76106381, + "learning_rate": 3.299362470215261e-06, + "loss": 0.78669918, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.2454834, + "step": 4934, + "time_per_iteration": 2.9833431243896484 + }, + { + "auxiliary_loss_clip": 0.01540117, + "auxiliary_loss_mlp": 0.01049731, + "balance_loss_clip": 1.34180045, + "balance_loss_mlp": 1.02397013, + "epoch": 0.2967082519164287, + "flos": 17173984838400.0, + "grad_norm": 1.6968028482853748, + "language_loss": 0.63725448, + "learning_rate": 3.299066374184594e-06, + "loss": 0.66315293, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.25744629, + "step": 4935, + "time_per_iteration": 2.817028760910034 + }, + { + "auxiliary_loss_clip": 0.01531671, + "auxiliary_loss_mlp": 0.0104657, + "balance_loss_clip": 1.33714342, + "balance_loss_mlp": 1.02291894, + "epoch": 0.29676837516909665, + "flos": 29399666083200.0, + "grad_norm": 1.5728541186778446, + "language_loss": 0.80157572, + "learning_rate": 3.2987702288932e-06, + "loss": 0.82735807, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.2364502, + "step": 4936, + "time_per_iteration": 3.0583746433258057 + }, + { + "auxiliary_loss_clip": 0.01548331, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.34896338, + "balance_loss_mlp": 1.02092528, + "epoch": 0.2968284984217646, + "flos": 34764594226560.0, + "grad_norm": 1.5952664404043264, + "language_loss": 0.75325072, + "learning_rate": 3.298474034352309e-06, + "loss": 0.77918255, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.23925781, + "step": 4937, + "time_per_iteration": 2.949100971221924 + }, + { + "auxiliary_loss_clip": 0.015389, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.34227467, + "balance_loss_mlp": 1.01986337, + "epoch": 0.2968886216744326, + "flos": 21554238240000.0, + "grad_norm": 1.8662305753533548, + "language_loss": 0.78422713, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.8100605, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.24597168, + "step": 4938, + "time_per_iteration": 2.8709776401519775 + }, + { + "auxiliary_loss_clip": 0.0154116, + "auxiliary_loss_mlp": 0.01045951, + "balance_loss_clip": 1.33958232, + "balance_loss_mlp": 1.02096462, + "epoch": 0.29694874492710055, + "flos": 12795134025600.0, + "grad_norm": 1.9466539686794282, + "language_loss": 0.7774719, + "learning_rate": 3.297881497566964e-06, + "loss": 0.803343, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25, + "step": 4939, + "time_per_iteration": 4.238704442977905 + }, + { + "auxiliary_loss_clip": 0.01543144, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.34260213, + "balance_loss_mlp": 1.02397943, + "epoch": 0.2970088681797685, + "flos": 24580256221440.0, + "grad_norm": 1.6596308280847512, + "language_loss": 0.79094684, + "learning_rate": 3.297585155344979e-06, + "loss": 0.8168546, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.23657227, + "step": 4940, + "time_per_iteration": 2.878542184829712 + }, + { + "auxiliary_loss_clip": 0.01544533, + "auxiliary_loss_mlp": 0.01048367, + "balance_loss_clip": 1.34374642, + "balance_loss_mlp": 1.0223552, + "epoch": 0.2970689914324365, + "flos": 23669566007040.0, + "grad_norm": 1.6658249036613395, + "language_loss": 0.75946504, + "learning_rate": 3.297288763918435e-06, + "loss": 0.78539407, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.26013184, + "step": 4941, + "time_per_iteration": 2.9013755321502686 + }, + { + "auxiliary_loss_clip": 0.01551939, + "auxiliary_loss_mlp": 0.01044954, + "balance_loss_clip": 1.34769964, + "balance_loss_mlp": 1.01862025, + "epoch": 0.29712911468510445, + "flos": 39683262476160.0, + "grad_norm": 2.268665035435123, + "language_loss": 0.75386113, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.77982998, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 2.03710938, + "router_z_loss_mlp": 0.26391602, + "step": 4942, + "time_per_iteration": 3.0222926139831543 + }, + { + "auxiliary_loss_clip": 0.01541787, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.34043276, + "balance_loss_mlp": 1.01929283, + "epoch": 0.2971892379377724, + "flos": 26406342109440.0, + "grad_norm": 1.7119260650401216, + "language_loss": 0.71180439, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.73767817, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.26306152, + "step": 4943, + "time_per_iteration": 2.888421058654785 + }, + { + "auxiliary_loss_clip": 0.01542261, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.34138083, + "balance_loss_mlp": 1.01935363, + "epoch": 0.2972493611904404, + "flos": 17612191157760.0, + "grad_norm": 2.2296683555622123, + "language_loss": 0.80351478, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82938111, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.25012207, + "step": 4944, + "time_per_iteration": 4.278645753860474 + }, + { + "auxiliary_loss_clip": 0.01521804, + "auxiliary_loss_mlp": 0.01043189, + "balance_loss_clip": 1.32758474, + "balance_loss_mlp": 1.01940656, + "epoch": 0.2973094844431084, + "flos": 20422544584320.0, + "grad_norm": 2.178044085322441, + "language_loss": 0.84634256, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.87199253, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.23791504, + "step": 4945, + "time_per_iteration": 2.871676206588745 + }, + { + "auxiliary_loss_clip": 0.01520657, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.32732844, + "balance_loss_mlp": 1.01599324, + "epoch": 0.29736960769577636, + "flos": 17502390731520.0, + "grad_norm": 1.7564182488903646, + "language_loss": 0.68326652, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.70889342, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.26037598, + "step": 4946, + "time_per_iteration": 2.8632309436798096 + }, + { + "auxiliary_loss_clip": 0.01529374, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.33287787, + "balance_loss_mlp": 1.01640582, + "epoch": 0.2974297309484443, + "flos": 26115064214400.0, + "grad_norm": 1.819632709315618, + "language_loss": 0.75210977, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.77781951, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.25195312, + "step": 4947, + "time_per_iteration": 5.790498495101929 + }, + { + "auxiliary_loss_clip": 0.01539364, + "auxiliary_loss_mlp": 0.01042387, + "balance_loss_clip": 1.3405807, + "balance_loss_mlp": 1.01754403, + "epoch": 0.2974898542011123, + "flos": 25677491322240.0, + "grad_norm": 3.1122686081790785, + "language_loss": 0.73835534, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.76417285, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.24865723, + "step": 4948, + "time_per_iteration": 2.9019315242767334 + }, + { + "auxiliary_loss_clip": 0.01513586, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.31934214, + "balance_loss_mlp": 1.01469517, + "epoch": 0.29754997745378026, + "flos": 18670624202880.0, + "grad_norm": 1.8736951914597135, + "language_loss": 0.84322369, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86874908, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24255371, + "step": 4949, + "time_per_iteration": 2.8188557624816895 + }, + { + "auxiliary_loss_clip": 0.01527276, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.33073854, + "balance_loss_mlp": 1.01608562, + "epoch": 0.2976101007064482, + "flos": 22285125043200.0, + "grad_norm": 2.039329958056863, + "language_loss": 0.71320117, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73889256, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.25793457, + "step": 4950, + "time_per_iteration": 2.886660099029541 + }, + { + "auxiliary_loss_clip": 0.01503092, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.31415164, + "balance_loss_mlp": 1.01619589, + "epoch": 0.2976702239591162, + "flos": 21955949988480.0, + "grad_norm": 1.7473776854985792, + "language_loss": 0.84170836, + "learning_rate": 3.294322145875789e-06, + "loss": 0.86715263, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.25158691, + "step": 4951, + "time_per_iteration": 2.8846027851104736 + }, + { + "auxiliary_loss_clip": 0.01520187, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.32296979, + "balance_loss_mlp": 1.01480877, + "epoch": 0.29773034721178415, + "flos": 24646006195200.0, + "grad_norm": 3.5215008992561443, + "language_loss": 0.76546621, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.7910614, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.24511719, + "step": 4952, + "time_per_iteration": 2.896153211593628 + }, + { + "auxiliary_loss_clip": 0.0151318, + "auxiliary_loss_mlp": 0.01046037, + "balance_loss_clip": 1.31937957, + "balance_loss_mlp": 1.02108657, + "epoch": 0.2977904704644521, + "flos": 20567075034240.0, + "grad_norm": 1.6881428555717954, + "language_loss": 0.85002607, + "learning_rate": 3.293728232937228e-06, + "loss": 0.87561822, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24975586, + "step": 4953, + "time_per_iteration": 2.8993093967437744 + }, + { + "auxiliary_loss_clip": 0.01524572, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.32702374, + "balance_loss_mlp": 1.01554453, + "epoch": 0.2978505937171201, + "flos": 18925724240640.0, + "grad_norm": 2.0966530801809706, + "language_loss": 0.75144899, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.77709889, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.24902344, + "step": 4954, + "time_per_iteration": 2.8388619422912598 + }, + { + "auxiliary_loss_clip": 0.0152548, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.32953119, + "balance_loss_mlp": 1.01683497, + "epoch": 0.29791071696978805, + "flos": 19327119275520.0, + "grad_norm": 1.826172120530772, + "language_loss": 0.76162696, + "learning_rate": 3.293134123765452e-06, + "loss": 0.78729737, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.24743652, + "step": 4955, + "time_per_iteration": 2.85305118560791 + }, + { + "auxiliary_loss_clip": 0.01530099, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.33097315, + "balance_loss_mlp": 1.01870179, + "epoch": 0.297970840222456, + "flos": 18816014304000.0, + "grad_norm": 1.8902690553195527, + "language_loss": 0.73187441, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.75761515, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.25305176, + "step": 4956, + "time_per_iteration": 2.853522539138794 + }, + { + "auxiliary_loss_clip": 0.0154677, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.34551096, + "balance_loss_mlp": 1.01912713, + "epoch": 0.298030963475124, + "flos": 22861979988480.0, + "grad_norm": 1.702060332079338, + "language_loss": 0.79969275, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.8256042, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25268555, + "step": 4957, + "time_per_iteration": 2.86283278465271 + }, + { + "auxiliary_loss_clip": 0.01524023, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.32707787, + "balance_loss_mlp": 1.01562452, + "epoch": 0.298091086727792, + "flos": 21877667205120.0, + "grad_norm": 1.5202312820928867, + "language_loss": 0.7104528, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.73611629, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.26708984, + "step": 4958, + "time_per_iteration": 2.877079963684082 + }, + { + "auxiliary_loss_clip": 0.01514125, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.32210779, + "balance_loss_mlp": 1.01780295, + "epoch": 0.29815120998045996, + "flos": 21183953644800.0, + "grad_norm": 1.796746825018875, + "language_loss": 0.79515374, + "learning_rate": 3.291945317082743e-06, + "loss": 0.82073104, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.25793457, + "step": 4959, + "time_per_iteration": 2.853182554244995 + }, + { + "auxiliary_loss_clip": 0.01511748, + "auxiliary_loss_mlp": 0.01040226, + "balance_loss_clip": 1.31702185, + "balance_loss_mlp": 1.01495337, + "epoch": 0.29821133323312793, + "flos": 19904426668800.0, + "grad_norm": 1.692841854667108, + "language_loss": 0.80107194, + "learning_rate": 3.291647992907147e-06, + "loss": 0.82659167, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.25280762, + "step": 4960, + "time_per_iteration": 2.8593592643737793 + }, + { + "auxiliary_loss_clip": 0.01532626, + "auxiliary_loss_mlp": 0.01051997, + "balance_loss_clip": 1.33092117, + "balance_loss_mlp": 1.02398324, + "epoch": 0.2982714564857959, + "flos": 12758548965120.0, + "grad_norm": 4.830231658893543, + "language_loss": 0.75470066, + "learning_rate": 3.291350619752129e-06, + "loss": 0.7805469, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.2800293, + "step": 4961, + "time_per_iteration": 2.8379149436950684 + }, + { + "auxiliary_loss_clip": 0.01541198, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.34103012, + "balance_loss_mlp": 1.02316189, + "epoch": 0.29833157973846386, + "flos": 22281731683200.0, + "grad_norm": 2.042139832968929, + "language_loss": 0.62831706, + "learning_rate": 3.291053197628967e-06, + "loss": 0.65422201, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.26147461, + "step": 4962, + "time_per_iteration": 2.873258113861084 + }, + { + "auxiliary_loss_clip": 0.01528471, + "auxiliary_loss_mlp": 0.01056165, + "balance_loss_clip": 1.33278012, + "balance_loss_mlp": 1.03020167, + "epoch": 0.2983917029911318, + "flos": 15380230999680.0, + "grad_norm": 1.900275798432957, + "language_loss": 0.83791649, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.86376286, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.25976562, + "step": 4963, + "time_per_iteration": 2.832347869873047 + }, + { + "auxiliary_loss_clip": 0.01513683, + "auxiliary_loss_mlp": 0.01049572, + "balance_loss_clip": 1.31993067, + "balance_loss_mlp": 1.02356017, + "epoch": 0.2984518262437998, + "flos": 15386067578880.0, + "grad_norm": 2.006067391860304, + "language_loss": 0.6746937, + "learning_rate": 3.290458206523322e-06, + "loss": 0.7003262, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.26000977, + "step": 4964, + "time_per_iteration": 2.8442320823669434 + }, + { + "auxiliary_loss_clip": 0.01513246, + "auxiliary_loss_mlp": 0.01052445, + "balance_loss_clip": 1.31909394, + "balance_loss_mlp": 1.02837658, + "epoch": 0.29851194949646775, + "flos": 18116147450880.0, + "grad_norm": 1.7477787239090943, + "language_loss": 0.71926272, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.74491966, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.24060059, + "step": 4965, + "time_per_iteration": 2.8743093013763428 + }, + { + "auxiliary_loss_clip": 0.01527714, + "auxiliary_loss_mlp": 0.01060035, + "balance_loss_clip": 1.33255482, + "balance_loss_mlp": 1.03416622, + "epoch": 0.2985720727491357, + "flos": 22028441437440.0, + "grad_norm": 1.8045038025303382, + "language_loss": 0.67669845, + "learning_rate": 3.289863019680461e-06, + "loss": 0.70257586, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.25854492, + "step": 4966, + "time_per_iteration": 2.8956854343414307 + }, + { + "auxiliary_loss_clip": 0.01527474, + "auxiliary_loss_mlp": 0.01054026, + "balance_loss_clip": 1.33111978, + "balance_loss_mlp": 1.0288136, + "epoch": 0.2986321960018037, + "flos": 13048876719360.0, + "grad_norm": 2.3789950621654232, + "language_loss": 0.7452122, + "learning_rate": 3.289565352885785e-06, + "loss": 0.77102721, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.25244141, + "step": 4967, + "time_per_iteration": 2.8474502563476562 + }, + { + "auxiliary_loss_clip": 0.01519001, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.32364357, + "balance_loss_mlp": 1.026088, + "epoch": 0.29869231925447165, + "flos": 14473115124480.0, + "grad_norm": 2.3891407498057173, + "language_loss": 0.72118884, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.74686944, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.22961426, + "step": 4968, + "time_per_iteration": 2.8301682472229004 + }, + { + "auxiliary_loss_clip": 0.01522728, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.32503438, + "balance_loss_mlp": 1.02182817, + "epoch": 0.2987524425071396, + "flos": 31662872415360.0, + "grad_norm": 1.7421132792310432, + "language_loss": 0.77528214, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.80097091, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.2434082, + "step": 4969, + "time_per_iteration": 2.9342589378356934 + }, + { + "auxiliary_loss_clip": 0.01512507, + "auxiliary_loss_mlp": 0.0105319, + "balance_loss_clip": 1.3201375, + "balance_loss_mlp": 1.02961063, + "epoch": 0.2988125657598076, + "flos": 21443216204160.0, + "grad_norm": 1.7156062496099085, + "language_loss": 0.70525753, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.73091447, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.23583984, + "step": 4970, + "time_per_iteration": 2.856318950653076 + }, + { + "auxiliary_loss_clip": 0.01539169, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.33881044, + "balance_loss_mlp": 1.02471042, + "epoch": 0.2988726890124756, + "flos": 18085941907200.0, + "grad_norm": 2.167375756619445, + "language_loss": 0.85900652, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.88489383, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.24865723, + "step": 4971, + "time_per_iteration": 2.818199634552002 + }, + { + "auxiliary_loss_clip": 0.01496284, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.30743814, + "balance_loss_mlp": 1.01729321, + "epoch": 0.29893281226514357, + "flos": 21763930481280.0, + "grad_norm": 1.7889536580720473, + "language_loss": 0.8001219, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.82550544, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.24804688, + "step": 4972, + "time_per_iteration": 2.9020330905914307 + }, + { + "auxiliary_loss_clip": 0.01497496, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.30496895, + "balance_loss_mlp": 1.01915097, + "epoch": 0.29899293551781153, + "flos": 16845443210880.0, + "grad_norm": 1.9351163121622845, + "language_loss": 0.86447096, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.88988054, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.24304199, + "step": 4973, + "time_per_iteration": 2.8049018383026123 + }, + { + "auxiliary_loss_clip": 0.01492302, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.30572879, + "balance_loss_mlp": 1.01589894, + "epoch": 0.2990530587704795, + "flos": 11736746225280.0, + "grad_norm": 2.6538637345197076, + "language_loss": 0.7871415, + "learning_rate": 3.287480316742863e-06, + "loss": 0.81246001, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.2364502, + "step": 4974, + "time_per_iteration": 4.238566875457764 + }, + { + "auxiliary_loss_clip": 0.01515449, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.32252228, + "balance_loss_mlp": 1.01882625, + "epoch": 0.29911318202314746, + "flos": 28052036403840.0, + "grad_norm": 1.8149858797459129, + "language_loss": 0.73349226, + "learning_rate": 3.287182259060815e-06, + "loss": 0.7590794, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.24438477, + "step": 4975, + "time_per_iteration": 2.9395973682403564 + }, + { + "auxiliary_loss_clip": 0.01507922, + "auxiliary_loss_mlp": 0.01043565, + "balance_loss_clip": 1.3148973, + "balance_loss_mlp": 1.01843536, + "epoch": 0.2991733052758154, + "flos": 18742663203840.0, + "grad_norm": 2.4746998979590606, + "language_loss": 0.76267493, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78818977, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.25134277, + "step": 4976, + "time_per_iteration": 2.8173210620880127 + }, + { + "auxiliary_loss_clip": 0.01499193, + "auxiliary_loss_mlp": 0.01041528, + "balance_loss_clip": 1.30946422, + "balance_loss_mlp": 1.01816344, + "epoch": 0.2992334285284834, + "flos": 15567409313280.0, + "grad_norm": 2.4600671299763532, + "language_loss": 0.87565279, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.90105987, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.23376465, + "step": 4977, + "time_per_iteration": 2.873326539993286 + }, + { + "auxiliary_loss_clip": 0.01505436, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.31373334, + "balance_loss_mlp": 1.02118349, + "epoch": 0.29929355178115136, + "flos": 21807257016960.0, + "grad_norm": 1.750267219674661, + "language_loss": 0.69379723, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.71929824, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.23461914, + "step": 4978, + "time_per_iteration": 2.860891819000244 + }, + { + "auxiliary_loss_clip": 0.01512573, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.31996012, + "balance_loss_mlp": 1.02169538, + "epoch": 0.2993536750338193, + "flos": 21188387635200.0, + "grad_norm": 2.2195261885006365, + "language_loss": 0.77389264, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.79946947, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.23425293, + "step": 4979, + "time_per_iteration": 4.3676581382751465 + }, + { + "auxiliary_loss_clip": 0.0151199, + "auxiliary_loss_mlp": 0.01048577, + "balance_loss_clip": 1.31707621, + "balance_loss_mlp": 1.02634406, + "epoch": 0.2994137982864873, + "flos": 32134270435200.0, + "grad_norm": 1.7356472964633352, + "language_loss": 0.69432449, + "learning_rate": 3.285691238725484e-06, + "loss": 0.71993023, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.22241211, + "step": 4980, + "time_per_iteration": 2.945446014404297 + }, + { + "auxiliary_loss_clip": 0.01498829, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.3095777, + "balance_loss_mlp": 1.02319193, + "epoch": 0.29947392153915525, + "flos": 21115217514240.0, + "grad_norm": 1.7895894692323624, + "language_loss": 0.74294406, + "learning_rate": 3.285392888352555e-06, + "loss": 0.76838875, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.22436523, + "step": 4981, + "time_per_iteration": 2.882982015609741 + }, + { + "auxiliary_loss_clip": 0.01522537, + "auxiliary_loss_mlp": 0.01051158, + "balance_loss_clip": 1.32408321, + "balance_loss_mlp": 1.02700627, + "epoch": 0.2995340447918232, + "flos": 21552383203200.0, + "grad_norm": 1.584956523796422, + "language_loss": 0.87124664, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.89698356, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.24157715, + "step": 4982, + "time_per_iteration": 5.755779266357422 + }, + { + "auxiliary_loss_clip": 0.01530647, + "auxiliary_loss_mlp": 0.01047773, + "balance_loss_clip": 1.33214724, + "balance_loss_mlp": 1.02333474, + "epoch": 0.2995941680444912, + "flos": 16733290055040.0, + "grad_norm": 2.1879555637998416, + "language_loss": 0.87098384, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.89676803, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.2442627, + "step": 4983, + "time_per_iteration": 2.8243215084075928 + }, + { + "auxiliary_loss_clip": 0.01501857, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.30848968, + "balance_loss_mlp": 1.02285826, + "epoch": 0.2996542912971592, + "flos": 20933378087040.0, + "grad_norm": 2.0700609002736767, + "language_loss": 0.79478157, + "learning_rate": 3.284497544825668e-06, + "loss": 0.8202765, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.2479248, + "step": 4984, + "time_per_iteration": 2.8983683586120605 + }, + { + "auxiliary_loss_clip": 0.01521545, + "auxiliary_loss_mlp": 0.01053525, + "balance_loss_clip": 1.32681537, + "balance_loss_mlp": 1.0279429, + "epoch": 0.29971441454982717, + "flos": 25090049093760.0, + "grad_norm": 1.6872794277233043, + "language_loss": 0.7945683, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.82031906, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.25561523, + "step": 4985, + "time_per_iteration": 2.9446492195129395 + }, + { + "auxiliary_loss_clip": 0.01525984, + "auxiliary_loss_mlp": 0.0104985, + "balance_loss_clip": 1.32597566, + "balance_loss_mlp": 1.02484012, + "epoch": 0.29977453780249513, + "flos": 52573809081600.0, + "grad_norm": 1.985353855925173, + "language_loss": 0.72177476, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74753308, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.25012207, + "step": 4986, + "time_per_iteration": 3.163069009780884 + }, + { + "auxiliary_loss_clip": 0.01519211, + "auxiliary_loss_mlp": 0.01046026, + "balance_loss_clip": 1.32081616, + "balance_loss_mlp": 1.02132583, + "epoch": 0.2998346610551631, + "flos": 22247408862720.0, + "grad_norm": 1.6742762955298107, + "language_loss": 0.74654102, + "learning_rate": 3.283601762924312e-06, + "loss": 0.77219337, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.24707031, + "step": 4987, + "time_per_iteration": 2.9177708625793457 + }, + { + "auxiliary_loss_clip": 0.01501834, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.30989242, + "balance_loss_mlp": 1.01668811, + "epoch": 0.29989478430783106, + "flos": 16881937781760.0, + "grad_norm": 1.624740352452569, + "language_loss": 0.81307065, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.83850056, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.24487305, + "step": 4988, + "time_per_iteration": 2.865680694580078 + }, + { + "auxiliary_loss_clip": 0.01487355, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.29613817, + "balance_loss_mlp": 1.01756334, + "epoch": 0.29995490756049903, + "flos": 23779366433280.0, + "grad_norm": 1.6917469884855654, + "language_loss": 0.71577239, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.74105924, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.23742676, + "step": 4989, + "time_per_iteration": 2.8918540477752686 + }, + { + "auxiliary_loss_clip": 0.01514302, + "auxiliary_loss_mlp": 0.01045633, + "balance_loss_clip": 1.31595039, + "balance_loss_mlp": 1.0192759, + "epoch": 0.300015030813167, + "flos": 14473477082880.0, + "grad_norm": 1.8890737684807088, + "language_loss": 0.86367601, + "learning_rate": 3.282705542954199e-06, + "loss": 0.88927537, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.26379395, + "step": 4990, + "time_per_iteration": 2.898923873901367 + }, + { + "auxiliary_loss_clip": 0.01520327, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.3186214, + "balance_loss_mlp": 1.01647544, + "epoch": 0.30007515406583496, + "flos": 25202880921600.0, + "grad_norm": 1.953462398326638, + "language_loss": 0.67288941, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69851184, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.25463867, + "step": 4991, + "time_per_iteration": 2.952831983566284 + }, + { + "auxiliary_loss_clip": 0.01527473, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.32706022, + "balance_loss_mlp": 1.01553214, + "epoch": 0.3001352773185029, + "flos": 19400696599680.0, + "grad_norm": 1.9867963497681904, + "language_loss": 0.79910946, + "learning_rate": 3.28210781975363e-06, + "loss": 0.82479906, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.25964355, + "step": 4992, + "time_per_iteration": 2.8430426120758057 + }, + { + "auxiliary_loss_clip": 0.01505301, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.31215453, + "balance_loss_mlp": 1.01374388, + "epoch": 0.3001954005711709, + "flos": 21553921526400.0, + "grad_norm": 2.037121126740254, + "language_loss": 0.83733428, + "learning_rate": 3.281808885221193e-06, + "loss": 0.86277878, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.25402832, + "step": 4993, + "time_per_iteration": 2.878878116607666 + }, + { + "auxiliary_loss_clip": 0.01524339, + "auxiliary_loss_mlp": 0.01045769, + "balance_loss_clip": 1.32477164, + "balance_loss_mlp": 1.01843429, + "epoch": 0.30025552382383885, + "flos": 17393268977280.0, + "grad_norm": 2.1251955251298313, + "language_loss": 0.88319087, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.90889204, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.2734375, + "step": 4994, + "time_per_iteration": 2.831803321838379 + }, + { + "auxiliary_loss_clip": 0.01509835, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_clip": 1.31396866, + "balance_loss_mlp": 1.01783395, + "epoch": 0.3003156470765068, + "flos": 29545237163520.0, + "grad_norm": 1.5379373617904444, + "language_loss": 0.81474525, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.84028059, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.25891113, + "step": 4995, + "time_per_iteration": 2.914771318435669 + }, + { + "auxiliary_loss_clip": 0.01503983, + "auxiliary_loss_mlp": 0.01043896, + "balance_loss_clip": 1.31180906, + "balance_loss_mlp": 1.01769376, + "epoch": 0.3003757703291748, + "flos": 43660175316480.0, + "grad_norm": 2.202796520299419, + "language_loss": 0.67534155, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.70082033, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.26208496, + "step": 4996, + "time_per_iteration": 3.064422130584717 + }, + { + "auxiliary_loss_clip": 0.01504605, + "auxiliary_loss_mlp": 0.01041749, + "balance_loss_clip": 1.3113482, + "balance_loss_mlp": 1.01582158, + "epoch": 0.30043589358184275, + "flos": 22538551023360.0, + "grad_norm": 2.052674566961902, + "language_loss": 0.76576674, + "learning_rate": 3.280612661141615e-06, + "loss": 0.7912302, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.25952148, + "step": 4997, + "time_per_iteration": 2.86826229095459 + }, + { + "auxiliary_loss_clip": 0.01491518, + "auxiliary_loss_mlp": 0.01044942, + "balance_loss_clip": 1.30116022, + "balance_loss_mlp": 1.01865673, + "epoch": 0.30049601683451077, + "flos": 21005643312000.0, + "grad_norm": 2.0654344859003926, + "language_loss": 0.79205829, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.81742293, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.26281738, + "step": 4998, + "time_per_iteration": 2.884098768234253 + }, + { + "auxiliary_loss_clip": 0.01505187, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.31438029, + "balance_loss_mlp": 1.01282263, + "epoch": 0.30055614008717874, + "flos": 23926837795200.0, + "grad_norm": 1.9220901947730018, + "language_loss": 0.74580634, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.77122962, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.2434082, + "step": 4999, + "time_per_iteration": 2.9165408611297607 + }, + { + "auxiliary_loss_clip": 0.01510262, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.31425762, + "balance_loss_mlp": 1.01698267, + "epoch": 0.3006162633398467, + "flos": 19178607283200.0, + "grad_norm": 1.6043719300483879, + "language_loss": 0.76435542, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.78987199, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.24414062, + "step": 5000, + "time_per_iteration": 2.9508867263793945 + }, + { + "auxiliary_loss_clip": 0.01495431, + "auxiliary_loss_mlp": 0.01049563, + "balance_loss_clip": 1.30537653, + "balance_loss_mlp": 1.02364659, + "epoch": 0.30067638659251467, + "flos": 14686155480960.0, + "grad_norm": 1.7700919753641575, + "language_loss": 0.82287818, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84832811, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.25927734, + "step": 5001, + "time_per_iteration": 2.8442962169647217 + }, + { + "auxiliary_loss_clip": 0.01517126, + "auxiliary_loss_mlp": 0.01051373, + "balance_loss_clip": 1.32322884, + "balance_loss_mlp": 1.02499235, + "epoch": 0.30073650984518263, + "flos": 23378514336000.0, + "grad_norm": 1.6814079250139846, + "language_loss": 0.81555045, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.8412354, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.26391602, + "step": 5002, + "time_per_iteration": 2.8489949703216553 + }, + { + "auxiliary_loss_clip": 0.01523003, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.32469165, + "balance_loss_mlp": 1.0173409, + "epoch": 0.3007966330978506, + "flos": 22977028811520.0, + "grad_norm": 1.8240161787655698, + "language_loss": 0.72302115, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.7486729, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24804688, + "step": 5003, + "time_per_iteration": 2.8865537643432617 + }, + { + "auxiliary_loss_clip": 0.01518396, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_clip": 1.32051873, + "balance_loss_mlp": 1.01965404, + "epoch": 0.30085675635051856, + "flos": 27829901842560.0, + "grad_norm": 5.293364398473275, + "language_loss": 0.72181964, + "learning_rate": 3.27851739984233e-06, + "loss": 0.74745208, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.25231934, + "step": 5004, + "time_per_iteration": 2.8986434936523438 + }, + { + "auxiliary_loss_clip": 0.01519704, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.32253504, + "balance_loss_mlp": 1.01733708, + "epoch": 0.3009168796031865, + "flos": 10888141155840.0, + "grad_norm": 2.964168243136023, + "language_loss": 0.82620788, + "learning_rate": 3.278217882782715e-06, + "loss": 0.85184908, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.27099609, + "step": 5005, + "time_per_iteration": 2.818019151687622 + }, + { + "auxiliary_loss_clip": 0.01511439, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.31888843, + "balance_loss_mlp": 1.01686788, + "epoch": 0.3009770028558545, + "flos": 23815906248960.0, + "grad_norm": 3.1324928037926085, + "language_loss": 0.76322967, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.7887761, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.26306152, + "step": 5006, + "time_per_iteration": 2.874333143234253 + }, + { + "auxiliary_loss_clip": 0.01501803, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.31086969, + "balance_loss_mlp": 1.01659369, + "epoch": 0.30103712610852246, + "flos": 26479240761600.0, + "grad_norm": 1.8770196595040238, + "language_loss": 0.72344851, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.74888861, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.25634766, + "step": 5007, + "time_per_iteration": 2.8995800018310547 + }, + { + "auxiliary_loss_clip": 0.0151426, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.31930208, + "balance_loss_mlp": 1.01459455, + "epoch": 0.3010972493611904, + "flos": 22866640202880.0, + "grad_norm": 2.777594654260953, + "language_loss": 0.77397937, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.79953206, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.2644043, + "step": 5008, + "time_per_iteration": 2.852952003479004 + }, + { + "auxiliary_loss_clip": 0.01513172, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.31935024, + "balance_loss_mlp": 1.01448226, + "epoch": 0.3011573726138584, + "flos": 24062409774720.0, + "grad_norm": 3.336653498929683, + "language_loss": 0.85147703, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87699819, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24450684, + "step": 5009, + "time_per_iteration": 4.3645994663238525 + }, + { + "auxiliary_loss_clip": 0.0153901, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.33754647, + "balance_loss_mlp": 1.01930332, + "epoch": 0.30121749586652635, + "flos": 20267562585600.0, + "grad_norm": 1.8742723982462457, + "language_loss": 0.84066796, + "learning_rate": 3.276719570659604e-06, + "loss": 0.86654752, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.29663086, + "step": 5010, + "time_per_iteration": 2.933987617492676 + }, + { + "auxiliary_loss_clip": 0.01503079, + "auxiliary_loss_mlp": 0.01040413, + "balance_loss_clip": 1.31005549, + "balance_loss_mlp": 1.01602256, + "epoch": 0.3012776191191944, + "flos": 26954348855040.0, + "grad_norm": 2.108992249847228, + "language_loss": 0.85467303, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.880108, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.24389648, + "step": 5011, + "time_per_iteration": 2.984506368637085 + }, + { + "auxiliary_loss_clip": 0.01529189, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.33096576, + "balance_loss_mlp": 1.01901102, + "epoch": 0.30133774237186234, + "flos": 20421956401920.0, + "grad_norm": 2.6742678105509827, + "language_loss": 0.73522162, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.76097023, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.26660156, + "step": 5012, + "time_per_iteration": 2.825655937194824 + }, + { + "auxiliary_loss_clip": 0.01520602, + "auxiliary_loss_mlp": 0.01041649, + "balance_loss_clip": 1.32367456, + "balance_loss_mlp": 1.01700819, + "epoch": 0.3013978656245303, + "flos": 19802046389760.0, + "grad_norm": 2.1257960519242314, + "language_loss": 0.88943869, + "learning_rate": 3.275820002334819e-06, + "loss": 0.91506124, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24633789, + "step": 5013, + "time_per_iteration": 4.228014230728149 + }, + { + "auxiliary_loss_clip": 0.01530696, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.33182645, + "balance_loss_mlp": 1.0185529, + "epoch": 0.30145798887719827, + "flos": 16257367555200.0, + "grad_norm": 1.9666007181115197, + "language_loss": 0.84294629, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.86869621, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.25720215, + "step": 5014, + "time_per_iteration": 2.8550171852111816 + }, + { + "auxiliary_loss_clip": 0.01502742, + "auxiliary_loss_mlp": 0.01045994, + "balance_loss_clip": 1.31220603, + "balance_loss_mlp": 1.02081704, + "epoch": 0.30151811212986623, + "flos": 24582473216640.0, + "grad_norm": 1.6453986563622518, + "language_loss": 0.68970013, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.71518743, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.25170898, + "step": 5015, + "time_per_iteration": 2.873831272125244 + }, + { + "auxiliary_loss_clip": 0.01521954, + "auxiliary_loss_mlp": 0.01047142, + "balance_loss_clip": 1.32829642, + "balance_loss_mlp": 1.01962852, + "epoch": 0.3015782353825342, + "flos": 21882191685120.0, + "grad_norm": 2.614020173913498, + "language_loss": 0.76302409, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.78871512, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.27514648, + "step": 5016, + "time_per_iteration": 4.275460243225098 + }, + { + "auxiliary_loss_clip": 0.01527883, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.33185673, + "balance_loss_mlp": 1.02091455, + "epoch": 0.30163835863520216, + "flos": 28781611107840.0, + "grad_norm": 1.7305255892810578, + "language_loss": 0.6661582, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.69189584, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.24963379, + "step": 5017, + "time_per_iteration": 4.414970874786377 + }, + { + "auxiliary_loss_clip": 0.01533732, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.33659124, + "balance_loss_mlp": 1.02171803, + "epoch": 0.30169848188787013, + "flos": 22976485873920.0, + "grad_norm": 3.5568876471714916, + "language_loss": 0.69854516, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.72433966, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.23986816, + "step": 5018, + "time_per_iteration": 2.8537416458129883 + }, + { + "auxiliary_loss_clip": 0.01508884, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.31870437, + "balance_loss_mlp": 1.02358842, + "epoch": 0.3017586051405381, + "flos": 21845289911040.0, + "grad_norm": 1.9821531468337283, + "language_loss": 0.80332911, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.82889265, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.23901367, + "step": 5019, + "time_per_iteration": 2.8651931285858154 + }, + { + "auxiliary_loss_clip": 0.01526969, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.33181953, + "balance_loss_mlp": 1.01968598, + "epoch": 0.30181872839320606, + "flos": 22169669016960.0, + "grad_norm": 2.1648294018670553, + "language_loss": 0.7167539, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.74247211, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.25183105, + "step": 5020, + "time_per_iteration": 2.8809921741485596 + }, + { + "auxiliary_loss_clip": 0.01548744, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.34834516, + "balance_loss_mlp": 1.02429581, + "epoch": 0.301878851645874, + "flos": 18123341374080.0, + "grad_norm": 1.9470972066517187, + "language_loss": 0.79806578, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.82403898, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.24291992, + "step": 5021, + "time_per_iteration": 2.9474642276763916 + }, + { + "auxiliary_loss_clip": 0.01532888, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.33738256, + "balance_loss_mlp": 1.01987302, + "epoch": 0.301938974898542, + "flos": 17610969548160.0, + "grad_norm": 2.1840732050459954, + "language_loss": 0.76618522, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.79194129, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.22827148, + "step": 5022, + "time_per_iteration": 2.8643288612365723 + }, + { + "auxiliary_loss_clip": 0.01528107, + "auxiliary_loss_mlp": 0.01047842, + "balance_loss_clip": 1.3315953, + "balance_loss_mlp": 1.02273703, + "epoch": 0.30199909815120995, + "flos": 11188422766080.0, + "grad_norm": 1.9577679090714486, + "language_loss": 0.70689785, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.73265731, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.25146484, + "step": 5023, + "time_per_iteration": 2.870351552963257 + }, + { + "auxiliary_loss_clip": 0.01545075, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.34480941, + "balance_loss_mlp": 1.02036619, + "epoch": 0.302059221403878, + "flos": 21917781360000.0, + "grad_norm": 1.786327559070514, + "language_loss": 0.71987766, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.74576259, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.23059082, + "step": 5024, + "time_per_iteration": 2.8936710357666016 + }, + { + "auxiliary_loss_clip": 0.01525752, + "auxiliary_loss_mlp": 0.01047726, + "balance_loss_clip": 1.33353996, + "balance_loss_mlp": 1.02307367, + "epoch": 0.30211934465654594, + "flos": 26407699453440.0, + "grad_norm": 1.6085231078886566, + "language_loss": 0.75271457, + "learning_rate": 3.272217377978061e-06, + "loss": 0.7784493, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.24658203, + "step": 5025, + "time_per_iteration": 2.9208872318267822 + }, + { + "auxiliary_loss_clip": 0.01511687, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.32134056, + "balance_loss_mlp": 1.01941741, + "epoch": 0.3021794679092139, + "flos": 23409941489280.0, + "grad_norm": 1.9192242828231918, + "language_loss": 0.6794281, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.70497298, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.23364258, + "step": 5026, + "time_per_iteration": 2.948808193206787 + }, + { + "auxiliary_loss_clip": 0.01535386, + "auxiliary_loss_mlp": 0.01045342, + "balance_loss_clip": 1.34076202, + "balance_loss_mlp": 1.01972401, + "epoch": 0.30223959116188187, + "flos": 20269191398400.0, + "grad_norm": 1.726245893193024, + "language_loss": 0.85899138, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.88479865, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.25622559, + "step": 5027, + "time_per_iteration": 2.9346561431884766 + }, + { + "auxiliary_loss_clip": 0.01531742, + "auxiliary_loss_mlp": 0.01046276, + "balance_loss_clip": 1.33865035, + "balance_loss_mlp": 1.02302992, + "epoch": 0.30229971441454984, + "flos": 26699113082880.0, + "grad_norm": 1.683548450924166, + "language_loss": 0.79013371, + "learning_rate": 3.271315635661351e-06, + "loss": 0.81591386, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.23242188, + "step": 5028, + "time_per_iteration": 2.9031426906585693 + }, + { + "auxiliary_loss_clip": 0.01530489, + "auxiliary_loss_mlp": 0.01048872, + "balance_loss_clip": 1.33724952, + "balance_loss_mlp": 1.02392197, + "epoch": 0.3023598376672178, + "flos": 34357407857280.0, + "grad_norm": 1.9780386088868531, + "language_loss": 0.77898651, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.80478013, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.24938965, + "step": 5029, + "time_per_iteration": 2.990168809890747 + }, + { + "auxiliary_loss_clip": 0.01554291, + "auxiliary_loss_mlp": 0.01050591, + "balance_loss_clip": 1.35575914, + "balance_loss_mlp": 1.02527118, + "epoch": 0.30241996091988577, + "flos": 23122464157440.0, + "grad_norm": 2.0364940490691876, + "language_loss": 0.82661474, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.85266352, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.25354004, + "step": 5030, + "time_per_iteration": 2.8578219413757324 + }, + { + "auxiliary_loss_clip": 0.01551023, + "auxiliary_loss_mlp": 0.01048784, + "balance_loss_clip": 1.35233426, + "balance_loss_mlp": 1.02386951, + "epoch": 0.30248008417255373, + "flos": 19399203521280.0, + "grad_norm": 2.8824655644837236, + "language_loss": 0.70839572, + "learning_rate": 3.270413459468905e-06, + "loss": 0.73439384, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.24902344, + "step": 5031, + "time_per_iteration": 2.871394395828247 + }, + { + "auxiliary_loss_clip": 0.01544148, + "auxiliary_loss_mlp": 0.01044257, + "balance_loss_clip": 1.34823251, + "balance_loss_mlp": 1.02052236, + "epoch": 0.3025402074252217, + "flos": 23780407063680.0, + "grad_norm": 2.2880249802517776, + "language_loss": 0.83021796, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.85610205, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.23730469, + "step": 5032, + "time_per_iteration": 2.866802453994751 + }, + { + "auxiliary_loss_clip": 0.01560751, + "auxiliary_loss_mlp": 0.01052908, + "balance_loss_clip": 1.36173749, + "balance_loss_mlp": 1.02698016, + "epoch": 0.30260033067788966, + "flos": 26005082808960.0, + "grad_norm": 2.2514221181944176, + "language_loss": 0.74727851, + "learning_rate": 3.269811767783906e-06, + "loss": 0.77341509, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.25952148, + "step": 5033, + "time_per_iteration": 2.912060499191284 + }, + { + "auxiliary_loss_clip": 0.01532561, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.33839476, + "balance_loss_mlp": 1.02786326, + "epoch": 0.3026604539305576, + "flos": 25385534755200.0, + "grad_norm": 1.9790331712501803, + "language_loss": 0.75103998, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.77688706, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.24291992, + "step": 5034, + "time_per_iteration": 2.920081377029419 + }, + { + "auxiliary_loss_clip": 0.01545418, + "auxiliary_loss_mlp": 0.01046882, + "balance_loss_clip": 1.34878302, + "balance_loss_mlp": 1.02244413, + "epoch": 0.3027205771832256, + "flos": 25823922053760.0, + "grad_norm": 1.848475547830947, + "language_loss": 0.72620887, + "learning_rate": 3.269209883493352e-06, + "loss": 0.75213182, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24450684, + "step": 5035, + "time_per_iteration": 2.8822824954986572 + }, + { + "auxiliary_loss_clip": 0.01530623, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.33824158, + "balance_loss_mlp": 1.01765919, + "epoch": 0.30278070043589356, + "flos": 27355970113920.0, + "grad_norm": 1.8932384513328882, + "language_loss": 0.8793326, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.90504014, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.22473145, + "step": 5036, + "time_per_iteration": 2.9166908264160156 + }, + { + "auxiliary_loss_clip": 0.01528016, + "auxiliary_loss_mlp": 0.01053396, + "balance_loss_clip": 1.33587706, + "balance_loss_mlp": 1.02775407, + "epoch": 0.3028408236885616, + "flos": 24795875531520.0, + "grad_norm": 1.4067770012799214, + "language_loss": 0.77937269, + "learning_rate": 3.268607806688536e-06, + "loss": 0.80518681, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.25646973, + "step": 5037, + "time_per_iteration": 2.8773856163024902 + }, + { + "auxiliary_loss_clip": 0.01544625, + "auxiliary_loss_mlp": 0.01040871, + "balance_loss_clip": 1.34488988, + "balance_loss_mlp": 1.01658821, + "epoch": 0.30290094694122954, + "flos": 12940116923520.0, + "grad_norm": 3.0673784240923143, + "language_loss": 0.79295754, + "learning_rate": 3.268306696121816e-06, + "loss": 0.81881249, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.24279785, + "step": 5038, + "time_per_iteration": 2.829651117324829 + }, + { + "auxiliary_loss_clip": 0.01530298, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.3380363, + "balance_loss_mlp": 1.01778412, + "epoch": 0.3029610701938975, + "flos": 25926166598400.0, + "grad_norm": 1.8550284127077146, + "language_loss": 0.74931133, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.77501911, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.22705078, + "step": 5039, + "time_per_iteration": 2.8959546089172363 + }, + { + "auxiliary_loss_clip": 0.0152711, + "auxiliary_loss_mlp": 0.01044962, + "balance_loss_clip": 1.3348819, + "balance_loss_mlp": 1.0224793, + "epoch": 0.3030211934465655, + "flos": 21990996725760.0, + "grad_norm": 1.9936484733854658, + "language_loss": 0.80769992, + "learning_rate": 3.267704330716847e-06, + "loss": 0.83342063, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.22485352, + "step": 5040, + "time_per_iteration": 2.8488874435424805 + }, + { + "auxiliary_loss_clip": 0.01540334, + "auxiliary_loss_mlp": 0.01048471, + "balance_loss_clip": 1.34620011, + "balance_loss_mlp": 1.02601147, + "epoch": 0.30308131669923344, + "flos": 21000711628800.0, + "grad_norm": 1.5696883490311577, + "language_loss": 0.82630336, + "learning_rate": 3.267403075901438e-06, + "loss": 0.85219133, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.22460938, + "step": 5041, + "time_per_iteration": 2.8700506687164307 + }, + { + "auxiliary_loss_clip": 0.01294815, + "auxiliary_loss_mlp": 0.01053805, + "balance_loss_clip": 1.1798358, + "balance_loss_mlp": 1.02042615, + "epoch": 0.3031414399519014, + "flos": 60578987610240.0, + "grad_norm": 0.7792938093820913, + "language_loss": 0.59546387, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61895007, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.33398438, + "step": 5042, + "time_per_iteration": 3.480782985687256 + }, + { + "auxiliary_loss_clip": 0.01550453, + "auxiliary_loss_mlp": 0.01040949, + "balance_loss_clip": 1.35181355, + "balance_loss_mlp": 1.01714325, + "epoch": 0.30320156320456937, + "flos": 21917555136000.0, + "grad_norm": 1.6755703922313625, + "language_loss": 0.72203809, + "learning_rate": 3.266800422101892e-06, + "loss": 0.7479521, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.23815918, + "step": 5043, + "time_per_iteration": 2.887463331222534 + }, + { + "auxiliary_loss_clip": 0.01529934, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.33521557, + "balance_loss_mlp": 1.01468027, + "epoch": 0.30326168645723733, + "flos": 21662726567040.0, + "grad_norm": 2.0790536364280015, + "language_loss": 0.70544195, + "learning_rate": 3.266499023140606e-06, + "loss": 0.73112369, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.23535156, + "step": 5044, + "time_per_iteration": 4.278761625289917 + }, + { + "auxiliary_loss_clip": 0.01528866, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.33623266, + "balance_loss_mlp": 1.01407802, + "epoch": 0.3033218097099053, + "flos": 21881196299520.0, + "grad_norm": 1.4170235070145143, + "language_loss": 0.7828747, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.80854213, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.23803711, + "step": 5045, + "time_per_iteration": 2.8708434104919434 + }, + { + "auxiliary_loss_clip": 0.01544721, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.3487041, + "balance_loss_mlp": 1.01669979, + "epoch": 0.30338193296257326, + "flos": 27101232034560.0, + "grad_norm": 1.6713053571332037, + "language_loss": 0.72626328, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.75211775, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.24023438, + "step": 5046, + "time_per_iteration": 2.945239782333374 + }, + { + "auxiliary_loss_clip": 0.0153785, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.33790374, + "balance_loss_mlp": 1.01569283, + "epoch": 0.30344205621524123, + "flos": 19543191033600.0, + "grad_norm": 1.5792495399695765, + "language_loss": 0.8179971, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.84377539, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.24267578, + "step": 5047, + "time_per_iteration": 2.9066264629364014 + }, + { + "auxiliary_loss_clip": 0.01526834, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.33183885, + "balance_loss_mlp": 1.01810467, + "epoch": 0.3035021794679092, + "flos": 23920503523200.0, + "grad_norm": 1.7776022743715796, + "language_loss": 0.72315115, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74883562, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.23510742, + "step": 5048, + "time_per_iteration": 4.317375898361206 + }, + { + "auxiliary_loss_clip": 0.01524983, + "auxiliary_loss_mlp": 0.01040541, + "balance_loss_clip": 1.33164883, + "balance_loss_mlp": 1.01710415, + "epoch": 0.30356230272057716, + "flos": 16152182098560.0, + "grad_norm": 1.7163742270982554, + "language_loss": 0.7567147, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.78236991, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.23461914, + "step": 5049, + "time_per_iteration": 2.8491060733795166 + }, + { + "auxiliary_loss_clip": 0.01531297, + "auxiliary_loss_mlp": 0.01045116, + "balance_loss_clip": 1.33352149, + "balance_loss_mlp": 1.02165556, + "epoch": 0.3036224259732452, + "flos": 28926865474560.0, + "grad_norm": 1.6254399524197258, + "language_loss": 0.83053517, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.8562994, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.23461914, + "step": 5050, + "time_per_iteration": 2.8952255249023438 + }, + { + "auxiliary_loss_clip": 0.01530708, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.33375192, + "balance_loss_mlp": 1.01743901, + "epoch": 0.30368254922591315, + "flos": 21115443738240.0, + "grad_norm": 2.5510511492734302, + "language_loss": 0.75318909, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.778916, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.2454834, + "step": 5051, + "time_per_iteration": 4.236498832702637 + }, + { + "auxiliary_loss_clip": 0.01522256, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.32808042, + "balance_loss_mlp": 1.01955795, + "epoch": 0.3037426724785811, + "flos": 23012482752000.0, + "grad_norm": 1.6931053116641501, + "language_loss": 0.76962841, + "learning_rate": 3.264086103483033e-06, + "loss": 0.79528308, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.2364502, + "step": 5052, + "time_per_iteration": 4.219244480133057 + }, + { + "auxiliary_loss_clip": 0.01528806, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.33126092, + "balance_loss_mlp": 1.02166224, + "epoch": 0.3038027957312491, + "flos": 15641122371840.0, + "grad_norm": 2.3457032938073574, + "language_loss": 0.84189773, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.86764157, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.23901367, + "step": 5053, + "time_per_iteration": 2.839395761489868 + }, + { + "auxiliary_loss_clip": 0.0152284, + "auxiliary_loss_mlp": 0.01044373, + "balance_loss_clip": 1.32882047, + "balance_loss_mlp": 1.01913631, + "epoch": 0.30386291898391704, + "flos": 12721692435840.0, + "grad_norm": 1.7059683250793194, + "language_loss": 0.71990973, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.74558187, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.25231934, + "step": 5054, + "time_per_iteration": 2.8406155109405518 + }, + { + "auxiliary_loss_clip": 0.01517322, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.32103777, + "balance_loss_mlp": 1.0177263, + "epoch": 0.303923042236585, + "flos": 26370616700160.0, + "grad_norm": 2.847380791250688, + "language_loss": 0.69949341, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.72507787, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.23400879, + "step": 5055, + "time_per_iteration": 2.9113967418670654 + }, + { + "auxiliary_loss_clip": 0.01525565, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.33059883, + "balance_loss_mlp": 1.02045393, + "epoch": 0.30398316548925297, + "flos": 19728740534400.0, + "grad_norm": 1.7701287922232614, + "language_loss": 0.68249208, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.70819265, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.24047852, + "step": 5056, + "time_per_iteration": 2.9149222373962402 + }, + { + "auxiliary_loss_clip": 0.01513529, + "auxiliary_loss_mlp": 0.01038774, + "balance_loss_clip": 1.32272315, + "balance_loss_mlp": 1.0146699, + "epoch": 0.30404328874192094, + "flos": 24249588088320.0, + "grad_norm": 1.510805006353096, + "language_loss": 0.8302837, + "learning_rate": 3.262576470461507e-06, + "loss": 0.85580677, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.2409668, + "step": 5057, + "time_per_iteration": 2.86606502532959 + }, + { + "auxiliary_loss_clip": 0.01507882, + "auxiliary_loss_mlp": 0.01046126, + "balance_loss_clip": 1.31616426, + "balance_loss_mlp": 1.02086568, + "epoch": 0.3041034119945889, + "flos": 24509710298880.0, + "grad_norm": 1.6326004060494128, + "language_loss": 0.8926931, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91823316, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.25280762, + "step": 5058, + "time_per_iteration": 2.8984851837158203 + }, + { + "auxiliary_loss_clip": 0.01526471, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.33104205, + "balance_loss_mlp": 1.02221847, + "epoch": 0.30416353524725687, + "flos": 28299444825600.0, + "grad_norm": 2.022590824876327, + "language_loss": 0.71743989, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.74315929, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.23242188, + "step": 5059, + "time_per_iteration": 2.926253318786621 + }, + { + "auxiliary_loss_clip": 0.01513501, + "auxiliary_loss_mlp": 0.01043321, + "balance_loss_clip": 1.3203094, + "balance_loss_mlp": 1.01742887, + "epoch": 0.30422365849992483, + "flos": 23671104330240.0, + "grad_norm": 1.98683840918947, + "language_loss": 0.73647231, + "learning_rate": 3.26167011603268e-06, + "loss": 0.7620405, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.25878906, + "step": 5060, + "time_per_iteration": 2.941573143005371 + }, + { + "auxiliary_loss_clip": 0.01517197, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.32344222, + "balance_loss_mlp": 1.02043962, + "epoch": 0.3042837817525928, + "flos": 23008048761600.0, + "grad_norm": 1.7835855942515848, + "language_loss": 0.78123784, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.80685937, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.24511719, + "step": 5061, + "time_per_iteration": 2.8847463130950928 + }, + { + "auxiliary_loss_clip": 0.01529473, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.33217955, + "balance_loss_mlp": 1.01700735, + "epoch": 0.30434390500526076, + "flos": 22090255113600.0, + "grad_norm": 1.9219140385195363, + "language_loss": 0.83078253, + "learning_rate": 3.261065640514415e-06, + "loss": 0.85650384, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.25671387, + "step": 5062, + "time_per_iteration": 2.8271775245666504 + }, + { + "auxiliary_loss_clip": 0.01509587, + "auxiliary_loss_mlp": 0.01039599, + "balance_loss_clip": 1.31712627, + "balance_loss_mlp": 1.01470757, + "epoch": 0.3044040282579287, + "flos": 25494385040640.0, + "grad_norm": 1.7782673870244914, + "language_loss": 0.75091672, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.77640855, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.24902344, + "step": 5063, + "time_per_iteration": 2.921821355819702 + }, + { + "auxiliary_loss_clip": 0.01506384, + "auxiliary_loss_mlp": 0.01046438, + "balance_loss_clip": 1.31407154, + "balance_loss_mlp": 1.02079606, + "epoch": 0.30446415151059675, + "flos": 21955723764480.0, + "grad_norm": 1.968107689238428, + "language_loss": 0.85178673, + "learning_rate": 3.26046097371721e-06, + "loss": 0.87731493, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.25634766, + "step": 5064, + "time_per_iteration": 3.0064053535461426 + }, + { + "auxiliary_loss_clip": 0.01506219, + "auxiliary_loss_mlp": 0.01042985, + "balance_loss_clip": 1.31352019, + "balance_loss_mlp": 1.01658022, + "epoch": 0.3045242747632647, + "flos": 16444048176000.0, + "grad_norm": 2.0874427370300355, + "language_loss": 0.76685095, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.79234302, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.26391602, + "step": 5065, + "time_per_iteration": 2.904982805252075 + }, + { + "auxiliary_loss_clip": 0.01526095, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.32946324, + "balance_loss_mlp": 1.01920724, + "epoch": 0.3045843980159327, + "flos": 31552845765120.0, + "grad_norm": 1.822831898909084, + "language_loss": 0.63512993, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.66084325, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.26013184, + "step": 5066, + "time_per_iteration": 2.964115619659424 + }, + { + "auxiliary_loss_clip": 0.01529582, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.32797551, + "balance_loss_mlp": 1.01815248, + "epoch": 0.30464452126860064, + "flos": 17861409371520.0, + "grad_norm": 1.8612124104707453, + "language_loss": 0.83407629, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.8598088, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.25549316, + "step": 5067, + "time_per_iteration": 2.8486366271972656 + }, + { + "auxiliary_loss_clip": 0.01491593, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.30229402, + "balance_loss_mlp": 1.02029657, + "epoch": 0.3047046445212686, + "flos": 20641150051200.0, + "grad_norm": 1.8499736918320313, + "language_loss": 0.63340563, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65877926, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.25488281, + "step": 5068, + "time_per_iteration": 2.9019763469696045 + }, + { + "auxiliary_loss_clip": 0.015033, + "auxiliary_loss_mlp": 0.01043553, + "balance_loss_clip": 1.31077504, + "balance_loss_mlp": 1.01816106, + "epoch": 0.3047647677739366, + "flos": 21297373655040.0, + "grad_norm": 1.9286109475593438, + "language_loss": 0.76377654, + "learning_rate": 3.258948470480793e-06, + "loss": 0.78924513, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.25427246, + "step": 5069, + "time_per_iteration": 2.9278621673583984 + }, + { + "auxiliary_loss_clip": 0.01497946, + "auxiliary_loss_mlp": 0.01048912, + "balance_loss_clip": 1.30699158, + "balance_loss_mlp": 1.02344871, + "epoch": 0.30482489102660454, + "flos": 21005688556800.0, + "grad_norm": 2.3432113254310494, + "language_loss": 0.76637828, + "learning_rate": 3.258645826569261e-06, + "loss": 0.79184681, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.2545166, + "step": 5070, + "time_per_iteration": 2.8674910068511963 + }, + { + "auxiliary_loss_clip": 0.01523433, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.32395709, + "balance_loss_mlp": 1.01891112, + "epoch": 0.3048850142792725, + "flos": 26303102179200.0, + "grad_norm": 1.9326963157754316, + "language_loss": 0.82834792, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.85401285, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.24157715, + "step": 5071, + "time_per_iteration": 2.9050464630126953 + }, + { + "auxiliary_loss_clip": 0.01520452, + "auxiliary_loss_mlp": 0.01045717, + "balance_loss_clip": 1.32227969, + "balance_loss_mlp": 1.01983666, + "epoch": 0.30494513753194047, + "flos": 22356440127360.0, + "grad_norm": 1.8908467468030958, + "language_loss": 0.76739907, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.79306078, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.25915527, + "step": 5072, + "time_per_iteration": 2.8716886043548584 + }, + { + "auxiliary_loss_clip": 0.0151209, + "auxiliary_loss_mlp": 0.01049811, + "balance_loss_clip": 1.31663465, + "balance_loss_mlp": 1.02276278, + "epoch": 0.30500526078460843, + "flos": 19547353555200.0, + "grad_norm": 1.8151215722367053, + "language_loss": 0.72694188, + "learning_rate": 3.257737608512723e-06, + "loss": 0.75256085, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.27050781, + "step": 5073, + "time_per_iteration": 2.8332679271698 + }, + { + "auxiliary_loss_clip": 0.01515486, + "auxiliary_loss_mlp": 0.01048169, + "balance_loss_clip": 1.31790209, + "balance_loss_mlp": 1.02219296, + "epoch": 0.3050653840372764, + "flos": 14473477082880.0, + "grad_norm": 5.684932935425009, + "language_loss": 0.77550489, + "learning_rate": 3.257434773758163e-06, + "loss": 0.80114138, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.25964355, + "step": 5074, + "time_per_iteration": 2.804622173309326 + }, + { + "auxiliary_loss_clip": 0.01520965, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_clip": 1.3247782, + "balance_loss_mlp": 1.02157378, + "epoch": 0.30512550728994436, + "flos": 24254565016320.0, + "grad_norm": 2.530775999402961, + "language_loss": 0.75439668, + "learning_rate": 3.25713189132155e-06, + "loss": 0.78006715, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.24523926, + "step": 5075, + "time_per_iteration": 2.904207229614258 + }, + { + "auxiliary_loss_clip": 0.01535971, + "auxiliary_loss_mlp": 0.01052027, + "balance_loss_clip": 1.33299494, + "balance_loss_mlp": 1.02444184, + "epoch": 0.30518563054261233, + "flos": 16368661059840.0, + "grad_norm": 2.2184586790815857, + "language_loss": 0.76755893, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.79343891, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 2.03125, + "router_z_loss_mlp": 0.27587891, + "step": 5076, + "time_per_iteration": 2.8144164085388184 + }, + { + "auxiliary_loss_clip": 0.01528188, + "auxiliary_loss_mlp": 0.01048766, + "balance_loss_clip": 1.33208823, + "balance_loss_mlp": 1.02295685, + "epoch": 0.30524575379528035, + "flos": 21589465956480.0, + "grad_norm": 3.0217734486579735, + "language_loss": 0.79230279, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81807232, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.25830078, + "step": 5077, + "time_per_iteration": 3.0125181674957275 + }, + { + "auxiliary_loss_clip": 0.01501531, + "auxiliary_loss_mlp": 0.01046322, + "balance_loss_clip": 1.31175447, + "balance_loss_mlp": 1.02112126, + "epoch": 0.3053058770479483, + "flos": 16553984336640.0, + "grad_norm": 1.6121717380260605, + "language_loss": 0.76435298, + "learning_rate": 3.256222958034259e-06, + "loss": 0.78983152, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.25219727, + "step": 5078, + "time_per_iteration": 4.274973392486572 + }, + { + "auxiliary_loss_clip": 0.01516922, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_clip": 1.32236218, + "balance_loss_mlp": 1.01876545, + "epoch": 0.3053660003006163, + "flos": 12320568869760.0, + "grad_norm": 1.9498475854428516, + "language_loss": 0.67800635, + "learning_rate": 3.255919884984307e-06, + "loss": 0.70360571, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.24267578, + "step": 5079, + "time_per_iteration": 2.8443379402160645 + }, + { + "auxiliary_loss_clip": 0.0151861, + "auxiliary_loss_mlp": 0.01052243, + "balance_loss_clip": 1.32167327, + "balance_loss_mlp": 1.02769756, + "epoch": 0.30542612355328425, + "flos": 23122645136640.0, + "grad_norm": 1.8980831751325304, + "language_loss": 0.81326473, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.83897328, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.24523926, + "step": 5080, + "time_per_iteration": 2.875612258911133 + }, + { + "auxiliary_loss_clip": 0.01517294, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.32179284, + "balance_loss_mlp": 1.02041054, + "epoch": 0.3054862468059522, + "flos": 24400181341440.0, + "grad_norm": 2.658461699923453, + "language_loss": 0.8173455, + "learning_rate": 3.255313596022074e-06, + "loss": 0.84298801, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.26525879, + "step": 5081, + "time_per_iteration": 3.027339458465576 + }, + { + "auxiliary_loss_clip": 0.01508254, + "auxiliary_loss_mlp": 0.01049496, + "balance_loss_clip": 1.31363082, + "balance_loss_mlp": 1.02430725, + "epoch": 0.3055463700586202, + "flos": 29397630067200.0, + "grad_norm": 1.6471427397399279, + "language_loss": 0.72477889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.75035638, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.25183105, + "step": 5082, + "time_per_iteration": 2.9955246448516846 + }, + { + "auxiliary_loss_clip": 0.01511905, + "auxiliary_loss_mlp": 0.01046269, + "balance_loss_clip": 1.31139112, + "balance_loss_mlp": 1.0206387, + "epoch": 0.30560649331128814, + "flos": 25602375674880.0, + "grad_norm": 2.3682605818411777, + "language_loss": 0.73644161, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.76202333, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.25671387, + "step": 5083, + "time_per_iteration": 2.9172441959381104 + }, + { + "auxiliary_loss_clip": 0.01513567, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.31510139, + "balance_loss_mlp": 1.01822746, + "epoch": 0.3056666165639561, + "flos": 19135823685120.0, + "grad_norm": 2.3110297538242235, + "language_loss": 0.71795344, + "learning_rate": 3.254403805595344e-06, + "loss": 0.74351478, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.2434082, + "step": 5084, + "time_per_iteration": 4.2829132080078125 + }, + { + "auxiliary_loss_clip": 0.01533134, + "auxiliary_loss_mlp": 0.0104426, + "balance_loss_clip": 1.33264279, + "balance_loss_mlp": 1.01953638, + "epoch": 0.30572673981662407, + "flos": 15532407820800.0, + "grad_norm": 2.0020444169105116, + "language_loss": 0.79992092, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.82569492, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.24719238, + "step": 5085, + "time_per_iteration": 2.878168821334839 + }, + { + "auxiliary_loss_clip": 0.01508102, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.31500268, + "balance_loss_mlp": 1.02013588, + "epoch": 0.30578686306929204, + "flos": 21516386325120.0, + "grad_norm": 1.551367390673303, + "language_loss": 0.78911781, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.81465203, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.2520752, + "step": 5086, + "time_per_iteration": 4.265935659408569 + }, + { + "auxiliary_loss_clip": 0.01508471, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.31734633, + "balance_loss_mlp": 1.0211997, + "epoch": 0.30584698632196, + "flos": 20962678734720.0, + "grad_norm": 2.5549164270938887, + "language_loss": 0.78040326, + "learning_rate": 3.253493587064563e-06, + "loss": 0.8059625, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.26245117, + "step": 5087, + "time_per_iteration": 4.311020135879517 + }, + { + "auxiliary_loss_clip": 0.01520196, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.31969726, + "balance_loss_mlp": 1.01647854, + "epoch": 0.30590710957462797, + "flos": 24691866439680.0, + "grad_norm": 3.115184633124134, + "language_loss": 0.73422849, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.75985289, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.25793457, + "step": 5088, + "time_per_iteration": 2.9140238761901855 + }, + { + "auxiliary_loss_clip": 0.01537738, + "auxiliary_loss_mlp": 0.01047319, + "balance_loss_clip": 1.33300662, + "balance_loss_mlp": 1.02187932, + "epoch": 0.30596723282729593, + "flos": 17094344711040.0, + "grad_norm": 3.0642660844719773, + "language_loss": 0.8068068, + "learning_rate": 3.252886537028521e-06, + "loss": 0.8326574, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 2.04980469, + "router_z_loss_mlp": 0.25439453, + "step": 5089, + "time_per_iteration": 2.8644042015075684 + }, + { + "auxiliary_loss_clip": 0.01519459, + "auxiliary_loss_mlp": 0.01047748, + "balance_loss_clip": 1.32310915, + "balance_loss_mlp": 1.02284551, + "epoch": 0.30602735607996395, + "flos": 22867454609280.0, + "grad_norm": 1.9535396371306832, + "language_loss": 0.77552128, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.80119324, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.24902344, + "step": 5090, + "time_per_iteration": 2.8599305152893066 + }, + { + "auxiliary_loss_clip": 0.01535799, + "auxiliary_loss_mlp": 0.01049042, + "balance_loss_clip": 1.33339548, + "balance_loss_mlp": 1.02391243, + "epoch": 0.3060874793326319, + "flos": 29873416832640.0, + "grad_norm": 2.0957961040183055, + "language_loss": 0.77294654, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.79879498, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.25134277, + "step": 5091, + "time_per_iteration": 2.9089274406433105 + }, + { + "auxiliary_loss_clip": 0.01529658, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.33017075, + "balance_loss_mlp": 1.02853143, + "epoch": 0.3061476025852999, + "flos": 20458179504000.0, + "grad_norm": 1.6035818326431206, + "language_loss": 0.72225702, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74808598, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.24694824, + "step": 5092, + "time_per_iteration": 2.865889072418213 + }, + { + "auxiliary_loss_clip": 0.01533631, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_clip": 1.33486342, + "balance_loss_mlp": 1.03047991, + "epoch": 0.30620772583796785, + "flos": 19401058558080.0, + "grad_norm": 1.897035939274073, + "language_loss": 0.83435655, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.8602429, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.24536133, + "step": 5093, + "time_per_iteration": 2.850254774093628 + }, + { + "auxiliary_loss_clip": 0.01522415, + "auxiliary_loss_mlp": 0.01050518, + "balance_loss_clip": 1.32685065, + "balance_loss_mlp": 1.02683139, + "epoch": 0.3062678490906358, + "flos": 24035190387840.0, + "grad_norm": 2.5222979367463623, + "language_loss": 0.76076043, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.78648973, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.23706055, + "step": 5094, + "time_per_iteration": 2.870781660079956 + }, + { + "auxiliary_loss_clip": 0.01512669, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.31848955, + "balance_loss_mlp": 1.01944566, + "epoch": 0.3063279723433038, + "flos": 19763877761280.0, + "grad_norm": 1.892468372967705, + "language_loss": 0.76502579, + "learning_rate": 3.251064247058868e-06, + "loss": 0.79058325, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.2364502, + "step": 5095, + "time_per_iteration": 2.8397059440612793 + }, + { + "auxiliary_loss_clip": 0.0152033, + "auxiliary_loss_mlp": 0.01050057, + "balance_loss_clip": 1.32562113, + "balance_loss_mlp": 1.02399755, + "epoch": 0.30638809559597174, + "flos": 22458775161600.0, + "grad_norm": 1.8875145556916268, + "language_loss": 0.81427395, + "learning_rate": 3.250760365955042e-06, + "loss": 0.83997786, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.26074219, + "step": 5096, + "time_per_iteration": 2.903921127319336 + }, + { + "auxiliary_loss_clip": 0.01534285, + "auxiliary_loss_mlp": 0.01046095, + "balance_loss_clip": 1.33440244, + "balance_loss_mlp": 1.02114415, + "epoch": 0.3064482188486397, + "flos": 17173984838400.0, + "grad_norm": 4.969834589557623, + "language_loss": 0.82575011, + "learning_rate": 3.250456437422258e-06, + "loss": 0.85155392, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.24975586, + "step": 5097, + "time_per_iteration": 2.8769636154174805 + }, + { + "auxiliary_loss_clip": 0.01522134, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.32470536, + "balance_loss_mlp": 1.01974499, + "epoch": 0.3065083421013077, + "flos": 23778868740480.0, + "grad_norm": 1.789587327546449, + "language_loss": 0.78016669, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80582619, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.24072266, + "step": 5098, + "time_per_iteration": 2.8389575481414795 + }, + { + "auxiliary_loss_clip": 0.01518873, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.32315099, + "balance_loss_mlp": 1.02240944, + "epoch": 0.30656846535397564, + "flos": 26442881925120.0, + "grad_norm": 1.8258649155228994, + "language_loss": 0.84975684, + "learning_rate": 3.249848438115917e-06, + "loss": 0.87541521, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.24536133, + "step": 5099, + "time_per_iteration": 2.9014062881469727 + }, + { + "auxiliary_loss_clip": 0.01528207, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.33033872, + "balance_loss_mlp": 1.01846528, + "epoch": 0.3066285886066436, + "flos": 26663161449600.0, + "grad_norm": 1.6260609993425659, + "language_loss": 0.8620078, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.8877297, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.25488281, + "step": 5100, + "time_per_iteration": 2.905745506286621 + }, + { + "auxiliary_loss_clip": 0.01529324, + "auxiliary_loss_mlp": 0.01041761, + "balance_loss_clip": 1.33169007, + "balance_loss_mlp": 1.01694202, + "epoch": 0.30668871185931157, + "flos": 15058476092160.0, + "grad_norm": 2.191230103265372, + "language_loss": 0.7979722, + "learning_rate": 3.249240249232065e-06, + "loss": 0.82368302, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.24841309, + "step": 5101, + "time_per_iteration": 2.8432817459106445 + }, + { + "auxiliary_loss_clip": 0.01544967, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.34394419, + "balance_loss_mlp": 1.02067614, + "epoch": 0.30674883511197953, + "flos": 20091197779200.0, + "grad_norm": 1.914733799970639, + "language_loss": 0.808514, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.83444214, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.27148438, + "step": 5102, + "time_per_iteration": 2.8522660732269287 + }, + { + "auxiliary_loss_clip": 0.01523484, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.32636428, + "balance_loss_mlp": 1.01770818, + "epoch": 0.30680895836464755, + "flos": 22904220648960.0, + "grad_norm": 2.49579542049509, + "language_loss": 0.8938908, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.91956294, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.26037598, + "step": 5103, + "time_per_iteration": 2.8759870529174805 + }, + { + "auxiliary_loss_clip": 0.01534162, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.33456349, + "balance_loss_mlp": 1.01747656, + "epoch": 0.3068690816173155, + "flos": 23706694005120.0, + "grad_norm": 1.6517259168487446, + "language_loss": 0.74499571, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.77075255, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.24035645, + "step": 5104, + "time_per_iteration": 2.9119296073913574 + }, + { + "auxiliary_loss_clip": 0.01546868, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_clip": 1.34403133, + "balance_loss_mlp": 1.0232352, + "epoch": 0.3069292048699835, + "flos": 23561756352000.0, + "grad_norm": 1.9020941932175348, + "language_loss": 0.73579848, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.7617417, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.24230957, + "step": 5105, + "time_per_iteration": 2.8734936714172363 + }, + { + "auxiliary_loss_clip": 0.01525881, + "auxiliary_loss_mlp": 0.01046086, + "balance_loss_clip": 1.32831848, + "balance_loss_mlp": 1.02136171, + "epoch": 0.30698932812265145, + "flos": 24541454165760.0, + "grad_norm": 2.002436644820995, + "language_loss": 0.87981296, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.9055326, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.24731445, + "step": 5106, + "time_per_iteration": 2.8930745124816895 + }, + { + "auxiliary_loss_clip": 0.01552618, + "auxiliary_loss_mlp": 0.01049641, + "balance_loss_clip": 1.34711361, + "balance_loss_mlp": 1.02509534, + "epoch": 0.3070494513753194, + "flos": 21006321984000.0, + "grad_norm": 2.2326936823105448, + "language_loss": 0.71825612, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.74427873, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.2454834, + "step": 5107, + "time_per_iteration": 2.9113588333129883 + }, + { + "auxiliary_loss_clip": 0.01510151, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.31499195, + "balance_loss_mlp": 1.01895738, + "epoch": 0.3071095746279874, + "flos": 19035253198080.0, + "grad_norm": 2.304917200351878, + "language_loss": 0.7290076, + "learning_rate": 3.247110096547814e-06, + "loss": 0.75454658, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.24816895, + "step": 5108, + "time_per_iteration": 2.927574872970581 + }, + { + "auxiliary_loss_clip": 0.01528865, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.33025932, + "balance_loss_mlp": 1.02053344, + "epoch": 0.30716969788065535, + "flos": 21225515633280.0, + "grad_norm": 1.6231989282709887, + "language_loss": 0.86450469, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.89025038, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.25146484, + "step": 5109, + "time_per_iteration": 2.8859140872955322 + }, + { + "auxiliary_loss_clip": 0.01516865, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_clip": 1.31999874, + "balance_loss_mlp": 1.01681876, + "epoch": 0.3072298211333233, + "flos": 25783038737280.0, + "grad_norm": 1.948210976240241, + "language_loss": 0.68380982, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.70938212, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.2355957, + "step": 5110, + "time_per_iteration": 2.8982293605804443 + }, + { + "auxiliary_loss_clip": 0.01517458, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.32332313, + "balance_loss_mlp": 1.01465225, + "epoch": 0.3072899443859913, + "flos": 25860054666240.0, + "grad_norm": 1.6656944395037456, + "language_loss": 0.77718282, + "learning_rate": 3.246196464379919e-06, + "loss": 0.80272686, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.22302246, + "step": 5111, + "time_per_iteration": 2.9260807037353516 + }, + { + "auxiliary_loss_clip": 0.01529647, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.32957387, + "balance_loss_mlp": 1.0167836, + "epoch": 0.30735006763865924, + "flos": 25934174928000.0, + "grad_norm": 1.9415890381017564, + "language_loss": 0.68009096, + "learning_rate": 3.245891825796765e-06, + "loss": 0.70580727, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.25219727, + "step": 5112, + "time_per_iteration": 2.8874776363372803 + }, + { + "auxiliary_loss_clip": 0.01542861, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.3413198, + "balance_loss_mlp": 1.01275277, + "epoch": 0.3074101908913272, + "flos": 30928547007360.0, + "grad_norm": 2.239094134335687, + "language_loss": 0.80723393, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.8330403, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.25024414, + "step": 5113, + "time_per_iteration": 4.402537107467651 + }, + { + "auxiliary_loss_clip": 0.01538218, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.33610368, + "balance_loss_mlp": 1.01600885, + "epoch": 0.30747031414399517, + "flos": 18409008913920.0, + "grad_norm": 2.5969231933618446, + "language_loss": 0.7809484, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.80672538, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.23486328, + "step": 5114, + "time_per_iteration": 2.8215558528900146 + }, + { + "auxiliary_loss_clip": 0.01518241, + "auxiliary_loss_mlp": 0.01041387, + "balance_loss_clip": 1.32215071, + "balance_loss_mlp": 1.01659191, + "epoch": 0.30753043739666314, + "flos": 22642152912000.0, + "grad_norm": 2.0155495486802324, + "language_loss": 0.62689435, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.65249068, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.2479248, + "step": 5115, + "time_per_iteration": 2.852583646774292 + }, + { + "auxiliary_loss_clip": 0.01528682, + "auxiliary_loss_mlp": 0.01035872, + "balance_loss_clip": 1.33094668, + "balance_loss_mlp": 1.01256657, + "epoch": 0.3075905606493311, + "flos": 27355381931520.0, + "grad_norm": 2.6441278351990456, + "language_loss": 0.83239281, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85803831, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.23291016, + "step": 5116, + "time_per_iteration": 2.9011270999908447 + }, + { + "auxiliary_loss_clip": 0.01524375, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.32757926, + "balance_loss_mlp": 1.01599324, + "epoch": 0.3076506839019991, + "flos": 22100932886400.0, + "grad_norm": 1.6984312372638461, + "language_loss": 0.76331145, + "learning_rate": 3.244367924446952e-06, + "loss": 0.7889421, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 1.96582031, + "router_z_loss_mlp": 0.22680664, + "step": 5117, + "time_per_iteration": 2.8562419414520264 + }, + { + "auxiliary_loss_clip": 0.01552827, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.35241258, + "balance_loss_mlp": 1.02186036, + "epoch": 0.3077108071546671, + "flos": 21299816874240.0, + "grad_norm": 2.731567314424615, + "language_loss": 0.72724116, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.75322503, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.23706055, + "step": 5118, + "time_per_iteration": 2.824920415878296 + }, + { + "auxiliary_loss_clip": 0.01527984, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.3284142, + "balance_loss_mlp": 1.01634634, + "epoch": 0.30777093040733505, + "flos": 21440139557760.0, + "grad_norm": 1.8109162567834796, + "language_loss": 0.7507714, + "learning_rate": 3.243758033520219e-06, + "loss": 0.77644837, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.23364258, + "step": 5119, + "time_per_iteration": 4.324901342391968 + }, + { + "auxiliary_loss_clip": 0.01548319, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.34546447, + "balance_loss_mlp": 1.02105653, + "epoch": 0.307831053660003, + "flos": 23159773134720.0, + "grad_norm": 2.1368306676196496, + "language_loss": 0.80891538, + "learning_rate": 3.243453017305926e-06, + "loss": 0.83484501, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.23583984, + "step": 5120, + "time_per_iteration": 2.945108652114868 + }, + { + "auxiliary_loss_clip": 0.01530168, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_clip": 1.3344996, + "balance_loss_mlp": 1.01934814, + "epoch": 0.307891176912671, + "flos": 17028685226880.0, + "grad_norm": 1.7640058377486132, + "language_loss": 0.80951458, + "learning_rate": 3.24314795393977e-06, + "loss": 0.83525223, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.24243164, + "step": 5121, + "time_per_iteration": 2.8610305786132812 + }, + { + "auxiliary_loss_clip": 0.01541617, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.345433, + "balance_loss_mlp": 1.01543283, + "epoch": 0.30795130016533895, + "flos": 27715441201920.0, + "grad_norm": 1.669702808430349, + "language_loss": 0.8335973, + "learning_rate": 3.242842843433319e-06, + "loss": 0.85939896, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.23120117, + "step": 5122, + "time_per_iteration": 5.926538467407227 + }, + { + "auxiliary_loss_clip": 0.01349946, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_clip": 1.23812473, + "balance_loss_mlp": 1.06938004, + "epoch": 0.3080114234180069, + "flos": 69093262356480.0, + "grad_norm": 0.7689857478606799, + "language_loss": 0.58620989, + "learning_rate": 3.242537685798143e-06, + "loss": 0.61068165, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.27929688, + "step": 5123, + "time_per_iteration": 3.566836357116699 + }, + { + "auxiliary_loss_clip": 0.01567432, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.35920274, + "balance_loss_mlp": 1.02051306, + "epoch": 0.3080715466706749, + "flos": 24070644328320.0, + "grad_norm": 1.6339584945848296, + "language_loss": 0.83718997, + "learning_rate": 3.242232481045813e-06, + "loss": 0.86330903, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.23974609, + "step": 5124, + "time_per_iteration": 2.906496286392212 + }, + { + "auxiliary_loss_clip": 0.0154116, + "auxiliary_loss_mlp": 0.01039151, + "balance_loss_clip": 1.33967507, + "balance_loss_mlp": 1.01595235, + "epoch": 0.30813166992334284, + "flos": 25859737952640.0, + "grad_norm": 2.3551759008249102, + "language_loss": 0.80508423, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.83088732, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.23193359, + "step": 5125, + "time_per_iteration": 2.8837056159973145 + }, + { + "auxiliary_loss_clip": 0.01558683, + "auxiliary_loss_mlp": 0.01047149, + "balance_loss_clip": 1.35168862, + "balance_loss_mlp": 1.0222578, + "epoch": 0.3081917931760108, + "flos": 20459491603200.0, + "grad_norm": 1.838447393341427, + "language_loss": 0.65303266, + "learning_rate": 3.241621930235989e-06, + "loss": 0.67909098, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 2.0703125, + "router_z_loss_mlp": 0.24902344, + "step": 5126, + "time_per_iteration": 2.945152521133423 + }, + { + "auxiliary_loss_clip": 0.01521672, + "auxiliary_loss_mlp": 0.01043487, + "balance_loss_clip": 1.32662487, + "balance_loss_mlp": 1.01981211, + "epoch": 0.3082519164286788, + "flos": 22177043919360.0, + "grad_norm": 1.644532973600044, + "language_loss": 0.87322229, + "learning_rate": 3.241316584201646e-06, + "loss": 0.89887393, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.23681641, + "step": 5127, + "time_per_iteration": 2.885012626647949 + }, + { + "auxiliary_loss_clip": 0.01538359, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_clip": 1.34079218, + "balance_loss_mlp": 1.02113461, + "epoch": 0.30831203968134674, + "flos": 28925236661760.0, + "grad_norm": 2.6522698950649555, + "language_loss": 0.69152355, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71733904, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.22045898, + "step": 5128, + "time_per_iteration": 2.9108567237854004 + }, + { + "auxiliary_loss_clip": 0.01557042, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_clip": 1.3520174, + "balance_loss_mlp": 1.02231193, + "epoch": 0.3083721629340147, + "flos": 25679120135040.0, + "grad_norm": 2.0036776184323593, + "language_loss": 0.72545362, + "learning_rate": 3.240705750931993e-06, + "loss": 0.75148451, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 2.05078125, + "router_z_loss_mlp": 0.23742676, + "step": 5129, + "time_per_iteration": 2.8721532821655273 + }, + { + "auxiliary_loss_clip": 0.01342799, + "auxiliary_loss_mlp": 0.01061602, + "balance_loss_clip": 1.22659898, + "balance_loss_mlp": 1.02917755, + "epoch": 0.3084322861866827, + "flos": 68245245469440.0, + "grad_norm": 0.8419197658500721, + "language_loss": 0.59387404, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61791801, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.32421875, + "step": 5130, + "time_per_iteration": 3.349917411804199 + }, + { + "auxiliary_loss_clip": 0.01571579, + "auxiliary_loss_mlp": 0.01044835, + "balance_loss_clip": 1.36578298, + "balance_loss_mlp": 1.02157688, + "epoch": 0.3084924094393507, + "flos": 20304464359680.0, + "grad_norm": 2.2511400152667886, + "language_loss": 0.74416912, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.77033329, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 2.05957031, + "router_z_loss_mlp": 0.23266602, + "step": 5131, + "time_per_iteration": 2.9103968143463135 + }, + { + "auxiliary_loss_clip": 0.01552318, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.35395312, + "balance_loss_mlp": 1.02313387, + "epoch": 0.30855253269201866, + "flos": 23959984250880.0, + "grad_norm": 1.5641806292141003, + "language_loss": 0.71842885, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.74440163, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.21838379, + "step": 5132, + "time_per_iteration": 2.8766536712646484 + }, + { + "auxiliary_loss_clip": 0.01536818, + "auxiliary_loss_mlp": 0.01045376, + "balance_loss_clip": 1.34228539, + "balance_loss_mlp": 1.02348971, + "epoch": 0.3086126559446866, + "flos": 19291393866240.0, + "grad_norm": 1.7900727868980226, + "language_loss": 0.90473872, + "learning_rate": 3.239483519913136e-06, + "loss": 0.93056065, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.21899414, + "step": 5133, + "time_per_iteration": 2.84804105758667 + }, + { + "auxiliary_loss_clip": 0.01553802, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.34980536, + "balance_loss_mlp": 1.0218066, + "epoch": 0.3086727791973546, + "flos": 33772499337600.0, + "grad_norm": 3.184449077739982, + "language_loss": 0.68374538, + "learning_rate": 3.239177844626102e-06, + "loss": 0.70973611, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.23461914, + "step": 5134, + "time_per_iteration": 2.9653737545013428 + }, + { + "auxiliary_loss_clip": 0.0155578, + "auxiliary_loss_mlp": 0.01048282, + "balance_loss_clip": 1.35136724, + "balance_loss_mlp": 1.02551317, + "epoch": 0.30873290245002255, + "flos": 16042698385920.0, + "grad_norm": 2.181458572614256, + "language_loss": 0.83564591, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.86168659, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 2.04492188, + "router_z_loss_mlp": 0.22766113, + "step": 5135, + "time_per_iteration": 2.8393285274505615 + }, + { + "auxiliary_loss_clip": 0.01324988, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.20963204, + "balance_loss_mlp": 0.99910849, + "epoch": 0.3087930257026905, + "flos": 65082614878080.0, + "grad_norm": 0.7015897598997128, + "language_loss": 0.55332732, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57688683, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.31835938, + "step": 5136, + "time_per_iteration": 3.4245336055755615 + }, + { + "auxiliary_loss_clip": 0.01546245, + "auxiliary_loss_mlp": 0.01045048, + "balance_loss_clip": 1.34735787, + "balance_loss_mlp": 1.02133775, + "epoch": 0.3088531489553585, + "flos": 74762372442240.0, + "grad_norm": 2.022984386543483, + "language_loss": 0.77013546, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.7960484, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.23718262, + "step": 5137, + "time_per_iteration": 3.24355411529541 + }, + { + "auxiliary_loss_clip": 0.0155015, + "auxiliary_loss_mlp": 0.01045918, + "balance_loss_clip": 1.34925818, + "balance_loss_mlp": 1.02431679, + "epoch": 0.30891327220802645, + "flos": 21152209777920.0, + "grad_norm": 2.4242515525288013, + "language_loss": 0.80838501, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8343457, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.21606445, + "step": 5138, + "time_per_iteration": 2.948270082473755 + }, + { + "auxiliary_loss_clip": 0.01554887, + "auxiliary_loss_mlp": 0.01046799, + "balance_loss_clip": 1.35224485, + "balance_loss_mlp": 1.02399433, + "epoch": 0.3089733954606944, + "flos": 25675048103040.0, + "grad_norm": 1.563443530945742, + "language_loss": 0.82123864, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.84725553, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.22827148, + "step": 5139, + "time_per_iteration": 2.903390407562256 + }, + { + "auxiliary_loss_clip": 0.01567846, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.35850859, + "balance_loss_mlp": 1.02670145, + "epoch": 0.3090335187133624, + "flos": 19436783967360.0, + "grad_norm": 1.895995878801638, + "language_loss": 0.77577549, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.80196667, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.24560547, + "step": 5140, + "time_per_iteration": 2.831421136856079 + }, + { + "auxiliary_loss_clip": 0.0151548, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_clip": 1.32320213, + "balance_loss_mlp": 1.02851617, + "epoch": 0.30909364196603034, + "flos": 20020968570240.0, + "grad_norm": 1.8001831515779823, + "language_loss": 0.788001, + "learning_rate": 3.237036802553252e-06, + "loss": 0.81365794, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.21704102, + "step": 5141, + "time_per_iteration": 2.863175868988037 + }, + { + "auxiliary_loss_clip": 0.01549421, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.34595513, + "balance_loss_mlp": 1.02274561, + "epoch": 0.3091537652186983, + "flos": 19685730712320.0, + "grad_norm": 2.4835727325304315, + "language_loss": 0.87957466, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.90554225, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.24597168, + "step": 5142, + "time_per_iteration": 2.822154998779297 + }, + { + "auxiliary_loss_clip": 0.01553146, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.34963107, + "balance_loss_mlp": 1.02472472, + "epoch": 0.3092138884713663, + "flos": 17029047185280.0, + "grad_norm": 2.705306223159886, + "language_loss": 0.7999056, + "learning_rate": 3.23642465389567e-06, + "loss": 0.82591361, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.22924805, + "step": 5143, + "time_per_iteration": 2.8427388668060303 + }, + { + "auxiliary_loss_clip": 0.01532706, + "auxiliary_loss_mlp": 0.0104382, + "balance_loss_clip": 1.33411789, + "balance_loss_mlp": 1.02056265, + "epoch": 0.3092740117240343, + "flos": 25020951004800.0, + "grad_norm": 3.8964071788514563, + "language_loss": 0.73031723, + "learning_rate": 3.236118509233055e-06, + "loss": 0.75608248, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.2322998, + "step": 5144, + "time_per_iteration": 2.87309193611145 + }, + { + "auxiliary_loss_clip": 0.01547209, + "auxiliary_loss_mlp": 0.01050724, + "balance_loss_clip": 1.34349203, + "balance_loss_mlp": 1.02808666, + "epoch": 0.30933413497670226, + "flos": 25600656372480.0, + "grad_norm": 3.02704440295389, + "language_loss": 0.74580085, + "learning_rate": 3.235812317696702e-06, + "loss": 0.77178013, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 2.03808594, + "router_z_loss_mlp": 0.22631836, + "step": 5145, + "time_per_iteration": 2.9030110836029053 + }, + { + "auxiliary_loss_clip": 0.01525864, + "auxiliary_loss_mlp": 0.01045792, + "balance_loss_clip": 1.32646704, + "balance_loss_mlp": 1.02198625, + "epoch": 0.3093942582293702, + "flos": 24400452810240.0, + "grad_norm": 1.712915366471327, + "language_loss": 0.76811993, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.79383641, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.23815918, + "step": 5146, + "time_per_iteration": 2.935161828994751 + }, + { + "auxiliary_loss_clip": 0.01533833, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.33596623, + "balance_loss_mlp": 1.01325989, + "epoch": 0.3094543814820382, + "flos": 19655977616640.0, + "grad_norm": 2.1561556727538225, + "language_loss": 0.67911488, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.7048049, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.21911621, + "step": 5147, + "time_per_iteration": 2.8600285053253174 + }, + { + "auxiliary_loss_clip": 0.01554172, + "auxiliary_loss_mlp": 0.01045186, + "balance_loss_clip": 1.35192049, + "balance_loss_mlp": 1.02290606, + "epoch": 0.30951450473470615, + "flos": 25674188451840.0, + "grad_norm": 2.724317064246959, + "language_loss": 0.76115531, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.78714895, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.22290039, + "step": 5148, + "time_per_iteration": 4.3097755908966064 + }, + { + "auxiliary_loss_clip": 0.0156764, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_clip": 1.35811794, + "balance_loss_mlp": 1.02368116, + "epoch": 0.3095746279873741, + "flos": 12027843141120.0, + "grad_norm": 2.0187723623022893, + "language_loss": 0.73697513, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.76312745, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.23913574, + "step": 5149, + "time_per_iteration": 2.8093461990356445 + }, + { + "auxiliary_loss_clip": 0.01550079, + "auxiliary_loss_mlp": 0.01045221, + "balance_loss_clip": 1.34458899, + "balance_loss_mlp": 1.02143896, + "epoch": 0.3096347512400421, + "flos": 23633523884160.0, + "grad_norm": 1.943822926546144, + "language_loss": 0.85785097, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.88380396, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.23791504, + "step": 5150, + "time_per_iteration": 2.9348647594451904 + }, + { + "auxiliary_loss_clip": 0.01537678, + "auxiliary_loss_mlp": 0.01039943, + "balance_loss_clip": 1.33914888, + "balance_loss_mlp": 1.01687646, + "epoch": 0.30969487449271005, + "flos": 22539727388160.0, + "grad_norm": 2.3868174092431294, + "language_loss": 0.79478151, + "learning_rate": 3.233974184780424e-06, + "loss": 0.82055771, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.23059082, + "step": 5151, + "time_per_iteration": 2.9043610095977783 + }, + { + "auxiliary_loss_clip": 0.01546491, + "auxiliary_loss_mlp": 0.01041218, + "balance_loss_clip": 1.34413159, + "balance_loss_mlp": 1.01810312, + "epoch": 0.309754997745378, + "flos": 15276267152640.0, + "grad_norm": 2.320458237701512, + "language_loss": 0.67965043, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.70552754, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.23144531, + "step": 5152, + "time_per_iteration": 2.8461403846740723 + }, + { + "auxiliary_loss_clip": 0.01542695, + "auxiliary_loss_mlp": 0.01044079, + "balance_loss_clip": 1.3417486, + "balance_loss_mlp": 1.01956964, + "epoch": 0.309815120998046, + "flos": 26990526712320.0, + "grad_norm": 2.160614435426142, + "language_loss": 0.83397353, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.85984129, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24523926, + "step": 5153, + "time_per_iteration": 2.9329991340637207 + }, + { + "auxiliary_loss_clip": 0.01540112, + "auxiliary_loss_mlp": 0.01045228, + "balance_loss_clip": 1.34035313, + "balance_loss_mlp": 1.0209806, + "epoch": 0.30987524425071394, + "flos": 21153295653120.0, + "grad_norm": 1.859146620208576, + "language_loss": 0.74863404, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.77448738, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 1.99414062, + "router_z_loss_mlp": 0.24255371, + "step": 5154, + "time_per_iteration": 4.276312351226807 + }, + { + "auxiliary_loss_clip": 0.01541806, + "auxiliary_loss_mlp": 0.01045913, + "balance_loss_clip": 1.34461427, + "balance_loss_mlp": 1.02215505, + "epoch": 0.3099353675033819, + "flos": 15276719600640.0, + "grad_norm": 2.020028819122007, + "language_loss": 0.7723763, + "learning_rate": 3.232747826832858e-06, + "loss": 0.79825354, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.2376709, + "step": 5155, + "time_per_iteration": 2.8350586891174316 + }, + { + "auxiliary_loss_clip": 0.01543558, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.34094453, + "balance_loss_mlp": 1.01736045, + "epoch": 0.30999549075604993, + "flos": 15422154946560.0, + "grad_norm": 1.8858272665506959, + "language_loss": 0.79840457, + "learning_rate": 3.232441120452094e-06, + "loss": 0.82425535, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.24169922, + "step": 5156, + "time_per_iteration": 2.904628276824951 + }, + { + "auxiliary_loss_clip": 0.0154662, + "auxiliary_loss_mlp": 0.01045932, + "balance_loss_clip": 1.3425107, + "balance_loss_mlp": 1.02170849, + "epoch": 0.3100556140087179, + "flos": 23194729382400.0, + "grad_norm": 2.092220494959802, + "language_loss": 0.75463319, + "learning_rate": 3.23213436733704e-06, + "loss": 0.78055871, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 2.04101562, + "router_z_loss_mlp": 0.2421875, + "step": 5157, + "time_per_iteration": 5.64022970199585 + }, + { + "auxiliary_loss_clip": 0.01521826, + "auxiliary_loss_mlp": 0.01041533, + "balance_loss_clip": 1.32483804, + "balance_loss_mlp": 1.01739323, + "epoch": 0.31011573726138586, + "flos": 25753964313600.0, + "grad_norm": 1.6284773653250768, + "language_loss": 0.70344549, + "learning_rate": 3.231827567499327e-06, + "loss": 0.72907901, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.24133301, + "step": 5158, + "time_per_iteration": 2.91727876663208 + }, + { + "auxiliary_loss_clip": 0.01509119, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.31519365, + "balance_loss_mlp": 1.0178709, + "epoch": 0.3101758605140538, + "flos": 20020968570240.0, + "grad_norm": 2.203945764206072, + "language_loss": 0.85349101, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87898719, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.22607422, + "step": 5159, + "time_per_iteration": 2.873427391052246 + }, + { + "auxiliary_loss_clip": 0.01538873, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.33817887, + "balance_loss_mlp": 1.01569486, + "epoch": 0.3102359837667218, + "flos": 19145098869120.0, + "grad_norm": 1.8295323207005212, + "language_loss": 0.85800552, + "learning_rate": 3.231213827702462e-06, + "loss": 0.88379353, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.24206543, + "step": 5160, + "time_per_iteration": 2.893089771270752 + }, + { + "auxiliary_loss_clip": 0.0152133, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.32574999, + "balance_loss_mlp": 1.01550555, + "epoch": 0.31029610701938976, + "flos": 22273542374400.0, + "grad_norm": 8.985555580697863, + "language_loss": 0.7698763, + "learning_rate": 3.230906887766584e-06, + "loss": 0.79548186, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.23718262, + "step": 5161, + "time_per_iteration": 2.8945724964141846 + }, + { + "auxiliary_loss_clip": 0.0154451, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.34044147, + "balance_loss_mlp": 1.02202868, + "epoch": 0.3103562302720577, + "flos": 20812945132800.0, + "grad_norm": 2.018752707764171, + "language_loss": 0.82452875, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.85044336, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.24914551, + "step": 5162, + "time_per_iteration": 3.060969591140747 + }, + { + "auxiliary_loss_clip": 0.01523506, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.32657146, + "balance_loss_mlp": 1.01678503, + "epoch": 0.3104163535247257, + "flos": 22353815928960.0, + "grad_norm": 1.5365247864117302, + "language_loss": 0.83326781, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.8589071, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.23657227, + "step": 5163, + "time_per_iteration": 2.8725178241729736 + }, + { + "auxiliary_loss_clip": 0.01533234, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.33479047, + "balance_loss_mlp": 1.01664615, + "epoch": 0.31047647677739365, + "flos": 21699356872320.0, + "grad_norm": 2.1596707371073185, + "language_loss": 0.76513338, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.79086256, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.23010254, + "step": 5164, + "time_per_iteration": 2.928584337234497 + }, + { + "auxiliary_loss_clip": 0.01530462, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.33321035, + "balance_loss_mlp": 1.0156796, + "epoch": 0.3105366000300616, + "flos": 18926764871040.0, + "grad_norm": 2.694951405418487, + "language_loss": 0.75826049, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.78395832, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.23632812, + "step": 5165, + "time_per_iteration": 2.8837180137634277 + }, + { + "auxiliary_loss_clip": 0.01532458, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.33581686, + "balance_loss_mlp": 1.01558375, + "epoch": 0.3105967232827296, + "flos": 18269681616000.0, + "grad_norm": 1.4771711477224014, + "language_loss": 0.76693726, + "learning_rate": 3.229371488178348e-06, + "loss": 0.79265988, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.24230957, + "step": 5166, + "time_per_iteration": 2.8363800048828125 + }, + { + "auxiliary_loss_clip": 0.01533831, + "auxiliary_loss_mlp": 0.01039818, + "balance_loss_clip": 1.33545041, + "balance_loss_mlp": 1.01557124, + "epoch": 0.31065684653539755, + "flos": 17680112881920.0, + "grad_norm": 2.423809866077527, + "language_loss": 0.74884462, + "learning_rate": 3.229064268360444e-06, + "loss": 0.77458107, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.24230957, + "step": 5167, + "time_per_iteration": 2.82952880859375 + }, + { + "auxiliary_loss_clip": 0.01300186, + "auxiliary_loss_mlp": 0.01025721, + "balance_loss_clip": 1.18061972, + "balance_loss_mlp": 1.00111639, + "epoch": 0.3107169697880655, + "flos": 68562294917760.0, + "grad_norm": 0.7167578015775972, + "language_loss": 0.53039575, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55365479, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.24511719, + "step": 5168, + "time_per_iteration": 3.4684948921203613 + }, + { + "auxiliary_loss_clip": 0.0153787, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.33435464, + "balance_loss_mlp": 1.01194191, + "epoch": 0.3107770930407335, + "flos": 13196664794880.0, + "grad_norm": 1.9002428924721744, + "language_loss": 0.80362558, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 2.03417969, + "router_z_loss_mlp": 0.25244141, + "step": 5169, + "time_per_iteration": 2.9534802436828613 + }, + { + "auxiliary_loss_clip": 0.01539642, + "auxiliary_loss_mlp": 0.0104674, + "balance_loss_clip": 1.33621621, + "balance_loss_mlp": 1.02240908, + "epoch": 0.3108372162934015, + "flos": 31594407753600.0, + "grad_norm": 1.5891733678315627, + "language_loss": 0.65302378, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.67888761, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.2434082, + "step": 5170, + "time_per_iteration": 2.984879493713379 + }, + { + "auxiliary_loss_clip": 0.01535156, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.33716869, + "balance_loss_mlp": 1.01666927, + "epoch": 0.31089733954606946, + "flos": 28741315973760.0, + "grad_norm": 2.090131672071812, + "language_loss": 0.78241563, + "learning_rate": 3.22783492314295e-06, + "loss": 0.80818403, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.25, + "step": 5171, + "time_per_iteration": 2.9152045249938965 + }, + { + "auxiliary_loss_clip": 0.01535092, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.33353329, + "balance_loss_mlp": 1.01979542, + "epoch": 0.3109574627987374, + "flos": 19692879390720.0, + "grad_norm": 2.240237915990043, + "language_loss": 0.84203506, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.867836, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.25219727, + "step": 5172, + "time_per_iteration": 2.868312120437622 + }, + { + "auxiliary_loss_clip": 0.01532197, + "auxiliary_loss_mlp": 0.01042832, + "balance_loss_clip": 1.33007956, + "balance_loss_mlp": 1.01810789, + "epoch": 0.3110175860514054, + "flos": 14692127794560.0, + "grad_norm": 2.1136543169132187, + "language_loss": 0.85858458, + "learning_rate": 3.227219971129842e-06, + "loss": 0.8843348, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.24743652, + "step": 5173, + "time_per_iteration": 2.8316757678985596 + }, + { + "auxiliary_loss_clip": 0.01513991, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_clip": 1.31954801, + "balance_loss_mlp": 1.01909935, + "epoch": 0.31107770930407336, + "flos": 25750797177600.0, + "grad_norm": 1.604442620155975, + "language_loss": 0.83980739, + "learning_rate": 3.226912425313001e-06, + "loss": 0.86537588, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.23754883, + "step": 5174, + "time_per_iteration": 2.933851718902588 + }, + { + "auxiliary_loss_clip": 0.01535713, + "auxiliary_loss_mlp": 0.01046278, + "balance_loss_clip": 1.33537483, + "balance_loss_mlp": 1.02222192, + "epoch": 0.3111378325567413, + "flos": 19217590318080.0, + "grad_norm": 1.8192178061925992, + "language_loss": 0.85455048, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.88037044, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.24060059, + "step": 5175, + "time_per_iteration": 2.8638949394226074 + }, + { + "auxiliary_loss_clip": 0.01527358, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_clip": 1.33233225, + "balance_loss_mlp": 1.01789093, + "epoch": 0.3111979558094093, + "flos": 23706965473920.0, + "grad_norm": 3.0842465003929003, + "language_loss": 0.85106754, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.87676823, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.24853516, + "step": 5176, + "time_per_iteration": 2.95747971534729 + }, + { + "auxiliary_loss_clip": 0.01518882, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.31981564, + "balance_loss_mlp": 1.01435876, + "epoch": 0.31125807906207725, + "flos": 21042952289280.0, + "grad_norm": 1.946237020718645, + "language_loss": 0.82179785, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.84737825, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.24816895, + "step": 5177, + "time_per_iteration": 2.9012701511383057 + }, + { + "auxiliary_loss_clip": 0.01537647, + "auxiliary_loss_mlp": 0.01041331, + "balance_loss_clip": 1.33790851, + "balance_loss_mlp": 1.01655912, + "epoch": 0.3113182023147452, + "flos": 23087191196160.0, + "grad_norm": 1.858604024031705, + "language_loss": 0.81410795, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.83989775, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.24768066, + "step": 5178, + "time_per_iteration": 2.870715618133545 + }, + { + "auxiliary_loss_clip": 0.01545534, + "auxiliary_loss_mlp": 0.01045719, + "balance_loss_clip": 1.3431747, + "balance_loss_mlp": 1.02159095, + "epoch": 0.3113783255674132, + "flos": 11846637141120.0, + "grad_norm": 2.4951595646321505, + "language_loss": 0.81802797, + "learning_rate": 3.225373998592471e-06, + "loss": 0.8439405, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.24133301, + "step": 5179, + "time_per_iteration": 2.836204767227173 + }, + { + "auxiliary_loss_clip": 0.0153006, + "auxiliary_loss_mlp": 0.01045085, + "balance_loss_clip": 1.33257258, + "balance_loss_mlp": 1.02123117, + "epoch": 0.31143844882008115, + "flos": 16298160382080.0, + "grad_norm": 3.7024828049705896, + "language_loss": 0.79567409, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.82142556, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.23840332, + "step": 5180, + "time_per_iteration": 2.8346080780029297 + }, + { + "auxiliary_loss_clip": 0.01543318, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.34290564, + "balance_loss_mlp": 1.01485753, + "epoch": 0.3114985720727491, + "flos": 23227151921280.0, + "grad_norm": 1.5754857422044288, + "language_loss": 0.84675384, + "learning_rate": 3.22475830255844e-06, + "loss": 0.87258077, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 2.00585938, + "router_z_loss_mlp": 0.24536133, + "step": 5181, + "time_per_iteration": 2.8967292308807373 + }, + { + "auxiliary_loss_clip": 0.01527664, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.33137488, + "balance_loss_mlp": 1.02208662, + "epoch": 0.3115586953254171, + "flos": 30056794583040.0, + "grad_norm": 1.7967906947190966, + "language_loss": 0.75051314, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.77623928, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.2286377, + "step": 5182, + "time_per_iteration": 2.9654948711395264 + }, + { + "auxiliary_loss_clip": 0.01560698, + "auxiliary_loss_mlp": 0.01045188, + "balance_loss_clip": 1.3558594, + "balance_loss_mlp": 1.02067876, + "epoch": 0.3116188185780851, + "flos": 25677355587840.0, + "grad_norm": 2.321260486877012, + "language_loss": 0.71566343, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.74172229, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.24511719, + "step": 5183, + "time_per_iteration": 4.323462009429932 + }, + { + "auxiliary_loss_clip": 0.01302694, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.18465185, + "balance_loss_mlp": 1.00948, + "epoch": 0.31167894183075306, + "flos": 69538780350720.0, + "grad_norm": 0.9620718447469093, + "language_loss": 0.59799278, + "learning_rate": 3.223834410214408e-06, + "loss": 0.62136823, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.25390625, + "step": 5184, + "time_per_iteration": 3.3900020122528076 + }, + { + "auxiliary_loss_clip": 0.01539732, + "auxiliary_loss_mlp": 0.01041314, + "balance_loss_clip": 1.34001231, + "balance_loss_mlp": 1.01780581, + "epoch": 0.31173906508342103, + "flos": 14948177973120.0, + "grad_norm": 2.879271923158347, + "language_loss": 0.71037436, + "learning_rate": 3.223526353268311e-06, + "loss": 0.73618484, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.23535156, + "step": 5185, + "time_per_iteration": 2.8665223121643066 + }, + { + "auxiliary_loss_clip": 0.0155297, + "auxiliary_loss_mlp": 0.01045544, + "balance_loss_clip": 1.34966826, + "balance_loss_mlp": 1.02207112, + "epoch": 0.311799188336089, + "flos": 16183564007040.0, + "grad_norm": 2.415996251650437, + "language_loss": 0.65247631, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.67846143, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.23474121, + "step": 5186, + "time_per_iteration": 2.881960868835449 + }, + { + "auxiliary_loss_clip": 0.0156456, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.35657191, + "balance_loss_mlp": 1.01805365, + "epoch": 0.31185931158875696, + "flos": 25020498556800.0, + "grad_norm": 2.7693634291331053, + "language_loss": 0.87538707, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.90146208, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.24902344, + "step": 5187, + "time_per_iteration": 2.8873414993286133 + }, + { + "auxiliary_loss_clip": 0.01554112, + "auxiliary_loss_mlp": 0.01049938, + "balance_loss_clip": 1.35333478, + "balance_loss_mlp": 1.02611971, + "epoch": 0.3119194348414249, + "flos": 37247084697600.0, + "grad_norm": 1.4504454595466472, + "language_loss": 0.63560283, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.66164333, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23815918, + "step": 5188, + "time_per_iteration": 4.508891344070435 + }, + { + "auxiliary_loss_clip": 0.01553914, + "auxiliary_loss_mlp": 0.01045896, + "balance_loss_clip": 1.35317969, + "balance_loss_mlp": 1.02216172, + "epoch": 0.3119795580940929, + "flos": 15021121870080.0, + "grad_norm": 2.425076530004471, + "language_loss": 0.839746, + "learning_rate": 3.222293661638346e-06, + "loss": 0.86574411, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.23742676, + "step": 5189, + "time_per_iteration": 2.8208768367767334 + }, + { + "auxiliary_loss_clip": 0.01534394, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.33784556, + "balance_loss_mlp": 1.01429439, + "epoch": 0.31203968134676086, + "flos": 16006837242240.0, + "grad_norm": 2.172733196522863, + "language_loss": 0.80042249, + "learning_rate": 3.22198537282789e-06, + "loss": 0.82614434, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.23498535, + "step": 5190, + "time_per_iteration": 2.8360445499420166 + }, + { + "auxiliary_loss_clip": 0.01552596, + "auxiliary_loss_mlp": 0.01047017, + "balance_loss_clip": 1.3516891, + "balance_loss_mlp": 1.02352071, + "epoch": 0.3120998045994288, + "flos": 23846699975040.0, + "grad_norm": 1.643102514499009, + "language_loss": 0.75625211, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.78224826, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23486328, + "step": 5191, + "time_per_iteration": 4.412895202636719 + }, + { + "auxiliary_loss_clip": 0.01295506, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.17864168, + "balance_loss_mlp": 1.03752828, + "epoch": 0.3121599278520968, + "flos": 69213496348800.0, + "grad_norm": 0.8511461519675058, + "language_loss": 0.63940513, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66296625, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.23046875, + "step": 5192, + "time_per_iteration": 4.823137044906616 + }, + { + "auxiliary_loss_clip": 0.01556034, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.3507638, + "balance_loss_mlp": 1.01755166, + "epoch": 0.31222005110476475, + "flos": 23817127858560.0, + "grad_norm": 1.8583944864877093, + "language_loss": 0.80945331, + "learning_rate": 3.221060228416446e-06, + "loss": 0.83543372, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 2.0546875, + "router_z_loss_mlp": 0.24462891, + "step": 5193, + "time_per_iteration": 2.8800132274627686 + }, + { + "auxiliary_loss_clip": 0.0154647, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.3427031, + "balance_loss_mlp": 1.01975918, + "epoch": 0.3122801743574327, + "flos": 25236434580480.0, + "grad_norm": 2.633137300290936, + "language_loss": 0.72753894, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.75342417, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.22290039, + "step": 5194, + "time_per_iteration": 2.910816192626953 + }, + { + "auxiliary_loss_clip": 0.01548853, + "auxiliary_loss_mlp": 0.01042773, + "balance_loss_clip": 1.35042906, + "balance_loss_mlp": 1.01959836, + "epoch": 0.3123402976101007, + "flos": 22976847832320.0, + "grad_norm": 1.8223793779620374, + "language_loss": 0.77111173, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.79702801, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.23168945, + "step": 5195, + "time_per_iteration": 3.0503242015838623 + }, + { + "auxiliary_loss_clip": 0.01551176, + "auxiliary_loss_mlp": 0.01051757, + "balance_loss_clip": 1.34809196, + "balance_loss_mlp": 1.02816534, + "epoch": 0.3124004208627687, + "flos": 25203016656000.0, + "grad_norm": 3.9614249544780793, + "language_loss": 0.79404581, + "learning_rate": 3.220134667280476e-06, + "loss": 0.82007515, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.23620605, + "step": 5196, + "time_per_iteration": 3.0194263458251953 + }, + { + "auxiliary_loss_clip": 0.01292123, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.17694044, + "balance_loss_mlp": 1.00521314, + "epoch": 0.31246054411543667, + "flos": 67518113984640.0, + "grad_norm": 0.7820391511203064, + "language_loss": 0.54867625, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.57185173, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.20214844, + "step": 5197, + "time_per_iteration": 3.472892999649048 + }, + { + "auxiliary_loss_clip": 0.01520927, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.32513666, + "balance_loss_mlp": 1.01759028, + "epoch": 0.31252066736810463, + "flos": 17867562664320.0, + "grad_norm": 2.0505173880490664, + "language_loss": 0.6785484, + "learning_rate": 3.21951739516552e-06, + "loss": 0.70416296, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.22949219, + "step": 5198, + "time_per_iteration": 2.8578035831451416 + }, + { + "auxiliary_loss_clip": 0.01570537, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.36493611, + "balance_loss_mlp": 1.01969886, + "epoch": 0.3125807906207726, + "flos": 18483264910080.0, + "grad_norm": 13.875804938609175, + "language_loss": 0.70782948, + "learning_rate": 3.219208689735857e-06, + "loss": 0.73397005, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.23840332, + "step": 5199, + "time_per_iteration": 2.891808032989502 + }, + { + "auxiliary_loss_clip": 0.01541757, + "auxiliary_loss_mlp": 0.01046823, + "balance_loss_clip": 1.34142911, + "balance_loss_mlp": 1.02251601, + "epoch": 0.31264091387344056, + "flos": 18954165237120.0, + "grad_norm": 1.7099846599399227, + "language_loss": 0.79736131, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.82324713, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 2.00195312, + "router_z_loss_mlp": 0.24279785, + "step": 5200, + "time_per_iteration": 2.8861944675445557 + }, + { + "auxiliary_loss_clip": 0.01525081, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.32905841, + "balance_loss_mlp": 1.02155304, + "epoch": 0.3127010371261085, + "flos": 21477946227840.0, + "grad_norm": 1.9848708334642413, + "language_loss": 0.8404057, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86610675, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.23486328, + "step": 5201, + "time_per_iteration": 2.872189521789551 + }, + { + "auxiliary_loss_clip": 0.01532403, + "auxiliary_loss_mlp": 0.01047324, + "balance_loss_clip": 1.33149314, + "balance_loss_mlp": 1.02304113, + "epoch": 0.3127611603787765, + "flos": 15343148246400.0, + "grad_norm": 2.6345413972506204, + "language_loss": 0.70283234, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.72862959, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.24279785, + "step": 5202, + "time_per_iteration": 2.877743721008301 + }, + { + "auxiliary_loss_clip": 0.01531369, + "auxiliary_loss_mlp": 0.01047609, + "balance_loss_clip": 1.32905948, + "balance_loss_mlp": 1.02543628, + "epoch": 0.31282128363144446, + "flos": 17611919688960.0, + "grad_norm": 1.78200571762854, + "language_loss": 0.85548514, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.88127494, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.22167969, + "step": 5203, + "time_per_iteration": 2.8283791542053223 + }, + { + "auxiliary_loss_clip": 0.01545477, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.3437655, + "balance_loss_mlp": 1.02007031, + "epoch": 0.3128814068841124, + "flos": 26766989562240.0, + "grad_norm": 2.2487768792208946, + "language_loss": 0.61977184, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.64565843, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.2310791, + "step": 5204, + "time_per_iteration": 2.943950891494751 + }, + { + "auxiliary_loss_clip": 0.01522349, + "auxiliary_loss_mlp": 0.01043736, + "balance_loss_clip": 1.3267808, + "balance_loss_mlp": 1.02126503, + "epoch": 0.3129415301367804, + "flos": 22282319865600.0, + "grad_norm": 2.1912709112254634, + "language_loss": 0.66353989, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68920076, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.22460938, + "step": 5205, + "time_per_iteration": 2.9052481651306152 + }, + { + "auxiliary_loss_clip": 0.01530698, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.32987893, + "balance_loss_mlp": 1.02039385, + "epoch": 0.31300165338944835, + "flos": 26475078240000.0, + "grad_norm": 1.5315952957260235, + "language_loss": 0.77046096, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79621375, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.24194336, + "step": 5206, + "time_per_iteration": 2.9199774265289307 + }, + { + "auxiliary_loss_clip": 0.01525754, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.32850194, + "balance_loss_mlp": 1.01732683, + "epoch": 0.3130617766421163, + "flos": 21954683134080.0, + "grad_norm": 1.9403421751906427, + "language_loss": 0.84218705, + "learning_rate": 3.216737382911672e-06, + "loss": 0.86784387, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.22607422, + "step": 5207, + "time_per_iteration": 2.8477680683135986 + }, + { + "auxiliary_loss_clip": 0.01516852, + "auxiliary_loss_mlp": 0.01044319, + "balance_loss_clip": 1.32095516, + "balance_loss_mlp": 1.02143085, + "epoch": 0.3131218998947843, + "flos": 23302674771840.0, + "grad_norm": 1.6862624622443259, + "language_loss": 0.72107327, + "learning_rate": 3.216428261810999e-06, + "loss": 0.74668503, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.22912598, + "step": 5208, + "time_per_iteration": 2.883291006088257 + }, + { + "auxiliary_loss_clip": 0.01525261, + "auxiliary_loss_mlp": 0.01042408, + "balance_loss_clip": 1.32695103, + "balance_loss_mlp": 1.01991332, + "epoch": 0.3131820231474523, + "flos": 21148861662720.0, + "grad_norm": 1.7806045432004773, + "language_loss": 0.75351351, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.77919024, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.22485352, + "step": 5209, + "time_per_iteration": 2.9013309478759766 + }, + { + "auxiliary_loss_clip": 0.01516601, + "auxiliary_loss_mlp": 0.01042033, + "balance_loss_clip": 1.3187722, + "balance_loss_mlp": 1.02036047, + "epoch": 0.31324214640012027, + "flos": 23919100934400.0, + "grad_norm": 2.0464994826118423, + "language_loss": 0.77774125, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.80332756, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.21679688, + "step": 5210, + "time_per_iteration": 2.8622682094573975 + }, + { + "auxiliary_loss_clip": 0.01503822, + "auxiliary_loss_mlp": 0.01045073, + "balance_loss_clip": 1.31256413, + "balance_loss_mlp": 1.0216608, + "epoch": 0.31330226965278823, + "flos": 22247137393920.0, + "grad_norm": 1.826369790574698, + "language_loss": 0.79877794, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.82426691, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.23425293, + "step": 5211, + "time_per_iteration": 2.8524739742279053 + }, + { + "auxiliary_loss_clip": 0.0151192, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_clip": 1.31535625, + "balance_loss_mlp": 1.02355027, + "epoch": 0.3133623929054562, + "flos": 19763063354880.0, + "grad_norm": 1.6899115389517014, + "language_loss": 0.79842603, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82400304, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.22229004, + "step": 5212, + "time_per_iteration": 2.846198081970215 + }, + { + "auxiliary_loss_clip": 0.01537863, + "auxiliary_loss_mlp": 0.0104548, + "balance_loss_clip": 1.33725739, + "balance_loss_mlp": 1.02333117, + "epoch": 0.31342251615812416, + "flos": 27173678238720.0, + "grad_norm": 2.4997708121336704, + "language_loss": 0.71576476, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.74159825, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 2.00292969, + "router_z_loss_mlp": 0.22167969, + "step": 5213, + "time_per_iteration": 2.905486822128296 + }, + { + "auxiliary_loss_clip": 0.01534276, + "auxiliary_loss_mlp": 0.01046846, + "balance_loss_clip": 1.33282804, + "balance_loss_mlp": 1.02411246, + "epoch": 0.31348263941079213, + "flos": 20239393057920.0, + "grad_norm": 2.0502132787192813, + "language_loss": 0.78763318, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.8134445, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 2.01757812, + "router_z_loss_mlp": 0.22729492, + "step": 5214, + "time_per_iteration": 2.8909809589385986 + }, + { + "auxiliary_loss_clip": 0.01513788, + "auxiliary_loss_mlp": 0.01050246, + "balance_loss_clip": 1.32107759, + "balance_loss_mlp": 1.02810907, + "epoch": 0.3135427626634601, + "flos": 24618243870720.0, + "grad_norm": 1.5611158660720477, + "language_loss": 0.83127725, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.85691762, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.22131348, + "step": 5215, + "time_per_iteration": 2.897444725036621 + }, + { + "auxiliary_loss_clip": 0.01519698, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.32207906, + "balance_loss_mlp": 1.02253783, + "epoch": 0.31360288591612806, + "flos": 20969736923520.0, + "grad_norm": 1.9357098955004386, + "language_loss": 0.80697727, + "learning_rate": 3.213953633415686e-06, + "loss": 0.83262825, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.2286377, + "step": 5216, + "time_per_iteration": 2.9133191108703613 + }, + { + "auxiliary_loss_clip": 0.01539794, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.33668387, + "balance_loss_mlp": 1.03068256, + "epoch": 0.313663009168796, + "flos": 26992065035520.0, + "grad_norm": 1.925736636615209, + "language_loss": 0.69326091, + "learning_rate": 3.213644097593477e-06, + "loss": 0.71920061, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.23510742, + "step": 5217, + "time_per_iteration": 2.9203834533691406 + }, + { + "auxiliary_loss_clip": 0.01520588, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.32344019, + "balance_loss_mlp": 1.02498174, + "epoch": 0.313723132421464, + "flos": 18049990273920.0, + "grad_norm": 1.6563119852808028, + "language_loss": 0.81557512, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.84125859, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.2277832, + "step": 5218, + "time_per_iteration": 2.884458541870117 + }, + { + "auxiliary_loss_clip": 0.01538551, + "auxiliary_loss_mlp": 0.01044363, + "balance_loss_clip": 1.33795702, + "balance_loss_mlp": 1.02077174, + "epoch": 0.31378325567413196, + "flos": 22498301134080.0, + "grad_norm": 2.499166988739463, + "language_loss": 0.6952045, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.72103363, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.23583984, + "step": 5219, + "time_per_iteration": 4.353666543960571 + }, + { + "auxiliary_loss_clip": 0.01527516, + "auxiliary_loss_mlp": 0.01039845, + "balance_loss_clip": 1.32917547, + "balance_loss_mlp": 1.01806533, + "epoch": 0.3138433789267999, + "flos": 22429474513920.0, + "grad_norm": 2.336972247684757, + "language_loss": 0.80640525, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.83207887, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.21777344, + "step": 5220, + "time_per_iteration": 2.9171934127807617 + }, + { + "auxiliary_loss_clip": 0.01529801, + "auxiliary_loss_mlp": 0.01046217, + "balance_loss_clip": 1.33190656, + "balance_loss_mlp": 1.02509296, + "epoch": 0.3139035021794679, + "flos": 13013694247680.0, + "grad_norm": 1.899963810663342, + "language_loss": 0.73692524, + "learning_rate": 3.212405494206986e-06, + "loss": 0.76268542, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 1.98046875, + "router_z_loss_mlp": 0.21130371, + "step": 5221, + "time_per_iteration": 2.8489012718200684 + }, + { + "auxiliary_loss_clip": 0.01516594, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.32162786, + "balance_loss_mlp": 1.0196228, + "epoch": 0.31396362543213585, + "flos": 16954700699520.0, + "grad_norm": 2.183669911061567, + "language_loss": 0.82791519, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.85349941, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.2220459, + "step": 5222, + "time_per_iteration": 2.866429090499878 + }, + { + "auxiliary_loss_clip": 0.01528765, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.32723784, + "balance_loss_mlp": 1.0194428, + "epoch": 0.31402374868480387, + "flos": 20166268181760.0, + "grad_norm": 1.8265473903464173, + "language_loss": 0.71368992, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.73941219, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.24023438, + "step": 5223, + "time_per_iteration": 4.240664720535278 + }, + { + "auxiliary_loss_clip": 0.01507189, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.31287456, + "balance_loss_mlp": 1.01954424, + "epoch": 0.31408387193747184, + "flos": 21260833839360.0, + "grad_norm": 1.6077611834206444, + "language_loss": 0.81113374, + "learning_rate": 3.211476058893379e-06, + "loss": 0.83660734, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.2064209, + "step": 5224, + "time_per_iteration": 2.8568296432495117 + }, + { + "auxiliary_loss_clip": 0.01536881, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_clip": 1.3335681, + "balance_loss_mlp": 1.02570009, + "epoch": 0.3141439951901398, + "flos": 27494121047040.0, + "grad_norm": 2.2164222924673322, + "language_loss": 0.58971721, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.61557943, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 2.03222656, + "router_z_loss_mlp": 0.2364502, + "step": 5225, + "time_per_iteration": 2.8776493072509766 + }, + { + "auxiliary_loss_clip": 0.01512103, + "auxiliary_loss_mlp": 0.01037878, + "balance_loss_clip": 1.31894815, + "balance_loss_mlp": 1.01657569, + "epoch": 0.31420411844280777, + "flos": 17859644824320.0, + "grad_norm": 2.8409601650015546, + "language_loss": 0.82417637, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84967625, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.2130127, + "step": 5226, + "time_per_iteration": 4.236299514770508 + }, + { + "auxiliary_loss_clip": 0.01526903, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.32679033, + "balance_loss_mlp": 1.0193435, + "epoch": 0.31426424169547573, + "flos": 21627091647360.0, + "grad_norm": 1.9581441963060615, + "language_loss": 0.75195527, + "learning_rate": 3.210546210126141e-06, + "loss": 0.77764726, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.22937012, + "step": 5227, + "time_per_iteration": 4.316488742828369 + }, + { + "auxiliary_loss_clip": 0.01534796, + "auxiliary_loss_mlp": 0.01042448, + "balance_loss_clip": 1.33753991, + "balance_loss_mlp": 1.02002501, + "epoch": 0.3143243649481437, + "flos": 30933116732160.0, + "grad_norm": 2.085507737641461, + "language_loss": 0.69417882, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.71995127, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.22412109, + "step": 5228, + "time_per_iteration": 2.947946786880493 + }, + { + "auxiliary_loss_clip": 0.01526439, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.32943559, + "balance_loss_mlp": 1.02018023, + "epoch": 0.31438448820081166, + "flos": 22831955424000.0, + "grad_norm": 1.6756484783602597, + "language_loss": 0.80487251, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.83056325, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.22460938, + "step": 5229, + "time_per_iteration": 2.8820884227752686 + }, + { + "auxiliary_loss_clip": 0.01524347, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.33006942, + "balance_loss_mlp": 1.0145992, + "epoch": 0.3144446114534796, + "flos": 23301860365440.0, + "grad_norm": 1.721774643312017, + "language_loss": 0.70787567, + "learning_rate": 3.209615948222611e-06, + "loss": 0.7334919, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.22668457, + "step": 5230, + "time_per_iteration": 2.879142999649048 + }, + { + "auxiliary_loss_clip": 0.01515786, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.32031965, + "balance_loss_mlp": 1.01777673, + "epoch": 0.3145047347061476, + "flos": 31367929691520.0, + "grad_norm": 1.5884728011171225, + "language_loss": 0.7987082, + "learning_rate": 3.209305769168239e-06, + "loss": 0.82426846, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.2244873, + "step": 5231, + "time_per_iteration": 2.9461941719055176 + }, + { + "auxiliary_loss_clip": 0.0152296, + "auxiliary_loss_mlp": 0.01044344, + "balance_loss_clip": 1.32691932, + "balance_loss_mlp": 1.02146721, + "epoch": 0.31456485795881556, + "flos": 10896737667840.0, + "grad_norm": 1.9667279456939306, + "language_loss": 0.86124188, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.88691491, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.22875977, + "step": 5232, + "time_per_iteration": 2.8304853439331055 + }, + { + "auxiliary_loss_clip": 0.01502462, + "auxiliary_loss_mlp": 0.01042192, + "balance_loss_clip": 1.31073952, + "balance_loss_mlp": 1.01926839, + "epoch": 0.3146249812114835, + "flos": 17101312410240.0, + "grad_norm": 1.7666017913665626, + "language_loss": 0.81087852, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.83632505, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.22937012, + "step": 5233, + "time_per_iteration": 2.864776134490967 + }, + { + "auxiliary_loss_clip": 0.015243, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.32465732, + "balance_loss_mlp": 1.01601732, + "epoch": 0.3146851044641515, + "flos": 55309816022400.0, + "grad_norm": 1.7269902783318816, + "language_loss": 0.71928227, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.74491644, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.23083496, + "step": 5234, + "time_per_iteration": 3.2603299617767334 + }, + { + "auxiliary_loss_clip": 0.01531827, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.33189464, + "balance_loss_mlp": 1.01586771, + "epoch": 0.31474522771681945, + "flos": 27027157017600.0, + "grad_norm": 1.8672246072668386, + "language_loss": 0.73177409, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.7574712, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.22009277, + "step": 5235, + "time_per_iteration": 2.9153060913085938 + }, + { + "auxiliary_loss_clip": 0.01511877, + "auxiliary_loss_mlp": 0.01041486, + "balance_loss_clip": 1.31696773, + "balance_loss_mlp": 1.01971841, + "epoch": 0.3148053509694875, + "flos": 21261919714560.0, + "grad_norm": 2.0085273557243264, + "language_loss": 0.79497993, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.82051361, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.21765137, + "step": 5236, + "time_per_iteration": 2.8782529830932617 + }, + { + "auxiliary_loss_clip": 0.01524457, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.32378328, + "balance_loss_mlp": 1.02093482, + "epoch": 0.31486547422215544, + "flos": 31261160666880.0, + "grad_norm": 1.9275716802098413, + "language_loss": 0.7710222, + "learning_rate": 3.207443732256881e-06, + "loss": 0.79669946, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.2232666, + "step": 5237, + "time_per_iteration": 2.949321985244751 + }, + { + "auxiliary_loss_clip": 0.01509525, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.31825674, + "balance_loss_mlp": 1.01887023, + "epoch": 0.3149255974748234, + "flos": 19838133757440.0, + "grad_norm": 4.3634303758261055, + "language_loss": 0.79835898, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82385933, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.21630859, + "step": 5238, + "time_per_iteration": 2.8863747119903564 + }, + { + "auxiliary_loss_clip": 0.01276868, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.15720272, + "balance_loss_mlp": 1.01751268, + "epoch": 0.31498572072749137, + "flos": 67711744321920.0, + "grad_norm": 0.8461327112452774, + "language_loss": 0.67941278, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.70263696, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.28125, + "step": 5239, + "time_per_iteration": 3.3732187747955322 + }, + { + "auxiliary_loss_clip": 0.01534579, + "auxiliary_loss_mlp": 0.01052522, + "balance_loss_clip": 1.33174729, + "balance_loss_mlp": 1.0282867, + "epoch": 0.31504584398015933, + "flos": 19802906040960.0, + "grad_norm": 2.1692149578045865, + "language_loss": 0.83550441, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.86137539, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 2.02636719, + "router_z_loss_mlp": 0.2421875, + "step": 5240, + "time_per_iteration": 2.8989758491516113 + }, + { + "auxiliary_loss_clip": 0.01516456, + "auxiliary_loss_mlp": 0.01054737, + "balance_loss_clip": 1.32227397, + "balance_loss_mlp": 1.03140783, + "epoch": 0.3151059672328273, + "flos": 26626802613120.0, + "grad_norm": 1.6520242784610895, + "language_loss": 0.81553161, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.84124351, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.2331543, + "step": 5241, + "time_per_iteration": 2.9146387577056885 + }, + { + "auxiliary_loss_clip": 0.01504136, + "auxiliary_loss_mlp": 0.01047963, + "balance_loss_clip": 1.31421912, + "balance_loss_mlp": 1.02528954, + "epoch": 0.31516609048549526, + "flos": 24214450861440.0, + "grad_norm": 1.7965630009417497, + "language_loss": 0.74901938, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.77454031, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.22692871, + "step": 5242, + "time_per_iteration": 2.8918778896331787 + }, + { + "auxiliary_loss_clip": 0.01516417, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.32390368, + "balance_loss_mlp": 1.02641261, + "epoch": 0.31522621373816323, + "flos": 25969945582080.0, + "grad_norm": 1.7035995248804716, + "language_loss": 0.74588096, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.77154052, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.23132324, + "step": 5243, + "time_per_iteration": 2.924443244934082 + }, + { + "auxiliary_loss_clip": 0.01519849, + "auxiliary_loss_mlp": 0.01047114, + "balance_loss_clip": 1.32132244, + "balance_loss_mlp": 1.02351022, + "epoch": 0.3152863369908312, + "flos": 21919002969600.0, + "grad_norm": 1.8575271327684466, + "language_loss": 0.65283012, + "learning_rate": 3.205269272758513e-06, + "loss": 0.67849976, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.23608398, + "step": 5244, + "time_per_iteration": 2.858238935470581 + }, + { + "auxiliary_loss_clip": 0.01522177, + "auxiliary_loss_mlp": 0.01048859, + "balance_loss_clip": 1.32280707, + "balance_loss_mlp": 1.02588785, + "epoch": 0.31534646024349916, + "flos": 16287482609280.0, + "grad_norm": 2.6439472579696157, + "language_loss": 0.92164338, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.94735372, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.22961426, + "step": 5245, + "time_per_iteration": 2.82991361618042 + }, + { + "auxiliary_loss_clip": 0.01528163, + "auxiliary_loss_mlp": 0.01051156, + "balance_loss_clip": 1.33055472, + "balance_loss_mlp": 1.02724218, + "epoch": 0.3154065834961671, + "flos": 24727908562560.0, + "grad_norm": 2.055457313310126, + "language_loss": 0.75751138, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.78330463, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 1.9765625, + "router_z_loss_mlp": 0.23901367, + "step": 5246, + "time_per_iteration": 2.9090206623077393 + }, + { + "auxiliary_loss_clip": 0.01514728, + "auxiliary_loss_mlp": 0.01051202, + "balance_loss_clip": 1.31849492, + "balance_loss_mlp": 1.0271101, + "epoch": 0.3154667067488351, + "flos": 35384594728320.0, + "grad_norm": 1.7688300449422854, + "language_loss": 0.62302488, + "learning_rate": 3.204336675750321e-06, + "loss": 0.64868414, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.24084473, + "step": 5247, + "time_per_iteration": 2.994504690170288 + }, + { + "auxiliary_loss_clip": 0.01526312, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.32643867, + "balance_loss_mlp": 1.0252676, + "epoch": 0.31552683000150306, + "flos": 17465217488640.0, + "grad_norm": 2.221969753357131, + "language_loss": 0.8328104, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.85856342, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.23730469, + "step": 5248, + "time_per_iteration": 2.8469808101654053 + }, + { + "auxiliary_loss_clip": 0.01508548, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_clip": 1.31118441, + "balance_loss_mlp": 1.02464414, + "epoch": 0.3155869532541711, + "flos": 18414935982720.0, + "grad_norm": 1.7537379690476955, + "language_loss": 0.86409593, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.88966876, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.24108887, + "step": 5249, + "time_per_iteration": 2.822160243988037 + }, + { + "auxiliary_loss_clip": 0.01521658, + "auxiliary_loss_mlp": 0.01046359, + "balance_loss_clip": 1.32430625, + "balance_loss_mlp": 1.02217102, + "epoch": 0.31564707650683904, + "flos": 21589692180480.0, + "grad_norm": 1.764742168449224, + "language_loss": 0.87202895, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.89770913, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.24206543, + "step": 5250, + "time_per_iteration": 2.885922908782959 + }, + { + "auxiliary_loss_clip": 0.01510158, + "auxiliary_loss_mlp": 0.01050408, + "balance_loss_clip": 1.31560349, + "balance_loss_mlp": 1.02730489, + "epoch": 0.315707199759507, + "flos": 21040373335680.0, + "grad_norm": 2.4247903918457934, + "language_loss": 0.69794762, + "learning_rate": 3.203092573767835e-06, + "loss": 0.72355324, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2310791, + "step": 5251, + "time_per_iteration": 2.8327720165252686 + }, + { + "auxiliary_loss_clip": 0.01511291, + "auxiliary_loss_mlp": 0.01048936, + "balance_loss_clip": 1.31653547, + "balance_loss_mlp": 1.02565479, + "epoch": 0.31576732301217497, + "flos": 26838892828800.0, + "grad_norm": 2.8575251066363547, + "language_loss": 0.79605997, + "learning_rate": 3.202781434189246e-06, + "loss": 0.82166219, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.23266602, + "step": 5252, + "time_per_iteration": 2.9043102264404297 + }, + { + "auxiliary_loss_clip": 0.01516913, + "auxiliary_loss_mlp": 0.01050886, + "balance_loss_clip": 1.32251132, + "balance_loss_mlp": 1.02833188, + "epoch": 0.31582744626484294, + "flos": 22721476325760.0, + "grad_norm": 1.7315355888827477, + "language_loss": 0.74690974, + "learning_rate": 3.202470249001066e-06, + "loss": 0.77258772, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.22546387, + "step": 5253, + "time_per_iteration": 2.8986923694610596 + }, + { + "auxiliary_loss_clip": 0.01529953, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.33179021, + "balance_loss_mlp": 1.02017117, + "epoch": 0.3158875695175109, + "flos": 23962608449280.0, + "grad_norm": 1.6872903560455994, + "language_loss": 0.74405801, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.76980233, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.24291992, + "step": 5254, + "time_per_iteration": 4.355388879776001 + }, + { + "auxiliary_loss_clip": 0.01527111, + "auxiliary_loss_mlp": 0.01054375, + "balance_loss_clip": 1.32715249, + "balance_loss_mlp": 1.03059292, + "epoch": 0.31594769277017887, + "flos": 13269880160640.0, + "grad_norm": 1.7944936812422003, + "language_loss": 0.79472506, + "learning_rate": 3.201847741843128e-06, + "loss": 0.82053995, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.23791504, + "step": 5255, + "time_per_iteration": 2.889650821685791 + }, + { + "auxiliary_loss_clip": 0.01511317, + "auxiliary_loss_mlp": 0.01053735, + "balance_loss_clip": 1.31614554, + "balance_loss_mlp": 1.02864194, + "epoch": 0.31600781602284683, + "flos": 23378831049600.0, + "grad_norm": 1.9118087741714693, + "language_loss": 0.79162896, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.81727946, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.25109863, + "step": 5256, + "time_per_iteration": 2.9084343910217285 + }, + { + "auxiliary_loss_clip": 0.01486768, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.29965663, + "balance_loss_mlp": 1.02789974, + "epoch": 0.3160679392755148, + "flos": 19838088512640.0, + "grad_norm": 1.5054919922482648, + "language_loss": 0.72011101, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.74548686, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.22912598, + "step": 5257, + "time_per_iteration": 4.307504177093506 + }, + { + "auxiliary_loss_clip": 0.0151132, + "auxiliary_loss_mlp": 0.01051858, + "balance_loss_clip": 1.31656551, + "balance_loss_mlp": 1.02892244, + "epoch": 0.31612806252818276, + "flos": 20202762752640.0, + "grad_norm": 1.9553211225643465, + "language_loss": 0.77730095, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.80293274, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.22924805, + "step": 5258, + "time_per_iteration": 2.8762292861938477 + }, + { + "auxiliary_loss_clip": 0.01518873, + "auxiliary_loss_mlp": 0.0104603, + "balance_loss_clip": 1.32410932, + "balance_loss_mlp": 1.0233326, + "epoch": 0.31618818578085073, + "flos": 24244701649920.0, + "grad_norm": 2.1917291377371693, + "language_loss": 0.74058425, + "learning_rate": 3.200602180731467e-06, + "loss": 0.76623327, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.22680664, + "step": 5259, + "time_per_iteration": 2.916705846786499 + }, + { + "auxiliary_loss_clip": 0.015083, + "auxiliary_loss_mlp": 0.01050237, + "balance_loss_clip": 1.31198323, + "balance_loss_mlp": 1.02838635, + "epoch": 0.3162483090335187, + "flos": 25092537557760.0, + "grad_norm": 2.007742390705015, + "language_loss": 0.67014736, + "learning_rate": 3.20029067660664e-06, + "loss": 0.69573271, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.21862793, + "step": 5260, + "time_per_iteration": 2.881371259689331 + }, + { + "auxiliary_loss_clip": 0.01509499, + "auxiliary_loss_mlp": 0.01044134, + "balance_loss_clip": 1.31462896, + "balance_loss_mlp": 1.02038753, + "epoch": 0.31630843228618666, + "flos": 26334665066880.0, + "grad_norm": 1.7904469769722526, + "language_loss": 0.73279345, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.75832981, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.23742676, + "step": 5261, + "time_per_iteration": 4.309060096740723 + }, + { + "auxiliary_loss_clip": 0.01287893, + "auxiliary_loss_mlp": 0.01063661, + "balance_loss_clip": 1.16592026, + "balance_loss_mlp": 1.0230341, + "epoch": 0.3163685555388547, + "flos": 66791977885440.0, + "grad_norm": 0.7499843547675183, + "language_loss": 0.50696504, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.53048062, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.40625, + "step": 5262, + "time_per_iteration": 4.732795476913452 + }, + { + "auxiliary_loss_clip": 0.01510517, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.31520224, + "balance_loss_mlp": 1.02059567, + "epoch": 0.31642867879152264, + "flos": 26007028335360.0, + "grad_norm": 1.5625877479126398, + "language_loss": 0.8592869, + "learning_rate": 3.19935589118856e-06, + "loss": 0.88482738, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.22924805, + "step": 5263, + "time_per_iteration": 2.872580051422119 + }, + { + "auxiliary_loss_clip": 0.0149249, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.30498588, + "balance_loss_mlp": 1.01515687, + "epoch": 0.3164888020441906, + "flos": 25785798670080.0, + "grad_norm": 1.7156841462889296, + "language_loss": 0.82287186, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.84820962, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.26159668, + "step": 5264, + "time_per_iteration": 2.914870500564575 + }, + { + "auxiliary_loss_clip": 0.01523112, + "auxiliary_loss_mlp": 0.01043777, + "balance_loss_clip": 1.32450235, + "balance_loss_mlp": 1.02009058, + "epoch": 0.3165489252968586, + "flos": 19765868532480.0, + "grad_norm": 2.22237906194405, + "language_loss": 0.80272633, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82839525, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.23706055, + "step": 5265, + "time_per_iteration": 2.9567506313323975 + }, + { + "auxiliary_loss_clip": 0.01525378, + "auxiliary_loss_mlp": 0.0104685, + "balance_loss_clip": 1.32833397, + "balance_loss_mlp": 1.02299643, + "epoch": 0.31660904854952654, + "flos": 23193960220800.0, + "grad_norm": 1.5944691495450953, + "language_loss": 0.7548219, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.78054416, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 1.96972656, + "router_z_loss_mlp": 0.23864746, + "step": 5266, + "time_per_iteration": 2.849513530731201 + }, + { + "auxiliary_loss_clip": 0.01532687, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.33441365, + "balance_loss_mlp": 1.02240241, + "epoch": 0.3166691718021945, + "flos": 20417160453120.0, + "grad_norm": 2.957081227760116, + "language_loss": 0.81195205, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.83773154, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.22851562, + "step": 5267, + "time_per_iteration": 2.877411127090454 + }, + { + "auxiliary_loss_clip": 0.01286289, + "auxiliary_loss_mlp": 0.01065405, + "balance_loss_clip": 1.16875172, + "balance_loss_mlp": 1.03469718, + "epoch": 0.31672929505486247, + "flos": 70177421710080.0, + "grad_norm": 0.7414074209008746, + "language_loss": 0.57926869, + "learning_rate": 3.197797006055478e-06, + "loss": 0.60278559, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.30664062, + "step": 5268, + "time_per_iteration": 3.3855483531951904 + }, + { + "auxiliary_loss_clip": 0.0151293, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.31633234, + "balance_loss_mlp": 1.02098417, + "epoch": 0.31678941830753043, + "flos": 14363857635840.0, + "grad_norm": 5.754239153416772, + "language_loss": 0.75286281, + "learning_rate": 3.197485092719815e-06, + "loss": 0.77843225, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.23034668, + "step": 5269, + "time_per_iteration": 2.853336811065674 + }, + { + "auxiliary_loss_clip": 0.01498405, + "auxiliary_loss_mlp": 0.01049458, + "balance_loss_clip": 1.30625451, + "balance_loss_mlp": 1.02653432, + "epoch": 0.3168495415601984, + "flos": 22758061386240.0, + "grad_norm": 2.1070560755882664, + "language_loss": 0.80516315, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.83064175, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.22924805, + "step": 5270, + "time_per_iteration": 2.905733346939087 + }, + { + "auxiliary_loss_clip": 0.01529818, + "auxiliary_loss_mlp": 0.01044668, + "balance_loss_clip": 1.33165526, + "balance_loss_mlp": 1.02158856, + "epoch": 0.31690966481286637, + "flos": 20123484583680.0, + "grad_norm": 14.986266530674218, + "language_loss": 0.80439854, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.83014345, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.23059082, + "step": 5271, + "time_per_iteration": 2.822153091430664 + }, + { + "auxiliary_loss_clip": 0.01522636, + "auxiliary_loss_mlp": 0.01046836, + "balance_loss_clip": 1.32689619, + "balance_loss_mlp": 1.02233815, + "epoch": 0.31696978806553433, + "flos": 21188885328000.0, + "grad_norm": 1.8573727606244586, + "language_loss": 0.74254405, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.76823872, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.24499512, + "step": 5272, + "time_per_iteration": 2.868091583251953 + }, + { + "auxiliary_loss_clip": 0.01523113, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.3245306, + "balance_loss_mlp": 1.01610899, + "epoch": 0.3170299113182023, + "flos": 43012548224640.0, + "grad_norm": 2.1259278891969013, + "language_loss": 0.70837665, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.73402256, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.25341797, + "step": 5273, + "time_per_iteration": 3.037597894668579 + }, + { + "auxiliary_loss_clip": 0.01519779, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.32350123, + "balance_loss_mlp": 1.02092981, + "epoch": 0.31709003457087026, + "flos": 24470320060800.0, + "grad_norm": 1.8123731010184865, + "language_loss": 0.69346052, + "learning_rate": 3.195924845146795e-06, + "loss": 0.71910536, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.2376709, + "step": 5274, + "time_per_iteration": 2.8730521202087402 + }, + { + "auxiliary_loss_clip": 0.01500674, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_clip": 1.31046975, + "balance_loss_mlp": 1.02488208, + "epoch": 0.3171501578235382, + "flos": 24146302913280.0, + "grad_norm": 1.4431414783523342, + "language_loss": 0.81149828, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83698249, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.22875977, + "step": 5275, + "time_per_iteration": 2.931072950363159 + }, + { + "auxiliary_loss_clip": 0.01520139, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.32330358, + "balance_loss_mlp": 1.02018261, + "epoch": 0.31721028107620625, + "flos": 18888912956160.0, + "grad_norm": 1.7160727049883893, + "language_loss": 0.73538011, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.76100194, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.21862793, + "step": 5276, + "time_per_iteration": 2.853792428970337 + }, + { + "auxiliary_loss_clip": 0.01493787, + "auxiliary_loss_mlp": 0.01037137, + "balance_loss_clip": 1.30352449, + "balance_loss_mlp": 1.01510763, + "epoch": 0.3172704043288742, + "flos": 23158008587520.0, + "grad_norm": 1.4007174992429483, + "language_loss": 0.78867292, + "learning_rate": 3.194988152313236e-06, + "loss": 0.81398213, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22033691, + "step": 5277, + "time_per_iteration": 3.05853533744812 + }, + { + "auxiliary_loss_clip": 0.01524903, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.32909417, + "balance_loss_mlp": 1.01849437, + "epoch": 0.3173305275815422, + "flos": 17867562664320.0, + "grad_norm": 3.0554214098193877, + "language_loss": 0.80961466, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.83527172, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.22302246, + "step": 5278, + "time_per_iteration": 2.8470168113708496 + }, + { + "auxiliary_loss_clip": 0.01274735, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.16251886, + "balance_loss_mlp": 1.01250923, + "epoch": 0.31739065083421014, + "flos": 59996142368640.0, + "grad_norm": 0.8875947285068659, + "language_loss": 0.62866551, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.6517697, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.23144531, + "step": 5279, + "time_per_iteration": 3.122891664505005 + }, + { + "auxiliary_loss_clip": 0.01521288, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.32407463, + "balance_loss_mlp": 1.0230031, + "epoch": 0.3174507740868781, + "flos": 23810341138560.0, + "grad_norm": 1.6260835789316455, + "language_loss": 0.82702506, + "learning_rate": 3.194051051653053e-06, + "loss": 0.85270727, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.23925781, + "step": 5280, + "time_per_iteration": 2.8883888721466064 + }, + { + "auxiliary_loss_clip": 0.01503583, + "auxiliary_loss_mlp": 0.01053099, + "balance_loss_clip": 1.31151772, + "balance_loss_mlp": 1.02969873, + "epoch": 0.31751089733954607, + "flos": 27651048572160.0, + "grad_norm": 1.889970678277756, + "language_loss": 0.79182565, + "learning_rate": 3.19373859419346e-06, + "loss": 0.81739253, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.23425293, + "step": 5281, + "time_per_iteration": 2.9371800422668457 + }, + { + "auxiliary_loss_clip": 0.01501626, + "auxiliary_loss_mlp": 0.01044286, + "balance_loss_clip": 1.30857897, + "balance_loss_mlp": 1.02063513, + "epoch": 0.31757102059221404, + "flos": 23779864126080.0, + "grad_norm": 1.9306600025042602, + "language_loss": 0.78867471, + "learning_rate": 3.193426091467179e-06, + "loss": 0.81413382, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.23657227, + "step": 5282, + "time_per_iteration": 2.9043967723846436 + }, + { + "auxiliary_loss_clip": 0.01530759, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.3319757, + "balance_loss_mlp": 1.0243187, + "epoch": 0.317631143844882, + "flos": 25275327125760.0, + "grad_norm": 1.878553608123145, + "language_loss": 0.68324184, + "learning_rate": 3.193113543486061e-06, + "loss": 0.70903909, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.2467041, + "step": 5283, + "time_per_iteration": 2.8771603107452393 + }, + { + "auxiliary_loss_clip": 0.01284059, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.16536498, + "balance_loss_mlp": 1.00408041, + "epoch": 0.31769126709754997, + "flos": 55850601582720.0, + "grad_norm": 0.7404405987643154, + "language_loss": 0.52967417, + "learning_rate": 3.192800950261958e-06, + "loss": 0.55287027, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.31445312, + "step": 5284, + "time_per_iteration": 3.346916437149048 + }, + { + "auxiliary_loss_clip": 0.01535345, + "auxiliary_loss_mlp": 0.0105261, + "balance_loss_clip": 1.33354211, + "balance_loss_mlp": 1.03006721, + "epoch": 0.31775139035021793, + "flos": 16699193458560.0, + "grad_norm": 1.7349913712188814, + "language_loss": 0.71384311, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.73972267, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.2253418, + "step": 5285, + "time_per_iteration": 2.851264238357544 + }, + { + "auxiliary_loss_clip": 0.01282693, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.16454005, + "balance_loss_mlp": 1.00579834, + "epoch": 0.3178115136028859, + "flos": 64256568981120.0, + "grad_norm": 0.8183356448292595, + "language_loss": 0.60528338, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62846583, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.296875, + "step": 5286, + "time_per_iteration": 3.3379528522491455 + }, + { + "auxiliary_loss_clip": 0.01519325, + "auxiliary_loss_mlp": 0.01049498, + "balance_loss_clip": 1.32145345, + "balance_loss_mlp": 1.02535772, + "epoch": 0.31787163685555386, + "flos": 18706666325760.0, + "grad_norm": 3.5314672190279617, + "language_loss": 0.73881066, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.76449889, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.24133301, + "step": 5287, + "time_per_iteration": 2.8710999488830566 + }, + { + "auxiliary_loss_clip": 0.01528145, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_clip": 1.32783937, + "balance_loss_mlp": 1.02441311, + "epoch": 0.31793176010822183, + "flos": 21334727877120.0, + "grad_norm": 2.2439237754214325, + "language_loss": 0.76906526, + "learning_rate": 3.191550125172792e-06, + "loss": 0.79483384, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.24291992, + "step": 5288, + "time_per_iteration": 2.9577395915985107 + }, + { + "auxiliary_loss_clip": 0.01494953, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.30304337, + "balance_loss_mlp": 1.02276373, + "epoch": 0.31799188336088985, + "flos": 20968243845120.0, + "grad_norm": 2.936952720531677, + "language_loss": 0.88601214, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.91141939, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.23022461, + "step": 5289, + "time_per_iteration": 4.465365171432495 + }, + { + "auxiliary_loss_clip": 0.01493681, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.30157328, + "balance_loss_mlp": 1.01959229, + "epoch": 0.3180520066135578, + "flos": 22502101697280.0, + "grad_norm": 2.3649445543953256, + "language_loss": 0.68690026, + "learning_rate": 3.190924441478572e-06, + "loss": 0.71226937, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.23632812, + "step": 5290, + "time_per_iteration": 2.905673027038574 + }, + { + "auxiliary_loss_clip": 0.01527473, + "auxiliary_loss_mlp": 0.01047425, + "balance_loss_clip": 1.32548666, + "balance_loss_mlp": 1.02328515, + "epoch": 0.3181121298662258, + "flos": 27246803114880.0, + "grad_norm": 1.8617056150926263, + "language_loss": 0.80182552, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82757449, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.24121094, + "step": 5291, + "time_per_iteration": 2.8867008686065674 + }, + { + "auxiliary_loss_clip": 0.01513625, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.31694341, + "balance_loss_mlp": 1.01748037, + "epoch": 0.31817225311889374, + "flos": 23189209516800.0, + "grad_norm": 1.906523467256779, + "language_loss": 0.80571425, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.83126903, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.24389648, + "step": 5292, + "time_per_iteration": 4.377549648284912 + }, + { + "auxiliary_loss_clip": 0.0148209, + "auxiliary_loss_mlp": 0.01041149, + "balance_loss_clip": 1.29296076, + "balance_loss_mlp": 1.01731908, + "epoch": 0.3182323763715617, + "flos": 23269347336960.0, + "grad_norm": 1.650341908358965, + "language_loss": 0.75991225, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.78514469, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.23828125, + "step": 5293, + "time_per_iteration": 2.887141227722168 + }, + { + "auxiliary_loss_clip": 0.01509302, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.31486726, + "balance_loss_mlp": 1.02204669, + "epoch": 0.3182924996242297, + "flos": 29026938268800.0, + "grad_norm": 1.8377585286485858, + "language_loss": 0.75709283, + "learning_rate": 3.189672532265379e-06, + "loss": 0.78263247, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.22607422, + "step": 5294, + "time_per_iteration": 2.934217691421509 + }, + { + "auxiliary_loss_clip": 0.01501334, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.30652428, + "balance_loss_mlp": 1.01448417, + "epoch": 0.31835262287689764, + "flos": 20458948665600.0, + "grad_norm": 2.126492740590171, + "language_loss": 0.77536148, + "learning_rate": 3.189359442151152e-06, + "loss": 0.80078375, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2644043, + "step": 5295, + "time_per_iteration": 3.0163979530334473 + }, + { + "auxiliary_loss_clip": 0.01530248, + "auxiliary_loss_mlp": 0.01050999, + "balance_loss_clip": 1.32961321, + "balance_loss_mlp": 1.02732432, + "epoch": 0.3184127461295656, + "flos": 25130479962240.0, + "grad_norm": 1.5525324505108404, + "language_loss": 0.7049666, + "learning_rate": 3.189046306936296e-06, + "loss": 0.73077905, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.23681641, + "step": 5296, + "time_per_iteration": 4.373674154281616 + }, + { + "auxiliary_loss_clip": 0.01499281, + "auxiliary_loss_mlp": 0.01044929, + "balance_loss_clip": 1.30503559, + "balance_loss_mlp": 1.02065754, + "epoch": 0.31847286938223357, + "flos": 25560768441600.0, + "grad_norm": 2.137614688164464, + "language_loss": 0.78315407, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.80859613, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.24243164, + "step": 5297, + "time_per_iteration": 4.324610948562622 + }, + { + "auxiliary_loss_clip": 0.01498221, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.30472946, + "balance_loss_mlp": 1.01576471, + "epoch": 0.31853299263490154, + "flos": 27794040698880.0, + "grad_norm": 1.8989612074180957, + "language_loss": 0.80110586, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.82649601, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.25012207, + "step": 5298, + "time_per_iteration": 2.925905466079712 + }, + { + "auxiliary_loss_clip": 0.01526982, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.326823, + "balance_loss_mlp": 1.01684129, + "epoch": 0.3185931158875695, + "flos": 22715911215360.0, + "grad_norm": 1.7419058099837923, + "language_loss": 0.75340676, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.77908874, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.24389648, + "step": 5299, + "time_per_iteration": 2.9240753650665283 + }, + { + "auxiliary_loss_clip": 0.01507897, + "auxiliary_loss_mlp": 0.01044789, + "balance_loss_clip": 1.30929267, + "balance_loss_mlp": 1.0193733, + "epoch": 0.31865323914023747, + "flos": 24582201747840.0, + "grad_norm": 4.567643443459848, + "language_loss": 0.79424596, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.81977284, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.25427246, + "step": 5300, + "time_per_iteration": 2.895050287246704 + }, + { + "auxiliary_loss_clip": 0.01505301, + "auxiliary_loss_mlp": 0.01042003, + "balance_loss_clip": 1.30979776, + "balance_loss_mlp": 1.01622987, + "epoch": 0.31871336239290543, + "flos": 18195606599040.0, + "grad_norm": 1.815257939279845, + "language_loss": 0.84755194, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.87302488, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.25793457, + "step": 5301, + "time_per_iteration": 2.849749803543091 + }, + { + "auxiliary_loss_clip": 0.01499102, + "auxiliary_loss_mlp": 0.01047193, + "balance_loss_clip": 1.30616534, + "balance_loss_mlp": 1.02172947, + "epoch": 0.31877348564557345, + "flos": 21834566893440.0, + "grad_norm": 3.0594244529928143, + "language_loss": 0.77962416, + "learning_rate": 3.187166549199015e-06, + "loss": 0.80508709, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.25439453, + "step": 5302, + "time_per_iteration": 2.855435371398926 + }, + { + "auxiliary_loss_clip": 0.01480554, + "auxiliary_loss_mlp": 0.01039628, + "balance_loss_clip": 1.28964376, + "balance_loss_mlp": 1.01378393, + "epoch": 0.3188336088982414, + "flos": 22024686119040.0, + "grad_norm": 1.6729728799638008, + "language_loss": 0.80438745, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.82958925, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.25866699, + "step": 5303, + "time_per_iteration": 2.85546612739563 + }, + { + "auxiliary_loss_clip": 0.01529562, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.32491672, + "balance_loss_mlp": 1.01993251, + "epoch": 0.3188937321509094, + "flos": 20057553630720.0, + "grad_norm": 2.7398417132049055, + "language_loss": 0.74048752, + "learning_rate": 3.186539603020047e-06, + "loss": 0.76624668, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 2.04589844, + "router_z_loss_mlp": 0.26416016, + "step": 5304, + "time_per_iteration": 2.8784852027893066 + }, + { + "auxiliary_loss_clip": 0.01499255, + "auxiliary_loss_mlp": 0.01039895, + "balance_loss_clip": 1.3081212, + "balance_loss_mlp": 1.01571929, + "epoch": 0.31895385540357735, + "flos": 25859014035840.0, + "grad_norm": 3.223280749264502, + "language_loss": 0.72905147, + "learning_rate": 3.186226062434068e-06, + "loss": 0.75444299, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.24182129, + "step": 5305, + "time_per_iteration": 2.881859540939331 + }, + { + "auxiliary_loss_clip": 0.01507751, + "auxiliary_loss_mlp": 0.01040996, + "balance_loss_clip": 1.31210828, + "balance_loss_mlp": 1.01612926, + "epoch": 0.3190139786562453, + "flos": 23488314762240.0, + "grad_norm": 1.9060531785785426, + "language_loss": 0.64715576, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.6726433, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.2487793, + "step": 5306, + "time_per_iteration": 2.8981449604034424 + }, + { + "auxiliary_loss_clip": 0.01502421, + "auxiliary_loss_mlp": 0.0104481, + "balance_loss_clip": 1.30624664, + "balance_loss_mlp": 1.02013421, + "epoch": 0.3190741019089133, + "flos": 29107347557760.0, + "grad_norm": 7.353705614910065, + "language_loss": 0.80890465, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.83437699, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.24682617, + "step": 5307, + "time_per_iteration": 2.9293360710144043 + }, + { + "auxiliary_loss_clip": 0.01490708, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.29910719, + "balance_loss_mlp": 1.01167989, + "epoch": 0.31913422516158124, + "flos": 17138576142720.0, + "grad_norm": 1.943762525310381, + "language_loss": 0.79073042, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.81600517, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.25085449, + "step": 5308, + "time_per_iteration": 2.82450008392334 + }, + { + "auxiliary_loss_clip": 0.01537431, + "auxiliary_loss_mlp": 0.01043755, + "balance_loss_clip": 1.33141041, + "balance_loss_mlp": 1.01705205, + "epoch": 0.3191943484142492, + "flos": 16078197571200.0, + "grad_norm": 2.4926081454639437, + "language_loss": 0.75529838, + "learning_rate": 3.184971450390961e-06, + "loss": 0.78111023, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 2.06445312, + "router_z_loss_mlp": 0.26733398, + "step": 5309, + "time_per_iteration": 2.9043562412261963 + }, + { + "auxiliary_loss_clip": 0.01509188, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.31470251, + "balance_loss_mlp": 1.0172987, + "epoch": 0.3192544716669172, + "flos": 22976440629120.0, + "grad_norm": 2.2235721323288473, + "language_loss": 0.83737171, + "learning_rate": 3.184657685014856e-06, + "loss": 0.86287451, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.23803711, + "step": 5310, + "time_per_iteration": 2.87599778175354 + }, + { + "auxiliary_loss_clip": 0.01494874, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.30101752, + "balance_loss_mlp": 1.01832438, + "epoch": 0.31931459491958514, + "flos": 26881676426880.0, + "grad_norm": 1.3560862230278141, + "language_loss": 0.7887913, + "learning_rate": 3.184343874716412e-06, + "loss": 0.81417143, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24804688, + "step": 5311, + "time_per_iteration": 2.9371345043182373 + }, + { + "auxiliary_loss_clip": 0.0149422, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.30130279, + "balance_loss_mlp": 1.01704729, + "epoch": 0.3193747181722531, + "flos": 21846873479040.0, + "grad_norm": 1.6585465383861815, + "language_loss": 0.8481583, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.87351346, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.24267578, + "step": 5312, + "time_per_iteration": 2.851501226425171 + }, + { + "auxiliary_loss_clip": 0.01519629, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.31977987, + "balance_loss_mlp": 1.01713061, + "epoch": 0.31943484142492107, + "flos": 18332128719360.0, + "grad_norm": 3.8154055635796147, + "language_loss": 0.80432582, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.82994223, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.24902344, + "step": 5313, + "time_per_iteration": 2.912537097930908 + }, + { + "auxiliary_loss_clip": 0.01509123, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.31395411, + "balance_loss_mlp": 1.02055681, + "epoch": 0.31949496467758903, + "flos": 21625643813760.0, + "grad_norm": 2.2916589673116188, + "language_loss": 0.877514, + "learning_rate": 3.183402174406057e-06, + "loss": 0.90305889, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.24780273, + "step": 5314, + "time_per_iteration": 2.8988044261932373 + }, + { + "auxiliary_loss_clip": 0.01503583, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.30901885, + "balance_loss_mlp": 1.02305388, + "epoch": 0.31955508793025705, + "flos": 21769676570880.0, + "grad_norm": 1.8344217502432987, + "language_loss": 0.80581528, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.83134675, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.26538086, + "step": 5315, + "time_per_iteration": 2.8809375762939453 + }, + { + "auxiliary_loss_clip": 0.01510424, + "auxiliary_loss_mlp": 0.01046538, + "balance_loss_clip": 1.31520784, + "balance_loss_mlp": 1.02199328, + "epoch": 0.319615211182925, + "flos": 17172944208000.0, + "grad_norm": 2.009734922160297, + "language_loss": 0.68688428, + "learning_rate": 3.18277414980567e-06, + "loss": 0.71245384, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.24536133, + "step": 5316, + "time_per_iteration": 2.9032540321350098 + }, + { + "auxiliary_loss_clip": 0.01502258, + "auxiliary_loss_mlp": 0.01043002, + "balance_loss_clip": 1.31049979, + "balance_loss_mlp": 1.01900518, + "epoch": 0.319675334435593, + "flos": 28124980300800.0, + "grad_norm": 1.4697451563879496, + "language_loss": 0.70284879, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.72830141, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.2401123, + "step": 5317, + "time_per_iteration": 2.927469491958618 + }, + { + "auxiliary_loss_clip": 0.01279234, + "auxiliary_loss_mlp": 0.01053905, + "balance_loss_clip": 1.15940142, + "balance_loss_mlp": 1.02453196, + "epoch": 0.31973545768826095, + "flos": 69536988541440.0, + "grad_norm": 0.7358679920987604, + "language_loss": 0.53204483, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55537623, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.29296875, + "step": 5318, + "time_per_iteration": 3.518836259841919 + }, + { + "auxiliary_loss_clip": 0.01495398, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_clip": 1.30451035, + "balance_loss_mlp": 1.02344251, + "epoch": 0.3197955809409289, + "flos": 13707181584000.0, + "grad_norm": 1.8950739840740365, + "language_loss": 0.85014379, + "learning_rate": 3.181831776553012e-06, + "loss": 0.87557364, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.24145508, + "step": 5319, + "time_per_iteration": 2.9108405113220215 + }, + { + "auxiliary_loss_clip": 0.01487587, + "auxiliary_loss_mlp": 0.01046387, + "balance_loss_clip": 1.29560065, + "balance_loss_mlp": 1.02167487, + "epoch": 0.3198557041935969, + "flos": 33230917353600.0, + "grad_norm": 1.6935531941561635, + "language_loss": 0.64388299, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.66922277, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.24731445, + "step": 5320, + "time_per_iteration": 2.966646909713745 + }, + { + "auxiliary_loss_clip": 0.01504953, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.30914879, + "balance_loss_mlp": 1.01811206, + "epoch": 0.31991582744626484, + "flos": 23742419414400.0, + "grad_norm": 1.981286110337951, + "language_loss": 0.71396059, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.73943508, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.24389648, + "step": 5321, + "time_per_iteration": 2.9343106746673584 + }, + { + "auxiliary_loss_clip": 0.01530181, + "auxiliary_loss_mlp": 0.0105166, + "balance_loss_clip": 1.32853031, + "balance_loss_mlp": 1.02604198, + "epoch": 0.3199759506989328, + "flos": 18559828391040.0, + "grad_norm": 2.6224458117916005, + "language_loss": 0.87034047, + "learning_rate": 3.180888999963749e-06, + "loss": 0.89615887, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 2.01660156, + "router_z_loss_mlp": 0.25622559, + "step": 5322, + "time_per_iteration": 2.8440749645233154 + }, + { + "auxiliary_loss_clip": 0.01500771, + "auxiliary_loss_mlp": 0.01041034, + "balance_loss_clip": 1.30723047, + "balance_loss_mlp": 1.0174191, + "epoch": 0.3200360739516008, + "flos": 22429022065920.0, + "grad_norm": 1.7146123566441598, + "language_loss": 0.83791494, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.86333299, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.23596191, + "step": 5323, + "time_per_iteration": 2.9148483276367188 + }, + { + "auxiliary_loss_clip": 0.01488063, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.29705143, + "balance_loss_mlp": 1.01342797, + "epoch": 0.32009619720426874, + "flos": 20605062683520.0, + "grad_norm": 1.729706145607028, + "language_loss": 0.79316449, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.81843609, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.25671387, + "step": 5324, + "time_per_iteration": 2.8997793197631836 + }, + { + "auxiliary_loss_clip": 0.01507107, + "auxiliary_loss_mlp": 0.01045589, + "balance_loss_clip": 1.31325972, + "balance_loss_mlp": 1.02042389, + "epoch": 0.3201563204569367, + "flos": 18156035381760.0, + "grad_norm": 1.925002493463545, + "language_loss": 0.80968869, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.83521569, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.25146484, + "step": 5325, + "time_per_iteration": 4.305352449417114 + }, + { + "auxiliary_loss_clip": 0.0151269, + "auxiliary_loss_mlp": 0.01040029, + "balance_loss_clip": 1.31706464, + "balance_loss_mlp": 1.01628256, + "epoch": 0.32021644370960467, + "flos": 31696245095040.0, + "grad_norm": 1.5774943141256061, + "language_loss": 0.75823343, + "learning_rate": 3.179631337655037e-06, + "loss": 0.78376067, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.23754883, + "step": 5326, + "time_per_iteration": 2.9322025775909424 + }, + { + "auxiliary_loss_clip": 0.01493366, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_clip": 1.30160618, + "balance_loss_mlp": 1.01915693, + "epoch": 0.32027656696227264, + "flos": 26876111316480.0, + "grad_norm": 1.696775430110584, + "language_loss": 0.8155514, + "learning_rate": 3.179316810218701e-06, + "loss": 0.84092748, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.25097656, + "step": 5327, + "time_per_iteration": 4.351373672485352 + }, + { + "auxiliary_loss_clip": 0.01511899, + "auxiliary_loss_mlp": 0.01044179, + "balance_loss_clip": 1.31279826, + "balance_loss_mlp": 1.01993132, + "epoch": 0.32033669021494066, + "flos": 24180444754560.0, + "grad_norm": 2.0529185265331766, + "language_loss": 0.78308213, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80864286, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.24267578, + "step": 5328, + "time_per_iteration": 2.9056758880615234 + }, + { + "auxiliary_loss_clip": 0.01514573, + "auxiliary_loss_mlp": 0.01045751, + "balance_loss_clip": 1.31734669, + "balance_loss_mlp": 1.02051449, + "epoch": 0.3203968134676086, + "flos": 24471089222400.0, + "grad_norm": 1.5627652265682492, + "language_loss": 0.74572384, + "learning_rate": 3.178687621198524e-06, + "loss": 0.77132702, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.25256348, + "step": 5329, + "time_per_iteration": 2.897634267807007 + }, + { + "auxiliary_loss_clip": 0.01478875, + "auxiliary_loss_mlp": 0.01039848, + "balance_loss_clip": 1.29226398, + "balance_loss_mlp": 1.01617289, + "epoch": 0.3204569367202766, + "flos": 18013857661440.0, + "grad_norm": 1.6730705785783615, + "language_loss": 0.72461259, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.74979985, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23669434, + "step": 5330, + "time_per_iteration": 2.849073886871338 + }, + { + "auxiliary_loss_clip": 0.01515442, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.3160038, + "balance_loss_mlp": 1.01872468, + "epoch": 0.32051705997294455, + "flos": 30601407968640.0, + "grad_norm": 1.7551329345286848, + "language_loss": 0.80693853, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.83253872, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.25866699, + "step": 5331, + "time_per_iteration": 4.464034795761108 + }, + { + "auxiliary_loss_clip": 0.01270828, + "auxiliary_loss_mlp": 0.01065461, + "balance_loss_clip": 1.15670526, + "balance_loss_mlp": 1.04066586, + "epoch": 0.3205771832256125, + "flos": 68447065115520.0, + "grad_norm": 0.8561338478182748, + "language_loss": 0.578354, + "learning_rate": 3.177743502478447e-06, + "loss": 0.60171694, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.24804688, + "step": 5332, + "time_per_iteration": 4.694497585296631 + }, + { + "auxiliary_loss_clip": 0.01514508, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.31447136, + "balance_loss_mlp": 1.01855707, + "epoch": 0.3206373064782805, + "flos": 30455701153920.0, + "grad_norm": 1.5448587946852224, + "language_loss": 0.73884761, + "learning_rate": 3.177428706902205e-06, + "loss": 0.76441741, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.23937988, + "step": 5333, + "time_per_iteration": 2.963329553604126 + }, + { + "auxiliary_loss_clip": 0.01507894, + "auxiliary_loss_mlp": 0.01042357, + "balance_loss_clip": 1.31236005, + "balance_loss_mlp": 1.01732326, + "epoch": 0.32069742973094845, + "flos": 22064664539520.0, + "grad_norm": 2.0280929843213804, + "language_loss": 0.71791255, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.74341512, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 1.95507812, + "router_z_loss_mlp": 0.25048828, + "step": 5334, + "time_per_iteration": 2.9486796855926514 + }, + { + "auxiliary_loss_clip": 0.01496207, + "auxiliary_loss_mlp": 0.01043404, + "balance_loss_clip": 1.30273485, + "balance_loss_mlp": 1.01896608, + "epoch": 0.3207575529836164, + "flos": 22064031112320.0, + "grad_norm": 2.0313439405830236, + "language_loss": 0.78361452, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.80901057, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.24450684, + "step": 5335, + "time_per_iteration": 2.906370162963867 + }, + { + "auxiliary_loss_clip": 0.01504746, + "auxiliary_loss_mlp": 0.01044838, + "balance_loss_clip": 1.31017196, + "balance_loss_mlp": 1.02079344, + "epoch": 0.3208176762362844, + "flos": 34070609197440.0, + "grad_norm": 1.5786535116271179, + "language_loss": 0.69107306, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.71656895, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.24047852, + "step": 5336, + "time_per_iteration": 2.991572856903076 + }, + { + "auxiliary_loss_clip": 0.01511328, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_clip": 1.31619787, + "balance_loss_mlp": 1.02249932, + "epoch": 0.32087779948895234, + "flos": 21808704850560.0, + "grad_norm": 1.988523406493965, + "language_loss": 0.79552644, + "learning_rate": 3.176169078234487e-06, + "loss": 0.82111096, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.24609375, + "step": 5337, + "time_per_iteration": 2.8606202602386475 + }, + { + "auxiliary_loss_clip": 0.01482128, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.29272532, + "balance_loss_mlp": 1.02077556, + "epoch": 0.3209379227416203, + "flos": 21444075855360.0, + "grad_norm": 1.8553450019254087, + "language_loss": 0.75340986, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.77867007, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.23120117, + "step": 5338, + "time_per_iteration": 2.8953824043273926 + }, + { + "auxiliary_loss_clip": 0.0150145, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_clip": 1.30416012, + "balance_loss_mlp": 1.02173352, + "epoch": 0.3209980459942883, + "flos": 25860235645440.0, + "grad_norm": 2.7753681460445536, + "language_loss": 0.63180691, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65728736, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.24865723, + "step": 5339, + "time_per_iteration": 2.869962215423584 + }, + { + "auxiliary_loss_clip": 0.01513265, + "auxiliary_loss_mlp": 0.01049265, + "balance_loss_clip": 1.31550992, + "balance_loss_mlp": 1.02319348, + "epoch": 0.32105816924695624, + "flos": 19108559053440.0, + "grad_norm": 2.4968396419650647, + "language_loss": 0.82650727, + "learning_rate": 3.175223888387192e-06, + "loss": 0.85213256, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.26049805, + "step": 5340, + "time_per_iteration": 2.867220401763916 + }, + { + "auxiliary_loss_clip": 0.01499916, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.30556405, + "balance_loss_mlp": 1.01763427, + "epoch": 0.3211182924996242, + "flos": 16590659886720.0, + "grad_norm": 3.1508307507214273, + "language_loss": 0.7701298, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.7955429, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.23779297, + "step": 5341, + "time_per_iteration": 2.847330093383789 + }, + { + "auxiliary_loss_clip": 0.01491606, + "auxiliary_loss_mlp": 0.01047078, + "balance_loss_clip": 1.30107355, + "balance_loss_mlp": 1.02279544, + "epoch": 0.3211784157522922, + "flos": 22681859863680.0, + "grad_norm": 1.6465805022781819, + "language_loss": 0.79932237, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.82470924, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.24279785, + "step": 5342, + "time_per_iteration": 2.8411126136779785 + }, + { + "auxiliary_loss_clip": 0.01509201, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.31171179, + "balance_loss_mlp": 1.01904678, + "epoch": 0.3212385390049602, + "flos": 20568613357440.0, + "grad_norm": 4.038830366834268, + "language_loss": 0.76084775, + "learning_rate": 3.174278297458438e-06, + "loss": 0.7863735, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.24365234, + "step": 5343, + "time_per_iteration": 2.8592886924743652 + }, + { + "auxiliary_loss_clip": 0.01504663, + "auxiliary_loss_mlp": 0.01043492, + "balance_loss_clip": 1.30916917, + "balance_loss_mlp": 1.01911414, + "epoch": 0.32129866225762815, + "flos": 24802074069120.0, + "grad_norm": 1.5599903316441746, + "language_loss": 0.83774114, + "learning_rate": 3.173963011408748e-06, + "loss": 0.86322272, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.24389648, + "step": 5344, + "time_per_iteration": 2.98291277885437 + }, + { + "auxiliary_loss_clip": 0.01510296, + "auxiliary_loss_mlp": 0.01048345, + "balance_loss_clip": 1.31282878, + "balance_loss_mlp": 1.02403784, + "epoch": 0.3213587855102961, + "flos": 18375410010240.0, + "grad_norm": 2.3192202015701975, + "language_loss": 0.81287825, + "learning_rate": 3.173647680842262e-06, + "loss": 0.83846462, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.24316406, + "step": 5345, + "time_per_iteration": 2.874897003173828 + }, + { + "auxiliary_loss_clip": 0.01510307, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.3132931, + "balance_loss_mlp": 1.02079976, + "epoch": 0.3214189087629641, + "flos": 27027292752000.0, + "grad_norm": 1.970850704696115, + "language_loss": 0.83986872, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.86541367, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.23413086, + "step": 5346, + "time_per_iteration": 2.9444947242736816 + }, + { + "auxiliary_loss_clip": 0.01526626, + "auxiliary_loss_mlp": 0.01047769, + "balance_loss_clip": 1.32687318, + "balance_loss_mlp": 1.02386737, + "epoch": 0.32147903201563205, + "flos": 23158506280320.0, + "grad_norm": 1.6772972679721327, + "language_loss": 0.82014918, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84589314, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.23913574, + "step": 5347, + "time_per_iteration": 2.8962972164154053 + }, + { + "auxiliary_loss_clip": 0.01504362, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.30958116, + "balance_loss_mlp": 1.02439737, + "epoch": 0.3215391552683, + "flos": 16589257297920.0, + "grad_norm": 2.1282630294847062, + "language_loss": 0.80272722, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.8282662, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.25134277, + "step": 5348, + "time_per_iteration": 2.931619167327881 + }, + { + "auxiliary_loss_clip": 0.01507107, + "auxiliary_loss_mlp": 0.01042901, + "balance_loss_clip": 1.31042099, + "balance_loss_mlp": 1.01863027, + "epoch": 0.321599278520968, + "flos": 17830796624640.0, + "grad_norm": 2.2070303587777174, + "language_loss": 0.85969353, + "learning_rate": 3.172385913647542e-06, + "loss": 0.88519359, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.24243164, + "step": 5349, + "time_per_iteration": 2.86352276802063 + }, + { + "auxiliary_loss_clip": 0.01513769, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.31608808, + "balance_loss_mlp": 1.02062523, + "epoch": 0.32165940177363594, + "flos": 16260036998400.0, + "grad_norm": 8.085624213604277, + "language_loss": 0.8072415, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83281982, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.23425293, + "step": 5350, + "time_per_iteration": 2.876023292541504 + }, + { + "auxiliary_loss_clip": 0.0150585, + "auxiliary_loss_mlp": 0.0105078, + "balance_loss_clip": 1.31195569, + "balance_loss_mlp": 1.02807033, + "epoch": 0.3217195250263039, + "flos": 27611386865280.0, + "grad_norm": 1.827492185840312, + "language_loss": 0.80484056, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.83040684, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.22717285, + "step": 5351, + "time_per_iteration": 2.907456874847412 + }, + { + "auxiliary_loss_clip": 0.01502213, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_clip": 1.30654693, + "balance_loss_mlp": 1.02358174, + "epoch": 0.3217796482789719, + "flos": 21480570426240.0, + "grad_norm": 5.144025406040443, + "language_loss": 0.76512444, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.790627, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.24475098, + "step": 5352, + "time_per_iteration": 2.88177752494812 + }, + { + "auxiliary_loss_clip": 0.01496438, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.30289221, + "balance_loss_mlp": 1.01749778, + "epoch": 0.32183977153163984, + "flos": 21225606122880.0, + "grad_norm": 1.8047148154779895, + "language_loss": 0.83215374, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.85753435, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.24108887, + "step": 5353, + "time_per_iteration": 2.8809335231781006 + }, + { + "auxiliary_loss_clip": 0.01499392, + "auxiliary_loss_mlp": 0.01046627, + "balance_loss_clip": 1.30732584, + "balance_loss_mlp": 1.02241588, + "epoch": 0.3218998947843078, + "flos": 24618922542720.0, + "grad_norm": 1.6116840527155312, + "language_loss": 0.73966312, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.76512325, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.24230957, + "step": 5354, + "time_per_iteration": 2.9160239696502686 + }, + { + "auxiliary_loss_clip": 0.01503047, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.30916739, + "balance_loss_mlp": 1.02225113, + "epoch": 0.3219600180369758, + "flos": 22280057625600.0, + "grad_norm": 1.722602069476886, + "language_loss": 0.84201694, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.86749697, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.22717285, + "step": 5355, + "time_per_iteration": 2.89809250831604 + }, + { + "auxiliary_loss_clip": 0.01534049, + "auxiliary_loss_mlp": 0.0104605, + "balance_loss_clip": 1.33693838, + "balance_loss_mlp": 1.02276862, + "epoch": 0.3220201412896438, + "flos": 14947273077120.0, + "grad_norm": 1.8353633040347797, + "language_loss": 0.7184478, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.74424875, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.23291016, + "step": 5356, + "time_per_iteration": 2.82808518409729 + }, + { + "auxiliary_loss_clip": 0.0154925, + "auxiliary_loss_mlp": 0.01050565, + "balance_loss_clip": 1.34406435, + "balance_loss_mlp": 1.02655578, + "epoch": 0.32208026454231176, + "flos": 22675797060480.0, + "grad_norm": 4.232243204612482, + "language_loss": 0.69794881, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.72394699, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 2.05175781, + "router_z_loss_mlp": 0.23999023, + "step": 5357, + "time_per_iteration": 2.892834424972534 + }, + { + "auxiliary_loss_clip": 0.01277731, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.16160512, + "balance_loss_mlp": 1.01665306, + "epoch": 0.3221403877949797, + "flos": 64637938552320.0, + "grad_norm": 0.7138098629525257, + "language_loss": 0.58317238, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60633177, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.21582031, + "step": 5358, + "time_per_iteration": 3.468287944793701 + }, + { + "auxiliary_loss_clip": 0.01520934, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.32382381, + "balance_loss_mlp": 1.02078092, + "epoch": 0.3222005110476477, + "flos": 20166630140160.0, + "grad_norm": 1.685544407835655, + "language_loss": 0.84127069, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86693573, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.24804688, + "step": 5359, + "time_per_iteration": 2.8964905738830566 + }, + { + "auxiliary_loss_clip": 0.01508286, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.31334949, + "balance_loss_mlp": 1.01966286, + "epoch": 0.32226063430031565, + "flos": 22684348327680.0, + "grad_norm": 1.7210720576681828, + "language_loss": 0.80186486, + "learning_rate": 3.168912388464595e-06, + "loss": 0.82736635, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2220459, + "step": 5360, + "time_per_iteration": 4.317646503448486 + }, + { + "auxiliary_loss_clip": 0.01272725, + "auxiliary_loss_mlp": 0.01020865, + "balance_loss_clip": 1.15493631, + "balance_loss_mlp": 1.00045609, + "epoch": 0.3223207575529836, + "flos": 63858431571840.0, + "grad_norm": 0.661459346900478, + "language_loss": 0.57128668, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59422266, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20410156, + "step": 5361, + "time_per_iteration": 3.202414035797119 + }, + { + "auxiliary_loss_clip": 0.01516588, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.32122874, + "balance_loss_mlp": 1.02143443, + "epoch": 0.3223808808056516, + "flos": 26881404958080.0, + "grad_norm": 2.68784670844084, + "language_loss": 0.71984565, + "learning_rate": 3.168280261735588e-06, + "loss": 0.74546462, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.23876953, + "step": 5362, + "time_per_iteration": 4.461483478546143 + }, + { + "auxiliary_loss_clip": 0.01513895, + "auxiliary_loss_mlp": 0.01050532, + "balance_loss_clip": 1.31731308, + "balance_loss_mlp": 1.02715516, + "epoch": 0.32244100405831955, + "flos": 26772780896640.0, + "grad_norm": 2.1685097266721978, + "language_loss": 0.74403387, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76967812, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.23388672, + "step": 5363, + "time_per_iteration": 2.8971471786499023 + }, + { + "auxiliary_loss_clip": 0.01528394, + "auxiliary_loss_mlp": 0.01040313, + "balance_loss_clip": 1.3255055, + "balance_loss_mlp": 1.01637578, + "epoch": 0.3225011273109875, + "flos": 23812920092160.0, + "grad_norm": 6.571431689959631, + "language_loss": 0.76873839, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79442549, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 2.02734375, + "router_z_loss_mlp": 0.23925781, + "step": 5364, + "time_per_iteration": 2.9084885120391846 + }, + { + "auxiliary_loss_clip": 0.01514111, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_clip": 1.31708765, + "balance_loss_mlp": 1.02012801, + "epoch": 0.3225612505636555, + "flos": 17283332816640.0, + "grad_norm": 2.401875069986875, + "language_loss": 0.77783775, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.80341953, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.23950195, + "step": 5365, + "time_per_iteration": 2.8234853744506836 + }, + { + "auxiliary_loss_clip": 0.01527863, + "auxiliary_loss_mlp": 0.01044444, + "balance_loss_clip": 1.32998908, + "balance_loss_mlp": 1.02075744, + "epoch": 0.32262137381632344, + "flos": 23376297340800.0, + "grad_norm": 2.1033965000500476, + "language_loss": 0.7708075, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79653066, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.23706055, + "step": 5366, + "time_per_iteration": 4.297854423522949 + }, + { + "auxiliary_loss_clip": 0.01517798, + "auxiliary_loss_mlp": 0.01044232, + "balance_loss_clip": 1.32155716, + "balance_loss_mlp": 1.02186882, + "epoch": 0.3226814970689914, + "flos": 23269347336960.0, + "grad_norm": 1.883545976863215, + "language_loss": 0.72873425, + "learning_rate": 3.166699169850055e-06, + "loss": 0.7543546, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.22375488, + "step": 5367, + "time_per_iteration": 4.2808732986450195 + }, + { + "auxiliary_loss_clip": 0.01510892, + "auxiliary_loss_mlp": 0.01040907, + "balance_loss_clip": 1.31582832, + "balance_loss_mlp": 1.01713657, + "epoch": 0.32274162032165943, + "flos": 16402848145920.0, + "grad_norm": 2.5983664018930663, + "language_loss": 0.75887817, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.78439617, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.23779297, + "step": 5368, + "time_per_iteration": 2.8614120483398438 + }, + { + "auxiliary_loss_clip": 0.01506221, + "auxiliary_loss_mlp": 0.01035968, + "balance_loss_clip": 1.3101908, + "balance_loss_mlp": 1.01337838, + "epoch": 0.3228017435743274, + "flos": 27866577392640.0, + "grad_norm": 2.2466874673525914, + "language_loss": 0.79037303, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81579494, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.22595215, + "step": 5369, + "time_per_iteration": 2.943206548690796 + }, + { + "auxiliary_loss_clip": 0.01508363, + "auxiliary_loss_mlp": 0.01036193, + "balance_loss_clip": 1.31682611, + "balance_loss_mlp": 1.01374602, + "epoch": 0.32286186682699536, + "flos": 19617989967360.0, + "grad_norm": 2.167055807126553, + "language_loss": 0.84190011, + "learning_rate": 3.16574998372661e-06, + "loss": 0.86734569, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.2244873, + "step": 5370, + "time_per_iteration": 2.8974969387054443 + }, + { + "auxiliary_loss_clip": 0.01515835, + "auxiliary_loss_mlp": 0.01040302, + "balance_loss_clip": 1.31860387, + "balance_loss_mlp": 1.01603103, + "epoch": 0.3229219900796633, + "flos": 24144674100480.0, + "grad_norm": 1.9376631385325869, + "language_loss": 0.83726782, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.86282915, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.24291992, + "step": 5371, + "time_per_iteration": 2.8881256580352783 + }, + { + "auxiliary_loss_clip": 0.01518775, + "auxiliary_loss_mlp": 0.01046093, + "balance_loss_clip": 1.31916738, + "balance_loss_mlp": 1.02244174, + "epoch": 0.3229821133323313, + "flos": 17757626503680.0, + "grad_norm": 2.0339266326739467, + "language_loss": 0.89071882, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.91636747, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.2364502, + "step": 5372, + "time_per_iteration": 2.891850233078003 + }, + { + "auxiliary_loss_clip": 0.01531321, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.33305907, + "balance_loss_mlp": 1.01627243, + "epoch": 0.32304223658499925, + "flos": 22356168658560.0, + "grad_norm": 9.74871941757989, + "language_loss": 0.73097318, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75669277, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.24389648, + "step": 5373, + "time_per_iteration": 3.0952396392822266 + }, + { + "auxiliary_loss_clip": 0.01498072, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.30681157, + "balance_loss_mlp": 1.01441729, + "epoch": 0.3231023598376672, + "flos": 18487382186880.0, + "grad_norm": 2.803537962686067, + "language_loss": 0.82782316, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.85318357, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.23571777, + "step": 5374, + "time_per_iteration": 2.9387032985687256 + }, + { + "auxiliary_loss_clip": 0.01496773, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.30612195, + "balance_loss_mlp": 1.01606727, + "epoch": 0.3231624830903352, + "flos": 27647790946560.0, + "grad_norm": 2.032993976782501, + "language_loss": 0.88115168, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.90650594, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.22595215, + "step": 5375, + "time_per_iteration": 2.9855504035949707 + }, + { + "auxiliary_loss_clip": 0.01536559, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.33447361, + "balance_loss_mlp": 1.01643634, + "epoch": 0.32322260634300315, + "flos": 21736575360000.0, + "grad_norm": 1.9769869346126496, + "language_loss": 0.76922601, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.79500377, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.24768066, + "step": 5376, + "time_per_iteration": 2.868070602416992 + }, + { + "auxiliary_loss_clip": 0.01518113, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.32150745, + "balance_loss_mlp": 1.01523411, + "epoch": 0.3232827295956711, + "flos": 22647808512000.0, + "grad_norm": 3.1691060160511553, + "language_loss": 0.67543817, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.70099217, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.22033691, + "step": 5377, + "time_per_iteration": 2.894752264022827 + }, + { + "auxiliary_loss_clip": 0.01514563, + "auxiliary_loss_mlp": 0.01047885, + "balance_loss_clip": 1.31879807, + "balance_loss_mlp": 1.02395964, + "epoch": 0.3233428528483391, + "flos": 26333669681280.0, + "grad_norm": 1.4524672168108606, + "language_loss": 0.73031807, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.75594252, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.23962402, + "step": 5378, + "time_per_iteration": 2.9289238452911377 + }, + { + "auxiliary_loss_clip": 0.01514051, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.31655872, + "balance_loss_mlp": 1.01422715, + "epoch": 0.32340297610100704, + "flos": 28597011747840.0, + "grad_norm": 2.2146907655562114, + "language_loss": 0.8289699, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.85448909, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.2364502, + "step": 5379, + "time_per_iteration": 2.9225308895111084 + }, + { + "auxiliary_loss_clip": 0.01526866, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.32577193, + "balance_loss_mlp": 1.01846027, + "epoch": 0.323463099353675, + "flos": 30786007328640.0, + "grad_norm": 1.6727185990844444, + "language_loss": 0.79776061, + "learning_rate": 3.162583158454388e-06, + "loss": 0.82344919, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23535156, + "step": 5380, + "time_per_iteration": 2.9653091430664062 + }, + { + "auxiliary_loss_clip": 0.01526042, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.32700777, + "balance_loss_mlp": 1.01934433, + "epoch": 0.32352322260634303, + "flos": 25239556471680.0, + "grad_norm": 1.6762360821354092, + "language_loss": 0.77739131, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.80306655, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.22143555, + "step": 5381, + "time_per_iteration": 2.9014899730682373 + }, + { + "auxiliary_loss_clip": 0.0151049, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.31698537, + "balance_loss_mlp": 1.01359808, + "epoch": 0.323583345859011, + "flos": 23340707665920.0, + "grad_norm": 3.096196118615213, + "language_loss": 0.7271111, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.75256777, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.21569824, + "step": 5382, + "time_per_iteration": 2.891120433807373 + }, + { + "auxiliary_loss_clip": 0.01529083, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.32845783, + "balance_loss_mlp": 1.01558042, + "epoch": 0.32364346911167896, + "flos": 26217082535040.0, + "grad_norm": 3.9847260796021025, + "language_loss": 0.71884263, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.74451947, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 2.00488281, + "router_z_loss_mlp": 0.22998047, + "step": 5383, + "time_per_iteration": 2.9093077182769775 + }, + { + "auxiliary_loss_clip": 0.01502356, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.31077039, + "balance_loss_mlp": 1.01555109, + "epoch": 0.3237035923643469, + "flos": 23706196312320.0, + "grad_norm": 1.5790684465092832, + "language_loss": 0.78901964, + "learning_rate": 3.161315193285283e-06, + "loss": 0.81442553, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.22680664, + "step": 5384, + "time_per_iteration": 2.9115638732910156 + }, + { + "auxiliary_loss_clip": 0.015338, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.3319726, + "balance_loss_mlp": 1.01905274, + "epoch": 0.3237637156170149, + "flos": 14436846777600.0, + "grad_norm": 2.1490304291947733, + "language_loss": 0.76289022, + "learning_rate": 3.16099809186998e-06, + "loss": 0.78866088, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 2.015625, + "router_z_loss_mlp": 0.2421875, + "step": 5385, + "time_per_iteration": 2.8328845500946045 + }, + { + "auxiliary_loss_clip": 0.01513529, + "auxiliary_loss_mlp": 0.01045579, + "balance_loss_clip": 1.31686437, + "balance_loss_mlp": 1.02224958, + "epoch": 0.32382383886968286, + "flos": 31074570535680.0, + "grad_norm": 2.9700903077917586, + "language_loss": 0.72294825, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74853933, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.23339844, + "step": 5386, + "time_per_iteration": 2.93491530418396 + }, + { + "auxiliary_loss_clip": 0.0153301, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.33070588, + "balance_loss_mlp": 1.01722383, + "epoch": 0.3238839621223508, + "flos": 23266994607360.0, + "grad_norm": 2.973072016498827, + "language_loss": 0.9506619, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.97640389, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.23962402, + "step": 5387, + "time_per_iteration": 2.8973398208618164 + }, + { + "auxiliary_loss_clip": 0.01548789, + "auxiliary_loss_mlp": 0.0104381, + "balance_loss_clip": 1.34677732, + "balance_loss_mlp": 1.02026629, + "epoch": 0.3239440853750188, + "flos": 22974540347520.0, + "grad_norm": 2.0904258095686896, + "language_loss": 0.78317177, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.80909777, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 2.02148438, + "router_z_loss_mlp": 0.23547363, + "step": 5388, + "time_per_iteration": 2.8879318237304688 + }, + { + "auxiliary_loss_clip": 0.0152445, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_clip": 1.32763791, + "balance_loss_mlp": 1.01937234, + "epoch": 0.32400420862768675, + "flos": 36260238205440.0, + "grad_norm": 2.3318983243417777, + "language_loss": 0.72587085, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.75154877, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 1.96679688, + "router_z_loss_mlp": 0.23986816, + "step": 5389, + "time_per_iteration": 3.063370943069458 + }, + { + "auxiliary_loss_clip": 0.01529827, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_clip": 1.33353567, + "balance_loss_mlp": 1.02029955, + "epoch": 0.3240643318803547, + "flos": 21626503464960.0, + "grad_norm": 1.9848507858877082, + "language_loss": 0.82194501, + "learning_rate": 3.159411924656557e-06, + "loss": 0.84768409, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 1.96289062, + "router_z_loss_mlp": 0.23791504, + "step": 5390, + "time_per_iteration": 2.946657657623291 + }, + { + "auxiliary_loss_clip": 0.01543245, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.34481645, + "balance_loss_mlp": 1.01376402, + "epoch": 0.3241244551330227, + "flos": 23306249111040.0, + "grad_norm": 1.9009228989205331, + "language_loss": 0.74177372, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.76758504, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.24108887, + "step": 5391, + "time_per_iteration": 2.9430923461914062 + }, + { + "auxiliary_loss_clip": 0.01513652, + "auxiliary_loss_mlp": 0.01039032, + "balance_loss_clip": 1.31759286, + "balance_loss_mlp": 1.01551175, + "epoch": 0.32418457838569065, + "flos": 14104459342080.0, + "grad_norm": 1.5974815487084333, + "language_loss": 0.78075534, + "learning_rate": 3.158777149931855e-06, + "loss": 0.80628216, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.23522949, + "step": 5392, + "time_per_iteration": 2.8958423137664795 + }, + { + "auxiliary_loss_clip": 0.01532149, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.33083475, + "balance_loss_mlp": 1.01439738, + "epoch": 0.3242447016383586, + "flos": 29764747526400.0, + "grad_norm": 4.1697884738172535, + "language_loss": 0.64123982, + "learning_rate": 3.158459696652067e-06, + "loss": 0.6669386, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 2.00976562, + "router_z_loss_mlp": 0.23327637, + "step": 5393, + "time_per_iteration": 2.932168483734131 + }, + { + "auxiliary_loss_clip": 0.01530201, + "auxiliary_loss_mlp": 0.01038961, + "balance_loss_clip": 1.33325398, + "balance_loss_mlp": 1.01575041, + "epoch": 0.3243048248910266, + "flos": 24361469775360.0, + "grad_norm": 2.0299057717223796, + "language_loss": 0.83642662, + "learning_rate": 3.158142199443371e-06, + "loss": 0.86211824, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 1.97070312, + "router_z_loss_mlp": 0.23205566, + "step": 5394, + "time_per_iteration": 4.324660062789917 + }, + { + "auxiliary_loss_clip": 0.01502723, + "auxiliary_loss_mlp": 0.01046325, + "balance_loss_clip": 1.31305408, + "balance_loss_mlp": 1.02425885, + "epoch": 0.3243649481436946, + "flos": 24363777260160.0, + "grad_norm": 1.9099050328630327, + "language_loss": 0.82583505, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.85132551, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.2208252, + "step": 5395, + "time_per_iteration": 2.9064745903015137 + }, + { + "auxiliary_loss_clip": 0.01511629, + "auxiliary_loss_mlp": 0.01044172, + "balance_loss_clip": 1.31931937, + "balance_loss_mlp": 1.02117634, + "epoch": 0.32442507139636256, + "flos": 22934335703040.0, + "grad_norm": 2.764506831768972, + "language_loss": 0.84121019, + "learning_rate": 3.157507073287417e-06, + "loss": 0.86676812, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.23022461, + "step": 5396, + "time_per_iteration": 2.88105845451355 + }, + { + "auxiliary_loss_clip": 0.01539563, + "auxiliary_loss_mlp": 0.01046167, + "balance_loss_clip": 1.33820248, + "balance_loss_mlp": 1.02312422, + "epoch": 0.32448519464903053, + "flos": 22210326109440.0, + "grad_norm": 3.2639527019100276, + "language_loss": 0.77893758, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.80479491, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 2.01074219, + "router_z_loss_mlp": 0.23034668, + "step": 5397, + "time_per_iteration": 4.277055501937866 + }, + { + "auxiliary_loss_clip": 0.01505481, + "auxiliary_loss_mlp": 0.01042464, + "balance_loss_clip": 1.31426561, + "balance_loss_mlp": 1.01921844, + "epoch": 0.3245453179016985, + "flos": 18846400826880.0, + "grad_norm": 4.194179064047444, + "language_loss": 0.68793637, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.7134158, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.23217773, + "step": 5398, + "time_per_iteration": 2.8372721672058105 + }, + { + "auxiliary_loss_clip": 0.0151075, + "auxiliary_loss_mlp": 0.01043472, + "balance_loss_clip": 1.31416726, + "balance_loss_mlp": 1.02022588, + "epoch": 0.32460544115436646, + "flos": 21188206656000.0, + "grad_norm": 1.370848530291454, + "language_loss": 0.73907667, + "learning_rate": 3.156554054887718e-06, + "loss": 0.76461887, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 1.96386719, + "router_z_loss_mlp": 0.23254395, + "step": 5399, + "time_per_iteration": 2.8462655544281006 + }, + { + "auxiliary_loss_clip": 0.01508547, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.31321478, + "balance_loss_mlp": 1.01824808, + "epoch": 0.3246655644070344, + "flos": 21991177704960.0, + "grad_norm": 2.1924504187901697, + "language_loss": 0.72043198, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.74593568, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.23596191, + "step": 5400, + "time_per_iteration": 2.877751588821411 + }, + { + "auxiliary_loss_clip": 0.01528098, + "auxiliary_loss_mlp": 0.01041086, + "balance_loss_clip": 1.32948875, + "balance_loss_mlp": 1.01788807, + "epoch": 0.3247256876597024, + "flos": 32172167594880.0, + "grad_norm": 1.9622744951842566, + "language_loss": 0.80750036, + "learning_rate": 3.155918489984614e-06, + "loss": 0.83319217, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.23205566, + "step": 5401, + "time_per_iteration": 4.396793842315674 + }, + { + "auxiliary_loss_clip": 0.0151938, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.32154834, + "balance_loss_mlp": 1.01386082, + "epoch": 0.32478581091237035, + "flos": 21007724572800.0, + "grad_norm": 1.6356878734758173, + "language_loss": 0.87776065, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.90332919, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.23620605, + "step": 5402, + "time_per_iteration": 4.31572151184082 + }, + { + "auxiliary_loss_clip": 0.01506876, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.31603885, + "balance_loss_mlp": 1.01848292, + "epoch": 0.3248459341650383, + "flos": 17932362497280.0, + "grad_norm": 2.241465406127365, + "language_loss": 0.85958004, + "learning_rate": 3.155282749751332e-06, + "loss": 0.88505185, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.21838379, + "step": 5403, + "time_per_iteration": 3.007122278213501 + }, + { + "auxiliary_loss_clip": 0.01490861, + "auxiliary_loss_mlp": 0.01042183, + "balance_loss_clip": 1.3036325, + "balance_loss_mlp": 1.01971245, + "epoch": 0.3249060574177063, + "flos": 24546431093760.0, + "grad_norm": 2.070887558256594, + "language_loss": 0.88442528, + "learning_rate": 3.154964813916007e-06, + "loss": 0.90975571, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.22460938, + "step": 5404, + "time_per_iteration": 2.9390408992767334 + }, + { + "auxiliary_loss_clip": 0.01504359, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.31299937, + "balance_loss_mlp": 1.01386499, + "epoch": 0.32496618067037425, + "flos": 26005897215360.0, + "grad_norm": 1.9286427856326507, + "language_loss": 0.73587507, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.76128221, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.22497559, + "step": 5405, + "time_per_iteration": 2.898709535598755 + }, + { + "auxiliary_loss_clip": 0.01497266, + "auxiliary_loss_mlp": 0.01038177, + "balance_loss_clip": 1.30402541, + "balance_loss_mlp": 1.01505089, + "epoch": 0.3250263039230422, + "flos": 19583350433280.0, + "grad_norm": 1.7084522478444493, + "language_loss": 0.83571732, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.86107183, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.23120117, + "step": 5406, + "time_per_iteration": 2.881152391433716 + }, + { + "auxiliary_loss_clip": 0.01507709, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.31499577, + "balance_loss_mlp": 1.01521158, + "epoch": 0.3250864271757102, + "flos": 16772318334720.0, + "grad_norm": 1.7457883557665272, + "language_loss": 0.8866089, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.91206688, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.22875977, + "step": 5407, + "time_per_iteration": 2.8570916652679443 + }, + { + "auxiliary_loss_clip": 0.01504686, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.31067848, + "balance_loss_mlp": 1.0162406, + "epoch": 0.3251465504283782, + "flos": 27831168696960.0, + "grad_norm": 1.6567588678314464, + "language_loss": 0.69680762, + "learning_rate": 3.153692632731479e-06, + "loss": 0.72224873, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.23193359, + "step": 5408, + "time_per_iteration": 2.935502052307129 + }, + { + "auxiliary_loss_clip": 0.01518973, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.31751931, + "balance_loss_mlp": 1.0176934, + "epoch": 0.32520667368104617, + "flos": 19072652664960.0, + "grad_norm": 2.1627715521182376, + "language_loss": 0.78579319, + "learning_rate": 3.153374478034841e-06, + "loss": 0.81138343, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 2.01953125, + "router_z_loss_mlp": 0.22351074, + "step": 5409, + "time_per_iteration": 2.8858842849731445 + }, + { + "auxiliary_loss_clip": 0.01508636, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.31231618, + "balance_loss_mlp": 1.02069294, + "epoch": 0.32526679693371413, + "flos": 29392653139200.0, + "grad_norm": 2.1051456088247944, + "language_loss": 0.83724809, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.86276448, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.22314453, + "step": 5410, + "time_per_iteration": 2.906587600708008 + }, + { + "auxiliary_loss_clip": 0.0148019, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.29233122, + "balance_loss_mlp": 1.01934719, + "epoch": 0.3253269201863821, + "flos": 20714320172160.0, + "grad_norm": 1.5206381441725918, + "language_loss": 0.71883655, + "learning_rate": 3.152738037445405e-06, + "loss": 0.74407071, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2388916, + "step": 5411, + "time_per_iteration": 2.872910737991333 + }, + { + "auxiliary_loss_clip": 0.01502282, + "auxiliary_loss_mlp": 0.01042721, + "balance_loss_clip": 1.30956817, + "balance_loss_mlp": 1.01963019, + "epoch": 0.32538704343905006, + "flos": 29105583010560.0, + "grad_norm": 1.4892604854537639, + "language_loss": 0.83559322, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.86104321, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.23059082, + "step": 5412, + "time_per_iteration": 2.9041829109191895 + }, + { + "auxiliary_loss_clip": 0.01516203, + "auxiliary_loss_mlp": 0.01040538, + "balance_loss_clip": 1.31906939, + "balance_loss_mlp": 1.016505, + "epoch": 0.325447166691718, + "flos": 24685577412480.0, + "grad_norm": 1.6937710269166464, + "language_loss": 0.81554866, + "learning_rate": 3.152101422008203e-06, + "loss": 0.84111607, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.24047852, + "step": 5413, + "time_per_iteration": 2.8988823890686035 + }, + { + "auxiliary_loss_clip": 0.01514556, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_clip": 1.32030785, + "balance_loss_mlp": 1.01821899, + "epoch": 0.325507289944386, + "flos": 21553016630400.0, + "grad_norm": 1.5735404177522316, + "language_loss": 0.77054203, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79610705, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.23742676, + "step": 5414, + "time_per_iteration": 2.9227371215820312 + }, + { + "auxiliary_loss_clip": 0.01286477, + "auxiliary_loss_mlp": 0.01056321, + "balance_loss_clip": 1.17123318, + "balance_loss_mlp": 1.03448153, + "epoch": 0.32556741319705396, + "flos": 71548470213120.0, + "grad_norm": 0.9256993340148384, + "language_loss": 0.64001912, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66344708, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.21875, + "step": 5415, + "time_per_iteration": 3.2861709594726562 + }, + { + "auxiliary_loss_clip": 0.01512757, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.32005048, + "balance_loss_mlp": 1.01975608, + "epoch": 0.3256275364497219, + "flos": 23742917107200.0, + "grad_norm": 1.5035062971118203, + "language_loss": 0.75307345, + "learning_rate": 3.151146171224075e-06, + "loss": 0.77862906, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.23022461, + "step": 5416, + "time_per_iteration": 3.0393288135528564 + }, + { + "auxiliary_loss_clip": 0.01288095, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.17594934, + "balance_loss_mlp": 1.01280427, + "epoch": 0.3256876597023899, + "flos": 67318583840640.0, + "grad_norm": 0.791150301897351, + "language_loss": 0.57998109, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.6031884, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.19824219, + "step": 5417, + "time_per_iteration": 3.3712329864501953 + }, + { + "auxiliary_loss_clip": 0.01287764, + "auxiliary_loss_mlp": 0.01021408, + "balance_loss_clip": 1.17636013, + "balance_loss_mlp": 1.00405121, + "epoch": 0.32574778295505785, + "flos": 71316942716160.0, + "grad_norm": 0.8369639823568511, + "language_loss": 0.63527369, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65836537, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.17382812, + "step": 5418, + "time_per_iteration": 3.399148941040039 + }, + { + "auxiliary_loss_clip": 0.01498001, + "auxiliary_loss_mlp": 0.01046619, + "balance_loss_clip": 1.30534244, + "balance_loss_mlp": 1.0242548, + "epoch": 0.3258079062077258, + "flos": 20785906725120.0, + "grad_norm": 1.927568903761663, + "language_loss": 0.70666611, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.73211235, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.22375488, + "step": 5419, + "time_per_iteration": 2.905410051345825 + }, + { + "auxiliary_loss_clip": 0.01518039, + "auxiliary_loss_mlp": 0.01047439, + "balance_loss_clip": 1.32302856, + "balance_loss_mlp": 1.02484894, + "epoch": 0.3258680294603938, + "flos": 22245282357120.0, + "grad_norm": 1.780609507826102, + "language_loss": 0.78221262, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.80786741, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.22583008, + "step": 5420, + "time_per_iteration": 2.8784520626068115 + }, + { + "auxiliary_loss_clip": 0.01520966, + "auxiliary_loss_mlp": 0.01052593, + "balance_loss_clip": 1.32495403, + "balance_loss_mlp": 1.0296576, + "epoch": 0.3259281527130618, + "flos": 26991386363520.0, + "grad_norm": 3.0329790543617534, + "language_loss": 0.8072837, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.83301932, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 1.95898438, + "router_z_loss_mlp": 0.22949219, + "step": 5421, + "time_per_iteration": 2.945810079574585 + }, + { + "auxiliary_loss_clip": 0.01512607, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.32173634, + "balance_loss_mlp": 1.0393157, + "epoch": 0.32598827596572977, + "flos": 26225588557440.0, + "grad_norm": 1.602110414703409, + "language_loss": 0.7582258, + "learning_rate": 3.149234491389381e-06, + "loss": 0.78395247, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.20751953, + "step": 5422, + "time_per_iteration": 3.045612335205078 + }, + { + "auxiliary_loss_clip": 0.01527001, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.32902181, + "balance_loss_mlp": 1.03385639, + "epoch": 0.32604839921839773, + "flos": 17648776218240.0, + "grad_norm": 1.9634518307389681, + "language_loss": 0.63733274, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.66316462, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.22338867, + "step": 5423, + "time_per_iteration": 2.8406739234924316 + }, + { + "auxiliary_loss_clip": 0.01492574, + "auxiliary_loss_mlp": 0.01050395, + "balance_loss_clip": 1.30435622, + "balance_loss_mlp": 1.02797222, + "epoch": 0.3261085224710657, + "flos": 23633116680960.0, + "grad_norm": 1.6239986719920534, + "language_loss": 0.75408572, + "learning_rate": 3.148596916016224e-06, + "loss": 0.77951539, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.22399902, + "step": 5424, + "time_per_iteration": 2.950788974761963 + }, + { + "auxiliary_loss_clip": 0.01501307, + "auxiliary_loss_mlp": 0.01051248, + "balance_loss_clip": 1.31097031, + "balance_loss_mlp": 1.02902794, + "epoch": 0.32616864572373366, + "flos": 23271428597760.0, + "grad_norm": 1.684322419331074, + "language_loss": 0.77151203, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.7970376, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.22229004, + "step": 5425, + "time_per_iteration": 2.879028558731079 + }, + { + "auxiliary_loss_clip": 0.01519794, + "auxiliary_loss_mlp": 0.01058266, + "balance_loss_clip": 1.32137609, + "balance_loss_mlp": 1.03349447, + "epoch": 0.32622876897640163, + "flos": 25604456935680.0, + "grad_norm": 2.2629186364749097, + "language_loss": 0.79320002, + "learning_rate": 3.147959166423428e-06, + "loss": 0.81898069, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.24780273, + "step": 5426, + "time_per_iteration": 2.9384098052978516 + }, + { + "auxiliary_loss_clip": 0.01526748, + "auxiliary_loss_mlp": 0.01052975, + "balance_loss_clip": 1.33150578, + "balance_loss_mlp": 1.0299319, + "epoch": 0.3262888922290696, + "flos": 22429157800320.0, + "grad_norm": 1.8913530600317383, + "language_loss": 0.75063497, + "learning_rate": 3.147640226324893e-06, + "loss": 0.77643222, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.23046875, + "step": 5427, + "time_per_iteration": 2.885321617126465 + }, + { + "auxiliary_loss_clip": 0.01527048, + "auxiliary_loss_mlp": 0.01053364, + "balance_loss_clip": 1.32871997, + "balance_loss_mlp": 1.03096437, + "epoch": 0.32634901548173756, + "flos": 19728333331200.0, + "grad_norm": 1.608216911416011, + "language_loss": 0.7967267, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.82253087, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.22412109, + "step": 5428, + "time_per_iteration": 2.9001705646514893 + }, + { + "auxiliary_loss_clip": 0.01502532, + "auxiliary_loss_mlp": 0.01045788, + "balance_loss_clip": 1.30979586, + "balance_loss_mlp": 1.02298355, + "epoch": 0.3264091387344055, + "flos": 16151322447360.0, + "grad_norm": 1.9879254913233098, + "language_loss": 0.71795195, + "learning_rate": 3.147002215584023e-06, + "loss": 0.74343514, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.22802734, + "step": 5429, + "time_per_iteration": 4.295518159866333 + }, + { + "auxiliary_loss_clip": 0.01510332, + "auxiliary_loss_mlp": 0.01050781, + "balance_loss_clip": 1.31622851, + "balance_loss_mlp": 1.02842879, + "epoch": 0.3264692619870735, + "flos": 16407734584320.0, + "grad_norm": 1.63556534076697, + "language_loss": 0.78877807, + "learning_rate": 3.146683144965881e-06, + "loss": 0.81438923, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.22351074, + "step": 5430, + "time_per_iteration": 2.854804754257202 + }, + { + "auxiliary_loss_clip": 0.01514989, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.31917715, + "balance_loss_mlp": 1.02313137, + "epoch": 0.32652938523974145, + "flos": 22392437005440.0, + "grad_norm": 2.1146935937881977, + "language_loss": 0.85597014, + "learning_rate": 3.146364030865399e-06, + "loss": 0.88159585, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.24462891, + "step": 5431, + "time_per_iteration": 2.8513412475585938 + }, + { + "auxiliary_loss_clip": 0.01503864, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.31322336, + "balance_loss_mlp": 1.02235615, + "epoch": 0.3265895084924094, + "flos": 21918052828800.0, + "grad_norm": 1.6319410798541392, + "language_loss": 0.7102111, + "learning_rate": 3.146044873294678e-06, + "loss": 0.73569351, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22021484, + "step": 5432, + "time_per_iteration": 4.33033561706543 + }, + { + "auxiliary_loss_clip": 0.0151839, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.32270658, + "balance_loss_mlp": 1.01546228, + "epoch": 0.3266496317450774, + "flos": 16074804211200.0, + "grad_norm": 1.7629038126828265, + "language_loss": 0.84985924, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.87541002, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.21228027, + "step": 5433, + "time_per_iteration": 2.8461503982543945 + }, + { + "auxiliary_loss_clip": 0.01504068, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.31366193, + "balance_loss_mlp": 1.01807451, + "epoch": 0.3267097549977454, + "flos": 22538370044160.0, + "grad_norm": 1.5100930553673961, + "language_loss": 0.86545599, + "learning_rate": 3.145406427790931e-06, + "loss": 0.89090419, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22668457, + "step": 5434, + "time_per_iteration": 2.934783458709717 + }, + { + "auxiliary_loss_clip": 0.01521272, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.32528985, + "balance_loss_mlp": 1.01653695, + "epoch": 0.32676987825041337, + "flos": 27281533138560.0, + "grad_norm": 1.7933847140754358, + "language_loss": 0.88738775, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.91300011, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 1.95800781, + "router_z_loss_mlp": 0.234375, + "step": 5435, + "time_per_iteration": 2.9163033962249756 + }, + { + "auxiliary_loss_clip": 0.01520373, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.32493687, + "balance_loss_mlp": 1.01804447, + "epoch": 0.32683000150308134, + "flos": 11515199846400.0, + "grad_norm": 3.6365273095162003, + "language_loss": 0.77744055, + "learning_rate": 3.144767808551479e-06, + "loss": 0.80304843, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.22375488, + "step": 5436, + "time_per_iteration": 4.245404481887817 + }, + { + "auxiliary_loss_clip": 0.01510473, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.31742275, + "balance_loss_mlp": 1.01426721, + "epoch": 0.3268901247557493, + "flos": 25641313464960.0, + "grad_norm": 1.5135461994555524, + "language_loss": 0.72692013, + "learning_rate": 3.144448433811134e-06, + "loss": 0.75238824, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.2208252, + "step": 5437, + "time_per_iteration": 4.313629627227783 + }, + { + "auxiliary_loss_clip": 0.01528894, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_clip": 1.33011603, + "balance_loss_mlp": 1.02113116, + "epoch": 0.32695024800841727, + "flos": 24870945934080.0, + "grad_norm": 1.5618424567682183, + "language_loss": 0.64318722, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66892278, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 1.98632812, + "router_z_loss_mlp": 0.23522949, + "step": 5438, + "time_per_iteration": 2.9271907806396484 + }, + { + "auxiliary_loss_clip": 0.01519702, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.3265264, + "balance_loss_mlp": 1.0216186, + "epoch": 0.32701037126108523, + "flos": 28850754441600.0, + "grad_norm": 1.8535277844475424, + "language_loss": 0.74656636, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.7722044, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.22509766, + "step": 5439, + "time_per_iteration": 2.9144742488861084 + }, + { + "auxiliary_loss_clip": 0.0152007, + "auxiliary_loss_mlp": 0.01047928, + "balance_loss_clip": 1.32566869, + "balance_loss_mlp": 1.02487302, + "epoch": 0.3270704945137532, + "flos": 27976739777280.0, + "grad_norm": 2.5940572640578567, + "language_loss": 0.75219041, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77787036, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.23046875, + "step": 5440, + "time_per_iteration": 2.94856858253479 + }, + { + "auxiliary_loss_clip": 0.01505652, + "auxiliary_loss_mlp": 0.01047725, + "balance_loss_clip": 1.31516647, + "balance_loss_mlp": 1.02492011, + "epoch": 0.32713061776642116, + "flos": 23699816795520.0, + "grad_norm": 1.9647789462822023, + "language_loss": 0.85137582, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.87690961, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.22814941, + "step": 5441, + "time_per_iteration": 2.9243557453155518 + }, + { + "auxiliary_loss_clip": 0.01520946, + "auxiliary_loss_mlp": 0.01045682, + "balance_loss_clip": 1.32485998, + "balance_loss_mlp": 1.0229969, + "epoch": 0.3271907410190891, + "flos": 22465833350400.0, + "grad_norm": 1.915330956881272, + "language_loss": 0.87321818, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89888448, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.22705078, + "step": 5442, + "time_per_iteration": 2.8568413257598877 + }, + { + "auxiliary_loss_clip": 0.01531824, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.3354342, + "balance_loss_mlp": 1.01528323, + "epoch": 0.3272508642717571, + "flos": 22830326611200.0, + "grad_norm": 1.567801282241699, + "language_loss": 0.78079498, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.80650556, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 1.96484375, + "router_z_loss_mlp": 0.23950195, + "step": 5443, + "time_per_iteration": 2.935483932495117 + }, + { + "auxiliary_loss_clip": 0.01529049, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.33062911, + "balance_loss_mlp": 1.02525353, + "epoch": 0.32731098752442506, + "flos": 11808694736640.0, + "grad_norm": 1.981174734856352, + "language_loss": 0.82005936, + "learning_rate": 3.142211596174343e-06, + "loss": 0.84583795, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 1.98339844, + "router_z_loss_mlp": 0.2355957, + "step": 5444, + "time_per_iteration": 2.8623428344726562 + }, + { + "auxiliary_loss_clip": 0.015122, + "auxiliary_loss_mlp": 0.01043928, + "balance_loss_clip": 1.31907797, + "balance_loss_mlp": 1.02118278, + "epoch": 0.327371110777093, + "flos": 21036346548480.0, + "grad_norm": 2.404673561397496, + "language_loss": 0.5991118, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.62467307, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.22741699, + "step": 5445, + "time_per_iteration": 2.920849084854126 + }, + { + "auxiliary_loss_clip": 0.01521031, + "auxiliary_loss_mlp": 0.0104425, + "balance_loss_clip": 1.32568717, + "balance_loss_mlp": 1.02105212, + "epoch": 0.327431234029761, + "flos": 19071340565760.0, + "grad_norm": 1.9971961738870043, + "language_loss": 0.8896848, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.91533762, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.23205566, + "step": 5446, + "time_per_iteration": 2.980774402618408 + }, + { + "auxiliary_loss_clip": 0.01551909, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_clip": 1.34970701, + "balance_loss_mlp": 1.01953423, + "epoch": 0.32749135728242895, + "flos": 25860416624640.0, + "grad_norm": 1.5782173473752859, + "language_loss": 0.80014902, + "learning_rate": 3.141252301538802e-06, + "loss": 0.8261525, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 2.02246094, + "router_z_loss_mlp": 0.28894043, + "step": 5447, + "time_per_iteration": 2.9395992755889893 + }, + { + "auxiliary_loss_clip": 0.0150705, + "auxiliary_loss_mlp": 0.01042308, + "balance_loss_clip": 1.3133297, + "balance_loss_mlp": 1.02019501, + "epoch": 0.327551480535097, + "flos": 20130135569280.0, + "grad_norm": 2.9768169093366357, + "language_loss": 0.74274057, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.76823413, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.22119141, + "step": 5448, + "time_per_iteration": 2.873718738555908 + }, + { + "auxiliary_loss_clip": 0.01503358, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.31264472, + "balance_loss_mlp": 1.01753902, + "epoch": 0.32761160378776494, + "flos": 28815255256320.0, + "grad_norm": 1.4624768167977689, + "language_loss": 0.67424893, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69969082, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.23291016, + "step": 5449, + "time_per_iteration": 2.9469923973083496 + }, + { + "auxiliary_loss_clip": 0.01512642, + "auxiliary_loss_mlp": 0.01038559, + "balance_loss_clip": 1.31728983, + "balance_loss_mlp": 1.01515841, + "epoch": 0.3276717270404329, + "flos": 26947697869440.0, + "grad_norm": 3.1036360438189727, + "language_loss": 0.65988332, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.6853953, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.23400879, + "step": 5450, + "time_per_iteration": 2.907425880432129 + }, + { + "auxiliary_loss_clip": 0.01506489, + "auxiliary_loss_mlp": 0.01042058, + "balance_loss_clip": 1.3136102, + "balance_loss_mlp": 1.01937294, + "epoch": 0.32773185029310087, + "flos": 25349492632320.0, + "grad_norm": 1.9546182005288313, + "language_loss": 0.78698468, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.81247014, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.22680664, + "step": 5451, + "time_per_iteration": 2.9047420024871826 + }, + { + "auxiliary_loss_clip": 0.01527214, + "auxiliary_loss_mlp": 0.01045056, + "balance_loss_clip": 1.3297863, + "balance_loss_mlp": 1.02008128, + "epoch": 0.32779197354576883, + "flos": 26401817629440.0, + "grad_norm": 2.3127574747600375, + "language_loss": 0.71180063, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.73752338, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 1.97460938, + "router_z_loss_mlp": 0.24951172, + "step": 5452, + "time_per_iteration": 2.908872127532959 + }, + { + "auxiliary_loss_clip": 0.01495395, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.3053329, + "balance_loss_mlp": 1.01730847, + "epoch": 0.3278520967984368, + "flos": 24910019458560.0, + "grad_norm": 1.9028361869925718, + "language_loss": 0.79242647, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.81779087, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.23718262, + "step": 5453, + "time_per_iteration": 2.9032766819000244 + }, + { + "auxiliary_loss_clip": 0.01516183, + "auxiliary_loss_mlp": 0.01044688, + "balance_loss_clip": 1.32224107, + "balance_loss_mlp": 1.02235961, + "epoch": 0.32791222005110476, + "flos": 29765154729600.0, + "grad_norm": 1.9077135469317346, + "language_loss": 0.75739741, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.78300613, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.22314453, + "step": 5454, + "time_per_iteration": 2.9371445178985596 + }, + { + "auxiliary_loss_clip": 0.01495864, + "auxiliary_loss_mlp": 0.01044783, + "balance_loss_clip": 1.3081249, + "balance_loss_mlp": 1.02231252, + "epoch": 0.32797234330377273, + "flos": 16516630114560.0, + "grad_norm": 1.9186240681328222, + "language_loss": 0.7803148, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.80572128, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.22473145, + "step": 5455, + "time_per_iteration": 2.8630244731903076 + }, + { + "auxiliary_loss_clip": 0.0152219, + "auxiliary_loss_mlp": 0.0104877, + "balance_loss_clip": 1.32427609, + "balance_loss_mlp": 1.02196014, + "epoch": 0.3280324665564407, + "flos": 26589041187840.0, + "grad_norm": 7.485143526876971, + "language_loss": 0.74745423, + "learning_rate": 3.138372082016768e-06, + "loss": 0.7731638, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.26831055, + "step": 5456, + "time_per_iteration": 2.9126675128936768 + }, + { + "auxiliary_loss_clip": 0.01514285, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.31959224, + "balance_loss_mlp": 1.01814246, + "epoch": 0.32809258980910866, + "flos": 22940308016640.0, + "grad_norm": 1.4799863293730444, + "language_loss": 0.79074842, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.81631303, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2401123, + "step": 5457, + "time_per_iteration": 2.8855936527252197 + }, + { + "auxiliary_loss_clip": 0.01520585, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.32125914, + "balance_loss_mlp": 1.01886427, + "epoch": 0.3281527130617766, + "flos": 22794103509120.0, + "grad_norm": 2.9214752654093457, + "language_loss": 0.79676545, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.8223846, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 1.99316406, + "router_z_loss_mlp": 0.22485352, + "step": 5458, + "time_per_iteration": 2.9148786067962646 + }, + { + "auxiliary_loss_clip": 0.01496173, + "auxiliary_loss_mlp": 0.01040065, + "balance_loss_clip": 1.30323219, + "balance_loss_mlp": 1.01672435, + "epoch": 0.3282128363144446, + "flos": 21260336146560.0, + "grad_norm": 6.005699325526837, + "language_loss": 0.73631936, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.76168168, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.23339844, + "step": 5459, + "time_per_iteration": 2.9444568157196045 + }, + { + "auxiliary_loss_clip": 0.01520764, + "auxiliary_loss_mlp": 0.01042311, + "balance_loss_clip": 1.32339096, + "balance_loss_mlp": 1.01918387, + "epoch": 0.32827295956711255, + "flos": 30854336256000.0, + "grad_norm": 2.421076521104659, + "language_loss": 0.84864265, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.87427342, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.23132324, + "step": 5460, + "time_per_iteration": 2.9539787769317627 + }, + { + "auxiliary_loss_clip": 0.01499815, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.30665016, + "balance_loss_mlp": 1.01603615, + "epoch": 0.3283330828197806, + "flos": 25924356806400.0, + "grad_norm": 3.2813644720771937, + "language_loss": 0.77711344, + "learning_rate": 3.136770448642288e-06, + "loss": 0.80249381, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.22192383, + "step": 5461, + "time_per_iteration": 2.8989076614379883 + }, + { + "auxiliary_loss_clip": 0.01500811, + "auxiliary_loss_mlp": 0.01044074, + "balance_loss_clip": 1.30826735, + "balance_loss_mlp": 1.01870608, + "epoch": 0.32839320607244854, + "flos": 38596388434560.0, + "grad_norm": 1.864001637360099, + "language_loss": 0.63040745, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65585637, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.25366211, + "step": 5462, + "time_per_iteration": 3.0316312313079834 + }, + { + "auxiliary_loss_clip": 0.01479031, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.29065418, + "balance_loss_mlp": 1.01534486, + "epoch": 0.3284533293251165, + "flos": 26662030329600.0, + "grad_norm": 1.4646882953829832, + "language_loss": 0.79121625, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.81639034, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.23034668, + "step": 5463, + "time_per_iteration": 2.9362776279449463 + }, + { + "auxiliary_loss_clip": 0.01501626, + "auxiliary_loss_mlp": 0.01043327, + "balance_loss_clip": 1.30850768, + "balance_loss_mlp": 1.01978326, + "epoch": 0.32851345257778447, + "flos": 15312445009920.0, + "grad_norm": 2.008715148886521, + "language_loss": 0.70356953, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72901905, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.2355957, + "step": 5464, + "time_per_iteration": 4.276487112045288 + }, + { + "auxiliary_loss_clip": 0.01486362, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.29821944, + "balance_loss_mlp": 1.02011919, + "epoch": 0.32857357583045244, + "flos": 23524221150720.0, + "grad_norm": 1.963871477864965, + "language_loss": 0.7294277, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.75473154, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.23913574, + "step": 5465, + "time_per_iteration": 2.8993887901306152 + }, + { + "auxiliary_loss_clip": 0.01499489, + "auxiliary_loss_mlp": 0.01045149, + "balance_loss_clip": 1.30689001, + "balance_loss_mlp": 1.01993608, + "epoch": 0.3286336990831204, + "flos": 21004783660800.0, + "grad_norm": 1.5148101675395698, + "language_loss": 0.8334291, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85887551, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.2520752, + "step": 5466, + "time_per_iteration": 2.928316831588745 + }, + { + "auxiliary_loss_clip": 0.01500226, + "auxiliary_loss_mlp": 0.01039829, + "balance_loss_clip": 1.30840063, + "balance_loss_mlp": 1.01698804, + "epoch": 0.32869382233578837, + "flos": 23669113559040.0, + "grad_norm": 2.3779425129734744, + "language_loss": 0.79768419, + "learning_rate": 3.134847066213879e-06, + "loss": 0.82308471, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.22827148, + "step": 5467, + "time_per_iteration": 2.9820942878723145 + }, + { + "auxiliary_loss_clip": 0.01502633, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.30914712, + "balance_loss_mlp": 1.01787972, + "epoch": 0.32875394558845633, + "flos": 25347094657920.0, + "grad_norm": 1.6942416045187358, + "language_loss": 0.75067806, + "learning_rate": 3.134526351787587e-06, + "loss": 0.77612019, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.23706055, + "step": 5468, + "time_per_iteration": 4.501052141189575 + }, + { + "auxiliary_loss_clip": 0.01514672, + "auxiliary_loss_mlp": 0.01049971, + "balance_loss_clip": 1.31634426, + "balance_loss_mlp": 1.02320814, + "epoch": 0.3288140688411243, + "flos": 14911366688640.0, + "grad_norm": 8.4800559162438, + "language_loss": 0.79307592, + "learning_rate": 3.134205594339942e-06, + "loss": 0.81872237, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 1.984375, + "router_z_loss_mlp": 0.26757812, + "step": 5469, + "time_per_iteration": 2.8568835258483887 + }, + { + "auxiliary_loss_clip": 0.01495726, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.30409932, + "balance_loss_mlp": 1.02054048, + "epoch": 0.32887419209379226, + "flos": 18560461818240.0, + "grad_norm": 1.7941850164777267, + "language_loss": 0.82783198, + "learning_rate": 3.133884793883107e-06, + "loss": 0.85322529, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.23059082, + "step": 5470, + "time_per_iteration": 4.264109373092651 + }, + { + "auxiliary_loss_clip": 0.01487198, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.29602516, + "balance_loss_mlp": 1.02233005, + "epoch": 0.3289343153464602, + "flos": 48122602554240.0, + "grad_norm": 1.9066202395115286, + "language_loss": 0.68927217, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.71459508, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.22790527, + "step": 5471, + "time_per_iteration": 3.119304895401001 + }, + { + "auxiliary_loss_clip": 0.01518322, + "auxiliary_loss_mlp": 0.01051874, + "balance_loss_clip": 1.31959188, + "balance_loss_mlp": 1.02608931, + "epoch": 0.3289944385991282, + "flos": 27611884558080.0, + "grad_norm": 1.880925464627752, + "language_loss": 0.65712148, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.68282342, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.25793457, + "step": 5472, + "time_per_iteration": 4.3394129276275635 + }, + { + "auxiliary_loss_clip": 0.01499208, + "auxiliary_loss_mlp": 0.01044078, + "balance_loss_clip": 1.30488253, + "balance_loss_mlp": 1.01876974, + "epoch": 0.32905456185179616, + "flos": 20129864100480.0, + "grad_norm": 1.7598214677053028, + "language_loss": 0.89464569, + "learning_rate": 3.13292213457912e-06, + "loss": 0.92007852, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.25292969, + "step": 5473, + "time_per_iteration": 2.909392833709717 + }, + { + "auxiliary_loss_clip": 0.01500096, + "auxiliary_loss_mlp": 0.01047548, + "balance_loss_clip": 1.30540586, + "balance_loss_mlp": 1.0214529, + "epoch": 0.3291146851044642, + "flos": 23189300006400.0, + "grad_norm": 2.1565696749155565, + "language_loss": 0.79576689, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.82124329, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2611084, + "step": 5474, + "time_per_iteration": 2.862248182296753 + }, + { + "auxiliary_loss_clip": 0.01286075, + "auxiliary_loss_mlp": 0.01105278, + "balance_loss_clip": 1.16967762, + "balance_loss_mlp": 1.07399774, + "epoch": 0.32917480835713214, + "flos": 67652509599360.0, + "grad_norm": 0.8428656893570078, + "language_loss": 0.60251576, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62642932, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.3125, + "step": 5475, + "time_per_iteration": 3.3473572731018066 + }, + { + "auxiliary_loss_clip": 0.01510483, + "auxiliary_loss_mlp": 0.01046858, + "balance_loss_clip": 1.31252456, + "balance_loss_mlp": 1.02060783, + "epoch": 0.3292349316098001, + "flos": 27976061105280.0, + "grad_norm": 2.5338870786224694, + "language_loss": 0.77492034, + "learning_rate": 3.131959088630455e-06, + "loss": 0.80049384, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 1.97753906, + "router_z_loss_mlp": 0.26257324, + "step": 5476, + "time_per_iteration": 2.91557240486145 + }, + { + "auxiliary_loss_clip": 0.01496337, + "auxiliary_loss_mlp": 0.01048888, + "balance_loss_clip": 1.30502999, + "balance_loss_mlp": 1.0246172, + "epoch": 0.3292950548624681, + "flos": 20272584758400.0, + "grad_norm": 1.9027148884680487, + "language_loss": 0.75443137, + "learning_rate": 3.131637987449997e-06, + "loss": 0.77988362, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24279785, + "step": 5477, + "time_per_iteration": 2.89992356300354 + }, + { + "auxiliary_loss_clip": 0.01468334, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.28180289, + "balance_loss_mlp": 1.01506388, + "epoch": 0.32935517811513604, + "flos": 20822582275200.0, + "grad_norm": 2.620283962676017, + "language_loss": 0.76777864, + "learning_rate": 3.131316843357713e-06, + "loss": 0.79284889, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.23620605, + "step": 5478, + "time_per_iteration": 2.86607027053833 + }, + { + "auxiliary_loss_clip": 0.01483455, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.29631531, + "balance_loss_mlp": 1.0150193, + "epoch": 0.329415301367804, + "flos": 18450932860800.0, + "grad_norm": 1.732896632645341, + "language_loss": 0.81192327, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.83714044, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.23266602, + "step": 5479, + "time_per_iteration": 2.8352530002593994 + }, + { + "auxiliary_loss_clip": 0.01278669, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.16016567, + "balance_loss_mlp": 1.02249944, + "epoch": 0.32947542462047197, + "flos": 66357074436480.0, + "grad_norm": 0.7465146669616054, + "language_loss": 0.5656755, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58892179, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.234375, + "step": 5480, + "time_per_iteration": 3.443718910217285 + }, + { + "auxiliary_loss_clip": 0.01489035, + "auxiliary_loss_mlp": 0.01048398, + "balance_loss_clip": 1.29775238, + "balance_loss_mlp": 1.02366245, + "epoch": 0.32953554787313993, + "flos": 23231947870080.0, + "grad_norm": 1.7748061346057895, + "language_loss": 0.78535223, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.81072664, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.24719238, + "step": 5481, + "time_per_iteration": 2.8862950801849365 + }, + { + "auxiliary_loss_clip": 0.01497996, + "auxiliary_loss_mlp": 0.01046617, + "balance_loss_clip": 1.30388677, + "balance_loss_mlp": 1.0228585, + "epoch": 0.3295956711258079, + "flos": 27019103443200.0, + "grad_norm": 1.7678688295761356, + "language_loss": 0.78698111, + "learning_rate": 3.130031838113899e-06, + "loss": 0.81242728, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.2376709, + "step": 5482, + "time_per_iteration": 2.9602763652801514 + }, + { + "auxiliary_loss_clip": 0.01507473, + "auxiliary_loss_mlp": 0.01051028, + "balance_loss_clip": 1.31193137, + "balance_loss_mlp": 1.02682805, + "epoch": 0.32965579437847586, + "flos": 19181005257600.0, + "grad_norm": 1.733154641005759, + "language_loss": 0.74982369, + "learning_rate": 3.129710479645185e-06, + "loss": 0.77540863, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.24206543, + "step": 5483, + "time_per_iteration": 2.959923028945923 + }, + { + "auxiliary_loss_clip": 0.01492424, + "auxiliary_loss_mlp": 0.01051287, + "balance_loss_clip": 1.3003366, + "balance_loss_mlp": 1.02705216, + "epoch": 0.32971591763114383, + "flos": 30494819923200.0, + "grad_norm": 1.6544760745359672, + "language_loss": 0.76297891, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.78841603, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.2421875, + "step": 5484, + "time_per_iteration": 2.946425437927246 + }, + { + "auxiliary_loss_clip": 0.01491026, + "auxiliary_loss_mlp": 0.01052601, + "balance_loss_clip": 1.29965031, + "balance_loss_mlp": 1.02802038, + "epoch": 0.3297760408838118, + "flos": 16298205626880.0, + "grad_norm": 1.9354436082571662, + "language_loss": 0.7288326, + "learning_rate": 3.129067634203742e-06, + "loss": 0.75426888, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.24621582, + "step": 5485, + "time_per_iteration": 2.8884856700897217 + }, + { + "auxiliary_loss_clip": 0.01488261, + "auxiliary_loss_mlp": 0.01049583, + "balance_loss_clip": 1.3004024, + "balance_loss_mlp": 1.02609921, + "epoch": 0.32983616413647976, + "flos": 29542205761920.0, + "grad_norm": 1.6868574425659706, + "language_loss": 0.81224537, + "learning_rate": 3.128746147255388e-06, + "loss": 0.83762378, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.23498535, + "step": 5486, + "time_per_iteration": 2.9129762649536133 + }, + { + "auxiliary_loss_clip": 0.01502015, + "auxiliary_loss_mlp": 0.01051608, + "balance_loss_clip": 1.31204295, + "balance_loss_mlp": 1.02856517, + "epoch": 0.3298962873891478, + "flos": 20641150051200.0, + "grad_norm": 8.241651619497196, + "language_loss": 0.85246831, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.87800455, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.23046875, + "step": 5487, + "time_per_iteration": 2.8778603076934814 + }, + { + "auxiliary_loss_clip": 0.01510028, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.31499732, + "balance_loss_mlp": 1.03150654, + "epoch": 0.32995641064181574, + "flos": 14984265340800.0, + "grad_norm": 2.249790025290847, + "language_loss": 0.75569022, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.78135526, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.24987793, + "step": 5488, + "time_per_iteration": 3.04264235496521 + }, + { + "auxiliary_loss_clip": 0.01497044, + "auxiliary_loss_mlp": 0.01054813, + "balance_loss_clip": 1.30467916, + "balance_loss_mlp": 1.03016019, + "epoch": 0.3300165338944837, + "flos": 18670759937280.0, + "grad_norm": 2.43678389434333, + "language_loss": 0.72985041, + "learning_rate": 3.127781429646098e-06, + "loss": 0.75536901, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.2467041, + "step": 5489, + "time_per_iteration": 3.0880839824676514 + }, + { + "auxiliary_loss_clip": 0.01499688, + "auxiliary_loss_mlp": 0.01044579, + "balance_loss_clip": 1.30891347, + "balance_loss_mlp": 1.02104664, + "epoch": 0.3300766571471517, + "flos": 25592240839680.0, + "grad_norm": 4.789586345656304, + "language_loss": 0.90425789, + "learning_rate": 3.127459771562238e-06, + "loss": 0.92970061, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.23547363, + "step": 5490, + "time_per_iteration": 2.8745641708374023 + }, + { + "auxiliary_loss_clip": 0.01482434, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.29275775, + "balance_loss_mlp": 1.02077377, + "epoch": 0.33013678039981964, + "flos": 11370714641280.0, + "grad_norm": 2.1894375104112704, + "language_loss": 0.83969396, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.86495715, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.2310791, + "step": 5491, + "time_per_iteration": 2.864011287689209 + }, + { + "auxiliary_loss_clip": 0.01495683, + "auxiliary_loss_mlp": 0.01046875, + "balance_loss_clip": 1.30456567, + "balance_loss_mlp": 1.02271116, + "epoch": 0.3301969036524876, + "flos": 24830469820800.0, + "grad_norm": 2.083382881281837, + "language_loss": 0.77865267, + "learning_rate": 3.126816327146554e-06, + "loss": 0.80407822, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.24169922, + "step": 5492, + "time_per_iteration": 2.9445090293884277 + }, + { + "auxiliary_loss_clip": 0.01502609, + "auxiliary_loss_mlp": 0.01044995, + "balance_loss_clip": 1.30656552, + "balance_loss_mlp": 1.02084303, + "epoch": 0.33025702690515557, + "flos": 15969663999360.0, + "grad_norm": 3.5862048078061775, + "language_loss": 0.76565647, + "learning_rate": 3.12649454083913e-06, + "loss": 0.79113257, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.24133301, + "step": 5493, + "time_per_iteration": 2.899712085723877 + }, + { + "auxiliary_loss_clip": 0.01276001, + "auxiliary_loss_mlp": 0.01055001, + "balance_loss_clip": 1.16355443, + "balance_loss_mlp": 1.02372038, + "epoch": 0.33031715015782354, + "flos": 59446298568960.0, + "grad_norm": 0.7896118684436624, + "language_loss": 0.53931975, + "learning_rate": 3.12617271181492e-06, + "loss": 0.56262982, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.3125, + "step": 5494, + "time_per_iteration": 3.333669424057007 + }, + { + "auxiliary_loss_clip": 0.01505908, + "auxiliary_loss_mlp": 0.01045205, + "balance_loss_clip": 1.31280756, + "balance_loss_mlp": 1.021029, + "epoch": 0.3303772734104915, + "flos": 23194503158400.0, + "grad_norm": 1.880385432093056, + "language_loss": 0.87636709, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.90187824, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.24194336, + "step": 5495, + "time_per_iteration": 2.9754135608673096 + }, + { + "auxiliary_loss_clip": 0.01502848, + "auxiliary_loss_mlp": 0.01048063, + "balance_loss_clip": 1.30751669, + "balance_loss_mlp": 1.02097821, + "epoch": 0.33043739666315947, + "flos": 33084577111680.0, + "grad_norm": 1.9749240192229451, + "language_loss": 0.74816823, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.77367735, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.27099609, + "step": 5496, + "time_per_iteration": 2.9954674243927 + }, + { + "auxiliary_loss_clip": 0.01491699, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.30207264, + "balance_loss_mlp": 1.0175724, + "epoch": 0.33049751991582743, + "flos": 24905042530560.0, + "grad_norm": 2.180883293504036, + "language_loss": 0.73512149, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.76045656, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.2421875, + "step": 5497, + "time_per_iteration": 2.9547500610351562 + }, + { + "auxiliary_loss_clip": 0.0149804, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.30863428, + "balance_loss_mlp": 1.01723492, + "epoch": 0.3305576431684954, + "flos": 29472519490560.0, + "grad_norm": 2.2325434249205074, + "language_loss": 0.82092321, + "learning_rate": 3.124884968794321e-06, + "loss": 0.84631151, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.2355957, + "step": 5498, + "time_per_iteration": 2.9362800121307373 + }, + { + "auxiliary_loss_clip": 0.01501008, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.30696726, + "balance_loss_mlp": 1.01760364, + "epoch": 0.33061776642116336, + "flos": 22641067036800.0, + "grad_norm": 2.040336272019396, + "language_loss": 0.7666803, + "learning_rate": 3.12456292636927e-06, + "loss": 0.79211462, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.24829102, + "step": 5499, + "time_per_iteration": 2.884650230407715 + }, + { + "auxiliary_loss_clip": 0.01494068, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.30261469, + "balance_loss_mlp": 1.01418042, + "epoch": 0.3306778896738313, + "flos": 25787517972480.0, + "grad_norm": 1.541483116706843, + "language_loss": 0.7957356, + "learning_rate": 3.124240841300681e-06, + "loss": 0.82105637, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.23815918, + "step": 5500, + "time_per_iteration": 4.288754940032959 + }, + { + "auxiliary_loss_clip": 0.01505524, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.31191134, + "balance_loss_mlp": 1.01578498, + "epoch": 0.33073801292649935, + "flos": 36954539948160.0, + "grad_norm": 2.236103787474974, + "language_loss": 0.67284137, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.69829714, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.24279785, + "step": 5501, + "time_per_iteration": 3.0692596435546875 + }, + { + "auxiliary_loss_clip": 0.01501561, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.30891538, + "balance_loss_mlp": 1.02415919, + "epoch": 0.3307981361791673, + "flos": 12975073171200.0, + "grad_norm": 2.479691276403042, + "language_loss": 0.78425163, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.80976248, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.25390625, + "step": 5502, + "time_per_iteration": 4.217082977294922 + }, + { + "auxiliary_loss_clip": 0.01511523, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.31814432, + "balance_loss_mlp": 1.02086282, + "epoch": 0.3308582594318353, + "flos": 25385308531200.0, + "grad_norm": 1.7335875634098998, + "language_loss": 0.73224354, + "learning_rate": 3.123274330355824e-06, + "loss": 0.75781834, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.25134277, + "step": 5503, + "time_per_iteration": 2.909001350402832 + }, + { + "auxiliary_loss_clip": 0.01486228, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.29525316, + "balance_loss_mlp": 1.01869452, + "epoch": 0.33091838268450324, + "flos": 26479693209600.0, + "grad_norm": 2.8003981361778303, + "language_loss": 0.75608897, + "learning_rate": 3.12295207483523e-06, + "loss": 0.78138399, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.24584961, + "step": 5504, + "time_per_iteration": 2.9777297973632812 + }, + { + "auxiliary_loss_clip": 0.0148874, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.29814315, + "balance_loss_mlp": 1.02324462, + "epoch": 0.3309785059371712, + "flos": 24981515521920.0, + "grad_norm": 1.6332632477629518, + "language_loss": 0.70753288, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.73287851, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.22595215, + "step": 5505, + "time_per_iteration": 4.339970827102661 + }, + { + "auxiliary_loss_clip": 0.01493398, + "auxiliary_loss_mlp": 0.01052199, + "balance_loss_clip": 1.30413222, + "balance_loss_mlp": 1.02866721, + "epoch": 0.3310386291898392, + "flos": 20455826774400.0, + "grad_norm": 1.8889255308854678, + "language_loss": 0.82462907, + "learning_rate": 3.122307436058899e-06, + "loss": 0.85008502, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23522949, + "step": 5506, + "time_per_iteration": 2.9033901691436768 + }, + { + "auxiliary_loss_clip": 0.01496619, + "auxiliary_loss_mlp": 0.0104632, + "balance_loss_clip": 1.30488026, + "balance_loss_mlp": 1.02225208, + "epoch": 0.33109875244250714, + "flos": 23192376652800.0, + "grad_norm": 1.9358742727891696, + "language_loss": 0.79731125, + "learning_rate": 3.121985052827606e-06, + "loss": 0.82274067, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.24084473, + "step": 5507, + "time_per_iteration": 2.895332098007202 + }, + { + "auxiliary_loss_clip": 0.01491163, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.30039024, + "balance_loss_mlp": 1.02556884, + "epoch": 0.3311588756951751, + "flos": 24178499228160.0, + "grad_norm": 2.023297746423264, + "language_loss": 0.72484231, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.75025165, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.2421875, + "step": 5508, + "time_per_iteration": 4.273925065994263 + }, + { + "auxiliary_loss_clip": 0.01482953, + "auxiliary_loss_mlp": 0.01042083, + "balance_loss_clip": 1.29661322, + "balance_loss_mlp": 1.02001774, + "epoch": 0.33121899894784307, + "flos": 28156995636480.0, + "grad_norm": 2.379950796173046, + "language_loss": 0.72565031, + "learning_rate": 3.12134015873989e-06, + "loss": 0.75090075, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.22070312, + "step": 5509, + "time_per_iteration": 2.888230562210083 + }, + { + "auxiliary_loss_clip": 0.01488687, + "auxiliary_loss_mlp": 0.01041217, + "balance_loss_clip": 1.29951489, + "balance_loss_mlp": 1.01837599, + "epoch": 0.33127912220051103, + "flos": 29579017046400.0, + "grad_norm": 1.8181885874476666, + "language_loss": 0.73830944, + "learning_rate": 3.121017647907921e-06, + "loss": 0.76360852, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.2286377, + "step": 5510, + "time_per_iteration": 2.955430746078491 + }, + { + "auxiliary_loss_clip": 0.01481679, + "auxiliary_loss_mlp": 0.01046051, + "balance_loss_clip": 1.2925874, + "balance_loss_mlp": 1.02305532, + "epoch": 0.331339245453179, + "flos": 14436982512000.0, + "grad_norm": 2.0671870588529235, + "language_loss": 0.88013697, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90541428, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.22998047, + "step": 5511, + "time_per_iteration": 2.811195135116577 + }, + { + "auxiliary_loss_clip": 0.01467956, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.28639698, + "balance_loss_mlp": 1.02053738, + "epoch": 0.33139936870584696, + "flos": 20897019250560.0, + "grad_norm": 1.7232927297574487, + "language_loss": 0.7416966, + "learning_rate": 3.12037249872891e-06, + "loss": 0.76680553, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22387695, + "step": 5512, + "time_per_iteration": 2.9063351154327393 + }, + { + "auxiliary_loss_clip": 0.01472882, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.28642035, + "balance_loss_mlp": 1.02145314, + "epoch": 0.33145949195851493, + "flos": 36298316344320.0, + "grad_norm": 2.9305704900878404, + "language_loss": 0.74027205, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.76546925, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.25402832, + "step": 5513, + "time_per_iteration": 2.9898509979248047 + }, + { + "auxiliary_loss_clip": 0.01495163, + "auxiliary_loss_mlp": 0.01044604, + "balance_loss_clip": 1.30242133, + "balance_loss_mlp": 1.02088094, + "epoch": 0.33151961521118295, + "flos": 14287022686080.0, + "grad_norm": 1.8678627210777785, + "language_loss": 0.69892496, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.72432268, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.23730469, + "step": 5514, + "time_per_iteration": 2.8323986530303955 + }, + { + "auxiliary_loss_clip": 0.01497084, + "auxiliary_loss_mlp": 0.01054692, + "balance_loss_clip": 1.3045311, + "balance_loss_mlp": 1.02986073, + "epoch": 0.3315797384638509, + "flos": 20783327771520.0, + "grad_norm": 4.1184658548474795, + "language_loss": 0.67573577, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.70125353, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.24841309, + "step": 5515, + "time_per_iteration": 2.8454108238220215 + }, + { + "auxiliary_loss_clip": 0.01506806, + "auxiliary_loss_mlp": 0.0104311, + "balance_loss_clip": 1.31332612, + "balance_loss_mlp": 1.01919687, + "epoch": 0.3316398617165189, + "flos": 24689785178880.0, + "grad_norm": 3.5017898227395383, + "language_loss": 0.69713706, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.72263616, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.23913574, + "step": 5516, + "time_per_iteration": 2.903982639312744 + }, + { + "auxiliary_loss_clip": 0.01499634, + "auxiliary_loss_mlp": 0.01046803, + "balance_loss_clip": 1.30555868, + "balance_loss_mlp": 1.02380705, + "epoch": 0.33169998496918685, + "flos": 18597137368320.0, + "grad_norm": 2.364697395287522, + "language_loss": 0.81609118, + "learning_rate": 3.118758882514359e-06, + "loss": 0.84155554, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.22998047, + "step": 5517, + "time_per_iteration": 2.854605197906494 + }, + { + "auxiliary_loss_clip": 0.01465222, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.2820487, + "balance_loss_mlp": 1.02169919, + "epoch": 0.3317601082218548, + "flos": 20203305690240.0, + "grad_norm": 1.66165754560778, + "language_loss": 0.75290138, + "learning_rate": 3.118436031952143e-06, + "loss": 0.77799743, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22680664, + "step": 5518, + "time_per_iteration": 2.852267265319824 + }, + { + "auxiliary_loss_clip": 0.01287218, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.17178583, + "balance_loss_mlp": 1.01699376, + "epoch": 0.3318202314745228, + "flos": 69006383061120.0, + "grad_norm": 0.625001371018603, + "language_loss": 0.5432682, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56655157, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.24121094, + "step": 5519, + "time_per_iteration": 3.5325875282287598 + }, + { + "auxiliary_loss_clip": 0.01488704, + "auxiliary_loss_mlp": 0.01041171, + "balance_loss_clip": 1.30038309, + "balance_loss_mlp": 1.01783013, + "epoch": 0.33188035472719074, + "flos": 21508287505920.0, + "grad_norm": 2.8777742518074074, + "language_loss": 0.7930547, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81835347, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.23327637, + "step": 5520, + "time_per_iteration": 2.876889944076538 + }, + { + "auxiliary_loss_clip": 0.01478694, + "auxiliary_loss_mlp": 0.01043543, + "balance_loss_clip": 1.29201329, + "balance_loss_mlp": 1.0209409, + "epoch": 0.3319404779798587, + "flos": 28880643271680.0, + "grad_norm": 1.8334972481450111, + "language_loss": 0.76647294, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.79169536, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.22595215, + "step": 5521, + "time_per_iteration": 2.9918320178985596 + }, + { + "auxiliary_loss_clip": 0.01492576, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.30200219, + "balance_loss_mlp": 1.02220023, + "epoch": 0.33200060123252667, + "flos": 23087553154560.0, + "grad_norm": 2.1045297355758326, + "language_loss": 0.71247727, + "learning_rate": 3.117144205713664e-06, + "loss": 0.73786896, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.24365234, + "step": 5522, + "time_per_iteration": 3.0218424797058105 + }, + { + "auxiliary_loss_clip": 0.01480911, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.29337335, + "balance_loss_mlp": 1.02445269, + "epoch": 0.33206072448519464, + "flos": 21152436001920.0, + "grad_norm": 1.8519489448444226, + "language_loss": 0.74915779, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.77443862, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.22741699, + "step": 5523, + "time_per_iteration": 2.917095422744751 + }, + { + "auxiliary_loss_clip": 0.01479266, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_clip": 1.29230487, + "balance_loss_mlp": 1.01738131, + "epoch": 0.3321208477378626, + "flos": 13086954858240.0, + "grad_norm": 1.8375580380587146, + "language_loss": 0.82312369, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84833038, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.2401123, + "step": 5524, + "time_per_iteration": 2.8747291564941406 + }, + { + "auxiliary_loss_clip": 0.01479688, + "auxiliary_loss_mlp": 0.01041446, + "balance_loss_clip": 1.29284048, + "balance_loss_mlp": 1.01771164, + "epoch": 0.33218097099053057, + "flos": 21225379898880.0, + "grad_norm": 1.5949591998226134, + "language_loss": 0.83633453, + "learning_rate": 3.116174891188636e-06, + "loss": 0.86154586, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23730469, + "step": 5525, + "time_per_iteration": 2.865344762802124 + }, + { + "auxiliary_loss_clip": 0.01278135, + "auxiliary_loss_mlp": 0.01019595, + "balance_loss_clip": 1.16222978, + "balance_loss_mlp": 0.99775618, + "epoch": 0.33224109424319853, + "flos": 64381526415360.0, + "grad_norm": 0.7724121775373993, + "language_loss": 0.52711022, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.55008751, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.21875, + "step": 5526, + "time_per_iteration": 3.3488171100616455 + }, + { + "auxiliary_loss_clip": 0.01497189, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.3044039, + "balance_loss_mlp": 1.01777995, + "epoch": 0.33230121749586655, + "flos": 17354466921600.0, + "grad_norm": 2.061833455328111, + "language_loss": 0.78849041, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.81387031, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.23046875, + "step": 5527, + "time_per_iteration": 2.829340934753418 + }, + { + "auxiliary_loss_clip": 0.01482088, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.29509449, + "balance_loss_mlp": 1.01993227, + "epoch": 0.3323613407485345, + "flos": 21007091145600.0, + "grad_norm": 2.019821475984275, + "language_loss": 0.73710847, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.76236117, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.23266602, + "step": 5528, + "time_per_iteration": 2.8849806785583496 + }, + { + "auxiliary_loss_clip": 0.0148844, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.29845464, + "balance_loss_mlp": 1.01879835, + "epoch": 0.3324214640012025, + "flos": 13160532182400.0, + "grad_norm": 4.183396210935699, + "language_loss": 0.84072047, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.86601996, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22705078, + "step": 5529, + "time_per_iteration": 2.87052059173584 + }, + { + "auxiliary_loss_clip": 0.01507956, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.31383991, + "balance_loss_mlp": 1.01750708, + "epoch": 0.33248158725387045, + "flos": 22283767699200.0, + "grad_norm": 1.9643627269463968, + "language_loss": 0.70735854, + "learning_rate": 3.114558520634423e-06, + "loss": 0.73284596, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.23291016, + "step": 5530, + "time_per_iteration": 2.8942863941192627 + }, + { + "auxiliary_loss_clip": 0.01495373, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_clip": 1.30343556, + "balance_loss_mlp": 1.02107382, + "epoch": 0.3325417105065384, + "flos": 20750814743040.0, + "grad_norm": 2.522753677392257, + "language_loss": 0.76969457, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.795093, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23400879, + "step": 5531, + "time_per_iteration": 2.943782091140747 + }, + { + "auxiliary_loss_clip": 0.01503949, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.31108522, + "balance_loss_mlp": 1.02063251, + "epoch": 0.3326018337592064, + "flos": 24801304907520.0, + "grad_norm": 1.9099913082059983, + "language_loss": 0.73881197, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.76429582, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.23803711, + "step": 5532, + "time_per_iteration": 2.975524663925171 + }, + { + "auxiliary_loss_clip": 0.01483334, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.29458642, + "balance_loss_mlp": 1.01165891, + "epoch": 0.33266195701187434, + "flos": 14509835919360.0, + "grad_norm": 1.868930217584913, + "language_loss": 0.67013073, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.69529474, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.21411133, + "step": 5533, + "time_per_iteration": 2.843625068664551 + }, + { + "auxiliary_loss_clip": 0.01486225, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.29763329, + "balance_loss_mlp": 1.01522255, + "epoch": 0.3327220802645423, + "flos": 15312761723520.0, + "grad_norm": 1.75535778742511, + "language_loss": 0.71995223, + "learning_rate": 3.113264663362451e-06, + "loss": 0.74519658, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.2298584, + "step": 5534, + "time_per_iteration": 4.282532215118408 + }, + { + "auxiliary_loss_clip": 0.01490755, + "auxiliary_loss_mlp": 0.01044204, + "balance_loss_clip": 1.30102086, + "balance_loss_mlp": 1.01947999, + "epoch": 0.3327822035172103, + "flos": 23488088538240.0, + "grad_norm": 2.043015297970604, + "language_loss": 0.67278391, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69813347, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.24719238, + "step": 5535, + "time_per_iteration": 2.907738208770752 + }, + { + "auxiliary_loss_clip": 0.01488351, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.29771948, + "balance_loss_mlp": 1.01568508, + "epoch": 0.33284232676987824, + "flos": 25385489510400.0, + "grad_norm": 2.182666457450704, + "language_loss": 0.73486614, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.76013142, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.22509766, + "step": 5536, + "time_per_iteration": 2.896592378616333 + }, + { + "auxiliary_loss_clip": 0.01478651, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.28908837, + "balance_loss_mlp": 1.01353681, + "epoch": 0.3329024500225462, + "flos": 23704431765120.0, + "grad_norm": 1.5226506075550612, + "language_loss": 0.82123673, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84638774, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.22900391, + "step": 5537, + "time_per_iteration": 4.2844343185424805 + }, + { + "auxiliary_loss_clip": 0.01497234, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.3044548, + "balance_loss_mlp": 1.01685321, + "epoch": 0.33296257327521417, + "flos": 31735816312320.0, + "grad_norm": 1.7630378477576538, + "language_loss": 0.72491419, + "learning_rate": 3.111970130648789e-06, + "loss": 0.75028014, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.22497559, + "step": 5538, + "time_per_iteration": 2.9599335193634033 + }, + { + "auxiliary_loss_clip": 0.0147901, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.29199791, + "balance_loss_mlp": 1.01229382, + "epoch": 0.33302269652788213, + "flos": 22754260823040.0, + "grad_norm": 1.7795640962343158, + "language_loss": 0.75359017, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.77873516, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.23205566, + "step": 5539, + "time_per_iteration": 2.8669655323028564 + }, + { + "auxiliary_loss_clip": 0.01511241, + "auxiliary_loss_mlp": 0.01044819, + "balance_loss_clip": 1.31547916, + "balance_loss_mlp": 1.02220452, + "epoch": 0.33308281978055015, + "flos": 11481193739520.0, + "grad_norm": 2.387810449503002, + "language_loss": 0.72154963, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.74711025, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.22619629, + "step": 5540, + "time_per_iteration": 4.2471396923065186 + }, + { + "auxiliary_loss_clip": 0.01486203, + "auxiliary_loss_mlp": 0.01038128, + "balance_loss_clip": 1.29683423, + "balance_loss_mlp": 1.01606274, + "epoch": 0.3331429430332181, + "flos": 38227008735360.0, + "grad_norm": 1.6533121805799553, + "language_loss": 0.61735594, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.64259923, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.22045898, + "step": 5541, + "time_per_iteration": 3.063832998275757 + }, + { + "auxiliary_loss_clip": 0.0149539, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.30460024, + "balance_loss_mlp": 1.02063739, + "epoch": 0.3332030662858861, + "flos": 22538912981760.0, + "grad_norm": 1.8274586290128774, + "language_loss": 0.69574612, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.72114718, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.24108887, + "step": 5542, + "time_per_iteration": 3.0555944442749023 + }, + { + "auxiliary_loss_clip": 0.01500524, + "auxiliary_loss_mlp": 0.0105525, + "balance_loss_clip": 1.30947137, + "balance_loss_mlp": 1.0315156, + "epoch": 0.33326318953855405, + "flos": 16006339549440.0, + "grad_norm": 1.6511774841213918, + "language_loss": 0.75772238, + "learning_rate": 3.110351016113414e-06, + "loss": 0.78328013, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.23742676, + "step": 5543, + "time_per_iteration": 4.235898733139038 + }, + { + "auxiliary_loss_clip": 0.01504277, + "auxiliary_loss_mlp": 0.01048608, + "balance_loss_clip": 1.31070852, + "balance_loss_mlp": 1.02645898, + "epoch": 0.333323312791222, + "flos": 25604592670080.0, + "grad_norm": 1.6575317564310843, + "language_loss": 0.76131165, + "learning_rate": 3.110027066843348e-06, + "loss": 0.78684056, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.22143555, + "step": 5544, + "time_per_iteration": 2.919095039367676 + }, + { + "auxiliary_loss_clip": 0.01489388, + "auxiliary_loss_mlp": 0.01046214, + "balance_loss_clip": 1.30156946, + "balance_loss_mlp": 1.02392173, + "epoch": 0.33338343604389, + "flos": 25130570451840.0, + "grad_norm": 1.8448209776438744, + "language_loss": 0.71702027, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.74237633, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.22302246, + "step": 5545, + "time_per_iteration": 2.882535934448242 + }, + { + "auxiliary_loss_clip": 0.01494443, + "auxiliary_loss_mlp": 0.01049145, + "balance_loss_clip": 1.30686164, + "balance_loss_mlp": 1.02706802, + "epoch": 0.33344355929655795, + "flos": 16956329512320.0, + "grad_norm": 2.7145273840593704, + "language_loss": 0.70071411, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.72614992, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.22058105, + "step": 5546, + "time_per_iteration": 2.845655918121338 + }, + { + "auxiliary_loss_clip": 0.0150527, + "auxiliary_loss_mlp": 0.01052443, + "balance_loss_clip": 1.31169844, + "balance_loss_mlp": 1.03055668, + "epoch": 0.3335036825492259, + "flos": 27900583499520.0, + "grad_norm": 2.459649323264281, + "language_loss": 0.65905219, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.68462932, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.21887207, + "step": 5547, + "time_per_iteration": 2.901700496673584 + }, + { + "auxiliary_loss_clip": 0.01499758, + "auxiliary_loss_mlp": 0.01042489, + "balance_loss_clip": 1.3097769, + "balance_loss_mlp": 1.0221045, + "epoch": 0.3335638058018939, + "flos": 16187997997440.0, + "grad_norm": 2.091477255296078, + "language_loss": 0.86510539, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.89052784, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.20385742, + "step": 5548, + "time_per_iteration": 2.8573436737060547 + }, + { + "auxiliary_loss_clip": 0.01501402, + "auxiliary_loss_mlp": 0.01052512, + "balance_loss_clip": 1.30882096, + "balance_loss_mlp": 1.02962375, + "epoch": 0.33362392905456184, + "flos": 39910916903040.0, + "grad_norm": 1.8280044371970818, + "language_loss": 0.74958384, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.77512294, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.22888184, + "step": 5549, + "time_per_iteration": 3.0063424110412598 + }, + { + "auxiliary_loss_clip": 0.01496851, + "auxiliary_loss_mlp": 0.01051362, + "balance_loss_clip": 1.30551779, + "balance_loss_mlp": 1.0285219, + "epoch": 0.3336840523072298, + "flos": 44286329111040.0, + "grad_norm": 1.9667321875173644, + "language_loss": 0.69643283, + "learning_rate": 3.108082487713921e-06, + "loss": 0.72191495, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.22814941, + "step": 5550, + "time_per_iteration": 3.0446786880493164 + }, + { + "auxiliary_loss_clip": 0.01506906, + "auxiliary_loss_mlp": 0.0104897, + "balance_loss_clip": 1.31551969, + "balance_loss_mlp": 1.02739275, + "epoch": 0.33374417555989777, + "flos": 15094020522240.0, + "grad_norm": 2.802139785023421, + "language_loss": 0.60778379, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.6333425, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.21582031, + "step": 5551, + "time_per_iteration": 2.8327903747558594 + }, + { + "auxiliary_loss_clip": 0.0148695, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.29869092, + "balance_loss_mlp": 1.01934636, + "epoch": 0.33380429881256574, + "flos": 15857013150720.0, + "grad_norm": 1.7387355789424657, + "language_loss": 0.71453953, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.73982608, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22338867, + "step": 5552, + "time_per_iteration": 2.912032127380371 + }, + { + "auxiliary_loss_clip": 0.01498259, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.3096348, + "balance_loss_mlp": 1.02033186, + "epoch": 0.33386442206523376, + "flos": 13488259403520.0, + "grad_norm": 1.9444661959562608, + "language_loss": 0.83840346, + "learning_rate": 3.107109630732192e-06, + "loss": 0.86380965, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.22045898, + "step": 5553, + "time_per_iteration": 2.8933253288269043 + }, + { + "auxiliary_loss_clip": 0.01509694, + "auxiliary_loss_mlp": 0.01046874, + "balance_loss_clip": 1.31905437, + "balance_loss_mlp": 1.02421284, + "epoch": 0.3339245453179017, + "flos": 16699329192960.0, + "grad_norm": 11.370747445795025, + "language_loss": 0.82088095, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.84644669, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22668457, + "step": 5554, + "time_per_iteration": 2.8687338829040527 + }, + { + "auxiliary_loss_clip": 0.01502762, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.31067967, + "balance_loss_mlp": 1.01982355, + "epoch": 0.3339846685705697, + "flos": 24621999189120.0, + "grad_norm": 1.5088948886781604, + "language_loss": 0.82665765, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.8521055, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.22192383, + "step": 5555, + "time_per_iteration": 2.9074978828430176 + }, + { + "auxiliary_loss_clip": 0.01497614, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.3073554, + "balance_loss_mlp": 1.01999044, + "epoch": 0.33404479182323765, + "flos": 30965448781440.0, + "grad_norm": 2.0141091886374647, + "language_loss": 0.75388616, + "learning_rate": 3.106136395915099e-06, + "loss": 0.77928489, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22277832, + "step": 5556, + "time_per_iteration": 2.9525091648101807 + }, + { + "auxiliary_loss_clip": 0.01481267, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.29508436, + "balance_loss_mlp": 1.0135926, + "epoch": 0.3341049150759056, + "flos": 23523678213120.0, + "grad_norm": 1.897365274272503, + "language_loss": 0.83075196, + "learning_rate": 3.105811900403391e-06, + "loss": 0.85590643, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.20605469, + "step": 5557, + "time_per_iteration": 2.875737190246582 + }, + { + "auxiliary_loss_clip": 0.01501702, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.31147122, + "balance_loss_mlp": 1.01685715, + "epoch": 0.3341650383285736, + "flos": 24038040810240.0, + "grad_norm": 1.5438172767335567, + "language_loss": 0.81180692, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.83721608, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22387695, + "step": 5558, + "time_per_iteration": 2.9527387619018555 + }, + { + "auxiliary_loss_clip": 0.01502775, + "auxiliary_loss_mlp": 0.01045181, + "balance_loss_clip": 1.31183112, + "balance_loss_mlp": 1.02406895, + "epoch": 0.33422516158124155, + "flos": 24913277084160.0, + "grad_norm": 2.179234405352386, + "language_loss": 0.82373852, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84921813, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.21105957, + "step": 5559, + "time_per_iteration": 2.88466477394104 + }, + { + "auxiliary_loss_clip": 0.01488411, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_clip": 1.30213356, + "balance_loss_mlp": 1.02097607, + "epoch": 0.3342852848339095, + "flos": 18342716002560.0, + "grad_norm": 1.9147359598255298, + "language_loss": 0.72302079, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74834728, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.23242188, + "step": 5560, + "time_per_iteration": 2.860154151916504 + }, + { + "auxiliary_loss_clip": 0.01512727, + "auxiliary_loss_mlp": 0.01044116, + "balance_loss_clip": 1.31968379, + "balance_loss_mlp": 1.02147758, + "epoch": 0.3343454080865775, + "flos": 30059192557440.0, + "grad_norm": 1.4406989913154584, + "language_loss": 0.75714958, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.782718, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.22644043, + "step": 5561, + "time_per_iteration": 3.025322198867798 + }, + { + "auxiliary_loss_clip": 0.01504826, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.31451297, + "balance_loss_mlp": 1.02099609, + "epoch": 0.33440553133924544, + "flos": 16407417870720.0, + "grad_norm": 1.7723786096291922, + "language_loss": 0.70091093, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.72639334, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.22436523, + "step": 5562, + "time_per_iteration": 2.8286075592041016 + }, + { + "auxiliary_loss_clip": 0.01494958, + "auxiliary_loss_mlp": 0.01043297, + "balance_loss_clip": 1.30516005, + "balance_loss_mlp": 1.02201784, + "epoch": 0.3344656545919134, + "flos": 24252393265920.0, + "grad_norm": 1.6690501622296605, + "language_loss": 0.66132474, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.68670726, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.21276855, + "step": 5563, + "time_per_iteration": 2.926013231277466 + }, + { + "auxiliary_loss_clip": 0.01507279, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.31448793, + "balance_loss_mlp": 1.0242337, + "epoch": 0.3345257778445814, + "flos": 52136824371840.0, + "grad_norm": 1.4829779258292959, + "language_loss": 0.74560082, + "learning_rate": 3.103539258400766e-06, + "loss": 0.77115023, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.23413086, + "step": 5564, + "time_per_iteration": 3.1650631427764893 + }, + { + "auxiliary_loss_clip": 0.01279809, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.1628319, + "balance_loss_mlp": 1.02711546, + "epoch": 0.33458590109724934, + "flos": 68076418561920.0, + "grad_norm": 0.7957323507333102, + "language_loss": 0.55538666, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57869625, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.24023438, + "step": 5565, + "time_per_iteration": 3.3232290744781494 + }, + { + "auxiliary_loss_clip": 0.01490349, + "auxiliary_loss_mlp": 0.01050566, + "balance_loss_clip": 1.30307245, + "balance_loss_mlp": 1.02652144, + "epoch": 0.3346460243499173, + "flos": 37428878880000.0, + "grad_norm": 1.8671753329327845, + "language_loss": 0.65621215, + "learning_rate": 3.102889555312721e-06, + "loss": 0.68162125, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.24047852, + "step": 5566, + "time_per_iteration": 3.0259647369384766 + }, + { + "auxiliary_loss_clip": 0.014908, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_clip": 1.30476046, + "balance_loss_mlp": 1.02528214, + "epoch": 0.3347061476025853, + "flos": 18706530591360.0, + "grad_norm": 1.816648156960533, + "language_loss": 0.78114271, + "learning_rate": 3.102564641030016e-06, + "loss": 0.80654037, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.23657227, + "step": 5567, + "time_per_iteration": 2.834923028945923 + }, + { + "auxiliary_loss_clip": 0.01497512, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_clip": 1.30657172, + "balance_loss_mlp": 1.02337086, + "epoch": 0.3347662708552533, + "flos": 13925787050880.0, + "grad_norm": 1.7976267539439292, + "language_loss": 0.77689457, + "learning_rate": 3.102239684937949e-06, + "loss": 0.80233085, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.22766113, + "step": 5568, + "time_per_iteration": 2.86480712890625 + }, + { + "auxiliary_loss_clip": 0.01500487, + "auxiliary_loss_mlp": 0.0104417, + "balance_loss_clip": 1.30942881, + "balance_loss_mlp": 1.02099562, + "epoch": 0.33482639410792125, + "flos": 19758538874880.0, + "grad_norm": 1.982322748473177, + "language_loss": 0.71599859, + "learning_rate": 3.101914687048842e-06, + "loss": 0.74144512, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.23181152, + "step": 5569, + "time_per_iteration": 4.278224468231201 + }, + { + "auxiliary_loss_clip": 0.01498276, + "auxiliary_loss_mlp": 0.01043065, + "balance_loss_clip": 1.30742288, + "balance_loss_mlp": 1.01956916, + "epoch": 0.3348865173605892, + "flos": 16110484375680.0, + "grad_norm": 2.8018263535646866, + "language_loss": 0.9070226, + "learning_rate": 3.10158964737502e-06, + "loss": 0.93243599, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.23498535, + "step": 5570, + "time_per_iteration": 2.877017021179199 + }, + { + "auxiliary_loss_clip": 0.01490519, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.30411994, + "balance_loss_mlp": 1.01527596, + "epoch": 0.3349466406132572, + "flos": 25019638905600.0, + "grad_norm": 1.790863179291985, + "language_loss": 0.80483353, + "learning_rate": 3.101264565928808e-06, + "loss": 0.83011371, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.22229004, + "step": 5571, + "time_per_iteration": 2.915003776550293 + }, + { + "auxiliary_loss_clip": 0.01284645, + "auxiliary_loss_mlp": 0.01025945, + "balance_loss_clip": 1.16647577, + "balance_loss_mlp": 1.00410545, + "epoch": 0.33500676386592515, + "flos": 54351129778560.0, + "grad_norm": 0.8989683165746324, + "language_loss": 0.56065637, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.58376229, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.21875, + "step": 5572, + "time_per_iteration": 3.3182129859924316 + }, + { + "auxiliary_loss_clip": 0.0147973, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.29359913, + "balance_loss_mlp": 1.02083862, + "epoch": 0.3350668871185931, + "flos": 26808642040320.0, + "grad_norm": 4.966680394649299, + "language_loss": 0.80082309, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.82605088, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.22216797, + "step": 5573, + "time_per_iteration": 4.45736289024353 + }, + { + "auxiliary_loss_clip": 0.0148784, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.29860139, + "balance_loss_mlp": 1.01879859, + "epoch": 0.3351270103712611, + "flos": 33523507347840.0, + "grad_norm": 5.349249583591129, + "language_loss": 0.73465735, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7599659, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.24169922, + "step": 5574, + "time_per_iteration": 2.9686388969421387 + }, + { + "auxiliary_loss_clip": 0.01470696, + "auxiliary_loss_mlp": 0.01040252, + "balance_loss_clip": 1.28623033, + "balance_loss_mlp": 1.01770949, + "epoch": 0.33518713362392905, + "flos": 26517680858880.0, + "grad_norm": 1.7010641810418943, + "language_loss": 0.89022529, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.9153347, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.22558594, + "step": 5575, + "time_per_iteration": 4.391767263412476 + }, + { + "auxiliary_loss_clip": 0.01521175, + "auxiliary_loss_mlp": 0.01050181, + "balance_loss_clip": 1.32244241, + "balance_loss_mlp": 1.02638674, + "epoch": 0.335247256876597, + "flos": 17239689567360.0, + "grad_norm": 2.188569298175893, + "language_loss": 0.83095932, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.85667294, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 1.98828125, + "router_z_loss_mlp": 0.23815918, + "step": 5576, + "time_per_iteration": 2.916335344314575 + }, + { + "auxiliary_loss_clip": 0.01487603, + "auxiliary_loss_mlp": 0.01041762, + "balance_loss_clip": 1.29638505, + "balance_loss_mlp": 1.01875448, + "epoch": 0.335307380129265, + "flos": 25640272834560.0, + "grad_norm": 2.8362113850668984, + "language_loss": 0.75001848, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.77531213, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.2298584, + "step": 5577, + "time_per_iteration": 2.8779680728912354 + }, + { + "auxiliary_loss_clip": 0.01494204, + "auxiliary_loss_mlp": 0.01046706, + "balance_loss_clip": 1.30505824, + "balance_loss_mlp": 1.02316165, + "epoch": 0.33536750338193294, + "flos": 19688897848320.0, + "grad_norm": 7.830351104763126, + "language_loss": 0.82672703, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.85213608, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.23547363, + "step": 5578, + "time_per_iteration": 4.274037599563599 + }, + { + "auxiliary_loss_clip": 0.01478156, + "auxiliary_loss_mlp": 0.01039968, + "balance_loss_clip": 1.29402697, + "balance_loss_mlp": 1.01824808, + "epoch": 0.3354276266346009, + "flos": 18341720616960.0, + "grad_norm": 2.1696088496013446, + "language_loss": 0.72716665, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.75234783, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.21716309, + "step": 5579, + "time_per_iteration": 2.8392436504364014 + }, + { + "auxiliary_loss_clip": 0.01501882, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_clip": 1.31090307, + "balance_loss_mlp": 1.02007353, + "epoch": 0.3354877498872689, + "flos": 17867607909120.0, + "grad_norm": 2.425485373418438, + "language_loss": 0.82306349, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.8485083, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.2253418, + "step": 5580, + "time_per_iteration": 2.895878314971924 + }, + { + "auxiliary_loss_clip": 0.01493739, + "auxiliary_loss_mlp": 0.01042167, + "balance_loss_clip": 1.30304945, + "balance_loss_mlp": 1.0184207, + "epoch": 0.3355478731399369, + "flos": 24728541989760.0, + "grad_norm": 1.7185060167676829, + "language_loss": 0.77945316, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80481219, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.23742676, + "step": 5581, + "time_per_iteration": 2.887403726577759 + }, + { + "auxiliary_loss_clip": 0.01511857, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.3156476, + "balance_loss_mlp": 1.02224004, + "epoch": 0.33560799639260486, + "flos": 16882797432960.0, + "grad_norm": 2.416711095709031, + "language_loss": 0.75195754, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77755249, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.25378418, + "step": 5582, + "time_per_iteration": 2.8443257808685303 + }, + { + "auxiliary_loss_clip": 0.01502871, + "auxiliary_loss_mlp": 0.01049492, + "balance_loss_clip": 1.30946231, + "balance_loss_mlp": 1.0261631, + "epoch": 0.3356681196452728, + "flos": 18342851736960.0, + "grad_norm": 1.579977905362513, + "language_loss": 0.83566988, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.86119348, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.23327637, + "step": 5583, + "time_per_iteration": 2.854748010635376 + }, + { + "auxiliary_loss_clip": 0.01490166, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.30223227, + "balance_loss_mlp": 1.02645361, + "epoch": 0.3357282428979408, + "flos": 34763825064960.0, + "grad_norm": 2.142456765041388, + "language_loss": 0.7862941, + "learning_rate": 3.097034711451581e-06, + "loss": 0.81168985, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.22973633, + "step": 5584, + "time_per_iteration": 2.9743659496307373 + }, + { + "auxiliary_loss_clip": 0.01503821, + "auxiliary_loss_mlp": 0.0104594, + "balance_loss_clip": 1.31069756, + "balance_loss_mlp": 1.02295661, + "epoch": 0.33578836615060875, + "flos": 21590099383680.0, + "grad_norm": 1.5937990432554863, + "language_loss": 0.76548672, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.79098433, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.22961426, + "step": 5585, + "time_per_iteration": 2.930433988571167 + }, + { + "auxiliary_loss_clip": 0.01477496, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_clip": 1.29048347, + "balance_loss_mlp": 1.02085507, + "epoch": 0.3358484894032767, + "flos": 24539734863360.0, + "grad_norm": 1.5980373630293525, + "language_loss": 0.78713608, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.81235838, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2388916, + "step": 5586, + "time_per_iteration": 2.9004650115966797 + }, + { + "auxiliary_loss_clip": 0.01527319, + "auxiliary_loss_mlp": 0.01045494, + "balance_loss_clip": 1.32831383, + "balance_loss_mlp": 1.02173567, + "epoch": 0.3359086126559447, + "flos": 22465788105600.0, + "grad_norm": 1.8845980368162183, + "language_loss": 0.82255739, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.84828556, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 1.99023438, + "router_z_loss_mlp": 0.23742676, + "step": 5587, + "time_per_iteration": 2.907565116882324 + }, + { + "auxiliary_loss_clip": 0.01487071, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.3029182, + "balance_loss_mlp": 1.03105521, + "epoch": 0.33596873590861265, + "flos": 16552038810240.0, + "grad_norm": 1.9687631794041052, + "language_loss": 0.6869905, + "learning_rate": 3.095731802118677e-06, + "loss": 0.71239638, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.22473145, + "step": 5588, + "time_per_iteration": 2.9055745601654053 + }, + { + "auxiliary_loss_clip": 0.01509781, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_clip": 1.31734848, + "balance_loss_mlp": 1.0306077, + "epoch": 0.3360288591612806, + "flos": 31188307259520.0, + "grad_norm": 1.8837376881026326, + "language_loss": 0.70731616, + "learning_rate": 3.095405970878919e-06, + "loss": 0.73295617, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.23596191, + "step": 5589, + "time_per_iteration": 2.9352684020996094 + }, + { + "auxiliary_loss_clip": 0.01500356, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_clip": 1.30754697, + "balance_loss_mlp": 1.02613091, + "epoch": 0.3360889824139486, + "flos": 23707644145920.0, + "grad_norm": 1.741791823278173, + "language_loss": 0.67536825, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.70086318, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.23010254, + "step": 5590, + "time_per_iteration": 2.86493182182312 + }, + { + "auxiliary_loss_clip": 0.01486797, + "auxiliary_loss_mlp": 0.01045921, + "balance_loss_clip": 1.30118465, + "balance_loss_mlp": 1.02234173, + "epoch": 0.33614910566661654, + "flos": 19327571723520.0, + "grad_norm": 2.2337028070713854, + "language_loss": 0.7460373, + "learning_rate": 3.094754183798047e-06, + "loss": 0.77136451, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23583984, + "step": 5591, + "time_per_iteration": 2.8862602710723877 + }, + { + "auxiliary_loss_clip": 0.01485429, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.29823864, + "balance_loss_mlp": 1.02370346, + "epoch": 0.3362092289192845, + "flos": 16480995194880.0, + "grad_norm": 2.3026444583770753, + "language_loss": 0.70273882, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72806334, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.2331543, + "step": 5592, + "time_per_iteration": 2.9333269596099854 + }, + { + "auxiliary_loss_clip": 0.01501391, + "auxiliary_loss_mlp": 0.01038467, + "balance_loss_clip": 1.31454587, + "balance_loss_mlp": 1.01660395, + "epoch": 0.33626935217195253, + "flos": 24254293547520.0, + "grad_norm": 2.1598410497734455, + "language_loss": 0.77670956, + "learning_rate": 3.094102230664423e-06, + "loss": 0.80210817, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.21862793, + "step": 5593, + "time_per_iteration": 2.9173765182495117 + }, + { + "auxiliary_loss_clip": 0.0151696, + "auxiliary_loss_mlp": 0.01048333, + "balance_loss_clip": 1.32037377, + "balance_loss_mlp": 1.0238837, + "epoch": 0.3363294754246205, + "flos": 19728152352000.0, + "grad_norm": 2.5693088034531866, + "language_loss": 0.72927207, + "learning_rate": 3.093776191858731e-06, + "loss": 0.75492501, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.24462891, + "step": 5594, + "time_per_iteration": 2.8508689403533936 + }, + { + "auxiliary_loss_clip": 0.01507923, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.31604946, + "balance_loss_mlp": 1.02126932, + "epoch": 0.33638959867728846, + "flos": 22605160648320.0, + "grad_norm": 2.4775397685689424, + "language_loss": 0.80401522, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82954997, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.24279785, + "step": 5595, + "time_per_iteration": 2.902956247329712 + }, + { + "auxiliary_loss_clip": 0.01504889, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.31626105, + "balance_loss_mlp": 1.01895952, + "epoch": 0.3364497219299564, + "flos": 21004195478400.0, + "grad_norm": 1.7581381187443923, + "language_loss": 0.82669365, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.8521533, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.22131348, + "step": 5596, + "time_per_iteration": 2.921912908554077 + }, + { + "auxiliary_loss_clip": 0.01508034, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.31729031, + "balance_loss_mlp": 1.01518917, + "epoch": 0.3365098451826244, + "flos": 25239556471680.0, + "grad_norm": 1.706245805180351, + "language_loss": 0.76656747, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.79202294, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.2232666, + "step": 5597, + "time_per_iteration": 2.9426043033599854 + }, + { + "auxiliary_loss_clip": 0.01506685, + "auxiliary_loss_mlp": 0.01040379, + "balance_loss_clip": 1.31773317, + "balance_loss_mlp": 1.01719296, + "epoch": 0.33656996843529235, + "flos": 24582427971840.0, + "grad_norm": 1.8752354213523363, + "language_loss": 0.79172641, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81719708, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.23205566, + "step": 5598, + "time_per_iteration": 2.900599956512451 + }, + { + "auxiliary_loss_clip": 0.0152873, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.32896376, + "balance_loss_mlp": 1.01919341, + "epoch": 0.3366300916879603, + "flos": 44106797168640.0, + "grad_norm": 1.4759902649897094, + "language_loss": 0.65109068, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.67680037, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.23059082, + "step": 5599, + "time_per_iteration": 3.0955095291137695 + }, + { + "auxiliary_loss_clip": 0.0151804, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.32079124, + "balance_loss_mlp": 1.0199312, + "epoch": 0.3366902149406283, + "flos": 13887437443200.0, + "grad_norm": 2.4652502912247978, + "language_loss": 0.83111119, + "learning_rate": 3.091819088459249e-06, + "loss": 0.85673594, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 1.97167969, + "router_z_loss_mlp": 0.24523926, + "step": 5600, + "time_per_iteration": 2.9089560508728027 + }, + { + "auxiliary_loss_clip": 0.01502937, + "auxiliary_loss_mlp": 0.01044787, + "balance_loss_clip": 1.3089509, + "balance_loss_mlp": 1.02028942, + "epoch": 0.33675033819329625, + "flos": 16261711056000.0, + "grad_norm": 2.177427231852065, + "language_loss": 0.84167743, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.8671546, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24499512, + "step": 5601, + "time_per_iteration": 2.8733246326446533 + }, + { + "auxiliary_loss_clip": 0.01477478, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.29369664, + "balance_loss_mlp": 1.01640487, + "epoch": 0.3368104614459642, + "flos": 17064636860160.0, + "grad_norm": 1.590471910268448, + "language_loss": 0.84492207, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.8700875, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22668457, + "step": 5602, + "time_per_iteration": 2.9372410774230957 + }, + { + "auxiliary_loss_clip": 0.01497032, + "auxiliary_loss_mlp": 0.0104018, + "balance_loss_clip": 1.30747414, + "balance_loss_mlp": 1.01682687, + "epoch": 0.3368705846986322, + "flos": 17867698398720.0, + "grad_norm": 1.7731344280748333, + "language_loss": 0.70652318, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.73189527, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.23352051, + "step": 5603, + "time_per_iteration": 2.8852651119232178 + }, + { + "auxiliary_loss_clip": 0.01513101, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.31960511, + "balance_loss_mlp": 1.01751876, + "epoch": 0.33693070795130015, + "flos": 22939719834240.0, + "grad_norm": 1.4936656255818428, + "language_loss": 0.83733654, + "learning_rate": 3.090513524656898e-06, + "loss": 0.86286926, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.2265625, + "step": 5604, + "time_per_iteration": 4.25240159034729 + }, + { + "auxiliary_loss_clip": 0.01499992, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.3080039, + "balance_loss_mlp": 1.01579654, + "epoch": 0.3369908312039681, + "flos": 22027446051840.0, + "grad_norm": 1.5983134903781098, + "language_loss": 0.74555922, + "learning_rate": 3.090187030294409e-06, + "loss": 0.77095509, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.23815918, + "step": 5605, + "time_per_iteration": 2.8519127368927 + }, + { + "auxiliary_loss_clip": 0.01514965, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.31942999, + "balance_loss_mlp": 1.02000213, + "epoch": 0.33705095445663613, + "flos": 11809463898240.0, + "grad_norm": 3.0050129119680515, + "language_loss": 0.84639478, + "learning_rate": 3.089860494591919e-06, + "loss": 0.87197584, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 1.95703125, + "router_z_loss_mlp": 0.23156738, + "step": 5606, + "time_per_iteration": 2.8195576667785645 + }, + { + "auxiliary_loss_clip": 0.01485581, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.2958647, + "balance_loss_mlp": 1.01800346, + "epoch": 0.3371110777093041, + "flos": 25057355086080.0, + "grad_norm": 1.5634217945434674, + "language_loss": 0.68605489, + "learning_rate": 3.089533917561809e-06, + "loss": 0.71132171, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.23095703, + "step": 5607, + "time_per_iteration": 2.8716025352478027 + }, + { + "auxiliary_loss_clip": 0.01513621, + "auxiliary_loss_mlp": 0.01042744, + "balance_loss_clip": 1.31767678, + "balance_loss_mlp": 1.01902127, + "epoch": 0.33717120096197206, + "flos": 26590443776640.0, + "grad_norm": 7.505445397663734, + "language_loss": 0.71994984, + "learning_rate": 3.089207299216464e-06, + "loss": 0.74551356, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 1.95996094, + "router_z_loss_mlp": 0.23730469, + "step": 5608, + "time_per_iteration": 4.286521673202515 + }, + { + "auxiliary_loss_clip": 0.01490519, + "auxiliary_loss_mlp": 0.01046187, + "balance_loss_clip": 1.29944289, + "balance_loss_mlp": 1.02259517, + "epoch": 0.33723132421464, + "flos": 15166873929600.0, + "grad_norm": 1.7569239525437061, + "language_loss": 0.8027342, + "learning_rate": 3.088880639568269e-06, + "loss": 0.82810122, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.23608398, + "step": 5609, + "time_per_iteration": 2.8362104892730713 + }, + { + "auxiliary_loss_clip": 0.01507396, + "auxiliary_loss_mlp": 0.01050679, + "balance_loss_clip": 1.31552565, + "balance_loss_mlp": 1.02614594, + "epoch": 0.337291447467308, + "flos": 23445350184960.0, + "grad_norm": 1.608650980386966, + "language_loss": 0.82936156, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.85494232, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.24523926, + "step": 5610, + "time_per_iteration": 4.381227731704712 + }, + { + "auxiliary_loss_clip": 0.01465079, + "auxiliary_loss_mlp": 0.01040537, + "balance_loss_clip": 1.28113532, + "balance_loss_mlp": 1.01723194, + "epoch": 0.33735157071997596, + "flos": 17247019224960.0, + "grad_norm": 3.0665662529194644, + "language_loss": 0.83303165, + "learning_rate": 3.088227196412879e-06, + "loss": 0.85808784, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23291016, + "step": 5611, + "time_per_iteration": 2.9353599548339844 + }, + { + "auxiliary_loss_clip": 0.01492578, + "auxiliary_loss_mlp": 0.01042922, + "balance_loss_clip": 1.30247784, + "balance_loss_mlp": 1.01764965, + "epoch": 0.3374116939726439, + "flos": 28269239281920.0, + "grad_norm": 1.602858353500534, + "language_loss": 0.80241299, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.82776809, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.25280762, + "step": 5612, + "time_per_iteration": 4.40021276473999 + }, + { + "auxiliary_loss_clip": 0.0147924, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.28967798, + "balance_loss_mlp": 1.02022076, + "epoch": 0.3374718172253119, + "flos": 35933234901120.0, + "grad_norm": 2.7126295468181274, + "language_loss": 0.71527076, + "learning_rate": 3.087573588194753e-06, + "loss": 0.74050045, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.23510742, + "step": 5613, + "time_per_iteration": 2.989490032196045 + }, + { + "auxiliary_loss_clip": 0.0149223, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.29990816, + "balance_loss_mlp": 1.01399064, + "epoch": 0.33753194047797985, + "flos": 18195742333440.0, + "grad_norm": 1.8217999154837716, + "language_loss": 0.80400467, + "learning_rate": 3.087246722218144e-06, + "loss": 0.82930064, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.23376465, + "step": 5614, + "time_per_iteration": 2.9208898544311523 + }, + { + "auxiliary_loss_clip": 0.0149086, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.29937661, + "balance_loss_mlp": 1.02025199, + "epoch": 0.3375920637306478, + "flos": 23159320686720.0, + "grad_norm": 1.8487572850533522, + "language_loss": 0.91431975, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93967986, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.24938965, + "step": 5615, + "time_per_iteration": 2.9683144092559814 + }, + { + "auxiliary_loss_clip": 0.01471147, + "auxiliary_loss_mlp": 0.01041552, + "balance_loss_clip": 1.28445148, + "balance_loss_mlp": 1.01825869, + "epoch": 0.3376521869833158, + "flos": 23122554647040.0, + "grad_norm": 1.6931177507054995, + "language_loss": 0.81725466, + "learning_rate": 3.086592866591809e-06, + "loss": 0.84238166, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.23291016, + "step": 5616, + "time_per_iteration": 2.874188184738159 + }, + { + "auxiliary_loss_clip": 0.01496783, + "auxiliary_loss_mlp": 0.01043235, + "balance_loss_clip": 1.30235064, + "balance_loss_mlp": 1.01872563, + "epoch": 0.33771231023598375, + "flos": 19282209171840.0, + "grad_norm": 1.9622051089556607, + "language_loss": 0.84608501, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.87148517, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.2454834, + "step": 5617, + "time_per_iteration": 2.849902629852295 + }, + { + "auxiliary_loss_clip": 0.01458358, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.27250612, + "balance_loss_mlp": 1.01619565, + "epoch": 0.3377724334886517, + "flos": 18159247762560.0, + "grad_norm": 4.501089883826148, + "language_loss": 0.8066563, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.83164907, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.24755859, + "step": 5618, + "time_per_iteration": 2.8378500938415527 + }, + { + "auxiliary_loss_clip": 0.0148243, + "auxiliary_loss_mlp": 0.0104124, + "balance_loss_clip": 1.29293394, + "balance_loss_mlp": 1.01750517, + "epoch": 0.3378325567413197, + "flos": 25787110769280.0, + "grad_norm": 1.6222809228721988, + "language_loss": 0.72674775, + "learning_rate": 3.085611774155481e-06, + "loss": 0.75198448, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23730469, + "step": 5619, + "time_per_iteration": 2.8863916397094727 + }, + { + "auxiliary_loss_clip": 0.01471133, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.28056979, + "balance_loss_mlp": 1.01700377, + "epoch": 0.3378926799939877, + "flos": 21325090734720.0, + "grad_norm": 4.032453665593643, + "language_loss": 0.7101295, + "learning_rate": 3.085284660993821e-06, + "loss": 0.73524725, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.23632812, + "step": 5620, + "time_per_iteration": 2.8823471069335938 + }, + { + "auxiliary_loss_clip": 0.01466783, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.28025532, + "balance_loss_mlp": 1.0183661, + "epoch": 0.33795280324665566, + "flos": 24910833864960.0, + "grad_norm": 2.8151348334880377, + "language_loss": 0.69108844, + "learning_rate": 3.084957506678058e-06, + "loss": 0.71617228, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.23242188, + "step": 5621, + "time_per_iteration": 2.8856420516967773 + }, + { + "auxiliary_loss_clip": 0.01463889, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.2798624, + "balance_loss_mlp": 1.01607919, + "epoch": 0.33801292649932363, + "flos": 24764674602240.0, + "grad_norm": 2.1091160503576325, + "language_loss": 0.8337574, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85878718, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22998047, + "step": 5622, + "time_per_iteration": 2.9631879329681396 + }, + { + "auxiliary_loss_clip": 0.01464445, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.28007114, + "balance_loss_mlp": 1.01903713, + "epoch": 0.3380730497519916, + "flos": 26735155205760.0, + "grad_norm": 1.5814974355175244, + "language_loss": 0.74435991, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.76942956, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.23474121, + "step": 5623, + "time_per_iteration": 2.903228759765625 + }, + { + "auxiliary_loss_clip": 0.01281748, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.16733217, + "balance_loss_mlp": 1.01138103, + "epoch": 0.33813317300465956, + "flos": 70068480935040.0, + "grad_norm": 0.7458005393977092, + "language_loss": 0.54982281, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57298875, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.234375, + "step": 5624, + "time_per_iteration": 3.5268478393554688 + }, + { + "auxiliary_loss_clip": 0.01474638, + "auxiliary_loss_mlp": 0.01045825, + "balance_loss_clip": 1.28425026, + "balance_loss_mlp": 1.02088618, + "epoch": 0.3381932962573275, + "flos": 24107908060800.0, + "grad_norm": 2.0732333976514727, + "language_loss": 0.73591948, + "learning_rate": 3.083648478122111e-06, + "loss": 0.76112413, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.24938965, + "step": 5625, + "time_per_iteration": 2.8666694164276123 + }, + { + "auxiliary_loss_clip": 0.01486659, + "auxiliary_loss_mlp": 0.01044207, + "balance_loss_clip": 1.29174829, + "balance_loss_mlp": 1.01920879, + "epoch": 0.3382534195099955, + "flos": 19286824141440.0, + "grad_norm": 2.34659655416355, + "language_loss": 0.72163194, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.74694061, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.24975586, + "step": 5626, + "time_per_iteration": 2.873349905014038 + }, + { + "auxiliary_loss_clip": 0.01456606, + "auxiliary_loss_mlp": 0.010413, + "balance_loss_clip": 1.27292836, + "balance_loss_mlp": 1.01658821, + "epoch": 0.33831354276266346, + "flos": 25236570314880.0, + "grad_norm": 1.4486268979061603, + "language_loss": 0.81329215, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83827114, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.24694824, + "step": 5627, + "time_per_iteration": 2.956404209136963 + }, + { + "auxiliary_loss_clip": 0.01478635, + "auxiliary_loss_mlp": 0.01049525, + "balance_loss_clip": 1.28847587, + "balance_loss_mlp": 1.02505136, + "epoch": 0.3383736660153314, + "flos": 23122283178240.0, + "grad_norm": 2.3106722815195706, + "language_loss": 0.81307137, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.83835292, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.24475098, + "step": 5628, + "time_per_iteration": 2.9071643352508545 + }, + { + "auxiliary_loss_clip": 0.01486933, + "auxiliary_loss_mlp": 0.01044473, + "balance_loss_clip": 1.29379845, + "balance_loss_mlp": 1.02051187, + "epoch": 0.3384337892679994, + "flos": 23487590845440.0, + "grad_norm": 2.34634782118513, + "language_loss": 0.7879926, + "learning_rate": 3.082338792093254e-06, + "loss": 0.81330669, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.23925781, + "step": 5629, + "time_per_iteration": 2.9267890453338623 + }, + { + "auxiliary_loss_clip": 0.01493533, + "auxiliary_loss_mlp": 0.01045698, + "balance_loss_clip": 1.29884434, + "balance_loss_mlp": 1.02046156, + "epoch": 0.33849391252066735, + "flos": 19434838440960.0, + "grad_norm": 2.201945194696313, + "language_loss": 0.85129207, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87668431, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.25244141, + "step": 5630, + "time_per_iteration": 2.8542301654815674 + }, + { + "auxiliary_loss_clip": 0.01486898, + "auxiliary_loss_mlp": 0.01050109, + "balance_loss_clip": 1.29739773, + "balance_loss_mlp": 1.02576602, + "epoch": 0.3385540357733353, + "flos": 21073565036160.0, + "grad_norm": 3.8768844157239486, + "language_loss": 0.7199896, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74535966, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.24328613, + "step": 5631, + "time_per_iteration": 2.848780632019043 + }, + { + "auxiliary_loss_clip": 0.01291091, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.17724133, + "balance_loss_mlp": 1.00751424, + "epoch": 0.3386141590260033, + "flos": 69234019505280.0, + "grad_norm": 0.8518126736593178, + "language_loss": 0.56114417, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58436292, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.23242188, + "step": 5632, + "time_per_iteration": 3.456843137741089 + }, + { + "auxiliary_loss_clip": 0.01479207, + "auxiliary_loss_mlp": 0.01045803, + "balance_loss_clip": 1.28824103, + "balance_loss_mlp": 1.02199674, + "epoch": 0.3386742822786713, + "flos": 25530246184320.0, + "grad_norm": 7.3419413449353, + "language_loss": 0.80787325, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.83312333, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.23815918, + "step": 5633, + "time_per_iteration": 2.901444911956787 + }, + { + "auxiliary_loss_clip": 0.01479688, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.29046726, + "balance_loss_mlp": 1.029176, + "epoch": 0.33873440553133927, + "flos": 23633614373760.0, + "grad_norm": 2.195714893013156, + "language_loss": 0.60281575, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.62815112, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.24658203, + "step": 5634, + "time_per_iteration": 2.9059152603149414 + }, + { + "auxiliary_loss_clip": 0.01484593, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.29516017, + "balance_loss_mlp": 1.02675378, + "epoch": 0.33879452878400723, + "flos": 17096833175040.0, + "grad_norm": 1.682067570515668, + "language_loss": 0.93264091, + "learning_rate": 3.080373032026589e-06, + "loss": 0.95799387, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.23950195, + "step": 5635, + "time_per_iteration": 2.8619163036346436 + }, + { + "auxiliary_loss_clip": 0.0144925, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.26659346, + "balance_loss_mlp": 1.01684928, + "epoch": 0.3388546520366752, + "flos": 15750470350080.0, + "grad_norm": 2.557706050168856, + "language_loss": 0.76617914, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.79107821, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.23803711, + "step": 5636, + "time_per_iteration": 2.856569528579712 + }, + { + "auxiliary_loss_clip": 0.01471316, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.28375208, + "balance_loss_mlp": 1.02178478, + "epoch": 0.33891477528934316, + "flos": 22428841086720.0, + "grad_norm": 1.5686089456504813, + "language_loss": 0.84516633, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.87033987, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.2421875, + "step": 5637, + "time_per_iteration": 2.914940118789673 + }, + { + "auxiliary_loss_clip": 0.014954, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.30302072, + "balance_loss_mlp": 1.01456463, + "epoch": 0.3389748985420111, + "flos": 17283740019840.0, + "grad_norm": 2.0673830989690254, + "language_loss": 0.71836853, + "learning_rate": 3.079389598759495e-06, + "loss": 0.74373353, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.265625, + "step": 5638, + "time_per_iteration": 2.827458143234253 + }, + { + "auxiliary_loss_clip": 0.0147359, + "auxiliary_loss_mlp": 0.01042664, + "balance_loss_clip": 1.28713298, + "balance_loss_mlp": 1.01817846, + "epoch": 0.3390350217946791, + "flos": 27756279273600.0, + "grad_norm": 1.8164592188503164, + "language_loss": 0.81532168, + "learning_rate": 3.079061705792765e-06, + "loss": 0.8404842, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.24487305, + "step": 5639, + "time_per_iteration": 4.323322057723999 + }, + { + "auxiliary_loss_clip": 0.01495055, + "auxiliary_loss_mlp": 0.01048195, + "balance_loss_clip": 1.29963672, + "balance_loss_mlp": 1.02381694, + "epoch": 0.33909514504734706, + "flos": 20349555442560.0, + "grad_norm": 2.2337803366822997, + "language_loss": 0.68309188, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70852435, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 1.95410156, + "router_z_loss_mlp": 0.24377441, + "step": 5640, + "time_per_iteration": 2.8533785343170166 + }, + { + "auxiliary_loss_clip": 0.01477558, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.28728306, + "balance_loss_mlp": 1.02003956, + "epoch": 0.339155268300015, + "flos": 14838241812480.0, + "grad_norm": 1.5757477659519648, + "language_loss": 0.7064532, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.73167098, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.24182129, + "step": 5641, + "time_per_iteration": 2.828672409057617 + }, + { + "auxiliary_loss_clip": 0.01488961, + "auxiliary_loss_mlp": 0.01043822, + "balance_loss_clip": 1.29821157, + "balance_loss_mlp": 1.01988459, + "epoch": 0.339215391552683, + "flos": 26079157825920.0, + "grad_norm": 3.2285427114005687, + "language_loss": 0.88904691, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.91437471, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.23937988, + "step": 5642, + "time_per_iteration": 4.379562139511108 + }, + { + "auxiliary_loss_clip": 0.01455236, + "auxiliary_loss_mlp": 0.01039451, + "balance_loss_clip": 1.27309465, + "balance_loss_mlp": 1.01681328, + "epoch": 0.33927551480535095, + "flos": 14582870305920.0, + "grad_norm": 1.7030647023084968, + "language_loss": 0.84499443, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86994135, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22644043, + "step": 5643, + "time_per_iteration": 2.8090620040893555 + }, + { + "auxiliary_loss_clip": 0.01457387, + "auxiliary_loss_mlp": 0.01047019, + "balance_loss_clip": 1.27053225, + "balance_loss_mlp": 1.02323651, + "epoch": 0.3393356380580189, + "flos": 23816132472960.0, + "grad_norm": 1.4568864701298074, + "language_loss": 0.78201097, + "learning_rate": 3.077421627435922e-06, + "loss": 0.807055, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.23803711, + "step": 5644, + "time_per_iteration": 2.845407247543335 + }, + { + "auxiliary_loss_clip": 0.01471214, + "auxiliary_loss_mlp": 0.01046016, + "balance_loss_clip": 1.28287673, + "balance_loss_mlp": 1.02222204, + "epoch": 0.3393957613106869, + "flos": 17356819651200.0, + "grad_norm": 3.527305927047716, + "language_loss": 0.64130557, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.66647792, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.23815918, + "step": 5645, + "time_per_iteration": 4.256519079208374 + }, + { + "auxiliary_loss_clip": 0.0146081, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.27494669, + "balance_loss_mlp": 1.01892221, + "epoch": 0.3394558845633549, + "flos": 28444337233920.0, + "grad_norm": 2.137041465473129, + "language_loss": 0.77758664, + "learning_rate": 3.076765310014552e-06, + "loss": 0.80261946, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.2355957, + "step": 5646, + "time_per_iteration": 2.9298095703125 + }, + { + "auxiliary_loss_clip": 0.01482706, + "auxiliary_loss_mlp": 0.01049273, + "balance_loss_clip": 1.29069662, + "balance_loss_mlp": 1.02509689, + "epoch": 0.33951600781602287, + "flos": 22096046448000.0, + "grad_norm": 2.1464963687433514, + "language_loss": 0.79856282, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.82388258, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.24194336, + "step": 5647, + "time_per_iteration": 4.312011480331421 + }, + { + "auxiliary_loss_clip": 0.01469594, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.28144574, + "balance_loss_mlp": 1.01382136, + "epoch": 0.33957613106869083, + "flos": 23888759656320.0, + "grad_norm": 3.1612707002397533, + "language_loss": 0.78149372, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.80656427, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.23632812, + "step": 5648, + "time_per_iteration": 2.882059097290039 + }, + { + "auxiliary_loss_clip": 0.0128101, + "auxiliary_loss_mlp": 0.01055305, + "balance_loss_clip": 1.16927004, + "balance_loss_mlp": 1.0303185, + "epoch": 0.3396362543213588, + "flos": 71276421358080.0, + "grad_norm": 0.791635093803938, + "language_loss": 0.56435156, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58771473, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.25, + "step": 5649, + "time_per_iteration": 3.383288621902466 + }, + { + "auxiliary_loss_clip": 0.01482063, + "auxiliary_loss_mlp": 0.0104785, + "balance_loss_clip": 1.29292655, + "balance_loss_mlp": 1.02407932, + "epoch": 0.33969637757402676, + "flos": 25932681849600.0, + "grad_norm": 1.771498634179408, + "language_loss": 0.86186337, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.88716251, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.23779297, + "step": 5650, + "time_per_iteration": 2.8908934593200684 + }, + { + "auxiliary_loss_clip": 0.01473701, + "auxiliary_loss_mlp": 0.01038745, + "balance_loss_clip": 1.28701162, + "balance_loss_mlp": 1.01615441, + "epoch": 0.33975650082669473, + "flos": 35275156260480.0, + "grad_norm": 1.7491881525770792, + "language_loss": 0.71860164, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.74372607, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22619629, + "step": 5651, + "time_per_iteration": 3.0218536853790283 + }, + { + "auxiliary_loss_clip": 0.01484725, + "auxiliary_loss_mlp": 0.01040968, + "balance_loss_clip": 1.29607177, + "balance_loss_mlp": 1.01654243, + "epoch": 0.3398166240793627, + "flos": 16653785662080.0, + "grad_norm": 1.8218861325226992, + "language_loss": 0.8234551, + "learning_rate": 3.074795378203616e-06, + "loss": 0.84871209, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.2442627, + "step": 5652, + "time_per_iteration": 2.8534984588623047 + }, + { + "auxiliary_loss_clip": 0.01494309, + "auxiliary_loss_mlp": 0.01042474, + "balance_loss_clip": 1.30264974, + "balance_loss_mlp": 1.01844144, + "epoch": 0.33987674733203066, + "flos": 24073132792320.0, + "grad_norm": 1.9158474929143987, + "language_loss": 0.78072107, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.80608892, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 1.91601562, + "router_z_loss_mlp": 0.2401123, + "step": 5653, + "time_per_iteration": 2.889430522918701 + }, + { + "auxiliary_loss_clip": 0.01485058, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.29757595, + "balance_loss_mlp": 1.0155189, + "epoch": 0.3399368705846986, + "flos": 13255763783040.0, + "grad_norm": 2.664875844177617, + "language_loss": 0.86429167, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.88953066, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.23339844, + "step": 5654, + "time_per_iteration": 2.8123180866241455 + }, + { + "auxiliary_loss_clip": 0.01480181, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.29260087, + "balance_loss_mlp": 1.01784658, + "epoch": 0.3399969938373666, + "flos": 27023311209600.0, + "grad_norm": 3.1129361722910995, + "language_loss": 0.66014516, + "learning_rate": 3.073809861919351e-06, + "loss": 0.68537581, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.25048828, + "step": 5655, + "time_per_iteration": 2.930741310119629 + }, + { + "auxiliary_loss_clip": 0.0147668, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.28967953, + "balance_loss_mlp": 1.01875651, + "epoch": 0.34005711709003456, + "flos": 28562236479360.0, + "grad_norm": 1.6766612306817017, + "language_loss": 0.77004075, + "learning_rate": 3.073481275036697e-06, + "loss": 0.79522377, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.22875977, + "step": 5656, + "time_per_iteration": 2.9428625106811523 + }, + { + "auxiliary_loss_clip": 0.01492402, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.29797256, + "balance_loss_mlp": 1.0194757, + "epoch": 0.3401172403427025, + "flos": 21627136892160.0, + "grad_norm": 1.5790339475110198, + "language_loss": 0.83966804, + "learning_rate": 3.073152647447525e-06, + "loss": 0.86501873, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.23205566, + "step": 5657, + "time_per_iteration": 2.966322183609009 + }, + { + "auxiliary_loss_clip": 0.01475755, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.28778434, + "balance_loss_mlp": 1.02256048, + "epoch": 0.3401773635953705, + "flos": 25897001685120.0, + "grad_norm": 1.7102921781068996, + "language_loss": 0.86354464, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.88876808, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.24035645, + "step": 5658, + "time_per_iteration": 2.9267985820770264 + }, + { + "auxiliary_loss_clip": 0.01281277, + "auxiliary_loss_mlp": 0.01057874, + "balance_loss_clip": 1.16242576, + "balance_loss_mlp": 1.03002703, + "epoch": 0.3402374868480385, + "flos": 65538629665920.0, + "grad_norm": 0.835436640682227, + "language_loss": 0.60187316, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62526464, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.27929688, + "step": 5659, + "time_per_iteration": 3.405350685119629 + }, + { + "auxiliary_loss_clip": 0.01470867, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.28704429, + "balance_loss_mlp": 1.01842964, + "epoch": 0.34029761010070647, + "flos": 24071096776320.0, + "grad_norm": 2.0854100201600936, + "language_loss": 0.68604004, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.71116388, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23083496, + "step": 5660, + "time_per_iteration": 2.847632646560669 + }, + { + "auxiliary_loss_clip": 0.01469714, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_clip": 1.2825048, + "balance_loss_mlp": 1.01822996, + "epoch": 0.34035773335337444, + "flos": 27611160641280.0, + "grad_norm": 1.6201914164856577, + "language_loss": 0.6807304, + "learning_rate": 3.071837730274918e-06, + "loss": 0.70586854, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.25866699, + "step": 5661, + "time_per_iteration": 2.9140677452087402 + }, + { + "auxiliary_loss_clip": 0.01460438, + "auxiliary_loss_mlp": 0.01040782, + "balance_loss_clip": 1.27607298, + "balance_loss_mlp": 1.01747692, + "epoch": 0.3404178566060424, + "flos": 20822310806400.0, + "grad_norm": 1.6077214553871748, + "language_loss": 0.8039422, + "learning_rate": 3.071508899340113e-06, + "loss": 0.82895446, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.23303223, + "step": 5662, + "time_per_iteration": 2.8768222332000732 + }, + { + "auxiliary_loss_clip": 0.01463926, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.27880287, + "balance_loss_mlp": 1.0183816, + "epoch": 0.34047797985871037, + "flos": 26844819897600.0, + "grad_norm": 6.659045946765524, + "language_loss": 0.74671102, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.77179253, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.25842285, + "step": 5663, + "time_per_iteration": 2.9183075428009033 + }, + { + "auxiliary_loss_clip": 0.01456881, + "auxiliary_loss_mlp": 0.0103685, + "balance_loss_clip": 1.27412581, + "balance_loss_mlp": 1.0137713, + "epoch": 0.34053810311137833, + "flos": 19692110229120.0, + "grad_norm": 1.6302768428766548, + "language_loss": 0.86977762, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.89471495, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23071289, + "step": 5664, + "time_per_iteration": 2.8136227130889893 + }, + { + "auxiliary_loss_clip": 0.01486269, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.29762924, + "balance_loss_mlp": 1.01878095, + "epoch": 0.3405982263640463, + "flos": 21735715708800.0, + "grad_norm": 2.0449476594805196, + "language_loss": 0.69940042, + "learning_rate": 3.070522162795235e-06, + "loss": 0.72468507, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.23413086, + "step": 5665, + "time_per_iteration": 2.869515895843506 + }, + { + "auxiliary_loss_clip": 0.01486682, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.29728627, + "balance_loss_mlp": 1.01987743, + "epoch": 0.34065834961671426, + "flos": 18050895169920.0, + "grad_norm": 2.9142886150750584, + "language_loss": 0.74203491, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.76734495, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.2442627, + "step": 5666, + "time_per_iteration": 2.8744821548461914 + }, + { + "auxiliary_loss_clip": 0.01496518, + "auxiliary_loss_mlp": 0.01043648, + "balance_loss_clip": 1.30482244, + "balance_loss_mlp": 1.01944852, + "epoch": 0.3407184728693822, + "flos": 21407219326080.0, + "grad_norm": 1.4860344330328579, + "language_loss": 0.74012572, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.76552737, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.24182129, + "step": 5667, + "time_per_iteration": 2.89290189743042 + }, + { + "auxiliary_loss_clip": 0.01287877, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.17445886, + "balance_loss_mlp": 1.00020683, + "epoch": 0.3407785961220502, + "flos": 68721077479680.0, + "grad_norm": 0.843616422277839, + "language_loss": 0.63311768, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65627503, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.27734375, + "step": 5668, + "time_per_iteration": 3.552546977996826 + }, + { + "auxiliary_loss_clip": 0.01472336, + "auxiliary_loss_mlp": 0.01042671, + "balance_loss_clip": 1.28427541, + "balance_loss_mlp": 1.01791131, + "epoch": 0.34083871937471816, + "flos": 14072534496000.0, + "grad_norm": 1.9886836776935644, + "language_loss": 0.73513985, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.76028991, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.24743652, + "step": 5669, + "time_per_iteration": 2.8623647689819336 + }, + { + "auxiliary_loss_clip": 0.01485754, + "auxiliary_loss_mlp": 0.0104077, + "balance_loss_clip": 1.29633951, + "balance_loss_mlp": 1.01605844, + "epoch": 0.3408988426273861, + "flos": 17092489674240.0, + "grad_norm": 1.6782448775393117, + "language_loss": 0.80917978, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.834445, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.24743652, + "step": 5670, + "time_per_iteration": 2.9192001819610596 + }, + { + "auxiliary_loss_clip": 0.01491173, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.2978456, + "balance_loss_mlp": 1.01382291, + "epoch": 0.3409589658800541, + "flos": 24035190387840.0, + "grad_norm": 1.626262312891787, + "language_loss": 0.7771191, + "learning_rate": 3.068547593996078e-06, + "loss": 0.80240864, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 1.93261719, + "router_z_loss_mlp": 0.23974609, + "step": 5671, + "time_per_iteration": 2.915644645690918 + }, + { + "auxiliary_loss_clip": 0.01495552, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.30551636, + "balance_loss_mlp": 1.01436484, + "epoch": 0.34101908913272205, + "flos": 21151847819520.0, + "grad_norm": 1.7406258004641717, + "language_loss": 0.74744332, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.77279031, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.2479248, + "step": 5672, + "time_per_iteration": 2.8889925479888916 + }, + { + "auxiliary_loss_clip": 0.01488731, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.29881787, + "balance_loss_mlp": 1.01451182, + "epoch": 0.3410792123853901, + "flos": 15709496544000.0, + "grad_norm": 1.9063653933772846, + "language_loss": 0.74652445, + "learning_rate": 3.06788908010777e-06, + "loss": 0.7717911, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.234375, + "step": 5673, + "time_per_iteration": 2.843393087387085 + }, + { + "auxiliary_loss_clip": 0.01469866, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.28531384, + "balance_loss_mlp": 1.01481247, + "epoch": 0.34113933563805804, + "flos": 23045493473280.0, + "grad_norm": 1.7794753818839084, + "language_loss": 0.80820584, + "learning_rate": 3.067559762415682e-06, + "loss": 0.83329082, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.23803711, + "step": 5674, + "time_per_iteration": 2.862316370010376 + }, + { + "auxiliary_loss_clip": 0.01284063, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.17345786, + "balance_loss_mlp": 1.02593541, + "epoch": 0.341199458890726, + "flos": 69641613077760.0, + "grad_norm": 0.7913267172623748, + "language_loss": 0.56092203, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58427751, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.25585938, + "step": 5675, + "time_per_iteration": 4.93659782409668 + }, + { + "auxiliary_loss_clip": 0.01459039, + "auxiliary_loss_mlp": 0.01041137, + "balance_loss_clip": 1.27603066, + "balance_loss_mlp": 1.01749802, + "epoch": 0.34125958214339397, + "flos": 22356575861760.0, + "grad_norm": 1.6167539201099466, + "language_loss": 0.80236822, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.82736999, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.23608398, + "step": 5676, + "time_per_iteration": 2.8936855792999268 + }, + { + "auxiliary_loss_clip": 0.01485225, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.29581547, + "balance_loss_mlp": 1.01632619, + "epoch": 0.34131970539606193, + "flos": 21882010705920.0, + "grad_norm": 1.7855992639328806, + "language_loss": 0.86369747, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.88895345, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.24047852, + "step": 5677, + "time_per_iteration": 2.8445372581481934 + }, + { + "auxiliary_loss_clip": 0.01483345, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.29563975, + "balance_loss_mlp": 1.01341748, + "epoch": 0.3413798286487299, + "flos": 24946333050240.0, + "grad_norm": 2.5599036953417236, + "language_loss": 0.80529404, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.83051115, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.24926758, + "step": 5678, + "time_per_iteration": 4.256150484085083 + }, + { + "auxiliary_loss_clip": 0.0148572, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.29695201, + "balance_loss_mlp": 1.01677752, + "epoch": 0.34143995190139786, + "flos": 25385308531200.0, + "grad_norm": 1.6765933931444794, + "language_loss": 0.76034236, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.78559798, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.23059082, + "step": 5679, + "time_per_iteration": 2.910964012145996 + }, + { + "auxiliary_loss_clip": 0.01272063, + "auxiliary_loss_mlp": 0.01025887, + "balance_loss_clip": 1.16572595, + "balance_loss_mlp": 1.00214064, + "epoch": 0.34150007515406583, + "flos": 67817038250880.0, + "grad_norm": 0.7245207366502291, + "language_loss": 0.59533644, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61831594, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.23730469, + "step": 5680, + "time_per_iteration": 4.885948657989502 + }, + { + "auxiliary_loss_clip": 0.01476597, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.29057467, + "balance_loss_mlp": 1.01751173, + "epoch": 0.3415601984067338, + "flos": 20311929751680.0, + "grad_norm": 1.8683275951768128, + "language_loss": 0.73426342, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.75943732, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23266602, + "step": 5681, + "time_per_iteration": 2.9201807975769043 + }, + { + "auxiliary_loss_clip": 0.01478079, + "auxiliary_loss_mlp": 0.01046591, + "balance_loss_clip": 1.29317081, + "balance_loss_mlp": 1.02321374, + "epoch": 0.34162032165940176, + "flos": 26042301296640.0, + "grad_norm": 2.324714334292036, + "language_loss": 0.72474849, + "learning_rate": 3.064923764577233e-06, + "loss": 0.74999517, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.23388672, + "step": 5682, + "time_per_iteration": 4.366451263427734 + }, + { + "auxiliary_loss_clip": 0.01486725, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.2999475, + "balance_loss_mlp": 1.01636744, + "epoch": 0.3416804449120697, + "flos": 28814983787520.0, + "grad_norm": 1.4587130835137663, + "language_loss": 0.8503682, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.87562907, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22998047, + "step": 5683, + "time_per_iteration": 2.918466567993164 + }, + { + "auxiliary_loss_clip": 0.01494125, + "auxiliary_loss_mlp": 0.01040818, + "balance_loss_clip": 1.30516529, + "balance_loss_mlp": 1.01629686, + "epoch": 0.3417405681647377, + "flos": 22611856878720.0, + "grad_norm": 1.789634343145235, + "language_loss": 0.71960211, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.74495149, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.24511719, + "step": 5684, + "time_per_iteration": 2.905488967895508 + }, + { + "auxiliary_loss_clip": 0.01478977, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.29628515, + "balance_loss_mlp": 1.01859617, + "epoch": 0.34180069141740566, + "flos": 24726641708160.0, + "grad_norm": 3.963055919160837, + "language_loss": 0.76132274, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.78652763, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.22924805, + "step": 5685, + "time_per_iteration": 2.9006502628326416 + }, + { + "auxiliary_loss_clip": 0.01472371, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.29062247, + "balance_loss_mlp": 1.01976562, + "epoch": 0.3418608146700737, + "flos": 30530997780480.0, + "grad_norm": 1.6396355001175116, + "language_loss": 0.71397448, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.73911935, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.22363281, + "step": 5686, + "time_per_iteration": 2.9380719661712646 + }, + { + "auxiliary_loss_clip": 0.01492894, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.30431521, + "balance_loss_mlp": 1.02098608, + "epoch": 0.34192093792274164, + "flos": 15130741317120.0, + "grad_norm": 1.887302226930833, + "language_loss": 0.78336215, + "learning_rate": 3.06327495310661e-06, + "loss": 0.8087374, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.23632812, + "step": 5687, + "time_per_iteration": 2.882664442062378 + }, + { + "auxiliary_loss_clip": 0.01479658, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.29762793, + "balance_loss_mlp": 1.01838422, + "epoch": 0.3419810611754096, + "flos": 13195488430080.0, + "grad_norm": 2.265846173989185, + "language_loss": 0.88120115, + "learning_rate": 3.062945069803981e-06, + "loss": 0.90642905, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.24743652, + "step": 5688, + "time_per_iteration": 2.8851053714752197 + }, + { + "auxiliary_loss_clip": 0.014977, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.30654192, + "balance_loss_mlp": 1.02274895, + "epoch": 0.34204118442807757, + "flos": 19545589008000.0, + "grad_norm": 1.692954002554797, + "language_loss": 0.80701518, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.83246613, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24658203, + "step": 5689, + "time_per_iteration": 2.8464603424072266 + }, + { + "auxiliary_loss_clip": 0.01494846, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.30701113, + "balance_loss_mlp": 1.02053714, + "epoch": 0.34210130768074554, + "flos": 15203142276480.0, + "grad_norm": 4.480820520340829, + "language_loss": 0.74462903, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.77003038, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.24719238, + "step": 5690, + "time_per_iteration": 2.8449182510375977 + }, + { + "auxiliary_loss_clip": 0.01492243, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.30490994, + "balance_loss_mlp": 1.01696134, + "epoch": 0.3421614309334135, + "flos": 24946966477440.0, + "grad_norm": 1.8252067677347612, + "language_loss": 0.7761569, + "learning_rate": 3.061955178104237e-06, + "loss": 0.80148047, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.23132324, + "step": 5691, + "time_per_iteration": 3.080873727798462 + }, + { + "auxiliary_loss_clip": 0.01473079, + "auxiliary_loss_mlp": 0.01044519, + "balance_loss_clip": 1.28906584, + "balance_loss_mlp": 1.02108216, + "epoch": 0.34222155418608147, + "flos": 21918776745600.0, + "grad_norm": 1.6027155699983473, + "language_loss": 0.69645917, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.72163516, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.234375, + "step": 5692, + "time_per_iteration": 2.885075092315674 + }, + { + "auxiliary_loss_clip": 0.0149018, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.30196989, + "balance_loss_mlp": 1.01822209, + "epoch": 0.34228167743874943, + "flos": 18123205639680.0, + "grad_norm": 2.1158466212015647, + "language_loss": 0.74178374, + "learning_rate": 3.06129504893632e-06, + "loss": 0.76711243, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.24475098, + "step": 5693, + "time_per_iteration": 2.8556039333343506 + }, + { + "auxiliary_loss_clip": 0.01473583, + "auxiliary_loss_mlp": 0.01047746, + "balance_loss_clip": 1.29139328, + "balance_loss_mlp": 1.02432108, + "epoch": 0.3423418006914174, + "flos": 21298640509440.0, + "grad_norm": 2.0834179777481503, + "language_loss": 0.77085865, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.79607195, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.23413086, + "step": 5694, + "time_per_iteration": 2.8845224380493164 + }, + { + "auxiliary_loss_clip": 0.01472868, + "auxiliary_loss_mlp": 0.01040135, + "balance_loss_clip": 1.29101706, + "balance_loss_mlp": 1.01756883, + "epoch": 0.34240192394408536, + "flos": 19831663751040.0, + "grad_norm": 1.8319180590360948, + "language_loss": 0.80729592, + "learning_rate": 3.060634758790747e-06, + "loss": 0.83242595, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.22570801, + "step": 5695, + "time_per_iteration": 2.9100561141967773 + }, + { + "auxiliary_loss_clip": 0.01476536, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.29110658, + "balance_loss_mlp": 1.01774943, + "epoch": 0.3424620471967533, + "flos": 24546340604160.0, + "grad_norm": 7.313005194273893, + "language_loss": 0.74529189, + "learning_rate": 3.060304553382635e-06, + "loss": 0.77047461, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.23999023, + "step": 5696, + "time_per_iteration": 2.968601703643799 + }, + { + "auxiliary_loss_clip": 0.0148179, + "auxiliary_loss_mlp": 0.01050972, + "balance_loss_clip": 1.29698968, + "balance_loss_mlp": 1.02736843, + "epoch": 0.3425221704494213, + "flos": 25860009421440.0, + "grad_norm": 1.8748157644889791, + "language_loss": 0.714957, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.74028462, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23608398, + "step": 5697, + "time_per_iteration": 3.000431537628174 + }, + { + "auxiliary_loss_clip": 0.01480074, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.29667258, + "balance_loss_mlp": 1.01543498, + "epoch": 0.34258229370208926, + "flos": 21549713760000.0, + "grad_norm": 1.7608423269418494, + "language_loss": 0.82846022, + "learning_rate": 3.05964402195837e-06, + "loss": 0.8536427, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22753906, + "step": 5698, + "time_per_iteration": 2.860071897506714 + }, + { + "auxiliary_loss_clip": 0.01494558, + "auxiliary_loss_mlp": 0.01047724, + "balance_loss_clip": 1.30705237, + "balance_loss_mlp": 1.02347636, + "epoch": 0.3426424169547573, + "flos": 23661512432640.0, + "grad_norm": 1.8674801584303693, + "language_loss": 0.70214844, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.72757125, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.24279785, + "step": 5699, + "time_per_iteration": 2.8833255767822266 + }, + { + "auxiliary_loss_clip": 0.01481973, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.2971468, + "balance_loss_mlp": 1.01513863, + "epoch": 0.34270254020742524, + "flos": 24655688582400.0, + "grad_norm": 2.2221592487681767, + "language_loss": 0.73987657, + "learning_rate": 3.058983329806877e-06, + "loss": 0.76507384, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22607422, + "step": 5700, + "time_per_iteration": 2.952270984649658 + }, + { + "auxiliary_loss_clip": 0.01492169, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.30772364, + "balance_loss_mlp": 1.01784897, + "epoch": 0.3427626634600932, + "flos": 21006729187200.0, + "grad_norm": 1.7776962679820854, + "language_loss": 0.82386231, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84919369, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.23120117, + "step": 5701, + "time_per_iteration": 2.8703997135162354 + }, + { + "auxiliary_loss_clip": 0.01486146, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.30032587, + "balance_loss_mlp": 1.02138877, + "epoch": 0.3428227867127612, + "flos": 21443668652160.0, + "grad_norm": 1.6643539192209136, + "language_loss": 0.72538102, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.75068498, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.2286377, + "step": 5702, + "time_per_iteration": 2.853252410888672 + }, + { + "auxiliary_loss_clip": 0.01251748, + "auxiliary_loss_mlp": 0.01020829, + "balance_loss_clip": 1.14721239, + "balance_loss_mlp": 1.00080156, + "epoch": 0.34288290996542914, + "flos": 55760392154880.0, + "grad_norm": 0.7799765185120751, + "language_loss": 0.57669407, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59941989, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.20019531, + "step": 5703, + "time_per_iteration": 3.251143217086792 + }, + { + "auxiliary_loss_clip": 0.01493765, + "auxiliary_loss_mlp": 0.01043369, + "balance_loss_clip": 1.30653358, + "balance_loss_mlp": 1.01885962, + "epoch": 0.3429430332180971, + "flos": 20166358671360.0, + "grad_norm": 2.0198636519643944, + "language_loss": 0.76164848, + "learning_rate": 3.057661463723086e-06, + "loss": 0.78701979, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.24499512, + "step": 5704, + "time_per_iteration": 2.8523623943328857 + }, + { + "auxiliary_loss_clip": 0.01478742, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.29599118, + "balance_loss_mlp": 1.02428651, + "epoch": 0.34300315647076507, + "flos": 17974376933760.0, + "grad_norm": 2.0695864268058597, + "language_loss": 0.73681819, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.7620846, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23632812, + "step": 5705, + "time_per_iteration": 2.860525131225586 + }, + { + "auxiliary_loss_clip": 0.01486183, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_clip": 1.30105817, + "balance_loss_mlp": 1.01749754, + "epoch": 0.34306327972343303, + "flos": 22095910713600.0, + "grad_norm": 2.044513158405664, + "language_loss": 0.80285633, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82813954, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.24658203, + "step": 5706, + "time_per_iteration": 2.884566307067871 + }, + { + "auxiliary_loss_clip": 0.01493564, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.3038497, + "balance_loss_mlp": 1.01777148, + "epoch": 0.343123402976101, + "flos": 18451973491200.0, + "grad_norm": 1.8858513992844508, + "language_loss": 0.83709687, + "learning_rate": 3.056669642996787e-06, + "loss": 0.862445, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.23474121, + "step": 5707, + "time_per_iteration": 2.8469135761260986 + }, + { + "auxiliary_loss_clip": 0.01485376, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.30076635, + "balance_loss_mlp": 1.0189321, + "epoch": 0.34318352622876896, + "flos": 17172129801600.0, + "grad_norm": 1.500566844224646, + "language_loss": 0.75750196, + "learning_rate": 3.056338955933266e-06, + "loss": 0.78278095, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.23583984, + "step": 5708, + "time_per_iteration": 2.866607189178467 + }, + { + "auxiliary_loss_clip": 0.0147062, + "auxiliary_loss_mlp": 0.01040921, + "balance_loss_clip": 1.28808653, + "balance_loss_mlp": 1.01814008, + "epoch": 0.34324364948143693, + "flos": 26699520286080.0, + "grad_norm": 1.5235453183406196, + "language_loss": 0.81740677, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.84252214, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22766113, + "step": 5709, + "time_per_iteration": 2.9027554988861084 + }, + { + "auxiliary_loss_clip": 0.01487335, + "auxiliary_loss_mlp": 0.01047784, + "balance_loss_clip": 1.3000505, + "balance_loss_mlp": 1.02284527, + "epoch": 0.3433037727341049, + "flos": 21261467266560.0, + "grad_norm": 1.9488103026208399, + "language_loss": 0.79678798, + "learning_rate": 3.055677461649329e-06, + "loss": 0.82213914, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.24951172, + "step": 5710, + "time_per_iteration": 4.2722327709198 + }, + { + "auxiliary_loss_clip": 0.01486733, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.29706621, + "balance_loss_mlp": 1.0135386, + "epoch": 0.34336389598677286, + "flos": 20638661587200.0, + "grad_norm": 1.7754777628390912, + "language_loss": 0.71572816, + "learning_rate": 3.055346654453996e-06, + "loss": 0.740978, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.24707031, + "step": 5711, + "time_per_iteration": 2.8505163192749023 + }, + { + "auxiliary_loss_clip": 0.01466748, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.28311801, + "balance_loss_mlp": 1.01811814, + "epoch": 0.3434240192394409, + "flos": 14546828183040.0, + "grad_norm": 1.7005521764814537, + "language_loss": 0.68399847, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70908809, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.2409668, + "step": 5712, + "time_per_iteration": 2.9147815704345703 + }, + { + "auxiliary_loss_clip": 0.01261446, + "auxiliary_loss_mlp": 0.01053607, + "balance_loss_clip": 1.15592074, + "balance_loss_mlp": 1.03386617, + "epoch": 0.34348414249210885, + "flos": 58076770406400.0, + "grad_norm": 0.8506168200074447, + "language_loss": 0.58204055, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60519111, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.19726562, + "step": 5713, + "time_per_iteration": 4.738008737564087 + }, + { + "auxiliary_loss_clip": 0.01487975, + "auxiliary_loss_mlp": 0.0104274, + "balance_loss_clip": 1.30220628, + "balance_loss_mlp": 1.02046025, + "epoch": 0.3435442657447768, + "flos": 20714229682560.0, + "grad_norm": 1.6290913914929215, + "language_loss": 0.81837648, + "learning_rate": 3.054353992805076e-06, + "loss": 0.8436836, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22290039, + "step": 5714, + "time_per_iteration": 2.8812825679779053 + }, + { + "auxiliary_loss_clip": 0.01482389, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.29707336, + "balance_loss_mlp": 1.01432502, + "epoch": 0.3436043889974448, + "flos": 22940669975040.0, + "grad_norm": 2.3356131706779584, + "language_loss": 0.72924566, + "learning_rate": 3.05402302560962e-06, + "loss": 0.75443196, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.21923828, + "step": 5715, + "time_per_iteration": 2.8737645149230957 + }, + { + "auxiliary_loss_clip": 0.01258677, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.15124726, + "balance_loss_mlp": 1.00647545, + "epoch": 0.34366451225011274, + "flos": 58434675909120.0, + "grad_norm": 0.9101243748589947, + "language_loss": 0.66107607, + "learning_rate": 3.053692018445505e-06, + "loss": 0.68394125, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.21386719, + "step": 5716, + "time_per_iteration": 4.772834062576294 + }, + { + "auxiliary_loss_clip": 0.01458134, + "auxiliary_loss_mlp": 0.01041274, + "balance_loss_clip": 1.27784848, + "balance_loss_mlp": 1.01869535, + "epoch": 0.3437246355027807, + "flos": 15604718290560.0, + "grad_norm": 1.8812413620421808, + "language_loss": 0.74749094, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.77248502, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22583008, + "step": 5717, + "time_per_iteration": 2.862823724746704 + }, + { + "auxiliary_loss_clip": 0.01475337, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.29166365, + "balance_loss_mlp": 1.01405931, + "epoch": 0.34378475875544867, + "flos": 27683606845440.0, + "grad_norm": 1.94888562481502, + "language_loss": 0.76475859, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.78987461, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.2220459, + "step": 5718, + "time_per_iteration": 4.343626976013184 + }, + { + "auxiliary_loss_clip": 0.01477319, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.2904973, + "balance_loss_mlp": 1.01816475, + "epoch": 0.34384488200811664, + "flos": 31444131214080.0, + "grad_norm": 1.9107268943222022, + "language_loss": 0.647008, + "learning_rate": 3.052698757266734e-06, + "loss": 0.67219818, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23535156, + "step": 5719, + "time_per_iteration": 2.952838182449341 + }, + { + "auxiliary_loss_clip": 0.0147778, + "auxiliary_loss_mlp": 0.01040657, + "balance_loss_clip": 1.29100251, + "balance_loss_mlp": 1.01673198, + "epoch": 0.3439050052607846, + "flos": 24910698130560.0, + "grad_norm": 1.797083032639574, + "language_loss": 0.74605501, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.7712394, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23937988, + "step": 5720, + "time_per_iteration": 2.915485382080078 + }, + { + "auxiliary_loss_clip": 0.01479269, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.29412198, + "balance_loss_mlp": 1.01481509, + "epoch": 0.34396512851345257, + "flos": 18159112028160.0, + "grad_norm": 1.6701827873180657, + "language_loss": 0.74628866, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.77145636, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22692871, + "step": 5721, + "time_per_iteration": 2.8276572227478027 + }, + { + "auxiliary_loss_clip": 0.01479699, + "auxiliary_loss_mlp": 0.01044796, + "balance_loss_clip": 1.29534388, + "balance_loss_mlp": 1.02219427, + "epoch": 0.34402525176612053, + "flos": 16043060344320.0, + "grad_norm": 2.316545739366996, + "language_loss": 0.8126182, + "learning_rate": 3.051705136821992e-06, + "loss": 0.83786315, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22619629, + "step": 5722, + "time_per_iteration": 2.846906900405884 + }, + { + "auxiliary_loss_clip": 0.01467925, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.28573656, + "balance_loss_mlp": 1.01754594, + "epoch": 0.3440853750187885, + "flos": 21188523369600.0, + "grad_norm": 1.7429046550709189, + "language_loss": 0.82292736, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84799504, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.21313477, + "step": 5723, + "time_per_iteration": 2.8604414463043213 + }, + { + "auxiliary_loss_clip": 0.01476312, + "auxiliary_loss_mlp": 0.0104546, + "balance_loss_clip": 1.2917031, + "balance_loss_mlp": 1.01862597, + "epoch": 0.34414549827145646, + "flos": 12685650312960.0, + "grad_norm": 2.2805962533966566, + "language_loss": 0.82440972, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.84962749, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.26818848, + "step": 5724, + "time_per_iteration": 2.8361175060272217 + }, + { + "auxiliary_loss_clip": 0.01469977, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.28581595, + "balance_loss_mlp": 1.01781249, + "epoch": 0.3442056215241244, + "flos": 31296840831360.0, + "grad_norm": 1.8619518198957918, + "language_loss": 0.69955611, + "learning_rate": 3.05071115745038e-06, + "loss": 0.72466052, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22631836, + "step": 5725, + "time_per_iteration": 2.9467291831970215 + }, + { + "auxiliary_loss_clip": 0.01498043, + "auxiliary_loss_mlp": 0.01043411, + "balance_loss_clip": 1.30954003, + "balance_loss_mlp": 1.01933074, + "epoch": 0.34426574477679245, + "flos": 23377609440000.0, + "grad_norm": 1.4170113584557122, + "language_loss": 0.70602894, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.73144346, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.24084473, + "step": 5726, + "time_per_iteration": 2.882237672805786 + }, + { + "auxiliary_loss_clip": 0.01491839, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.30476427, + "balance_loss_mlp": 1.02327776, + "epoch": 0.3443258680294604, + "flos": 24546204869760.0, + "grad_norm": 1.9208719122408142, + "language_loss": 0.73868763, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.76405317, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21435547, + "step": 5727, + "time_per_iteration": 2.9142730236053467 + }, + { + "auxiliary_loss_clip": 0.01481104, + "auxiliary_loss_mlp": 0.01046624, + "balance_loss_clip": 1.29666114, + "balance_loss_mlp": 1.02490401, + "epoch": 0.3443859912821284, + "flos": 20239800261120.0, + "grad_norm": 2.0590785042808957, + "language_loss": 0.89226019, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.91753745, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.21740723, + "step": 5728, + "time_per_iteration": 2.8307361602783203 + }, + { + "auxiliary_loss_clip": 0.01479638, + "auxiliary_loss_mlp": 0.01047157, + "balance_loss_clip": 1.29584014, + "balance_loss_mlp": 1.02504396, + "epoch": 0.34444611453479634, + "flos": 24327237444480.0, + "grad_norm": 2.097829760254631, + "language_loss": 0.71353674, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.73880464, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22119141, + "step": 5729, + "time_per_iteration": 2.8734922409057617 + }, + { + "auxiliary_loss_clip": 0.01472183, + "auxiliary_loss_mlp": 0.01043413, + "balance_loss_clip": 1.28995001, + "balance_loss_mlp": 1.02016675, + "epoch": 0.3445062377874643, + "flos": 16992371635200.0, + "grad_norm": 1.8736306421339783, + "language_loss": 0.75093538, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.77609134, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.23242188, + "step": 5730, + "time_per_iteration": 2.8149631023406982 + }, + { + "auxiliary_loss_clip": 0.01478406, + "auxiliary_loss_mlp": 0.0104519, + "balance_loss_clip": 1.29514623, + "balance_loss_mlp": 1.02171791, + "epoch": 0.3445663610401323, + "flos": 20312291710080.0, + "grad_norm": 3.7489692215942574, + "language_loss": 0.81116503, + "learning_rate": 3.048722123283578e-06, + "loss": 0.83640105, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23486328, + "step": 5731, + "time_per_iteration": 2.9419095516204834 + }, + { + "auxiliary_loss_clip": 0.01487835, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.30267119, + "balance_loss_mlp": 1.01940119, + "epoch": 0.34462648429280024, + "flos": 15896584368000.0, + "grad_norm": 2.8000631531280664, + "language_loss": 0.79265344, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81794035, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.21459961, + "step": 5732, + "time_per_iteration": 2.8550000190734863 + }, + { + "auxiliary_loss_clip": 0.01253171, + "auxiliary_loss_mlp": 0.01081038, + "balance_loss_clip": 1.14761519, + "balance_loss_mlp": 1.06043899, + "epoch": 0.3446866075454682, + "flos": 59341520315520.0, + "grad_norm": 0.7611188657171202, + "language_loss": 0.5359503, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55929232, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.20605469, + "step": 5733, + "time_per_iteration": 3.372556686401367 + }, + { + "auxiliary_loss_clip": 0.01483317, + "auxiliary_loss_mlp": 0.01042522, + "balance_loss_clip": 1.29688585, + "balance_loss_mlp": 1.01862073, + "epoch": 0.34474673079813617, + "flos": 22353363480960.0, + "grad_norm": 2.6901644518049017, + "language_loss": 0.84003592, + "learning_rate": 3.047727069167207e-06, + "loss": 0.86529422, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.2388916, + "step": 5734, + "time_per_iteration": 2.871182441711426 + }, + { + "auxiliary_loss_clip": 0.01490691, + "auxiliary_loss_mlp": 0.01040991, + "balance_loss_clip": 1.30347383, + "balance_loss_mlp": 1.01719689, + "epoch": 0.34480685405080413, + "flos": 27680937402240.0, + "grad_norm": 1.7684844875088794, + "language_loss": 0.94077313, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.9660899, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23791504, + "step": 5735, + "time_per_iteration": 2.885958671569824 + }, + { + "auxiliary_loss_clip": 0.01497156, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.30963016, + "balance_loss_mlp": 1.01624274, + "epoch": 0.3448669773034721, + "flos": 22466331043200.0, + "grad_norm": 1.7490086777712723, + "language_loss": 0.7769556, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.80231559, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.22595215, + "step": 5736, + "time_per_iteration": 2.839071750640869 + }, + { + "auxiliary_loss_clip": 0.0148963, + "auxiliary_loss_mlp": 0.01038165, + "balance_loss_clip": 1.30303907, + "balance_loss_mlp": 1.01496649, + "epoch": 0.34492710055614006, + "flos": 24946921232640.0, + "grad_norm": 1.4812093884193605, + "language_loss": 0.79298025, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81825823, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.23217773, + "step": 5737, + "time_per_iteration": 2.8906710147857666 + }, + { + "auxiliary_loss_clip": 0.0150445, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.31295419, + "balance_loss_mlp": 1.01543236, + "epoch": 0.34498722380880803, + "flos": 20130497527680.0, + "grad_norm": 1.961297394370401, + "language_loss": 0.72684181, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.75227463, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.23376465, + "step": 5738, + "time_per_iteration": 2.8215975761413574 + }, + { + "auxiliary_loss_clip": 0.01491585, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.30378127, + "balance_loss_mlp": 1.01437485, + "epoch": 0.34504734706147605, + "flos": 28449585630720.0, + "grad_norm": 1.7324288078210495, + "language_loss": 0.82168853, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84697545, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.22753906, + "step": 5739, + "time_per_iteration": 2.896411895751953 + }, + { + "auxiliary_loss_clip": 0.0149123, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.30521858, + "balance_loss_mlp": 1.01683354, + "epoch": 0.345107470314144, + "flos": 22684710286080.0, + "grad_norm": 1.9173009141619501, + "language_loss": 0.84159964, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.86691868, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.23828125, + "step": 5740, + "time_per_iteration": 2.861753463745117 + }, + { + "auxiliary_loss_clip": 0.0149362, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.30902338, + "balance_loss_mlp": 1.0158906, + "epoch": 0.345167593566812, + "flos": 20640290400000.0, + "grad_norm": 1.9726559805583839, + "language_loss": 0.78051275, + "learning_rate": 3.045403886269181e-06, + "loss": 0.80583715, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.22924805, + "step": 5741, + "time_per_iteration": 2.8760039806365967 + }, + { + "auxiliary_loss_clip": 0.01493842, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.30385518, + "balance_loss_mlp": 1.0148989, + "epoch": 0.34522771681947995, + "flos": 26225724291840.0, + "grad_norm": 2.2623598862933076, + "language_loss": 0.77624166, + "learning_rate": 3.045071844330053e-06, + "loss": 0.80155694, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.22753906, + "step": 5742, + "time_per_iteration": 2.992497682571411 + }, + { + "auxiliary_loss_clip": 0.01480088, + "auxiliary_loss_mlp": 0.01038265, + "balance_loss_clip": 1.29656124, + "balance_loss_mlp": 1.01452994, + "epoch": 0.3452878400721479, + "flos": 19071657279360.0, + "grad_norm": 2.1069843493224414, + "language_loss": 0.7715745, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.79675806, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23730469, + "step": 5743, + "time_per_iteration": 2.888165235519409 + }, + { + "auxiliary_loss_clip": 0.01484311, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.30026627, + "balance_loss_mlp": 1.01946545, + "epoch": 0.3453479633248159, + "flos": 27941693040000.0, + "grad_norm": 1.5291127370394664, + "language_loss": 0.71237713, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.73764884, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23413086, + "step": 5744, + "time_per_iteration": 2.9155471324920654 + }, + { + "auxiliary_loss_clip": 0.01470856, + "auxiliary_loss_mlp": 0.0104009, + "balance_loss_clip": 1.28907025, + "balance_loss_mlp": 1.01735651, + "epoch": 0.34540808657748384, + "flos": 19614641852160.0, + "grad_norm": 1.6596394436697124, + "language_loss": 0.80640578, + "learning_rate": 3.044075480787665e-06, + "loss": 0.83151525, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22729492, + "step": 5745, + "time_per_iteration": 4.210557699203491 + }, + { + "auxiliary_loss_clip": 0.01494639, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.30725682, + "balance_loss_mlp": 1.01472449, + "epoch": 0.3454682098301518, + "flos": 20421141995520.0, + "grad_norm": 1.8715411327252032, + "language_loss": 0.90165317, + "learning_rate": 3.043743280407182e-06, + "loss": 0.92697597, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.22900391, + "step": 5746, + "time_per_iteration": 2.834477186203003 + }, + { + "auxiliary_loss_clip": 0.01509693, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.31929672, + "balance_loss_mlp": 1.01764607, + "epoch": 0.34552833308281977, + "flos": 21335180325120.0, + "grad_norm": 1.9068009637882408, + "language_loss": 0.66481394, + "learning_rate": 3.043411040447849e-06, + "loss": 0.69032705, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.23986816, + "step": 5747, + "time_per_iteration": 2.8847992420196533 + }, + { + "auxiliary_loss_clip": 0.01487599, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.30365694, + "balance_loss_mlp": 1.018628, + "epoch": 0.34558845633548774, + "flos": 36256482887040.0, + "grad_norm": 1.4837471102806254, + "language_loss": 0.73322713, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75851202, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.22277832, + "step": 5748, + "time_per_iteration": 4.436941385269165 + }, + { + "auxiliary_loss_clip": 0.01471668, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.29126644, + "balance_loss_mlp": 1.01324904, + "epoch": 0.3456485795881557, + "flos": 22459453833600.0, + "grad_norm": 1.5430070985842557, + "language_loss": 0.76527661, + "learning_rate": 3.042746441843029e-06, + "loss": 0.79034197, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.21630859, + "step": 5749, + "time_per_iteration": 2.8982269763946533 + }, + { + "auxiliary_loss_clip": 0.01255633, + "auxiliary_loss_mlp": 0.01022735, + "balance_loss_clip": 1.14954615, + "balance_loss_mlp": 1.00328052, + "epoch": 0.34570870284082367, + "flos": 62033748272640.0, + "grad_norm": 0.881660689215939, + "language_loss": 0.62774605, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.65052974, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19433594, + "step": 5750, + "time_per_iteration": 4.687304496765137 + }, + { + "auxiliary_loss_clip": 0.01471462, + "auxiliary_loss_mlp": 0.01039599, + "balance_loss_clip": 1.29214263, + "balance_loss_mlp": 1.01663947, + "epoch": 0.34576882609349163, + "flos": 22792022248320.0, + "grad_norm": 1.6810878772500837, + "language_loss": 0.81789094, + "learning_rate": 3.042081685074012e-06, + "loss": 0.84300154, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22961426, + "step": 5751, + "time_per_iteration": 2.9845314025878906 + }, + { + "auxiliary_loss_clip": 0.01478686, + "auxiliary_loss_mlp": 0.01039919, + "balance_loss_clip": 1.29653907, + "balance_loss_mlp": 1.01655436, + "epoch": 0.34582894934615965, + "flos": 12356339523840.0, + "grad_norm": 2.202883925539081, + "language_loss": 0.85703117, + "learning_rate": 3.041749247409439e-06, + "loss": 0.88221729, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.23352051, + "step": 5752, + "time_per_iteration": 2.8346107006073 + }, + { + "auxiliary_loss_clip": 0.01252876, + "auxiliary_loss_mlp": 0.01022806, + "balance_loss_clip": 1.14784169, + "balance_loss_mlp": 1.00239706, + "epoch": 0.3458890725988276, + "flos": 70196813746560.0, + "grad_norm": 0.7377490357669844, + "language_loss": 0.63167304, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65442985, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.20410156, + "step": 5753, + "time_per_iteration": 4.633998155593872 + }, + { + "auxiliary_loss_clip": 0.01481393, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.29796886, + "balance_loss_mlp": 1.01384842, + "epoch": 0.3459491958514956, + "flos": 17101493389440.0, + "grad_norm": 1.7617481685003882, + "language_loss": 0.72330743, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.74848366, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22375488, + "step": 5754, + "time_per_iteration": 2.9359426498413086 + }, + { + "auxiliary_loss_clip": 0.01509707, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.31998038, + "balance_loss_mlp": 1.01600814, + "epoch": 0.34600931910416355, + "flos": 16658762590080.0, + "grad_norm": 1.7876717986096986, + "language_loss": 0.73788029, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.76336575, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.22814941, + "step": 5755, + "time_per_iteration": 2.8450236320495605 + }, + { + "auxiliary_loss_clip": 0.01487707, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.30407655, + "balance_loss_mlp": 1.01956201, + "epoch": 0.3460694423568315, + "flos": 38560663025280.0, + "grad_norm": 2.1304945593061726, + "language_loss": 0.72793311, + "learning_rate": 3.040419101844869e-06, + "loss": 0.7532357, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.2298584, + "step": 5756, + "time_per_iteration": 3.019719362258911 + }, + { + "auxiliary_loss_clip": 0.01254974, + "auxiliary_loss_mlp": 0.0102118, + "balance_loss_clip": 1.14960492, + "balance_loss_mlp": 0.99934065, + "epoch": 0.3461295656094995, + "flos": 72115389285120.0, + "grad_norm": 0.7590716851070733, + "language_loss": 0.62631851, + "learning_rate": 3.040086466790207e-06, + "loss": 0.6490801, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.21875, + "step": 5757, + "time_per_iteration": 3.3244504928588867 + }, + { + "auxiliary_loss_clip": 0.01256386, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.14938974, + "balance_loss_mlp": 1.01326299, + "epoch": 0.34618968886216744, + "flos": 65487747248640.0, + "grad_norm": 0.8260900918288759, + "language_loss": 0.59337348, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61627597, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.20605469, + "step": 5758, + "time_per_iteration": 3.332646131515503 + }, + { + "auxiliary_loss_clip": 0.01473741, + "auxiliary_loss_mlp": 0.01036873, + "balance_loss_clip": 1.2941525, + "balance_loss_mlp": 1.01484311, + "epoch": 0.3462498121148354, + "flos": 23481799511040.0, + "grad_norm": 1.6515240148667716, + "language_loss": 0.72738576, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.75249183, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22009277, + "step": 5759, + "time_per_iteration": 3.0097854137420654 + }, + { + "auxiliary_loss_clip": 0.01477409, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.29610896, + "balance_loss_mlp": 1.01839876, + "epoch": 0.3463099353675034, + "flos": 24181530629760.0, + "grad_norm": 1.6936561280311544, + "language_loss": 0.84536099, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.87055296, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23376465, + "step": 5760, + "time_per_iteration": 3.0449624061584473 + }, + { + "auxiliary_loss_clip": 0.01255624, + "auxiliary_loss_mlp": 0.01020015, + "balance_loss_clip": 1.14809585, + "balance_loss_mlp": 0.99884397, + "epoch": 0.34637005862017134, + "flos": 63725692032000.0, + "grad_norm": 0.8393070658327962, + "language_loss": 0.56715119, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58990759, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.21191406, + "step": 5761, + "time_per_iteration": 3.3759799003601074 + }, + { + "auxiliary_loss_clip": 0.01468697, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.28823292, + "balance_loss_mlp": 1.01795959, + "epoch": 0.3464301818728393, + "flos": 13151709446400.0, + "grad_norm": 2.4827338971348127, + "language_loss": 0.95889151, + "learning_rate": 3.038422700166474e-06, + "loss": 0.98399192, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23364258, + "step": 5762, + "time_per_iteration": 2.887610912322998 + }, + { + "auxiliary_loss_clip": 0.01495655, + "auxiliary_loss_mlp": 0.01040201, + "balance_loss_clip": 1.30729938, + "balance_loss_mlp": 1.01727724, + "epoch": 0.34649030512550727, + "flos": 29327219879040.0, + "grad_norm": 1.5401867044969124, + "language_loss": 0.70618534, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.7315439, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.22912598, + "step": 5763, + "time_per_iteration": 2.9568610191345215 + }, + { + "auxiliary_loss_clip": 0.01504963, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_clip": 1.31436515, + "balance_loss_mlp": 1.01783943, + "epoch": 0.34655042837817523, + "flos": 23740971580800.0, + "grad_norm": 2.727962863978151, + "language_loss": 0.84262002, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86810178, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.2532959, + "step": 5764, + "time_per_iteration": 2.9098241329193115 + }, + { + "auxiliary_loss_clip": 0.01475918, + "auxiliary_loss_mlp": 0.01037434, + "balance_loss_clip": 1.29402208, + "balance_loss_mlp": 1.01442695, + "epoch": 0.34661055163084326, + "flos": 22064302581120.0, + "grad_norm": 2.5936655139975104, + "language_loss": 0.69419324, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.71932667, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23034668, + "step": 5765, + "time_per_iteration": 2.8823161125183105 + }, + { + "auxiliary_loss_clip": 0.01493383, + "auxiliary_loss_mlp": 0.01042762, + "balance_loss_clip": 1.31128132, + "balance_loss_mlp": 1.01906347, + "epoch": 0.3466706748835112, + "flos": 21809338277760.0, + "grad_norm": 2.036136529250617, + "language_loss": 0.77542233, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80078375, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.23718262, + "step": 5766, + "time_per_iteration": 2.872681140899658 + }, + { + "auxiliary_loss_clip": 0.01482722, + "auxiliary_loss_mlp": 0.01043207, + "balance_loss_clip": 1.30078423, + "balance_loss_mlp": 1.02074814, + "epoch": 0.3467307981361792, + "flos": 19470563850240.0, + "grad_norm": 2.2476449288716096, + "language_loss": 0.7443248, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.76958406, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.2244873, + "step": 5767, + "time_per_iteration": 2.876159191131592 + }, + { + "auxiliary_loss_clip": 0.01483492, + "auxiliary_loss_mlp": 0.01046324, + "balance_loss_clip": 1.30066299, + "balance_loss_mlp": 1.02195799, + "epoch": 0.34679092138884715, + "flos": 24838342416000.0, + "grad_norm": 1.9472699982290949, + "language_loss": 0.79156113, + "learning_rate": 3.036424880912893e-06, + "loss": 0.81685925, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.24365234, + "step": 5768, + "time_per_iteration": 2.9208943843841553 + }, + { + "auxiliary_loss_clip": 0.01257133, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.15064418, + "balance_loss_mlp": 1.01378059, + "epoch": 0.3468510446415151, + "flos": 63263569196160.0, + "grad_norm": 0.7752184876031226, + "language_loss": 0.57570904, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59861463, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19628906, + "step": 5769, + "time_per_iteration": 3.504361391067505 + }, + { + "auxiliary_loss_clip": 0.01522073, + "auxiliary_loss_mlp": 0.01044683, + "balance_loss_clip": 1.32983327, + "balance_loss_mlp": 1.01989913, + "epoch": 0.3469111678941831, + "flos": 12125789429760.0, + "grad_norm": 3.1377386917143704, + "language_loss": 0.88190985, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.9075774, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.2479248, + "step": 5770, + "time_per_iteration": 2.9669764041900635 + }, + { + "auxiliary_loss_clip": 0.01267378, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.15830183, + "balance_loss_mlp": 1.0112313, + "epoch": 0.34697129114685105, + "flos": 65961814711680.0, + "grad_norm": 0.7661166109854397, + "language_loss": 0.59807628, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.62111986, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.2578125, + "step": 5771, + "time_per_iteration": 3.1558072566986084 + }, + { + "auxiliary_loss_clip": 0.0149202, + "auxiliary_loss_mlp": 0.01048643, + "balance_loss_clip": 1.3084197, + "balance_loss_mlp": 1.02440763, + "epoch": 0.347031414399519, + "flos": 34466167653120.0, + "grad_norm": 1.901584474939687, + "language_loss": 0.7296192, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.7550258, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.2421875, + "step": 5772, + "time_per_iteration": 2.9714596271514893 + }, + { + "auxiliary_loss_clip": 0.01480477, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.29645634, + "balance_loss_mlp": 1.01719737, + "epoch": 0.347091537652187, + "flos": 26955706199040.0, + "grad_norm": 1.5316851163332588, + "language_loss": 0.77748036, + "learning_rate": 3.034758950632507e-06, + "loss": 0.80271399, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.25720215, + "step": 5773, + "time_per_iteration": 2.9201712608337402 + }, + { + "auxiliary_loss_clip": 0.0148869, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.30181086, + "balance_loss_mlp": 1.02134633, + "epoch": 0.34715166090485494, + "flos": 21152119288320.0, + "grad_norm": 2.558925385116559, + "language_loss": 0.71099335, + "learning_rate": 3.034425646811396e-06, + "loss": 0.73634338, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.24963379, + "step": 5774, + "time_per_iteration": 2.860607147216797 + }, + { + "auxiliary_loss_clip": 0.01480174, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.29843497, + "balance_loss_mlp": 1.02423525, + "epoch": 0.3472117841575229, + "flos": 23488676720640.0, + "grad_norm": 1.6686673785489854, + "language_loss": 0.77330315, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.79860413, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.25683594, + "step": 5775, + "time_per_iteration": 2.8855786323547363 + }, + { + "auxiliary_loss_clip": 0.01495811, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.3060503, + "balance_loss_mlp": 1.02069342, + "epoch": 0.34727190741019087, + "flos": 17501531080320.0, + "grad_norm": 2.039570194456537, + "language_loss": 0.80256343, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.82797796, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.24963379, + "step": 5776, + "time_per_iteration": 2.824976682662964 + }, + { + "auxiliary_loss_clip": 0.01268954, + "auxiliary_loss_mlp": 0.01048512, + "balance_loss_clip": 1.15978956, + "balance_loss_mlp": 1.01780379, + "epoch": 0.34733203066285884, + "flos": 65299410552960.0, + "grad_norm": 0.8452380062567858, + "language_loss": 0.63427079, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65744543, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.30664062, + "step": 5777, + "time_per_iteration": 3.3874659538269043 + }, + { + "auxiliary_loss_clip": 0.01496779, + "auxiliary_loss_mlp": 0.01046212, + "balance_loss_clip": 1.30945683, + "balance_loss_mlp": 1.02115452, + "epoch": 0.3473921539155268, + "flos": 28670679561600.0, + "grad_norm": 1.7539914791184454, + "language_loss": 0.65548873, + "learning_rate": 3.033092039398119e-06, + "loss": 0.68091869, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.25048828, + "step": 5778, + "time_per_iteration": 2.90110182762146 + }, + { + "auxiliary_loss_clip": 0.01490535, + "auxiliary_loss_mlp": 0.01046025, + "balance_loss_clip": 1.3016398, + "balance_loss_mlp": 1.02143192, + "epoch": 0.3474522771681948, + "flos": 40849912379520.0, + "grad_norm": 1.7736775741039934, + "language_loss": 0.73450303, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.75986862, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.24572754, + "step": 5779, + "time_per_iteration": 4.484851837158203 + }, + { + "auxiliary_loss_clip": 0.01500397, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_clip": 1.31059682, + "balance_loss_mlp": 1.02209198, + "epoch": 0.3475124004208628, + "flos": 24619420235520.0, + "grad_norm": 32.956691510034204, + "language_loss": 0.63438809, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.65986621, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.2532959, + "step": 5780, + "time_per_iteration": 2.9484384059906006 + }, + { + "auxiliary_loss_clip": 0.0149508, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.30964625, + "balance_loss_mlp": 1.01660347, + "epoch": 0.34757252367353075, + "flos": 22721657304960.0, + "grad_norm": 1.6981513809640765, + "language_loss": 0.72209764, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74745303, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.23876953, + "step": 5781, + "time_per_iteration": 2.9209048748016357 + }, + { + "auxiliary_loss_clip": 0.01483963, + "auxiliary_loss_mlp": 0.01039828, + "balance_loss_clip": 1.29662383, + "balance_loss_mlp": 1.0136373, + "epoch": 0.3476326469261987, + "flos": 19837590819840.0, + "grad_norm": 1.9442807314373183, + "language_loss": 0.78121793, + "learning_rate": 3.031757805185612e-06, + "loss": 0.80645579, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.2623291, + "step": 5782, + "time_per_iteration": 2.8724257946014404 + }, + { + "auxiliary_loss_clip": 0.0146792, + "auxiliary_loss_mlp": 0.0104068, + "balance_loss_clip": 1.28341043, + "balance_loss_mlp": 1.01549149, + "epoch": 0.3476927701788667, + "flos": 19947662714880.0, + "grad_norm": 2.377371544962961, + "language_loss": 0.6413368, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.66642278, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.2520752, + "step": 5783, + "time_per_iteration": 2.863662004470825 + }, + { + "auxiliary_loss_clip": 0.01464458, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.28381038, + "balance_loss_mlp": 1.01596951, + "epoch": 0.34775289343153465, + "flos": 20743394595840.0, + "grad_norm": 1.9806170003858385, + "language_loss": 0.88741064, + "learning_rate": 3.031090453282605e-06, + "loss": 0.9124524, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.23742676, + "step": 5784, + "time_per_iteration": 4.320150852203369 + }, + { + "auxiliary_loss_clip": 0.01471972, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.28986144, + "balance_loss_mlp": 1.01274228, + "epoch": 0.3478130166842026, + "flos": 19364383008000.0, + "grad_norm": 1.5676845720706745, + "language_loss": 0.82079482, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.8458935, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.25134277, + "step": 5785, + "time_per_iteration": 4.486806392669678 + }, + { + "auxiliary_loss_clip": 0.014731, + "auxiliary_loss_mlp": 0.0104366, + "balance_loss_clip": 1.28898692, + "balance_loss_mlp": 1.01903188, + "epoch": 0.3478731399368706, + "flos": 22060502017920.0, + "grad_norm": 1.9797706371229755, + "language_loss": 0.81560671, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.8407743, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.24645996, + "step": 5786, + "time_per_iteration": 2.8967173099517822 + }, + { + "auxiliary_loss_clip": 0.01464623, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.28421998, + "balance_loss_mlp": 1.01568222, + "epoch": 0.34793326318953854, + "flos": 18050668945920.0, + "grad_norm": 1.5498976683645185, + "language_loss": 0.75718582, + "learning_rate": 3.030089132216836e-06, + "loss": 0.78224897, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.2598877, + "step": 5787, + "time_per_iteration": 2.828270196914673 + }, + { + "auxiliary_loss_clip": 0.01478903, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.29383111, + "balance_loss_mlp": 1.01323485, + "epoch": 0.3479933864422065, + "flos": 29326586451840.0, + "grad_norm": 1.694649206176931, + "language_loss": 0.82095355, + "learning_rate": 3.029755280389203e-06, + "loss": 0.84611952, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.24438477, + "step": 5788, + "time_per_iteration": 4.362186908721924 + }, + { + "auxiliary_loss_clip": 0.01514335, + "auxiliary_loss_mlp": 0.01044075, + "balance_loss_clip": 1.32314348, + "balance_loss_mlp": 1.0197562, + "epoch": 0.3480535096948745, + "flos": 20130361793280.0, + "grad_norm": 1.8760531155038436, + "language_loss": 0.8639853, + "learning_rate": 3.029421389513147e-06, + "loss": 0.8895694, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24316406, + "step": 5789, + "time_per_iteration": 2.8584489822387695 + }, + { + "auxiliary_loss_clip": 0.01493981, + "auxiliary_loss_mlp": 0.0104168, + "balance_loss_clip": 1.30630028, + "balance_loss_mlp": 1.017802, + "epoch": 0.34811363294754244, + "flos": 18557475661440.0, + "grad_norm": 1.6973234789267944, + "language_loss": 0.85362184, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87897849, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.23864746, + "step": 5790, + "time_per_iteration": 2.85018253326416 + }, + { + "auxiliary_loss_clip": 0.01487966, + "auxiliary_loss_mlp": 0.01040977, + "balance_loss_clip": 1.30260825, + "balance_loss_mlp": 1.01547849, + "epoch": 0.3481737562002104, + "flos": 26881631182080.0, + "grad_norm": 1.9785380606081096, + "language_loss": 0.82511878, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8504082, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.25512695, + "step": 5791, + "time_per_iteration": 2.8835055828094482 + }, + { + "auxiliary_loss_clip": 0.01500249, + "auxiliary_loss_mlp": 0.01041456, + "balance_loss_clip": 1.30972266, + "balance_loss_mlp": 1.01695848, + "epoch": 0.3482338794528784, + "flos": 28919264348160.0, + "grad_norm": 1.6887311194472383, + "language_loss": 0.78363359, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80905068, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.24499512, + "step": 5792, + "time_per_iteration": 2.9002573490142822 + }, + { + "auxiliary_loss_clip": 0.01493428, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.30720329, + "balance_loss_mlp": 1.01246309, + "epoch": 0.3482940027055464, + "flos": 22211050026240.0, + "grad_norm": 1.5549442868245533, + "language_loss": 0.82656598, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.85185885, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.23400879, + "step": 5793, + "time_per_iteration": 2.848363161087036 + }, + { + "auxiliary_loss_clip": 0.01493271, + "auxiliary_loss_mlp": 0.01051171, + "balance_loss_clip": 1.3034327, + "balance_loss_mlp": 1.02633929, + "epoch": 0.34835412595821436, + "flos": 20312291710080.0, + "grad_norm": 1.7874065512553536, + "language_loss": 0.77206039, + "learning_rate": 3.027751349849706e-06, + "loss": 0.79750484, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.24829102, + "step": 5794, + "time_per_iteration": 2.9665229320526123 + }, + { + "auxiliary_loss_clip": 0.01479203, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.2937237, + "balance_loss_mlp": 1.01779163, + "epoch": 0.3484142492108823, + "flos": 20459582092800.0, + "grad_norm": 1.8032035988739517, + "language_loss": 0.58212912, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.60733974, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24084473, + "step": 5795, + "time_per_iteration": 2.8366358280181885 + }, + { + "auxiliary_loss_clip": 0.01486186, + "auxiliary_loss_mlp": 0.01044407, + "balance_loss_clip": 1.30176485, + "balance_loss_mlp": 1.02207971, + "epoch": 0.3484743724635503, + "flos": 24363641525760.0, + "grad_norm": 1.9301709124779443, + "language_loss": 0.83530235, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.86060822, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22314453, + "step": 5796, + "time_per_iteration": 2.9322516918182373 + }, + { + "auxiliary_loss_clip": 0.0147506, + "auxiliary_loss_mlp": 0.0104234, + "balance_loss_clip": 1.29412782, + "balance_loss_mlp": 1.01828325, + "epoch": 0.34853449571621825, + "flos": 24363732015360.0, + "grad_norm": 1.596500129474699, + "language_loss": 0.84293962, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.86811364, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.24060059, + "step": 5797, + "time_per_iteration": 2.8796439170837402 + }, + { + "auxiliary_loss_clip": 0.0147896, + "auxiliary_loss_mlp": 0.01043451, + "balance_loss_clip": 1.29545295, + "balance_loss_mlp": 1.01937103, + "epoch": 0.3485946189688862, + "flos": 27278094533760.0, + "grad_norm": 1.7835352488005007, + "language_loss": 0.74103093, + "learning_rate": 3.026414616539167e-06, + "loss": 0.76625502, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.24072266, + "step": 5798, + "time_per_iteration": 2.9132137298583984 + }, + { + "auxiliary_loss_clip": 0.0149655, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.30791879, + "balance_loss_mlp": 1.02093935, + "epoch": 0.3486547422215542, + "flos": 20166222936960.0, + "grad_norm": 1.9918282048014941, + "language_loss": 0.76994634, + "learning_rate": 3.026080335875485e-06, + "loss": 0.79535723, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.23583984, + "step": 5799, + "time_per_iteration": 2.8885583877563477 + }, + { + "auxiliary_loss_clip": 0.01496371, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.30955791, + "balance_loss_mlp": 1.01713884, + "epoch": 0.34871486547422215, + "flos": 20240071729920.0, + "grad_norm": 1.63881968724741, + "language_loss": 0.76706201, + "learning_rate": 3.025746016302734e-06, + "loss": 0.79243362, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2364502, + "step": 5800, + "time_per_iteration": 2.8590195178985596 + }, + { + "auxiliary_loss_clip": 0.01498623, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.30994153, + "balance_loss_mlp": 1.01536691, + "epoch": 0.3487749887268901, + "flos": 44068538050560.0, + "grad_norm": 1.7612909833439905, + "language_loss": 0.6818192, + "learning_rate": 3.025411657833591e-06, + "loss": 0.70720923, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.25036621, + "step": 5801, + "time_per_iteration": 3.0323290824890137 + }, + { + "auxiliary_loss_clip": 0.01482374, + "auxiliary_loss_mlp": 0.01042476, + "balance_loss_clip": 1.29918075, + "balance_loss_mlp": 1.01881242, + "epoch": 0.3488351119795581, + "flos": 23305660928640.0, + "grad_norm": 1.9521044323633399, + "language_loss": 0.77486813, + "learning_rate": 3.025077260480735e-06, + "loss": 0.80011666, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.23657227, + "step": 5802, + "time_per_iteration": 2.830148458480835 + }, + { + "auxiliary_loss_clip": 0.01469966, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_clip": 1.29152417, + "balance_loss_mlp": 1.0206356, + "epoch": 0.34889523523222604, + "flos": 19943545438080.0, + "grad_norm": 2.3019190832190857, + "language_loss": 0.79316026, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81830484, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.23852539, + "step": 5803, + "time_per_iteration": 2.8416848182678223 + }, + { + "auxiliary_loss_clip": 0.01500886, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.31144166, + "balance_loss_mlp": 1.01785135, + "epoch": 0.348955358484894, + "flos": 30458506331520.0, + "grad_norm": 3.6035080563630553, + "language_loss": 0.6936658, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.71908879, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.2355957, + "step": 5804, + "time_per_iteration": 2.9415807723999023 + }, + { + "auxiliary_loss_clip": 0.01473141, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_clip": 1.29288268, + "balance_loss_mlp": 1.0233444, + "epoch": 0.349015481737562, + "flos": 18007885347840.0, + "grad_norm": 1.7682332235216662, + "language_loss": 0.7746222, + "learning_rate": 3.024073835246702e-06, + "loss": 0.79982555, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.23852539, + "step": 5805, + "time_per_iteration": 2.7975432872772217 + }, + { + "auxiliary_loss_clip": 0.01494502, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.30901206, + "balance_loss_mlp": 1.02198565, + "epoch": 0.34907560499023, + "flos": 27209856096000.0, + "grad_norm": 2.0578394169969014, + "language_loss": 0.68848068, + "learning_rate": 3.023739282485814e-06, + "loss": 0.71389341, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.24780273, + "step": 5806, + "time_per_iteration": 2.9179153442382812 + }, + { + "auxiliary_loss_clip": 0.01488631, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_clip": 1.30316353, + "balance_loss_mlp": 1.01845288, + "epoch": 0.34913572824289796, + "flos": 30239041213440.0, + "grad_norm": 1.6596084849070676, + "language_loss": 0.72746509, + "learning_rate": 3.023404690904629e-06, + "loss": 0.75278091, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.24511719, + "step": 5807, + "time_per_iteration": 2.912523031234741 + }, + { + "auxiliary_loss_clip": 0.01501593, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.31259882, + "balance_loss_mlp": 1.01703084, + "epoch": 0.3491958514955659, + "flos": 29984122154880.0, + "grad_norm": 2.051775504452474, + "language_loss": 0.75620878, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.78164065, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.24560547, + "step": 5808, + "time_per_iteration": 2.985081434249878 + }, + { + "auxiliary_loss_clip": 0.01474004, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.29572821, + "balance_loss_mlp": 1.02067018, + "epoch": 0.3492559747482339, + "flos": 22793379592320.0, + "grad_norm": 1.715538195900244, + "language_loss": 0.84631151, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.87148166, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22338867, + "step": 5809, + "time_per_iteration": 2.8417210578918457 + }, + { + "auxiliary_loss_clip": 0.01464614, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.28584957, + "balance_loss_mlp": 1.01509571, + "epoch": 0.34931609800090185, + "flos": 26079248315520.0, + "grad_norm": 2.895152867168358, + "language_loss": 0.82106841, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.84610212, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.2364502, + "step": 5810, + "time_per_iteration": 2.8990931510925293 + }, + { + "auxiliary_loss_clip": 0.01479292, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.29439056, + "balance_loss_mlp": 1.01613069, + "epoch": 0.3493762212535698, + "flos": 29254004513280.0, + "grad_norm": 3.343551757252438, + "language_loss": 0.76501614, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.79021478, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.24462891, + "step": 5811, + "time_per_iteration": 2.9344804286956787 + }, + { + "auxiliary_loss_clip": 0.01480274, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.29622912, + "balance_loss_mlp": 1.01631355, + "epoch": 0.3494363445062378, + "flos": 27137545626240.0, + "grad_norm": 1.6274871532126767, + "language_loss": 0.80928564, + "learning_rate": 3.021731151138386e-06, + "loss": 0.83451056, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.25927734, + "step": 5812, + "time_per_iteration": 2.873048782348633 + }, + { + "auxiliary_loss_clip": 0.01483859, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.29849386, + "balance_loss_mlp": 1.01656711, + "epoch": 0.34949646775890575, + "flos": 12283802830080.0, + "grad_norm": 2.0223834592284087, + "language_loss": 0.70383334, + "learning_rate": 3.021396326901918e-06, + "loss": 0.72908103, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.24328613, + "step": 5813, + "time_per_iteration": 2.833826780319214 + }, + { + "auxiliary_loss_clip": 0.01460506, + "auxiliary_loss_mlp": 0.01039383, + "balance_loss_clip": 1.27924597, + "balance_loss_mlp": 1.01482606, + "epoch": 0.3495565910115737, + "flos": 17174437286400.0, + "grad_norm": 5.260107837109985, + "language_loss": 0.77650338, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.80150229, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.2454834, + "step": 5814, + "time_per_iteration": 4.304182291030884 + }, + { + "auxiliary_loss_clip": 0.01483242, + "auxiliary_loss_mlp": 0.0104634, + "balance_loss_clip": 1.29707599, + "balance_loss_mlp": 1.01997113, + "epoch": 0.3496167142642417, + "flos": 26476028380800.0, + "grad_norm": 1.4970898519533578, + "language_loss": 0.85383606, + "learning_rate": 3.020726562247328e-06, + "loss": 0.87913179, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.26342773, + "step": 5815, + "time_per_iteration": 2.866560220718384 + }, + { + "auxiliary_loss_clip": 0.01482687, + "auxiliary_loss_mlp": 0.01043538, + "balance_loss_clip": 1.29687619, + "balance_loss_mlp": 1.01907635, + "epoch": 0.34967683751690964, + "flos": 17422252911360.0, + "grad_norm": 1.9954745982641804, + "language_loss": 0.77968359, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.80494583, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.24450684, + "step": 5816, + "time_per_iteration": 2.821442127227783 + }, + { + "auxiliary_loss_clip": 0.0149818, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.30964196, + "balance_loss_mlp": 1.02195108, + "epoch": 0.3497369607695776, + "flos": 22609866107520.0, + "grad_norm": 2.436867859210501, + "language_loss": 0.60047019, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.62591791, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.24633789, + "step": 5817, + "time_per_iteration": 2.8496599197387695 + }, + { + "auxiliary_loss_clip": 0.01258697, + "auxiliary_loss_mlp": 0.01053349, + "balance_loss_clip": 1.15017128, + "balance_loss_mlp": 1.02855384, + "epoch": 0.34979708402224563, + "flos": 68558992047360.0, + "grad_norm": 0.8881255823635612, + "language_loss": 0.59858012, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.62170064, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.24707031, + "step": 5818, + "time_per_iteration": 3.3916971683502197 + }, + { + "auxiliary_loss_clip": 0.01476813, + "auxiliary_loss_mlp": 0.01043411, + "balance_loss_clip": 1.29482925, + "balance_loss_mlp": 1.01648188, + "epoch": 0.3498572072749136, + "flos": 18998984851200.0, + "grad_norm": 1.7159791302729104, + "language_loss": 0.84057033, + "learning_rate": 3.019386568567123e-06, + "loss": 0.8657726, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.26928711, + "step": 5819, + "time_per_iteration": 4.2695276737213135 + }, + { + "auxiliary_loss_clip": 0.01483072, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.29838967, + "balance_loss_mlp": 1.0163089, + "epoch": 0.34991733052758156, + "flos": 27830444780160.0, + "grad_norm": 4.371674132950581, + "language_loss": 0.71639347, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.74163806, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.25097656, + "step": 5820, + "time_per_iteration": 4.367142200469971 + }, + { + "auxiliary_loss_clip": 0.01483584, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.2975173, + "balance_loss_mlp": 1.01304078, + "epoch": 0.3499774537802495, + "flos": 33597401385600.0, + "grad_norm": 1.5898853106826576, + "language_loss": 0.70515692, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7303654, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.24230957, + "step": 5821, + "time_per_iteration": 2.9403603076934814 + }, + { + "auxiliary_loss_clip": 0.01487364, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.298931, + "balance_loss_mlp": 1.01545393, + "epoch": 0.3500375770329175, + "flos": 23487093152640.0, + "grad_norm": 2.0849954271644107, + "language_loss": 0.74694699, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.77223295, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.25793457, + "step": 5822, + "time_per_iteration": 2.8512659072875977 + }, + { + "auxiliary_loss_clip": 0.01480565, + "auxiliary_loss_mlp": 0.01045298, + "balance_loss_clip": 1.29296947, + "balance_loss_mlp": 1.01939428, + "epoch": 0.35009770028558546, + "flos": 19035388932480.0, + "grad_norm": 1.670891566374546, + "language_loss": 0.78921872, + "learning_rate": 3.018045956403094e-06, + "loss": 0.81447732, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.25915527, + "step": 5823, + "time_per_iteration": 4.203794717788696 + }, + { + "auxiliary_loss_clip": 0.01265647, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.15590394, + "balance_loss_mlp": 1.01420426, + "epoch": 0.3501578235382534, + "flos": 68382717730560.0, + "grad_norm": 0.7157619799083322, + "language_loss": 0.59281552, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61593252, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.31835938, + "step": 5824, + "time_per_iteration": 3.379009485244751 + }, + { + "auxiliary_loss_clip": 0.01474157, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_clip": 1.28828335, + "balance_loss_mlp": 1.02019548, + "epoch": 0.3502179467909214, + "flos": 21260607615360.0, + "grad_norm": 1.8142939836322582, + "language_loss": 0.85409707, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87931556, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.27514648, + "step": 5825, + "time_per_iteration": 2.9218318462371826 + }, + { + "auxiliary_loss_clip": 0.01482748, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.29733169, + "balance_loss_mlp": 1.01993871, + "epoch": 0.35027807004358935, + "flos": 11949198399360.0, + "grad_norm": 3.3022007426212667, + "language_loss": 0.84061795, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.8659147, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.27001953, + "step": 5826, + "time_per_iteration": 2.9781434535980225 + }, + { + "auxiliary_loss_clip": 0.01489121, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.30203605, + "balance_loss_mlp": 1.0175209, + "epoch": 0.3503381932962573, + "flos": 21480977629440.0, + "grad_norm": 2.0039370448866283, + "language_loss": 0.81436312, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.83971041, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.28063965, + "step": 5827, + "time_per_iteration": 2.879667043685913 + }, + { + "auxiliary_loss_clip": 0.01481975, + "auxiliary_loss_mlp": 0.01047294, + "balance_loss_clip": 1.29873323, + "balance_loss_mlp": 1.02096033, + "epoch": 0.3503983165489253, + "flos": 21260879084160.0, + "grad_norm": 1.9744332845597816, + "language_loss": 0.71906567, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.74435842, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.26318359, + "step": 5828, + "time_per_iteration": 2.867096424102783 + }, + { + "auxiliary_loss_clip": 0.01485485, + "auxiliary_loss_mlp": 0.010508, + "balance_loss_clip": 1.29890347, + "balance_loss_mlp": 1.02135491, + "epoch": 0.35045843980159325, + "flos": 27826463237760.0, + "grad_norm": 1.8491423749502007, + "language_loss": 0.80347228, + "learning_rate": 3.016033880279248e-06, + "loss": 0.82883507, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.29467773, + "step": 5829, + "time_per_iteration": 2.9218502044677734 + }, + { + "auxiliary_loss_clip": 0.01491771, + "auxiliary_loss_mlp": 0.01047084, + "balance_loss_clip": 1.30132842, + "balance_loss_mlp": 1.01847351, + "epoch": 0.3505185630542612, + "flos": 25932093667200.0, + "grad_norm": 2.0167690593499983, + "language_loss": 0.72743106, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.7528196, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.28637695, + "step": 5830, + "time_per_iteration": 2.939810276031494 + }, + { + "auxiliary_loss_clip": 0.01474814, + "auxiliary_loss_mlp": 0.01048712, + "balance_loss_clip": 1.2900331, + "balance_loss_mlp": 1.02048349, + "epoch": 0.35057868630692923, + "flos": 20531440114560.0, + "grad_norm": 3.050356908586686, + "language_loss": 0.89690924, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.92214447, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.2824707, + "step": 5831, + "time_per_iteration": 2.828636646270752 + }, + { + "auxiliary_loss_clip": 0.01486628, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.30059195, + "balance_loss_mlp": 1.01891232, + "epoch": 0.3506388095595972, + "flos": 20458269993600.0, + "grad_norm": 4.087055671865954, + "language_loss": 0.79430199, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.81961101, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.25378418, + "step": 5832, + "time_per_iteration": 2.856583833694458 + }, + { + "auxiliary_loss_clip": 0.01494621, + "auxiliary_loss_mlp": 0.01045174, + "balance_loss_clip": 1.30464816, + "balance_loss_mlp": 1.01885295, + "epoch": 0.35069893281226516, + "flos": 23119342266240.0, + "grad_norm": 3.0278512129681956, + "language_loss": 0.71996754, + "learning_rate": 3.014691725465008e-06, + "loss": 0.7453655, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.26342773, + "step": 5833, + "time_per_iteration": 2.853951930999756 + }, + { + "auxiliary_loss_clip": 0.01466049, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.28521836, + "balance_loss_mlp": 1.01608336, + "epoch": 0.35075905606493313, + "flos": 27283523909760.0, + "grad_norm": 1.4390089222419462, + "language_loss": 0.81538808, + "learning_rate": 3.014356090536606e-06, + "loss": 0.84046507, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.25561523, + "step": 5834, + "time_per_iteration": 2.890451431274414 + }, + { + "auxiliary_loss_clip": 0.0147622, + "auxiliary_loss_mlp": 0.01045197, + "balance_loss_clip": 1.28947341, + "balance_loss_mlp": 1.01736212, + "epoch": 0.3508191793176011, + "flos": 19136185643520.0, + "grad_norm": 2.946875217344705, + "language_loss": 0.84043133, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86564553, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.27832031, + "step": 5835, + "time_per_iteration": 2.836390733718872 + }, + { + "auxiliary_loss_clip": 0.01474219, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.28975415, + "balance_loss_mlp": 1.01788032, + "epoch": 0.35087930257026906, + "flos": 25568776771200.0, + "grad_norm": 2.5390373631838283, + "language_loss": 0.77832675, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.80350661, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.2590332, + "step": 5836, + "time_per_iteration": 2.87524676322937 + }, + { + "auxiliary_loss_clip": 0.01470562, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.28864169, + "balance_loss_mlp": 1.01112843, + "epoch": 0.350939425822937, + "flos": 18013269479040.0, + "grad_norm": 2.1148938738832506, + "language_loss": 0.78671002, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.81179076, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.26403809, + "step": 5837, + "time_per_iteration": 2.8404996395111084 + }, + { + "auxiliary_loss_clip": 0.01472947, + "auxiliary_loss_mlp": 0.01043339, + "balance_loss_clip": 1.28831744, + "balance_loss_mlp": 1.01704192, + "epoch": 0.350999549075605, + "flos": 22283360496000.0, + "grad_norm": 2.0071055511052713, + "language_loss": 0.68766999, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.71283281, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.26306152, + "step": 5838, + "time_per_iteration": 2.8751027584075928 + }, + { + "auxiliary_loss_clip": 0.01470116, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_clip": 1.28693306, + "balance_loss_mlp": 1.01728916, + "epoch": 0.35105967232827295, + "flos": 14400397451520.0, + "grad_norm": 2.519553506472713, + "language_loss": 0.84130007, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.86644518, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.27111816, + "step": 5839, + "time_per_iteration": 2.918013334274292 + }, + { + "auxiliary_loss_clip": 0.01483034, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.29301751, + "balance_loss_mlp": 1.01925635, + "epoch": 0.3511197955809409, + "flos": 25093170984960.0, + "grad_norm": 1.6951662623978128, + "language_loss": 0.6016863, + "learning_rate": 3.012341473657572e-06, + "loss": 0.62697196, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.26257324, + "step": 5840, + "time_per_iteration": 2.962437868118286 + }, + { + "auxiliary_loss_clip": 0.01485443, + "auxiliary_loss_mlp": 0.01044976, + "balance_loss_clip": 1.29742503, + "balance_loss_mlp": 1.02059734, + "epoch": 0.3511799188336089, + "flos": 25894965669120.0, + "grad_norm": 2.164954471130695, + "language_loss": 0.89077485, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.91607904, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.24401855, + "step": 5841, + "time_per_iteration": 2.928478240966797 + }, + { + "auxiliary_loss_clip": 0.01501013, + "auxiliary_loss_mlp": 0.01048324, + "balance_loss_clip": 1.30873382, + "balance_loss_mlp": 1.02042866, + "epoch": 0.35124004208627685, + "flos": 20093233795200.0, + "grad_norm": 2.289754685816453, + "language_loss": 0.76500303, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.79049635, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.27905273, + "step": 5842, + "time_per_iteration": 2.9654524326324463 + }, + { + "auxiliary_loss_clip": 0.01488667, + "auxiliary_loss_mlp": 0.01041194, + "balance_loss_clip": 1.3003006, + "balance_loss_mlp": 1.0152657, + "epoch": 0.3513001653389448, + "flos": 17791904079360.0, + "grad_norm": 3.014944404441442, + "language_loss": 0.70112145, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.72642004, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.25927734, + "step": 5843, + "time_per_iteration": 2.831038475036621 + }, + { + "auxiliary_loss_clip": 0.01488446, + "auxiliary_loss_mlp": 0.01045825, + "balance_loss_clip": 1.30245638, + "balance_loss_mlp": 1.01961112, + "epoch": 0.3513602885916128, + "flos": 29398942166400.0, + "grad_norm": 2.0044682392108526, + "language_loss": 0.66020375, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68554646, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.26220703, + "step": 5844, + "time_per_iteration": 2.8809878826141357 + }, + { + "auxiliary_loss_clip": 0.014896, + "auxiliary_loss_mlp": 0.01055587, + "balance_loss_clip": 1.30238056, + "balance_loss_mlp": 1.03021884, + "epoch": 0.3514204118442808, + "flos": 16188405200640.0, + "grad_norm": 4.163975009452507, + "language_loss": 0.76056373, + "learning_rate": 3.010661570469245e-06, + "loss": 0.78601551, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.25354004, + "step": 5845, + "time_per_iteration": 2.8514504432678223 + }, + { + "auxiliary_loss_clip": 0.01470979, + "auxiliary_loss_mlp": 0.0105628, + "balance_loss_clip": 1.28844428, + "balance_loss_mlp": 1.03073359, + "epoch": 0.35148053509694877, + "flos": 23843306615040.0, + "grad_norm": 3.1248103295757184, + "language_loss": 0.73524076, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.76051337, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.25549316, + "step": 5846, + "time_per_iteration": 2.817150115966797 + }, + { + "auxiliary_loss_clip": 0.01491046, + "auxiliary_loss_mlp": 0.0105785, + "balance_loss_clip": 1.30248725, + "balance_loss_mlp": 1.03237522, + "epoch": 0.35154065834961673, + "flos": 20999851977600.0, + "grad_norm": 1.9776035468846587, + "language_loss": 0.76152563, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.7870146, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.25500488, + "step": 5847, + "time_per_iteration": 2.8636062145233154 + }, + { + "auxiliary_loss_clip": 0.01493066, + "auxiliary_loss_mlp": 0.010554, + "balance_loss_clip": 1.30288398, + "balance_loss_mlp": 1.03022325, + "epoch": 0.3516007816022847, + "flos": 33268497799680.0, + "grad_norm": 2.0901566579614648, + "language_loss": 0.733239, + "learning_rate": 3.009653168561666e-06, + "loss": 0.75872368, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.25195312, + "step": 5848, + "time_per_iteration": 2.9602010250091553 + }, + { + "auxiliary_loss_clip": 0.01487957, + "auxiliary_loss_mlp": 0.01059564, + "balance_loss_clip": 1.29860187, + "balance_loss_mlp": 1.03180003, + "epoch": 0.35166090485495266, + "flos": 11733895802880.0, + "grad_norm": 2.26898933247515, + "language_loss": 0.91494161, + "learning_rate": 3.009316958003178e-06, + "loss": 0.94041681, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.27746582, + "step": 5849, + "time_per_iteration": 4.188556671142578 + }, + { + "auxiliary_loss_clip": 0.01476453, + "auxiliary_loss_mlp": 0.01061684, + "balance_loss_clip": 1.29036713, + "balance_loss_mlp": 1.03668582, + "epoch": 0.3517210281076206, + "flos": 22648803897600.0, + "grad_norm": 1.9500409910933438, + "language_loss": 0.75355315, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.77893448, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.25012207, + "step": 5850, + "time_per_iteration": 2.8783211708068848 + }, + { + "auxiliary_loss_clip": 0.01474134, + "auxiliary_loss_mlp": 0.01055576, + "balance_loss_clip": 1.28929281, + "balance_loss_mlp": 1.0298028, + "epoch": 0.3517811513602886, + "flos": 21332194168320.0, + "grad_norm": 6.339216106173975, + "language_loss": 0.77091318, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.79621029, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.25793457, + "step": 5851, + "time_per_iteration": 2.8392467498779297 + }, + { + "auxiliary_loss_clip": 0.01491996, + "auxiliary_loss_mlp": 0.01055511, + "balance_loss_clip": 1.30592203, + "balance_loss_mlp": 1.02904677, + "epoch": 0.35184127461295656, + "flos": 21042771310080.0, + "grad_norm": 1.7769904412893267, + "language_loss": 0.87896705, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.90444207, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.26452637, + "step": 5852, + "time_per_iteration": 2.835381269454956 + }, + { + "auxiliary_loss_clip": 0.01481575, + "auxiliary_loss_mlp": 0.01050353, + "balance_loss_clip": 1.29583383, + "balance_loss_mlp": 1.02450824, + "epoch": 0.3519013978656245, + "flos": 22465380902400.0, + "grad_norm": 2.5184283918732833, + "language_loss": 0.6821894, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70750868, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.25854492, + "step": 5853, + "time_per_iteration": 2.8290586471557617 + }, + { + "auxiliary_loss_clip": 0.01488291, + "auxiliary_loss_mlp": 0.01052472, + "balance_loss_clip": 1.30046427, + "balance_loss_mlp": 1.02591181, + "epoch": 0.3519615211182925, + "flos": 13123087470720.0, + "grad_norm": 1.8600908742861302, + "language_loss": 0.82190526, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.84731293, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.265625, + "step": 5854, + "time_per_iteration": 4.218945741653442 + }, + { + "auxiliary_loss_clip": 0.01468745, + "auxiliary_loss_mlp": 0.01050159, + "balance_loss_clip": 1.28625667, + "balance_loss_mlp": 1.02542305, + "epoch": 0.35202164437096045, + "flos": 19144420197120.0, + "grad_norm": 1.4757812721698564, + "language_loss": 0.74183887, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.76702791, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.24755859, + "step": 5855, + "time_per_iteration": 4.312297582626343 + }, + { + "auxiliary_loss_clip": 0.01479109, + "auxiliary_loss_mlp": 0.01049026, + "balance_loss_clip": 1.29403973, + "balance_loss_mlp": 1.02468324, + "epoch": 0.3520817676236284, + "flos": 26553496757760.0, + "grad_norm": 2.065417367006445, + "language_loss": 0.72163904, + "learning_rate": 3.006962413152691e-06, + "loss": 0.74692035, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.24353027, + "step": 5856, + "time_per_iteration": 2.8791329860687256 + }, + { + "auxiliary_loss_clip": 0.01504131, + "auxiliary_loss_mlp": 0.01051872, + "balance_loss_clip": 1.31385112, + "balance_loss_mlp": 1.02500224, + "epoch": 0.3521418908762964, + "flos": 44909270524800.0, + "grad_norm": 1.7172873592265194, + "language_loss": 0.62498695, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.65054697, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.26855469, + "step": 5857, + "time_per_iteration": 3.0565097332000732 + }, + { + "auxiliary_loss_clip": 0.01487765, + "auxiliary_loss_mlp": 0.01047073, + "balance_loss_clip": 1.30164599, + "balance_loss_mlp": 1.02045405, + "epoch": 0.3522020141289644, + "flos": 20195433095040.0, + "grad_norm": 1.700694967385964, + "language_loss": 0.73884118, + "learning_rate": 3.006289342204152e-06, + "loss": 0.76418954, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.26647949, + "step": 5858, + "time_per_iteration": 4.250205755233765 + }, + { + "auxiliary_loss_clip": 0.01489972, + "auxiliary_loss_mlp": 0.01045369, + "balance_loss_clip": 1.30420291, + "balance_loss_mlp": 1.02044284, + "epoch": 0.35226213738163237, + "flos": 27575525721600.0, + "grad_norm": 1.61345897219296, + "language_loss": 0.77023566, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.79558909, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.24926758, + "step": 5859, + "time_per_iteration": 2.9078209400177 + }, + { + "auxiliary_loss_clip": 0.01519147, + "auxiliary_loss_mlp": 0.01052499, + "balance_loss_clip": 1.32589364, + "balance_loss_mlp": 1.02518821, + "epoch": 0.35232226063430033, + "flos": 22976621608320.0, + "grad_norm": 1.8097286990554327, + "language_loss": 0.72204751, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.74776393, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.27331543, + "step": 5860, + "time_per_iteration": 2.863739252090454 + }, + { + "auxiliary_loss_clip": 0.01517028, + "auxiliary_loss_mlp": 0.01051637, + "balance_loss_clip": 1.32302904, + "balance_loss_mlp": 1.02470732, + "epoch": 0.3523823838869683, + "flos": 19176707001600.0, + "grad_norm": 2.3405483741150124, + "language_loss": 0.67910755, + "learning_rate": 3.005279449623811e-06, + "loss": 0.70479417, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.26916504, + "step": 5861, + "time_per_iteration": 2.8162944316864014 + }, + { + "auxiliary_loss_clip": 0.01492347, + "auxiliary_loss_mlp": 0.01054637, + "balance_loss_clip": 1.30743694, + "balance_loss_mlp": 1.02897096, + "epoch": 0.35244250713963626, + "flos": 17939918378880.0, + "grad_norm": 1.8419124172669643, + "language_loss": 0.67037773, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.69584757, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.25695801, + "step": 5862, + "time_per_iteration": 2.8115737438201904 + }, + { + "auxiliary_loss_clip": 0.01498538, + "auxiliary_loss_mlp": 0.01054354, + "balance_loss_clip": 1.30960906, + "balance_loss_mlp": 1.02854538, + "epoch": 0.35250263039230423, + "flos": 21442085084160.0, + "grad_norm": 1.9489105762344714, + "language_loss": 0.78352082, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.80904973, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.25830078, + "step": 5863, + "time_per_iteration": 2.825699806213379 + }, + { + "auxiliary_loss_clip": 0.01493111, + "auxiliary_loss_mlp": 0.0104537, + "balance_loss_clip": 1.30479622, + "balance_loss_mlp": 1.02108693, + "epoch": 0.3525627536449722, + "flos": 27428144849280.0, + "grad_norm": 2.26425635605047, + "language_loss": 0.762312, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.78769684, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.24304199, + "step": 5864, + "time_per_iteration": 2.909738540649414 + }, + { + "auxiliary_loss_clip": 0.01490113, + "auxiliary_loss_mlp": 0.01058731, + "balance_loss_clip": 1.30311036, + "balance_loss_mlp": 1.03416228, + "epoch": 0.35262287689764016, + "flos": 24800400011520.0, + "grad_norm": 2.3225588354543243, + "language_loss": 0.80402017, + "learning_rate": 3.003932392558793e-06, + "loss": 0.8295086, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.24572754, + "step": 5865, + "time_per_iteration": 2.9233973026275635 + }, + { + "auxiliary_loss_clip": 0.01514893, + "auxiliary_loss_mlp": 0.01045269, + "balance_loss_clip": 1.32198393, + "balance_loss_mlp": 1.01950812, + "epoch": 0.3526830001503081, + "flos": 17830525155840.0, + "grad_norm": 2.0839971332904743, + "language_loss": 0.81974858, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.84535015, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.25769043, + "step": 5866, + "time_per_iteration": 2.8572540283203125 + }, + { + "auxiliary_loss_clip": 0.01524369, + "auxiliary_loss_mlp": 0.01050689, + "balance_loss_clip": 1.32804823, + "balance_loss_mlp": 1.02433181, + "epoch": 0.3527431234029761, + "flos": 18087389740800.0, + "grad_norm": 3.1920081057223384, + "language_loss": 0.85226274, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.87801331, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 1.96191406, + "router_z_loss_mlp": 0.26367188, + "step": 5867, + "time_per_iteration": 2.804725408554077 + }, + { + "auxiliary_loss_clip": 0.01509994, + "auxiliary_loss_mlp": 0.01059592, + "balance_loss_clip": 1.3196156, + "balance_loss_mlp": 1.0316968, + "epoch": 0.35280324665564405, + "flos": 19436783967360.0, + "grad_norm": 2.1584447880871735, + "language_loss": 0.74672031, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.77241617, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.27929688, + "step": 5868, + "time_per_iteration": 2.848109245300293 + }, + { + "auxiliary_loss_clip": 0.01518176, + "auxiliary_loss_mlp": 0.01064291, + "balance_loss_clip": 1.32666194, + "balance_loss_mlp": 1.03781426, + "epoch": 0.352863369908312, + "flos": 21513219189120.0, + "grad_norm": 1.5754360744581988, + "language_loss": 0.623514, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.64933872, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.26501465, + "step": 5869, + "time_per_iteration": 2.8394765853881836 + }, + { + "auxiliary_loss_clip": 0.01509613, + "auxiliary_loss_mlp": 0.01053167, + "balance_loss_clip": 1.31973147, + "balance_loss_mlp": 1.02789474, + "epoch": 0.35292349316098, + "flos": 22319628842880.0, + "grad_norm": 1.7835937249507423, + "language_loss": 0.75319719, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.77882493, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.25280762, + "step": 5870, + "time_per_iteration": 2.90159273147583 + }, + { + "auxiliary_loss_clip": 0.01500058, + "auxiliary_loss_mlp": 0.01054378, + "balance_loss_clip": 1.31302094, + "balance_loss_mlp": 1.02835476, + "epoch": 0.352983616413648, + "flos": 33122790984960.0, + "grad_norm": 1.3666863348004208, + "language_loss": 0.72364175, + "learning_rate": 3.001910665140316e-06, + "loss": 0.7491861, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.26037598, + "step": 5871, + "time_per_iteration": 2.9779415130615234 + }, + { + "auxiliary_loss_clip": 0.01479638, + "auxiliary_loss_mlp": 0.01050555, + "balance_loss_clip": 1.29764462, + "balance_loss_mlp": 1.02659416, + "epoch": 0.35304373966631597, + "flos": 18705580450560.0, + "grad_norm": 2.2035402299621305, + "language_loss": 0.74318624, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76848817, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23962402, + "step": 5872, + "time_per_iteration": 2.835054874420166 + }, + { + "auxiliary_loss_clip": 0.01504122, + "auxiliary_loss_mlp": 0.01055548, + "balance_loss_clip": 1.31952739, + "balance_loss_mlp": 1.03078771, + "epoch": 0.35310386291898394, + "flos": 23374668528000.0, + "grad_norm": 1.7587036871253636, + "language_loss": 0.83342397, + "learning_rate": 3.001236451924089e-06, + "loss": 0.85902071, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.24768066, + "step": 5873, + "time_per_iteration": 2.859938621520996 + }, + { + "auxiliary_loss_clip": 0.01522028, + "auxiliary_loss_mlp": 0.01056643, + "balance_loss_clip": 1.32966447, + "balance_loss_mlp": 1.03034532, + "epoch": 0.3531639861716519, + "flos": 24472582300800.0, + "grad_norm": 2.963680412476417, + "language_loss": 0.67558604, + "learning_rate": 3.000899288359104e-06, + "loss": 0.70137274, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.26318359, + "step": 5874, + "time_per_iteration": 2.8979969024658203 + }, + { + "auxiliary_loss_clip": 0.01283499, + "auxiliary_loss_mlp": 0.01107543, + "balance_loss_clip": 1.17316484, + "balance_loss_mlp": 1.07492769, + "epoch": 0.35322410942431987, + "flos": 70341298951680.0, + "grad_norm": 0.793463254094825, + "language_loss": 0.61495793, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63886833, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.32617188, + "step": 5875, + "time_per_iteration": 3.24753999710083 + }, + { + "auxiliary_loss_clip": 0.01497028, + "auxiliary_loss_mlp": 0.01050032, + "balance_loss_clip": 1.31091356, + "balance_loss_mlp": 1.02512968, + "epoch": 0.35328423267698783, + "flos": 19828406125440.0, + "grad_norm": 1.7777694925622165, + "language_loss": 0.80684143, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.83231205, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.24890137, + "step": 5876, + "time_per_iteration": 2.849835157394409 + }, + { + "auxiliary_loss_clip": 0.01280479, + "auxiliary_loss_mlp": 0.01057711, + "balance_loss_clip": 1.1708988, + "balance_loss_mlp": 1.02414155, + "epoch": 0.3533443559296558, + "flos": 60852801012480.0, + "grad_norm": 0.6813134684430068, + "language_loss": 0.56816685, + "learning_rate": 2.999887569990088e-06, + "loss": 0.59154874, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.3359375, + "step": 5877, + "time_per_iteration": 3.3602020740509033 + }, + { + "auxiliary_loss_clip": 0.01505947, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.31864405, + "balance_loss_mlp": 1.02037358, + "epoch": 0.35340447918232376, + "flos": 24766982087040.0, + "grad_norm": 1.5498905983008981, + "language_loss": 0.72994941, + "learning_rate": 2.999550254685024e-06, + "loss": 0.75547069, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.25793457, + "step": 5878, + "time_per_iteration": 2.9558303356170654 + }, + { + "auxiliary_loss_clip": 0.01497737, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.31081486, + "balance_loss_mlp": 1.019243, + "epoch": 0.3534646024349917, + "flos": 21805673448960.0, + "grad_norm": 2.2759176422691447, + "language_loss": 0.79269099, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.81811559, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.25463867, + "step": 5879, + "time_per_iteration": 2.880070209503174 + }, + { + "auxiliary_loss_clip": 0.01528937, + "auxiliary_loss_mlp": 0.01048714, + "balance_loss_clip": 1.33589935, + "balance_loss_mlp": 1.02360892, + "epoch": 0.3535247256876597, + "flos": 20021963955840.0, + "grad_norm": 2.0691336969778384, + "language_loss": 0.64097035, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.66674691, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.2512207, + "step": 5880, + "time_per_iteration": 2.918210506439209 + }, + { + "auxiliary_loss_clip": 0.01501981, + "auxiliary_loss_mlp": 0.0105583, + "balance_loss_clip": 1.31214261, + "balance_loss_mlp": 1.02977109, + "epoch": 0.35358484894032766, + "flos": 18197325901440.0, + "grad_norm": 2.4463970431712947, + "language_loss": 0.66168326, + "learning_rate": 2.998538081402727e-06, + "loss": 0.6872614, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.26086426, + "step": 5881, + "time_per_iteration": 2.9047420024871826 + }, + { + "auxiliary_loss_clip": 0.01483541, + "auxiliary_loss_mlp": 0.01055478, + "balance_loss_clip": 1.30229783, + "balance_loss_mlp": 1.02959752, + "epoch": 0.3536449721929956, + "flos": 22830643324800.0, + "grad_norm": 1.4363513776401182, + "language_loss": 0.76675379, + "learning_rate": 2.998200614562239e-06, + "loss": 0.79214406, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.25878906, + "step": 5882, + "time_per_iteration": 2.9823551177978516 + }, + { + "auxiliary_loss_clip": 0.01504032, + "auxiliary_loss_mlp": 0.01065144, + "balance_loss_clip": 1.3155576, + "balance_loss_mlp": 1.03777373, + "epoch": 0.3537050954456636, + "flos": 26443515352320.0, + "grad_norm": 2.0686311879512327, + "language_loss": 0.71367866, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73937041, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.27404785, + "step": 5883, + "time_per_iteration": 2.927515983581543 + }, + { + "auxiliary_loss_clip": 0.01507916, + "auxiliary_loss_mlp": 0.01054144, + "balance_loss_clip": 1.31574082, + "balance_loss_mlp": 1.02958667, + "epoch": 0.3537652186983316, + "flos": 17204823809280.0, + "grad_norm": 1.770351041889775, + "language_loss": 0.79293251, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.81855309, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.2454834, + "step": 5884, + "time_per_iteration": 4.264991521835327 + }, + { + "auxiliary_loss_clip": 0.01490552, + "auxiliary_loss_mlp": 0.01056544, + "balance_loss_clip": 1.30623221, + "balance_loss_mlp": 1.03267801, + "epoch": 0.3538253419509996, + "flos": 19546222435200.0, + "grad_norm": 2.844431623998467, + "language_loss": 0.75971007, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.78518111, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.23864746, + "step": 5885, + "time_per_iteration": 2.8482789993286133 + }, + { + "auxiliary_loss_clip": 0.01498821, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.3098712, + "balance_loss_mlp": 1.02471745, + "epoch": 0.35388546520366754, + "flos": 12135698040960.0, + "grad_norm": 4.037867910935463, + "language_loss": 0.85051203, + "learning_rate": 2.996850368809606e-06, + "loss": 0.87599504, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.24780273, + "step": 5886, + "time_per_iteration": 2.843576669692993 + }, + { + "auxiliary_loss_clip": 0.01473085, + "auxiliary_loss_mlp": 0.01053318, + "balance_loss_clip": 1.28962338, + "balance_loss_mlp": 1.02705598, + "epoch": 0.3539455884563355, + "flos": 19686454629120.0, + "grad_norm": 2.13003910941335, + "language_loss": 0.79395592, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.81921995, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.26293945, + "step": 5887, + "time_per_iteration": 2.831603765487671 + }, + { + "auxiliary_loss_clip": 0.01483764, + "auxiliary_loss_mlp": 0.01051009, + "balance_loss_clip": 1.29829979, + "balance_loss_mlp": 1.02539146, + "epoch": 0.35400571170900347, + "flos": 18079743369600.0, + "grad_norm": 1.7324122832185167, + "language_loss": 0.66553766, + "learning_rate": 2.996175019078089e-06, + "loss": 0.69088537, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.25610352, + "step": 5888, + "time_per_iteration": 2.8590457439422607 + }, + { + "auxiliary_loss_clip": 0.01496206, + "auxiliary_loss_mlp": 0.01046937, + "balance_loss_clip": 1.31059289, + "balance_loss_mlp": 1.02243972, + "epoch": 0.35406583496167143, + "flos": 26079293560320.0, + "grad_norm": 1.7020038476034263, + "language_loss": 0.77402514, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.79945654, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24487305, + "step": 5889, + "time_per_iteration": 4.297436714172363 + }, + { + "auxiliary_loss_clip": 0.01499354, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_clip": 1.31512165, + "balance_loss_mlp": 1.02310562, + "epoch": 0.3541259582143394, + "flos": 19802046389760.0, + "grad_norm": 1.9173843034413605, + "language_loss": 0.82130742, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.84677553, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.24365234, + "step": 5890, + "time_per_iteration": 4.271352291107178 + }, + { + "auxiliary_loss_clip": 0.01479438, + "auxiliary_loss_mlp": 0.01041242, + "balance_loss_clip": 1.29597616, + "balance_loss_mlp": 1.01743543, + "epoch": 0.35418608146700736, + "flos": 24032068496640.0, + "grad_norm": 1.6837926980078042, + "language_loss": 0.8031795, + "learning_rate": 2.99516171119991e-06, + "loss": 0.82838631, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23803711, + "step": 5891, + "time_per_iteration": 2.8578197956085205 + }, + { + "auxiliary_loss_clip": 0.01486467, + "auxiliary_loss_mlp": 0.01049432, + "balance_loss_clip": 1.30254412, + "balance_loss_mlp": 1.02317011, + "epoch": 0.35424620471967533, + "flos": 12393422277120.0, + "grad_norm": 1.872007149987236, + "language_loss": 0.74067748, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.76603645, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.26269531, + "step": 5892, + "time_per_iteration": 2.869631290435791 + }, + { + "auxiliary_loss_clip": 0.01489939, + "auxiliary_loss_mlp": 0.01042338, + "balance_loss_clip": 1.30521214, + "balance_loss_mlp": 1.01761436, + "epoch": 0.3543063279723433, + "flos": 19680753784320.0, + "grad_norm": 2.0776294752082127, + "language_loss": 0.67133772, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69666052, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.24731445, + "step": 5893, + "time_per_iteration": 4.252424478530884 + }, + { + "auxiliary_loss_clip": 0.01488806, + "auxiliary_loss_mlp": 0.01046957, + "balance_loss_clip": 1.30468106, + "balance_loss_mlp": 1.02186394, + "epoch": 0.35436645122501126, + "flos": 21919500662400.0, + "grad_norm": 9.169872654723694, + "language_loss": 0.7042048, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.72956252, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.25085449, + "step": 5894, + "time_per_iteration": 2.8282227516174316 + }, + { + "auxiliary_loss_clip": 0.0149021, + "auxiliary_loss_mlp": 0.01042554, + "balance_loss_clip": 1.30797732, + "balance_loss_mlp": 1.01924872, + "epoch": 0.3544265744776792, + "flos": 21727888358400.0, + "grad_norm": 4.437390629711494, + "language_loss": 0.74903846, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77436614, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.23303223, + "step": 5895, + "time_per_iteration": 2.868025541305542 + }, + { + "auxiliary_loss_clip": 0.01488294, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.30522513, + "balance_loss_mlp": 1.02191448, + "epoch": 0.3544866977303472, + "flos": 21222619966080.0, + "grad_norm": 1.721417544074894, + "language_loss": 0.84947592, + "learning_rate": 2.993472110174491e-06, + "loss": 0.8748194, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.24121094, + "step": 5896, + "time_per_iteration": 2.841447591781616 + }, + { + "auxiliary_loss_clip": 0.01487409, + "auxiliary_loss_mlp": 0.0105497, + "balance_loss_clip": 1.30417311, + "balance_loss_mlp": 1.02925706, + "epoch": 0.35454682098301515, + "flos": 29322469175040.0, + "grad_norm": 1.6643706111055712, + "language_loss": 0.71523643, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.74066019, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.25708008, + "step": 5897, + "time_per_iteration": 2.9384734630584717 + }, + { + "auxiliary_loss_clip": 0.01482238, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.29811811, + "balance_loss_mlp": 1.02560699, + "epoch": 0.3546069442356832, + "flos": 24327327934080.0, + "grad_norm": 2.284167427759106, + "language_loss": 0.81890249, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84422743, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.24658203, + "step": 5898, + "time_per_iteration": 2.8800716400146484 + }, + { + "auxiliary_loss_clip": 0.01472294, + "auxiliary_loss_mlp": 0.01050779, + "balance_loss_clip": 1.29169273, + "balance_loss_mlp": 1.02715182, + "epoch": 0.35466706748835114, + "flos": 22867680833280.0, + "grad_norm": 3.1403215947964114, + "language_loss": 0.74938047, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.77461118, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.2364502, + "step": 5899, + "time_per_iteration": 2.8744659423828125 + }, + { + "auxiliary_loss_clip": 0.01491676, + "auxiliary_loss_mlp": 0.01040056, + "balance_loss_clip": 1.3045609, + "balance_loss_mlp": 1.01610637, + "epoch": 0.3547271907410191, + "flos": 28341911710080.0, + "grad_norm": 1.6556085375803948, + "language_loss": 0.80288851, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.82820582, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.23962402, + "step": 5900, + "time_per_iteration": 2.9583418369293213 + }, + { + "auxiliary_loss_clip": 0.01487031, + "auxiliary_loss_mlp": 0.0104772, + "balance_loss_clip": 1.30242682, + "balance_loss_mlp": 1.02271008, + "epoch": 0.35478731399368707, + "flos": 23524673598720.0, + "grad_norm": 1.814098865853592, + "language_loss": 0.82971072, + "learning_rate": 2.991781567335093e-06, + "loss": 0.85505825, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.25024414, + "step": 5901, + "time_per_iteration": 2.8718202114105225 + }, + { + "auxiliary_loss_clip": 0.01495756, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.30826449, + "balance_loss_mlp": 1.01737368, + "epoch": 0.35484743724635504, + "flos": 18633450960000.0, + "grad_norm": 1.7676723276022392, + "language_loss": 0.76795542, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.79332793, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.24108887, + "step": 5902, + "time_per_iteration": 2.8588809967041016 + }, + { + "auxiliary_loss_clip": 0.01489991, + "auxiliary_loss_mlp": 0.01044549, + "balance_loss_clip": 1.30486929, + "balance_loss_mlp": 1.0220542, + "epoch": 0.354907560499023, + "flos": 17393495201280.0, + "grad_norm": 1.8421894349485655, + "language_loss": 0.7232731, + "learning_rate": 2.991105086850381e-06, + "loss": 0.74861848, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.22473145, + "step": 5903, + "time_per_iteration": 2.883305072784424 + }, + { + "auxiliary_loss_clip": 0.01497753, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.30822682, + "balance_loss_mlp": 1.01104307, + "epoch": 0.35496768375169097, + "flos": 19217952276480.0, + "grad_norm": 2.3379969520385884, + "language_loss": 0.74530089, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.77062607, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.23706055, + "step": 5904, + "time_per_iteration": 2.8320400714874268 + }, + { + "auxiliary_loss_clip": 0.0150183, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_clip": 1.31425881, + "balance_loss_mlp": 1.0212239, + "epoch": 0.35502780700435893, + "flos": 18341946840960.0, + "grad_norm": 2.7787966889918296, + "language_loss": 0.79144704, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81690478, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.22717285, + "step": 5905, + "time_per_iteration": 2.9491379261016846 + }, + { + "auxiliary_loss_clip": 0.01460841, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.2855742, + "balance_loss_mlp": 1.01534748, + "epoch": 0.3550879302570269, + "flos": 15456477767040.0, + "grad_norm": 1.9159593793637681, + "language_loss": 0.73243093, + "learning_rate": 2.990090084284356e-06, + "loss": 0.75741756, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22497559, + "step": 5906, + "time_per_iteration": 2.8441784381866455 + }, + { + "auxiliary_loss_clip": 0.01495123, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.30722153, + "balance_loss_mlp": 1.01202607, + "epoch": 0.35514805350969486, + "flos": 21988734485760.0, + "grad_norm": 1.9711710997907057, + "language_loss": 0.76146978, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.78676701, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22570801, + "step": 5907, + "time_per_iteration": 2.856736660003662 + }, + { + "auxiliary_loss_clip": 0.01491189, + "auxiliary_loss_mlp": 0.01042063, + "balance_loss_clip": 1.30665731, + "balance_loss_mlp": 1.01836443, + "epoch": 0.3552081767623628, + "flos": 29874005015040.0, + "grad_norm": 2.1991219501888657, + "language_loss": 0.76448536, + "learning_rate": 2.989413228164047e-06, + "loss": 0.78981787, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.23681641, + "step": 5908, + "time_per_iteration": 2.90158748626709 + }, + { + "auxiliary_loss_clip": 0.01484715, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.30006981, + "balance_loss_mlp": 1.01387048, + "epoch": 0.3552683000150308, + "flos": 26443424862720.0, + "grad_norm": 2.051483521906333, + "language_loss": 0.69092464, + "learning_rate": 2.989074743819502e-06, + "loss": 0.71612883, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.21838379, + "step": 5909, + "time_per_iteration": 2.878732204437256 + }, + { + "auxiliary_loss_clip": 0.01471291, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.29473257, + "balance_loss_mlp": 1.0172112, + "epoch": 0.35532842326769876, + "flos": 19793902325760.0, + "grad_norm": 2.6026918679261013, + "language_loss": 0.79087538, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8159883, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.22790527, + "step": 5910, + "time_per_iteration": 2.8257815837860107 + }, + { + "auxiliary_loss_clip": 0.01500003, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.31033671, + "balance_loss_mlp": 1.01290739, + "epoch": 0.3553885465203668, + "flos": 17248421813760.0, + "grad_norm": 2.1628292399089024, + "language_loss": 0.71833539, + "learning_rate": 2.98839766262581e-06, + "loss": 0.7437073, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.24316406, + "step": 5911, + "time_per_iteration": 2.8181076049804688 + }, + { + "auxiliary_loss_clip": 0.01473868, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.29352283, + "balance_loss_mlp": 1.01708162, + "epoch": 0.35544866977303474, + "flos": 14941255518720.0, + "grad_norm": 2.277521934796158, + "language_loss": 0.87831986, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.90345472, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.22546387, + "step": 5912, + "time_per_iteration": 2.8057937622070312 + }, + { + "auxiliary_loss_clip": 0.01490641, + "auxiliary_loss_mlp": 0.01040251, + "balance_loss_clip": 1.30704141, + "balance_loss_mlp": 1.01714826, + "epoch": 0.3555087930257027, + "flos": 19765642308480.0, + "grad_norm": 2.1198469082448823, + "language_loss": 0.78188682, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.80719578, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.2310791, + "step": 5913, + "time_per_iteration": 2.921937942504883 + }, + { + "auxiliary_loss_clip": 0.01479531, + "auxiliary_loss_mlp": 0.01041288, + "balance_loss_clip": 1.29816782, + "balance_loss_mlp": 1.01762486, + "epoch": 0.3555689162783707, + "flos": 21077863292160.0, + "grad_norm": 1.628324258800963, + "language_loss": 0.83311939, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.85832763, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.23681641, + "step": 5914, + "time_per_iteration": 2.872138738632202 + }, + { + "auxiliary_loss_clip": 0.01504103, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.31717002, + "balance_loss_mlp": 1.01914036, + "epoch": 0.35562903953103864, + "flos": 33081726689280.0, + "grad_norm": 2.3834601736837246, + "language_loss": 0.70938003, + "learning_rate": 2.98704305057949e-06, + "loss": 0.73484671, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23425293, + "step": 5915, + "time_per_iteration": 2.9198110103607178 + }, + { + "auxiliary_loss_clip": 0.01480528, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.29671001, + "balance_loss_mlp": 1.01586938, + "epoch": 0.3556891627837066, + "flos": 20567436992640.0, + "grad_norm": 1.6802350603370486, + "language_loss": 0.76923537, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.79442823, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.22888184, + "step": 5916, + "time_per_iteration": 2.8342831134796143 + }, + { + "auxiliary_loss_clip": 0.01488882, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.30312181, + "balance_loss_mlp": 1.01605213, + "epoch": 0.35574928603637457, + "flos": 20712827093760.0, + "grad_norm": 4.928583385049272, + "language_loss": 0.88884342, + "learning_rate": 2.986365519932332e-06, + "loss": 0.91410911, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.21655273, + "step": 5917, + "time_per_iteration": 2.8293392658233643 + }, + { + "auxiliary_loss_clip": 0.01485884, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.30149853, + "balance_loss_mlp": 1.01399553, + "epoch": 0.35580940928904253, + "flos": 15202825562880.0, + "grad_norm": 2.5556823901701287, + "language_loss": 0.76184261, + "learning_rate": 2.98602669849771e-06, + "loss": 0.78705949, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.21801758, + "step": 5918, + "time_per_iteration": 4.227437257766724 + }, + { + "auxiliary_loss_clip": 0.01261132, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.152179, + "balance_loss_mlp": 1.00204301, + "epoch": 0.3558695325417105, + "flos": 58665524734080.0, + "grad_norm": 0.9167161447229929, + "language_loss": 0.63940626, + "learning_rate": 2.985687839672857e-06, + "loss": 0.66228789, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 3.1191210746765137 + }, + { + "auxiliary_loss_clip": 0.01502784, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.31262481, + "balance_loss_mlp": 1.01961899, + "epoch": 0.35592965579437846, + "flos": 22028441437440.0, + "grad_norm": 3.346441450886939, + "language_loss": 0.74437094, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.7698102, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.21520996, + "step": 5920, + "time_per_iteration": 2.8364038467407227 + }, + { + "auxiliary_loss_clip": 0.01500167, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.31489301, + "balance_loss_mlp": 1.01791608, + "epoch": 0.35598977904704643, + "flos": 23378107132800.0, + "grad_norm": 1.913080206465589, + "language_loss": 0.78585702, + "learning_rate": 2.985010009903857e-06, + "loss": 0.81126541, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22766113, + "step": 5921, + "time_per_iteration": 2.9208571910858154 + }, + { + "auxiliary_loss_clip": 0.01496714, + "auxiliary_loss_mlp": 0.01043424, + "balance_loss_clip": 1.30879152, + "balance_loss_mlp": 1.02132285, + "epoch": 0.3560499022997144, + "flos": 17794302053760.0, + "grad_norm": 2.0833746316153636, + "language_loss": 0.68443751, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.70983887, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.22106934, + "step": 5922, + "time_per_iteration": 2.834113121032715 + }, + { + "auxiliary_loss_clip": 0.01494551, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.30949008, + "balance_loss_mlp": 1.01670408, + "epoch": 0.35611002555238236, + "flos": 20750136071040.0, + "grad_norm": 1.9121156687299066, + "language_loss": 0.80188262, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.8272146, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21923828, + "step": 5923, + "time_per_iteration": 2.836017370223999 + }, + { + "auxiliary_loss_clip": 0.01491713, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.30623889, + "balance_loss_mlp": 1.01933718, + "epoch": 0.3561701488050504, + "flos": 19471242522240.0, + "grad_norm": 4.451716274599384, + "language_loss": 0.85741365, + "learning_rate": 2.983992985144908e-06, + "loss": 0.8827492, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.22509766, + "step": 5924, + "time_per_iteration": 4.217506408691406 + }, + { + "auxiliary_loss_clip": 0.0149385, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_clip": 1.30822492, + "balance_loss_mlp": 1.02032971, + "epoch": 0.35623027205771834, + "flos": 30787455162240.0, + "grad_norm": 3.9600483476531676, + "language_loss": 0.77972496, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.805085, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.21826172, + "step": 5925, + "time_per_iteration": 4.381460189819336 + }, + { + "auxiliary_loss_clip": 0.0148603, + "auxiliary_loss_mlp": 0.01041264, + "balance_loss_clip": 1.30037475, + "balance_loss_mlp": 1.01882887, + "epoch": 0.3562903953103863, + "flos": 16989430723200.0, + "grad_norm": 2.0368867902616046, + "language_loss": 0.77653539, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.80180836, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22424316, + "step": 5926, + "time_per_iteration": 2.798799991607666 + }, + { + "auxiliary_loss_clip": 0.01505815, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.31581306, + "balance_loss_mlp": 1.01993537, + "epoch": 0.3563505185630543, + "flos": 23849821866240.0, + "grad_norm": 2.042205538791556, + "language_loss": 0.70062435, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.72610885, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.22692871, + "step": 5927, + "time_per_iteration": 2.8645682334899902 + }, + { + "auxiliary_loss_clip": 0.01487209, + "auxiliary_loss_mlp": 0.01049667, + "balance_loss_clip": 1.30420089, + "balance_loss_mlp": 1.02806592, + "epoch": 0.35641064181572224, + "flos": 22283677209600.0, + "grad_norm": 2.3450775708416076, + "language_loss": 0.80593276, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.83130145, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21618652, + "step": 5928, + "time_per_iteration": 4.197360277175903 + }, + { + "auxiliary_loss_clip": 0.01480179, + "auxiliary_loss_mlp": 0.01045547, + "balance_loss_clip": 1.29539418, + "balance_loss_mlp": 1.02314758, + "epoch": 0.3564707650683902, + "flos": 23011170652800.0, + "grad_norm": 1.3791347546277852, + "language_loss": 0.82401001, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84926724, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22399902, + "step": 5929, + "time_per_iteration": 2.8482091426849365 + }, + { + "auxiliary_loss_clip": 0.01467279, + "auxiliary_loss_mlp": 0.01040045, + "balance_loss_clip": 1.28688848, + "balance_loss_mlp": 1.01805067, + "epoch": 0.35653088832105817, + "flos": 14692489752960.0, + "grad_norm": 2.4558171581397055, + "language_loss": 0.71307117, + "learning_rate": 2.981957928520201e-06, + "loss": 0.73814446, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.2199707, + "step": 5930, + "time_per_iteration": 2.8549370765686035 + }, + { + "auxiliary_loss_clip": 0.01489352, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.30110312, + "balance_loss_mlp": 1.02168798, + "epoch": 0.35659101157372614, + "flos": 23487500355840.0, + "grad_norm": 2.3432339328826925, + "language_loss": 0.69077611, + "learning_rate": 2.981618622015244e-06, + "loss": 0.71610844, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.22192383, + "step": 5931, + "time_per_iteration": 2.849738121032715 + }, + { + "auxiliary_loss_clip": 0.01470148, + "auxiliary_loss_mlp": 0.01048192, + "balance_loss_clip": 1.28859377, + "balance_loss_mlp": 1.02507782, + "epoch": 0.3566511348263941, + "flos": 26589991328640.0, + "grad_norm": 1.6084686599236901, + "language_loss": 0.68441129, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70959473, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23132324, + "step": 5932, + "time_per_iteration": 2.902076482772827 + }, + { + "auxiliary_loss_clip": 0.01475153, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.2937088, + "balance_loss_mlp": 1.01940882, + "epoch": 0.35671125807906207, + "flos": 13122725512320.0, + "grad_norm": 2.5205406756795, + "language_loss": 0.80427086, + "learning_rate": 2.980939897348969e-06, + "loss": 0.82943189, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.21533203, + "step": 5933, + "time_per_iteration": 2.853135824203491 + }, + { + "auxiliary_loss_clip": 0.0148536, + "auxiliary_loss_mlp": 0.01046872, + "balance_loss_clip": 1.29931355, + "balance_loss_mlp": 1.02304244, + "epoch": 0.35677138133173003, + "flos": 33013669230720.0, + "grad_norm": 1.659074760625499, + "language_loss": 0.70133281, + "learning_rate": 2.980600479213388e-06, + "loss": 0.72665519, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23815918, + "step": 5934, + "time_per_iteration": 2.976716995239258 + }, + { + "auxiliary_loss_clip": 0.01504508, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.31167364, + "balance_loss_mlp": 1.01796877, + "epoch": 0.356831504584398, + "flos": 20787761761920.0, + "grad_norm": 1.8157790590120326, + "language_loss": 0.72107339, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.74653292, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.23474121, + "step": 5935, + "time_per_iteration": 2.9108774662017822 + }, + { + "auxiliary_loss_clip": 0.01470488, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.28655457, + "balance_loss_mlp": 1.0156188, + "epoch": 0.35689162783706596, + "flos": 12172826039040.0, + "grad_norm": 2.032077837321027, + "language_loss": 0.78488827, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80997366, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.22460938, + "step": 5936, + "time_per_iteration": 2.816220998764038 + }, + { + "auxiliary_loss_clip": 0.014721, + "auxiliary_loss_mlp": 0.01040693, + "balance_loss_clip": 1.29011512, + "balance_loss_mlp": 1.01846027, + "epoch": 0.356951751089734, + "flos": 23851903127040.0, + "grad_norm": 1.4297885648484168, + "language_loss": 0.65164179, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.67676973, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22216797, + "step": 5937, + "time_per_iteration": 2.8711416721343994 + }, + { + "auxiliary_loss_clip": 0.01483739, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.29760134, + "balance_loss_mlp": 1.01433527, + "epoch": 0.35701187434240195, + "flos": 11727878244480.0, + "grad_norm": 2.2070968330277503, + "language_loss": 0.79530442, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.82051241, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.22729492, + "step": 5938, + "time_per_iteration": 2.8140807151794434 + }, + { + "auxiliary_loss_clip": 0.01477665, + "auxiliary_loss_mlp": 0.01042353, + "balance_loss_clip": 1.29407287, + "balance_loss_mlp": 1.02002501, + "epoch": 0.3570719975950699, + "flos": 24909386031360.0, + "grad_norm": 1.5429947458018223, + "language_loss": 0.81119919, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.83639932, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.2232666, + "step": 5939, + "time_per_iteration": 2.888073444366455 + }, + { + "auxiliary_loss_clip": 0.01505286, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.31143785, + "balance_loss_mlp": 1.01552784, + "epoch": 0.3571321208477379, + "flos": 26005987704960.0, + "grad_norm": 1.7576658151324345, + "language_loss": 0.80104977, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.82648289, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.22485352, + "step": 5940, + "time_per_iteration": 2.878488779067993 + }, + { + "auxiliary_loss_clip": 0.01478578, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.29047501, + "balance_loss_mlp": 1.01629996, + "epoch": 0.35719224410040584, + "flos": 14509790674560.0, + "grad_norm": 2.30464604201849, + "language_loss": 0.73329687, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.75846893, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22314453, + "step": 5941, + "time_per_iteration": 2.835662603378296 + }, + { + "auxiliary_loss_clip": 0.01493737, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.30876851, + "balance_loss_mlp": 1.01872909, + "epoch": 0.3572523673530738, + "flos": 31187854811520.0, + "grad_norm": 2.891491454387655, + "language_loss": 0.65288311, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.6782313, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22338867, + "step": 5942, + "time_per_iteration": 2.934222936630249 + }, + { + "auxiliary_loss_clip": 0.01480825, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.2948699, + "balance_loss_mlp": 1.01897025, + "epoch": 0.3573124906057418, + "flos": 15860542245120.0, + "grad_norm": 1.77306445642961, + "language_loss": 0.74614733, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.77138108, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.23571777, + "step": 5943, + "time_per_iteration": 2.8524742126464844 + }, + { + "auxiliary_loss_clip": 0.01283448, + "auxiliary_loss_mlp": 0.01053642, + "balance_loss_clip": 1.1767211, + "balance_loss_mlp": 1.03409219, + "epoch": 0.35737261385840974, + "flos": 60848231287680.0, + "grad_norm": 0.8156437494343916, + "language_loss": 0.60808945, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.63146043, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1953125, + "step": 5944, + "time_per_iteration": 3.4341330528259277 + }, + { + "auxiliary_loss_clip": 0.01475728, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_clip": 1.29140449, + "balance_loss_mlp": 1.0224489, + "epoch": 0.3574327371110777, + "flos": 18853097057280.0, + "grad_norm": 1.6608294610844703, + "language_loss": 0.73252904, + "learning_rate": 2.976864428379655e-06, + "loss": 0.75774074, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.22998047, + "step": 5945, + "time_per_iteration": 2.8418240547180176 + }, + { + "auxiliary_loss_clip": 0.01469253, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_clip": 1.28532958, + "balance_loss_mlp": 1.01845574, + "epoch": 0.35749286036374567, + "flos": 23560037049600.0, + "grad_norm": 2.206165827475419, + "language_loss": 0.81428039, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83939672, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.23937988, + "step": 5946, + "time_per_iteration": 2.8385822772979736 + }, + { + "auxiliary_loss_clip": 0.01504571, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.31681025, + "balance_loss_mlp": 1.01764655, + "epoch": 0.35755298361641363, + "flos": 21115308003840.0, + "grad_norm": 1.3715599140902572, + "language_loss": 0.69977987, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.72522175, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.21960449, + "step": 5947, + "time_per_iteration": 2.902299404144287 + }, + { + "auxiliary_loss_clip": 0.01466021, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.28617311, + "balance_loss_mlp": 1.02246356, + "epoch": 0.3576131068690816, + "flos": 19254446847360.0, + "grad_norm": 2.1595837366012147, + "language_loss": 0.76357025, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.78867513, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.2199707, + "step": 5948, + "time_per_iteration": 2.8406879901885986 + }, + { + "auxiliary_loss_clip": 0.01486047, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.30057752, + "balance_loss_mlp": 1.02231646, + "epoch": 0.35767323012174956, + "flos": 28665612144000.0, + "grad_norm": 1.7538443878487264, + "language_loss": 0.71686298, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.74216807, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22143555, + "step": 5949, + "time_per_iteration": 2.9003255367279053 + }, + { + "auxiliary_loss_clip": 0.01485913, + "auxiliary_loss_mlp": 0.01044079, + "balance_loss_clip": 1.30041456, + "balance_loss_mlp": 1.02337205, + "epoch": 0.35773335337441753, + "flos": 17092399184640.0, + "grad_norm": 1.9660650844725822, + "language_loss": 0.78762442, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.81292427, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.20715332, + "step": 5950, + "time_per_iteration": 2.8292899131774902 + }, + { + "auxiliary_loss_clip": 0.01493292, + "auxiliary_loss_mlp": 0.01043798, + "balance_loss_clip": 1.30627346, + "balance_loss_mlp": 1.02175641, + "epoch": 0.35779347662708555, + "flos": 15897217795200.0, + "grad_norm": 1.8773067881355587, + "language_loss": 0.7350589, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.7604298, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22045898, + "step": 5951, + "time_per_iteration": 2.880941390991211 + }, + { + "auxiliary_loss_clip": 0.01510751, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.31962538, + "balance_loss_mlp": 1.02339649, + "epoch": 0.3578535998797535, + "flos": 28670815296000.0, + "grad_norm": 2.1354967458706433, + "language_loss": 0.70975113, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.7353127, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.21984863, + "step": 5952, + "time_per_iteration": 2.9058597087860107 + }, + { + "auxiliary_loss_clip": 0.01484844, + "auxiliary_loss_mlp": 0.01047392, + "balance_loss_clip": 1.30174279, + "balance_loss_mlp": 1.02537429, + "epoch": 0.3579137231324215, + "flos": 37866587506560.0, + "grad_norm": 1.7194579061305273, + "language_loss": 0.70728195, + "learning_rate": 2.974144484269449e-06, + "loss": 0.73260432, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22033691, + "step": 5953, + "time_per_iteration": 4.3763508796691895 + }, + { + "auxiliary_loss_clip": 0.01489007, + "auxiliary_loss_mlp": 0.01044437, + "balance_loss_clip": 1.30340576, + "balance_loss_mlp": 1.02241945, + "epoch": 0.35797384638508944, + "flos": 22357209288960.0, + "grad_norm": 1.5181384774718423, + "language_loss": 0.67377681, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.69911122, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22009277, + "step": 5954, + "time_per_iteration": 2.866511106491089 + }, + { + "auxiliary_loss_clip": 0.01491888, + "auxiliary_loss_mlp": 0.01048703, + "balance_loss_clip": 1.31073666, + "balance_loss_mlp": 1.02661395, + "epoch": 0.3580339696377574, + "flos": 13597652626560.0, + "grad_norm": 2.2414895295217048, + "language_loss": 0.76282048, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.78822643, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.2208252, + "step": 5955, + "time_per_iteration": 2.860464096069336 + }, + { + "auxiliary_loss_clip": 0.01482986, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_clip": 1.30202663, + "balance_loss_mlp": 1.02614975, + "epoch": 0.3580940928904254, + "flos": 23778552026880.0, + "grad_norm": 2.584515097728016, + "language_loss": 0.76368213, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78898931, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21594238, + "step": 5956, + "time_per_iteration": 2.9444873332977295 + }, + { + "auxiliary_loss_clip": 0.01479622, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.30131876, + "balance_loss_mlp": 1.02574992, + "epoch": 0.35815421614309334, + "flos": 19473323783040.0, + "grad_norm": 1.889453265257825, + "language_loss": 0.74332869, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.76859701, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.21472168, + "step": 5957, + "time_per_iteration": 2.8872735500335693 + }, + { + "auxiliary_loss_clip": 0.01496449, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_clip": 1.31120825, + "balance_loss_mlp": 1.03115988, + "epoch": 0.3582143393957613, + "flos": 23378740560000.0, + "grad_norm": 2.009572949899705, + "language_loss": 0.72166222, + "learning_rate": 2.972443318242726e-06, + "loss": 0.74714309, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.20483398, + "step": 5958, + "time_per_iteration": 2.9787826538085938 + }, + { + "auxiliary_loss_clip": 0.01483385, + "auxiliary_loss_mlp": 0.01046636, + "balance_loss_clip": 1.30329835, + "balance_loss_mlp": 1.0245229, + "epoch": 0.35827446264842927, + "flos": 26334529332480.0, + "grad_norm": 1.8203149964298313, + "language_loss": 0.89421296, + "learning_rate": 2.972102974360324e-06, + "loss": 0.91951311, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22106934, + "step": 5959, + "time_per_iteration": 4.326797723770142 + }, + { + "auxiliary_loss_clip": 0.01492559, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_clip": 1.31028676, + "balance_loss_mlp": 1.02475905, + "epoch": 0.35833458590109724, + "flos": 30459049269120.0, + "grad_norm": 1.6103273068722281, + "language_loss": 0.59376442, + "learning_rate": 2.971762593615679e-06, + "loss": 0.619151, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.21337891, + "step": 5960, + "time_per_iteration": 4.470661163330078 + }, + { + "auxiliary_loss_clip": 0.01486271, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.30306721, + "balance_loss_mlp": 1.02477884, + "epoch": 0.3583947091537652, + "flos": 14838060833280.0, + "grad_norm": 1.880272712120269, + "language_loss": 0.77433181, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.79966545, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22314453, + "step": 5961, + "time_per_iteration": 2.894744873046875 + }, + { + "auxiliary_loss_clip": 0.0149072, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.30680418, + "balance_loss_mlp": 1.02100062, + "epoch": 0.35845483240643317, + "flos": 34253941703040.0, + "grad_norm": 2.024029564631576, + "language_loss": 0.71140575, + "learning_rate": 2.971081721591294e-06, + "loss": 0.73674411, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22119141, + "step": 5962, + "time_per_iteration": 3.031308889389038 + }, + { + "auxiliary_loss_clip": 0.01486048, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.30493164, + "balance_loss_mlp": 1.01902378, + "epoch": 0.35851495565910113, + "flos": 20970008392320.0, + "grad_norm": 1.5701342569419792, + "language_loss": 0.75035942, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77561939, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.20910645, + "step": 5963, + "time_per_iteration": 4.249804496765137 + }, + { + "auxiliary_loss_clip": 0.01499251, + "auxiliary_loss_mlp": 0.01046962, + "balance_loss_clip": 1.31642973, + "balance_loss_mlp": 1.02452683, + "epoch": 0.35857507891176915, + "flos": 22320081290880.0, + "grad_norm": 1.5903193486383551, + "language_loss": 0.79304427, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.81850642, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.22436523, + "step": 5964, + "time_per_iteration": 2.8704819679260254 + }, + { + "auxiliary_loss_clip": 0.01492649, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.30660701, + "balance_loss_mlp": 1.01905894, + "epoch": 0.3586352021644371, + "flos": 23378333356800.0, + "grad_norm": 2.7079500986767817, + "language_loss": 0.67707086, + "learning_rate": 2.970060137410626e-06, + "loss": 0.70240903, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.22119141, + "step": 5965, + "time_per_iteration": 2.8442797660827637 + }, + { + "auxiliary_loss_clip": 0.01487041, + "auxiliary_loss_mlp": 0.01041203, + "balance_loss_clip": 1.30531955, + "balance_loss_mlp": 1.01823115, + "epoch": 0.3586953254171051, + "flos": 27859202490240.0, + "grad_norm": 1.572995959585258, + "language_loss": 0.80110067, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.82638311, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22961426, + "step": 5966, + "time_per_iteration": 2.9373557567596436 + }, + { + "auxiliary_loss_clip": 0.01490374, + "auxiliary_loss_mlp": 0.01044635, + "balance_loss_clip": 1.30628479, + "balance_loss_mlp": 1.02181816, + "epoch": 0.35875544866977305, + "flos": 19509999333120.0, + "grad_norm": 2.407760084263722, + "language_loss": 0.92467141, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.95002151, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.22827148, + "step": 5967, + "time_per_iteration": 2.9103174209594727 + }, + { + "auxiliary_loss_clip": 0.0148151, + "auxiliary_loss_mlp": 0.01039066, + "balance_loss_clip": 1.29564428, + "balance_loss_mlp": 1.01613045, + "epoch": 0.358815571922441, + "flos": 21481249098240.0, + "grad_norm": 1.6960847891276656, + "language_loss": 0.80918187, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.83438766, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22949219, + "step": 5968, + "time_per_iteration": 2.828022003173828 + }, + { + "auxiliary_loss_clip": 0.01496921, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.31130219, + "balance_loss_mlp": 1.02544701, + "epoch": 0.358875695175109, + "flos": 21845470890240.0, + "grad_norm": 2.0794903990954823, + "language_loss": 0.85847002, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.88393199, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23828125, + "step": 5969, + "time_per_iteration": 2.87953782081604 + }, + { + "auxiliary_loss_clip": 0.01490783, + "auxiliary_loss_mlp": 0.01041304, + "balance_loss_clip": 1.30901122, + "balance_loss_mlp": 1.02006125, + "epoch": 0.35893581842777694, + "flos": 32023474623360.0, + "grad_norm": 2.1749556858516534, + "language_loss": 0.73019499, + "learning_rate": 2.968356761586202e-06, + "loss": 0.75551587, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21240234, + "step": 5970, + "time_per_iteration": 2.987529993057251 + }, + { + "auxiliary_loss_clip": 0.01488511, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.30528069, + "balance_loss_mlp": 1.0182693, + "epoch": 0.3589959416804449, + "flos": 20495624215680.0, + "grad_norm": 1.6739271661964252, + "language_loss": 0.80937016, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.83465636, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21838379, + "step": 5971, + "time_per_iteration": 2.8597004413604736 + }, + { + "auxiliary_loss_clip": 0.01500039, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.31184816, + "balance_loss_mlp": 1.01800561, + "epoch": 0.3590560649331129, + "flos": 16189400586240.0, + "grad_norm": 1.7113892545255982, + "language_loss": 0.79579908, + "learning_rate": 2.967675154124696e-06, + "loss": 0.82121027, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.23071289, + "step": 5972, + "time_per_iteration": 2.821305751800537 + }, + { + "auxiliary_loss_clip": 0.01496178, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.31036615, + "balance_loss_mlp": 1.02121711, + "epoch": 0.35911618818578084, + "flos": 20384918893440.0, + "grad_norm": 2.0982417181111592, + "language_loss": 0.8193934, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.84478313, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.21594238, + "step": 5973, + "time_per_iteration": 2.855057716369629 + }, + { + "auxiliary_loss_clip": 0.01274594, + "auxiliary_loss_mlp": 0.01065437, + "balance_loss_clip": 1.16429627, + "balance_loss_mlp": 1.04150009, + "epoch": 0.3591763114384488, + "flos": 41258881278720.0, + "grad_norm": 0.937385527100441, + "language_loss": 0.56788468, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.59128499, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.23925781, + "step": 5974, + "time_per_iteration": 3.221466064453125 + }, + { + "auxiliary_loss_clip": 0.01490101, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.30669689, + "balance_loss_mlp": 1.02221704, + "epoch": 0.35923643469111677, + "flos": 18704132616960.0, + "grad_norm": 2.4374672598232894, + "language_loss": 0.702465, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.72780281, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21484375, + "step": 5975, + "time_per_iteration": 2.8233208656311035 + }, + { + "auxiliary_loss_clip": 0.0148847, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.30470824, + "balance_loss_mlp": 1.01943016, + "epoch": 0.35929655794378473, + "flos": 25020905760000.0, + "grad_norm": 2.1039657682169013, + "language_loss": 0.8082428, + "learning_rate": 2.96631149897303e-06, + "loss": 0.83354628, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.22460938, + "step": 5976, + "time_per_iteration": 2.92484188079834 + }, + { + "auxiliary_loss_clip": 0.01477252, + "auxiliary_loss_mlp": 0.01046077, + "balance_loss_clip": 1.29442537, + "balance_loss_mlp": 1.02219987, + "epoch": 0.35935668119645275, + "flos": 14983722403200.0, + "grad_norm": 2.0275011794583984, + "language_loss": 0.80002618, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.82525945, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.23852539, + "step": 5977, + "time_per_iteration": 2.812204122543335 + }, + { + "auxiliary_loss_clip": 0.01472746, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.29199219, + "balance_loss_mlp": 1.01504135, + "epoch": 0.3594168044491207, + "flos": 21187889942400.0, + "grad_norm": 1.8909056252313554, + "language_loss": 0.81374747, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.83885026, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22497559, + "step": 5978, + "time_per_iteration": 2.888421058654785 + }, + { + "auxiliary_loss_clip": 0.01497265, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.31067848, + "balance_loss_mlp": 1.01893973, + "epoch": 0.3594769277017887, + "flos": 27683471111040.0, + "grad_norm": 1.81120802292484, + "language_loss": 0.6855104, + "learning_rate": 2.965288372816436e-06, + "loss": 0.71089357, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.22106934, + "step": 5979, + "time_per_iteration": 2.988886833190918 + }, + { + "auxiliary_loss_clip": 0.01486913, + "auxiliary_loss_mlp": 0.01043987, + "balance_loss_clip": 1.30287552, + "balance_loss_mlp": 1.02186203, + "epoch": 0.35953705095445665, + "flos": 23012527996800.0, + "grad_norm": 2.413972486205155, + "language_loss": 0.68391323, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.70922226, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22131348, + "step": 5980, + "time_per_iteration": 2.897665023803711 + }, + { + "auxiliary_loss_clip": 0.01519686, + "auxiliary_loss_mlp": 0.01053067, + "balance_loss_clip": 1.32744837, + "balance_loss_mlp": 1.02949905, + "epoch": 0.3595971742071246, + "flos": 25523323729920.0, + "grad_norm": 1.8092605196932077, + "language_loss": 0.71995735, + "learning_rate": 2.964606105671327e-06, + "loss": 0.74568498, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.23571777, + "step": 5981, + "time_per_iteration": 2.962467670440674 + }, + { + "auxiliary_loss_clip": 0.01502061, + "auxiliary_loss_mlp": 0.01051471, + "balance_loss_clip": 1.3135556, + "balance_loss_mlp": 1.02765322, + "epoch": 0.3596572974597926, + "flos": 29874774176640.0, + "grad_norm": 1.7206835342983149, + "language_loss": 0.71864879, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.74418414, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.23815918, + "step": 5982, + "time_per_iteration": 3.0216333866119385 + }, + { + "auxiliary_loss_clip": 0.01477365, + "auxiliary_loss_mlp": 0.01048331, + "balance_loss_clip": 1.2980938, + "balance_loss_mlp": 1.02614617, + "epoch": 0.35971742071246054, + "flos": 23122961850240.0, + "grad_norm": 1.662769111284471, + "language_loss": 0.76723337, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.79249036, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22167969, + "step": 5983, + "time_per_iteration": 2.878126859664917 + }, + { + "auxiliary_loss_clip": 0.01511149, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_clip": 1.32289028, + "balance_loss_mlp": 1.03140569, + "epoch": 0.3597775439651285, + "flos": 16733561523840.0, + "grad_norm": 2.117915875455009, + "language_loss": 0.7658453, + "learning_rate": 2.96358243065131e-06, + "loss": 0.79150355, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.23278809, + "step": 5984, + "time_per_iteration": 2.840381383895874 + }, + { + "auxiliary_loss_clip": 0.01485698, + "auxiliary_loss_mlp": 0.01048311, + "balance_loss_clip": 1.30466104, + "balance_loss_mlp": 1.02706838, + "epoch": 0.3598376672177965, + "flos": 19729057248000.0, + "grad_norm": 1.865716805996135, + "language_loss": 0.87485266, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.90019274, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21228027, + "step": 5985, + "time_per_iteration": 2.8272838592529297 + }, + { + "auxiliary_loss_clip": 0.01489688, + "auxiliary_loss_mlp": 0.01045294, + "balance_loss_clip": 1.3085382, + "balance_loss_mlp": 1.02405071, + "epoch": 0.35989779047046444, + "flos": 17320506059520.0, + "grad_norm": 1.4576524482883215, + "language_loss": 0.7358681, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.76121795, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21252441, + "step": 5986, + "time_per_iteration": 2.8298404216766357 + }, + { + "auxiliary_loss_clip": 0.01520025, + "auxiliary_loss_mlp": 0.01053291, + "balance_loss_clip": 1.32899809, + "balance_loss_mlp": 1.03128493, + "epoch": 0.3599579137231324, + "flos": 22721566815360.0, + "grad_norm": 1.8413381032419178, + "language_loss": 0.74763393, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.77336705, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22009277, + "step": 5987, + "time_per_iteration": 2.85229754447937 + }, + { + "auxiliary_loss_clip": 0.01499747, + "auxiliary_loss_mlp": 0.01049338, + "balance_loss_clip": 1.31347215, + "balance_loss_mlp": 1.02792764, + "epoch": 0.36001803697580037, + "flos": 20969736923520.0, + "grad_norm": 5.557195787198832, + "language_loss": 0.70809519, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.73358607, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.21411133, + "step": 5988, + "time_per_iteration": 4.283057689666748 + }, + { + "auxiliary_loss_clip": 0.01503994, + "auxiliary_loss_mlp": 0.01054621, + "balance_loss_clip": 1.31612587, + "balance_loss_mlp": 1.0320431, + "epoch": 0.36007816022846834, + "flos": 20495307502080.0, + "grad_norm": 1.7321591849179958, + "language_loss": 0.73957825, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.76516438, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.22583008, + "step": 5989, + "time_per_iteration": 2.8868801593780518 + }, + { + "auxiliary_loss_clip": 0.01491715, + "auxiliary_loss_mlp": 0.01047326, + "balance_loss_clip": 1.30843019, + "balance_loss_mlp": 1.02617824, + "epoch": 0.36013828348113636, + "flos": 28012827144960.0, + "grad_norm": 1.5499160553451723, + "language_loss": 0.80542225, + "learning_rate": 2.961534094403931e-06, + "loss": 0.83081269, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.21142578, + "step": 5990, + "time_per_iteration": 2.911135673522949 + }, + { + "auxiliary_loss_clip": 0.01489874, + "auxiliary_loss_mlp": 0.01049376, + "balance_loss_clip": 1.30538452, + "balance_loss_mlp": 1.02745295, + "epoch": 0.3601984067338043, + "flos": 20091288268800.0, + "grad_norm": 1.5957317966349969, + "language_loss": 0.84539437, + "learning_rate": 2.961192577338698e-06, + "loss": 0.87078679, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.21923828, + "step": 5991, + "time_per_iteration": 2.8812973499298096 + }, + { + "auxiliary_loss_clip": 0.01498956, + "auxiliary_loss_mlp": 0.01049322, + "balance_loss_clip": 1.31004298, + "balance_loss_mlp": 1.02792442, + "epoch": 0.3602585299864723, + "flos": 18624990182400.0, + "grad_norm": 1.8219980675124852, + "language_loss": 0.76301718, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78849995, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.21398926, + "step": 5992, + "time_per_iteration": 2.8238253593444824 + }, + { + "auxiliary_loss_clip": 0.01492937, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_clip": 1.30932975, + "balance_loss_mlp": 1.029212, + "epoch": 0.36031865323914025, + "flos": 19582400292480.0, + "grad_norm": 1.9451218457664603, + "language_loss": 0.78169882, + "learning_rate": 2.960509433875627e-06, + "loss": 0.8071503, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23010254, + "step": 5993, + "time_per_iteration": 2.854663848876953 + }, + { + "auxiliary_loss_clip": 0.01487543, + "auxiliary_loss_mlp": 0.01049163, + "balance_loss_clip": 1.30133951, + "balance_loss_mlp": 1.02638245, + "epoch": 0.3603787764918082, + "flos": 17498771147520.0, + "grad_norm": 3.679889935585441, + "language_loss": 0.75882024, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.78418732, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.22766113, + "step": 5994, + "time_per_iteration": 4.268512964248657 + }, + { + "auxiliary_loss_clip": 0.01499258, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.31139731, + "balance_loss_mlp": 1.02102149, + "epoch": 0.3604388997444762, + "flos": 15531819638400.0, + "grad_norm": 2.0562246084855267, + "language_loss": 0.70488656, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.73030853, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.21911621, + "step": 5995, + "time_per_iteration": 4.23672080039978 + }, + { + "auxiliary_loss_clip": 0.0150898, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.32044816, + "balance_loss_mlp": 1.02995491, + "epoch": 0.36049902299714415, + "flos": 17319691653120.0, + "grad_norm": 2.5461852925319537, + "language_loss": 0.82792377, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.85353243, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.21948242, + "step": 5996, + "time_per_iteration": 2.8254098892211914 + }, + { + "auxiliary_loss_clip": 0.01499573, + "auxiliary_loss_mlp": 0.01044571, + "balance_loss_clip": 1.31444621, + "balance_loss_mlp": 1.02270854, + "epoch": 0.3605591462498121, + "flos": 17064908328960.0, + "grad_norm": 1.5472151844505635, + "language_loss": 0.73631191, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76175344, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21862793, + "step": 5997, + "time_per_iteration": 2.9526751041412354 + }, + { + "auxiliary_loss_clip": 0.01476889, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.29508293, + "balance_loss_mlp": 1.0204711, + "epoch": 0.3606192695024801, + "flos": 16845624190080.0, + "grad_norm": 2.4186230848539205, + "language_loss": 0.70902014, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.73421043, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21679688, + "step": 5998, + "time_per_iteration": 4.274650573730469 + }, + { + "auxiliary_loss_clip": 0.0147316, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.29001379, + "balance_loss_mlp": 1.01474512, + "epoch": 0.36067939275514804, + "flos": 12137055384960.0, + "grad_norm": 2.651976858957742, + "language_loss": 0.78432447, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.80943286, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22937012, + "step": 5999, + "time_per_iteration": 2.7896034717559814 + }, + { + "auxiliary_loss_clip": 0.01497739, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.31211114, + "balance_loss_mlp": 1.01904464, + "epoch": 0.360739516007816, + "flos": 18050668945920.0, + "grad_norm": 5.085688751135163, + "language_loss": 0.78998792, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.81537902, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22302246, + "step": 6000, + "time_per_iteration": 2.8993637561798096 + }, + { + "auxiliary_loss_clip": 0.01479879, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.29653084, + "balance_loss_mlp": 1.01637948, + "epoch": 0.360799639260484, + "flos": 18558516291840.0, + "grad_norm": 1.7224950187403487, + "language_loss": 0.79688394, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.82208467, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23828125, + "step": 6001, + "time_per_iteration": 2.889944314956665 + }, + { + "auxiliary_loss_clip": 0.01469869, + "auxiliary_loss_mlp": 0.01038817, + "balance_loss_clip": 1.28965604, + "balance_loss_mlp": 1.01660872, + "epoch": 0.36085976251315194, + "flos": 19691341067520.0, + "grad_norm": 1.9769917908939385, + "language_loss": 0.8401531, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.86523998, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.22216797, + "step": 6002, + "time_per_iteration": 2.840131998062134 + }, + { + "auxiliary_loss_clip": 0.01451063, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.27608585, + "balance_loss_mlp": 1.01194382, + "epoch": 0.3609198857658199, + "flos": 24207528407040.0, + "grad_norm": 2.1826438725327506, + "language_loss": 0.92234683, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.94719517, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21838379, + "step": 6003, + "time_per_iteration": 2.8664186000823975 + }, + { + "auxiliary_loss_clip": 0.01255104, + "auxiliary_loss_mlp": 0.01031325, + "balance_loss_clip": 1.1455977, + "balance_loss_mlp": 1.01139283, + "epoch": 0.3609800090184879, + "flos": 57144380670720.0, + "grad_norm": 0.869269092296618, + "language_loss": 0.5346365, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55750084, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.19921875, + "step": 6004, + "time_per_iteration": 3.2506120204925537 + }, + { + "auxiliary_loss_clip": 0.01480038, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.29481912, + "balance_loss_mlp": 1.01821768, + "epoch": 0.3610401322711559, + "flos": 20820365280000.0, + "grad_norm": 2.1921643610064003, + "language_loss": 0.7850076, + "learning_rate": 2.956407517225883e-06, + "loss": 0.81023431, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.2442627, + "step": 6005, + "time_per_iteration": 2.88390851020813 + }, + { + "auxiliary_loss_clip": 0.01467112, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.28617859, + "balance_loss_mlp": 1.01630664, + "epoch": 0.36110025552382385, + "flos": 13707136339200.0, + "grad_norm": 1.9836770032587356, + "language_loss": 0.79772711, + "learning_rate": 2.956065454793429e-06, + "loss": 0.82279468, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.2331543, + "step": 6006, + "time_per_iteration": 2.838115692138672 + }, + { + "auxiliary_loss_clip": 0.01478501, + "auxiliary_loss_mlp": 0.01039729, + "balance_loss_clip": 1.29485846, + "balance_loss_mlp": 1.01530337, + "epoch": 0.3611603787764918, + "flos": 22465064188800.0, + "grad_norm": 1.8919396841344198, + "language_loss": 0.85496676, + "learning_rate": 2.955723356106876e-06, + "loss": 0.88014907, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.24414062, + "step": 6007, + "time_per_iteration": 2.8564505577087402 + }, + { + "auxiliary_loss_clip": 0.01504027, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.31156743, + "balance_loss_mlp": 1.01805568, + "epoch": 0.3612205020291598, + "flos": 20896431068160.0, + "grad_norm": 2.018110748395329, + "language_loss": 0.73032677, + "learning_rate": 2.955381221179198e-06, + "loss": 0.75578058, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.23291016, + "step": 6008, + "time_per_iteration": 2.826467514038086 + }, + { + "auxiliary_loss_clip": 0.01485481, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.3007201, + "balance_loss_mlp": 1.01850057, + "epoch": 0.36128062528182775, + "flos": 15750379860480.0, + "grad_norm": 2.361592914507676, + "language_loss": 0.84362662, + "learning_rate": 2.955039050023368e-06, + "loss": 0.86889255, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.22631836, + "step": 6009, + "time_per_iteration": 2.840806245803833 + }, + { + "auxiliary_loss_clip": 0.01483452, + "auxiliary_loss_mlp": 0.01042579, + "balance_loss_clip": 1.29866838, + "balance_loss_mlp": 1.01880908, + "epoch": 0.3613407485344957, + "flos": 16773404209920.0, + "grad_norm": 2.460389416092129, + "language_loss": 0.76953375, + "learning_rate": 2.954696842652362e-06, + "loss": 0.79479408, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.2376709, + "step": 6010, + "time_per_iteration": 2.8667469024658203 + }, + { + "auxiliary_loss_clip": 0.01485212, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.30084503, + "balance_loss_mlp": 1.0200336, + "epoch": 0.3614008717871637, + "flos": 20379625251840.0, + "grad_norm": 5.359851133144061, + "language_loss": 0.835504, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.86078978, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.23327637, + "step": 6011, + "time_per_iteration": 2.8594775199890137 + }, + { + "auxiliary_loss_clip": 0.01499994, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.30966067, + "balance_loss_mlp": 1.02457809, + "epoch": 0.36146099503983165, + "flos": 22785054549120.0, + "grad_norm": 2.0202564358939483, + "language_loss": 0.62913543, + "learning_rate": 2.954012319316727e-06, + "loss": 0.6546362, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.25488281, + "step": 6012, + "time_per_iteration": 2.8527724742889404 + }, + { + "auxiliary_loss_clip": 0.01462264, + "auxiliary_loss_mlp": 0.01041049, + "balance_loss_clip": 1.28174961, + "balance_loss_mlp": 1.01850629, + "epoch": 0.3615211182924996, + "flos": 23005379318400.0, + "grad_norm": 1.7624527857848, + "language_loss": 0.84157717, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86661029, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22558594, + "step": 6013, + "time_per_iteration": 2.863112449645996 + }, + { + "auxiliary_loss_clip": 0.01466984, + "auxiliary_loss_mlp": 0.01039737, + "balance_loss_clip": 1.28336537, + "balance_loss_mlp": 1.0165503, + "epoch": 0.3615812415451676, + "flos": 16655143006080.0, + "grad_norm": 2.242357097041817, + "language_loss": 0.92338067, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.94844782, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23181152, + "step": 6014, + "time_per_iteration": 2.818052291870117 + }, + { + "auxiliary_loss_clip": 0.01462698, + "auxiliary_loss_mlp": 0.01043967, + "balance_loss_clip": 1.28275335, + "balance_loss_mlp": 1.02101874, + "epoch": 0.36164136479783554, + "flos": 21328846053120.0, + "grad_norm": 1.6116860931682282, + "language_loss": 0.74907541, + "learning_rate": 2.95298526302391e-06, + "loss": 0.77414203, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22961426, + "step": 6015, + "time_per_iteration": 2.875652551651001 + }, + { + "auxiliary_loss_clip": 0.01486155, + "auxiliary_loss_mlp": 0.01038997, + "balance_loss_clip": 1.30130613, + "balance_loss_mlp": 1.01507151, + "epoch": 0.3617014880505035, + "flos": 24179992306560.0, + "grad_norm": 3.44407887045406, + "language_loss": 0.66040599, + "learning_rate": 2.9526428386344e-06, + "loss": 0.6856575, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.23901367, + "step": 6016, + "time_per_iteration": 2.93022084236145 + }, + { + "auxiliary_loss_clip": 0.01481329, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.29680371, + "balance_loss_mlp": 1.01579642, + "epoch": 0.3617616113031715, + "flos": 39027943768320.0, + "grad_norm": 1.7281043797984248, + "language_loss": 0.7265287, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.7517423, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.2421875, + "step": 6017, + "time_per_iteration": 3.027055025100708 + }, + { + "auxiliary_loss_clip": 0.01478899, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.2924211, + "balance_loss_mlp": 1.01402569, + "epoch": 0.3618217345558395, + "flos": 12138548463360.0, + "grad_norm": 1.984576165815701, + "language_loss": 0.74565673, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.77081323, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.22741699, + "step": 6018, + "time_per_iteration": 2.8565731048583984 + }, + { + "auxiliary_loss_clip": 0.0144867, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.27234447, + "balance_loss_mlp": 1.01478374, + "epoch": 0.36188185780850746, + "flos": 24945428154240.0, + "grad_norm": 1.5514589197786195, + "language_loss": 0.6952678, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.72014296, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.24072266, + "step": 6019, + "time_per_iteration": 2.9547243118286133 + }, + { + "auxiliary_loss_clip": 0.01486761, + "auxiliary_loss_mlp": 0.01037617, + "balance_loss_clip": 1.30085254, + "balance_loss_mlp": 1.01502705, + "epoch": 0.3619419810611754, + "flos": 20968424824320.0, + "grad_norm": 1.8075555641827235, + "language_loss": 0.7680167, + "learning_rate": 2.95127277996311e-06, + "loss": 0.79326046, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.22595215, + "step": 6020, + "time_per_iteration": 2.8417208194732666 + }, + { + "auxiliary_loss_clip": 0.01477576, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.29336619, + "balance_loss_mlp": 1.01662099, + "epoch": 0.3620021043138434, + "flos": 22539184450560.0, + "grad_norm": 2.3496219508852967, + "language_loss": 0.74396271, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76913577, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.23132324, + "step": 6021, + "time_per_iteration": 2.923877716064453 + }, + { + "auxiliary_loss_clip": 0.01465787, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.28388858, + "balance_loss_mlp": 1.01863623, + "epoch": 0.36206222756651135, + "flos": 15605215983360.0, + "grad_norm": 1.6613944900577262, + "language_loss": 0.82073897, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.84581381, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.23071289, + "step": 6022, + "time_per_iteration": 2.842090129852295 + }, + { + "auxiliary_loss_clip": 0.01454331, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.27783966, + "balance_loss_mlp": 1.01318312, + "epoch": 0.3621223508191793, + "flos": 23597798474880.0, + "grad_norm": 1.7398921930124356, + "language_loss": 0.82129693, + "learning_rate": 2.950244857154417e-06, + "loss": 0.84620303, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.2310791, + "step": 6023, + "time_per_iteration": 2.857262134552002 + }, + { + "auxiliary_loss_clip": 0.01481871, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.29528308, + "balance_loss_mlp": 1.01322508, + "epoch": 0.3621824740718473, + "flos": 22320126535680.0, + "grad_norm": 1.6578131460468306, + "language_loss": 0.80939245, + "learning_rate": 2.9499021441341e-06, + "loss": 0.83458358, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.24035645, + "step": 6024, + "time_per_iteration": 4.258174896240234 + }, + { + "auxiliary_loss_clip": 0.01450895, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.27328265, + "balance_loss_mlp": 1.01411557, + "epoch": 0.36224259732451525, + "flos": 16772318334720.0, + "grad_norm": 1.8344432753428062, + "language_loss": 0.75758427, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.78246403, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.22998047, + "step": 6025, + "time_per_iteration": 2.849942922592163 + }, + { + "auxiliary_loss_clip": 0.01462465, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.28141975, + "balance_loss_mlp": 1.01581466, + "epoch": 0.3623027205771832, + "flos": 23160361317120.0, + "grad_norm": 1.6778386719154663, + "language_loss": 0.73606706, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.76108897, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.23937988, + "step": 6026, + "time_per_iteration": 2.8670058250427246 + }, + { + "auxiliary_loss_clip": 0.01487545, + "auxiliary_loss_mlp": 0.01045057, + "balance_loss_clip": 1.29971075, + "balance_loss_mlp": 1.02052343, + "epoch": 0.3623628438298512, + "flos": 28561060114560.0, + "grad_norm": 1.9549438957194902, + "language_loss": 0.79277408, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81810009, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.24511719, + "step": 6027, + "time_per_iteration": 2.9297337532043457 + }, + { + "auxiliary_loss_clip": 0.01463469, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.28014421, + "balance_loss_mlp": 1.0183053, + "epoch": 0.36242296708251914, + "flos": 25496194832640.0, + "grad_norm": 1.8062734011625254, + "language_loss": 0.68703246, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.71209073, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.24072266, + "step": 6028, + "time_per_iteration": 2.9004640579223633 + }, + { + "auxiliary_loss_clip": 0.01468405, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.28793144, + "balance_loss_mlp": 1.01643872, + "epoch": 0.3624830903351871, + "flos": 16299201012480.0, + "grad_norm": 1.7769836831604713, + "language_loss": 0.86191285, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.88699049, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22937012, + "step": 6029, + "time_per_iteration": 4.284532308578491 + }, + { + "auxiliary_loss_clip": 0.01464272, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.28487945, + "balance_loss_mlp": 1.0168035, + "epoch": 0.36254321358785513, + "flos": 18305452270080.0, + "grad_norm": 1.6361069531614483, + "language_loss": 0.73559022, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.7606352, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.23413086, + "step": 6030, + "time_per_iteration": 4.263043403625488 + }, + { + "auxiliary_loss_clip": 0.01483187, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.29708982, + "balance_loss_mlp": 1.017766, + "epoch": 0.3626033368405231, + "flos": 14873198060160.0, + "grad_norm": 2.1758758588396123, + "language_loss": 0.75513828, + "learning_rate": 2.94750214514905e-06, + "loss": 0.7803936, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.24597168, + "step": 6031, + "time_per_iteration": 2.9086179733276367 + }, + { + "auxiliary_loss_clip": 0.01463608, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.28213191, + "balance_loss_mlp": 1.01567864, + "epoch": 0.36266346009319106, + "flos": 22315964014080.0, + "grad_norm": 1.687661347278102, + "language_loss": 0.74576586, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.77079266, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.23388672, + "step": 6032, + "time_per_iteration": 2.852872610092163 + }, + { + "auxiliary_loss_clip": 0.01480281, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_clip": 1.29647541, + "balance_loss_mlp": 1.02167547, + "epoch": 0.362723583345859, + "flos": 18231196273920.0, + "grad_norm": 1.9718548503373239, + "language_loss": 0.78449929, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80974609, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22741699, + "step": 6033, + "time_per_iteration": 4.240483045578003 + }, + { + "auxiliary_loss_clip": 0.01276045, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.168185, + "balance_loss_mlp": 1.01356399, + "epoch": 0.362783706598527, + "flos": 68530415316480.0, + "grad_norm": 0.7963741240708653, + "language_loss": 0.64847732, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.67158413, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.2109375, + "step": 6034, + "time_per_iteration": 3.4088032245635986 + }, + { + "auxiliary_loss_clip": 0.01458953, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.28095245, + "balance_loss_mlp": 1.01852059, + "epoch": 0.36284382985119495, + "flos": 26587367130240.0, + "grad_norm": 1.4877683545041624, + "language_loss": 0.90268493, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92768306, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.22314453, + "step": 6035, + "time_per_iteration": 2.8808484077453613 + }, + { + "auxiliary_loss_clip": 0.01480802, + "auxiliary_loss_mlp": 0.01049265, + "balance_loss_clip": 1.29475784, + "balance_loss_mlp": 1.02630568, + "epoch": 0.3629039531038629, + "flos": 20166313426560.0, + "grad_norm": 1.870229554374374, + "language_loss": 0.74795854, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.77325922, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.22961426, + "step": 6036, + "time_per_iteration": 2.8843064308166504 + }, + { + "auxiliary_loss_clip": 0.01479912, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.29221439, + "balance_loss_mlp": 1.02082837, + "epoch": 0.3629640763565309, + "flos": 18635215507200.0, + "grad_norm": 2.419788597548329, + "language_loss": 0.76860595, + "learning_rate": 2.945443601747297e-06, + "loss": 0.79384029, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.22705078, + "step": 6037, + "time_per_iteration": 2.8315038681030273 + }, + { + "auxiliary_loss_clip": 0.01463974, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.28535855, + "balance_loss_mlp": 1.02234674, + "epoch": 0.36302419960919885, + "flos": 19580454766080.0, + "grad_norm": 1.5647508732641637, + "language_loss": 0.79273653, + "learning_rate": 2.945100385624828e-06, + "loss": 0.81782722, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22741699, + "step": 6038, + "time_per_iteration": 2.8738584518432617 + }, + { + "auxiliary_loss_clip": 0.01266466, + "auxiliary_loss_mlp": 0.01020096, + "balance_loss_clip": 1.15898192, + "balance_loss_mlp": 0.99968714, + "epoch": 0.3630843228618668, + "flos": 63828949944960.0, + "grad_norm": 0.8698185729433199, + "language_loss": 0.63379496, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65666056, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.20410156, + "step": 6039, + "time_per_iteration": 3.445794105529785 + }, + { + "auxiliary_loss_clip": 0.01462844, + "auxiliary_loss_mlp": 0.01048094, + "balance_loss_clip": 1.28222537, + "balance_loss_mlp": 1.02516985, + "epoch": 0.3631444461145348, + "flos": 21845154176640.0, + "grad_norm": 2.1198778475037785, + "language_loss": 0.72425091, + "learning_rate": 2.944413845878002e-06, + "loss": 0.74936026, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22924805, + "step": 6040, + "time_per_iteration": 2.8730623722076416 + }, + { + "auxiliary_loss_clip": 0.01488327, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.30243993, + "balance_loss_mlp": 1.02179754, + "epoch": 0.36320456936720275, + "flos": 21731055494400.0, + "grad_norm": 1.712448659894353, + "language_loss": 0.82057816, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.84590518, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.22583008, + "step": 6041, + "time_per_iteration": 2.981795072555542 + }, + { + "auxiliary_loss_clip": 0.01480383, + "auxiliary_loss_mlp": 0.0104475, + "balance_loss_clip": 1.29464853, + "balance_loss_mlp": 1.02018118, + "epoch": 0.3632646926198707, + "flos": 17027825575680.0, + "grad_norm": 1.9466610298853078, + "language_loss": 0.85515106, + "learning_rate": 2.943727162882107e-06, + "loss": 0.88040245, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.24560547, + "step": 6042, + "time_per_iteration": 2.8340020179748535 + }, + { + "auxiliary_loss_clip": 0.01468734, + "auxiliary_loss_mlp": 0.01050413, + "balance_loss_clip": 1.28815436, + "balance_loss_mlp": 1.02748883, + "epoch": 0.36332481587253873, + "flos": 23341567317120.0, + "grad_norm": 1.5911113367821486, + "language_loss": 0.78785408, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.8130455, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.22900391, + "step": 6043, + "time_per_iteration": 2.879533052444458 + }, + { + "auxiliary_loss_clip": 0.01471594, + "auxiliary_loss_mlp": 0.01045321, + "balance_loss_clip": 1.29185247, + "balance_loss_mlp": 1.02165794, + "epoch": 0.3633849391252067, + "flos": 10750668894720.0, + "grad_norm": 3.199068266606343, + "language_loss": 0.66350454, + "learning_rate": 2.943040336741298e-06, + "loss": 0.68867373, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.23657227, + "step": 6044, + "time_per_iteration": 2.8355069160461426 + }, + { + "auxiliary_loss_clip": 0.01472464, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.29114687, + "balance_loss_mlp": 1.01555276, + "epoch": 0.36344506237787466, + "flos": 25860326135040.0, + "grad_norm": 1.7399331878188895, + "language_loss": 0.81384593, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83895737, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23132324, + "step": 6045, + "time_per_iteration": 2.866396188735962 + }, + { + "auxiliary_loss_clip": 0.01473581, + "auxiliary_loss_mlp": 0.01045489, + "balance_loss_clip": 1.29105449, + "balance_loss_mlp": 1.02257657, + "epoch": 0.3635051856305426, + "flos": 30166233050880.0, + "grad_norm": 2.0290333141777825, + "language_loss": 0.65480602, + "learning_rate": 2.942353367559755e-06, + "loss": 0.67999673, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.22912598, + "step": 6046, + "time_per_iteration": 2.9372034072875977 + }, + { + "auxiliary_loss_clip": 0.0147493, + "auxiliary_loss_mlp": 0.01042848, + "balance_loss_clip": 1.29199815, + "balance_loss_mlp": 1.0191493, + "epoch": 0.3635653088832106, + "flos": 22208063869440.0, + "grad_norm": 1.515138129407602, + "language_loss": 0.78014731, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.80532503, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.23706055, + "step": 6047, + "time_per_iteration": 2.8611257076263428 + }, + { + "auxiliary_loss_clip": 0.0148731, + "auxiliary_loss_mlp": 0.01044595, + "balance_loss_clip": 1.29734409, + "balance_loss_mlp": 1.02100325, + "epoch": 0.36362543213587856, + "flos": 24796916161920.0, + "grad_norm": 1.567852203170704, + "language_loss": 0.80343771, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.82875681, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.23596191, + "step": 6048, + "time_per_iteration": 2.8817338943481445 + }, + { + "auxiliary_loss_clip": 0.01243168, + "auxiliary_loss_mlp": 0.01041516, + "balance_loss_clip": 1.13901424, + "balance_loss_mlp": 1.0227288, + "epoch": 0.3636855553885465, + "flos": 62558879132160.0, + "grad_norm": 0.7782096075010976, + "language_loss": 0.52622092, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54906774, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1875, + "step": 6049, + "time_per_iteration": 3.418602466583252 + }, + { + "auxiliary_loss_clip": 0.01482058, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.29695344, + "balance_loss_mlp": 1.01595163, + "epoch": 0.3637456786412145, + "flos": 24071187265920.0, + "grad_norm": 1.8279309815169458, + "language_loss": 0.87224412, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.89745915, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.23510742, + "step": 6050, + "time_per_iteration": 2.865732431411743 + }, + { + "auxiliary_loss_clip": 0.01465641, + "auxiliary_loss_mlp": 0.01043711, + "balance_loss_clip": 1.28501678, + "balance_loss_mlp": 1.02104974, + "epoch": 0.36380580189388245, + "flos": 16700369823360.0, + "grad_norm": 1.8590703131379729, + "language_loss": 0.79378641, + "learning_rate": 2.940635319486546e-06, + "loss": 0.81887996, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22668457, + "step": 6051, + "time_per_iteration": 2.904627561569214 + }, + { + "auxiliary_loss_clip": 0.01473583, + "auxiliary_loss_mlp": 0.01041157, + "balance_loss_clip": 1.29010475, + "balance_loss_mlp": 1.01810217, + "epoch": 0.3638659251465504, + "flos": 25123693242240.0, + "grad_norm": 2.057356218906122, + "language_loss": 0.83085883, + "learning_rate": 2.940291602812822e-06, + "loss": 0.85600621, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.23059082, + "step": 6052, + "time_per_iteration": 2.8706858158111572 + }, + { + "auxiliary_loss_clip": 0.01464847, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.287678, + "balance_loss_mlp": 1.01980948, + "epoch": 0.3639260483992184, + "flos": 23013387648000.0, + "grad_norm": 1.5533857229964654, + "language_loss": 0.72903085, + "learning_rate": 2.939947850483145e-06, + "loss": 0.75410533, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22802734, + "step": 6053, + "time_per_iteration": 2.9413135051727295 + }, + { + "auxiliary_loss_clip": 0.01240773, + "auxiliary_loss_mlp": 0.01017751, + "balance_loss_clip": 1.13818884, + "balance_loss_mlp": 1.00163364, + "epoch": 0.36398617165188635, + "flos": 70745544408960.0, + "grad_norm": 0.766787507091847, + "language_loss": 0.6124922, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63507736, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.16113281, + "step": 6054, + "time_per_iteration": 3.3966116905212402 + }, + { + "auxiliary_loss_clip": 0.01484942, + "auxiliary_loss_mlp": 0.01055558, + "balance_loss_clip": 1.29997361, + "balance_loss_mlp": 1.03077459, + "epoch": 0.3640462949045543, + "flos": 22245282357120.0, + "grad_norm": 3.511688798022756, + "language_loss": 0.76344198, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78884697, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.24829102, + "step": 6055, + "time_per_iteration": 2.8474414348602295 + }, + { + "auxiliary_loss_clip": 0.0148898, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.30562568, + "balance_loss_mlp": 1.02603662, + "epoch": 0.3641064181572223, + "flos": 21553514323200.0, + "grad_norm": 1.744908593797914, + "language_loss": 0.75914299, + "learning_rate": 2.938916379688765e-06, + "loss": 0.78452682, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23364258, + "step": 6056, + "time_per_iteration": 2.900407552719116 + }, + { + "auxiliary_loss_clip": 0.01477012, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.29593289, + "balance_loss_mlp": 1.02312744, + "epoch": 0.3641665414098903, + "flos": 22283496230400.0, + "grad_norm": 1.8468259289296993, + "language_loss": 0.81692141, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.84215879, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.23596191, + "step": 6057, + "time_per_iteration": 2.869539737701416 + }, + { + "auxiliary_loss_clip": 0.01468295, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_clip": 1.28940392, + "balance_loss_mlp": 1.03222072, + "epoch": 0.36422666466255826, + "flos": 28341368772480.0, + "grad_norm": 2.0067268963520113, + "language_loss": 0.81259072, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.83783424, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.23840332, + "step": 6058, + "time_per_iteration": 2.921931028366089 + }, + { + "auxiliary_loss_clip": 0.014817, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.29790759, + "balance_loss_mlp": 1.02715492, + "epoch": 0.36428678791522623, + "flos": 24181168671360.0, + "grad_norm": 1.790003326347346, + "language_loss": 0.86089849, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.88621545, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22827148, + "step": 6059, + "time_per_iteration": 4.396906614303589 + }, + { + "auxiliary_loss_clip": 0.01479142, + "auxiliary_loss_mlp": 0.01050503, + "balance_loss_clip": 1.29467463, + "balance_loss_mlp": 1.02590966, + "epoch": 0.3643469111678942, + "flos": 22538641512960.0, + "grad_norm": 1.5274394792978823, + "language_loss": 0.88725948, + "learning_rate": 2.937540586903884e-06, + "loss": 0.91255593, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.24597168, + "step": 6060, + "time_per_iteration": 2.855309247970581 + }, + { + "auxiliary_loss_clip": 0.01487539, + "auxiliary_loss_mlp": 0.01051685, + "balance_loss_clip": 1.30134571, + "balance_loss_mlp": 1.02712798, + "epoch": 0.36440703442056216, + "flos": 19435833826560.0, + "grad_norm": 1.9514311047955741, + "language_loss": 0.67698783, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70238006, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.24572754, + "step": 6061, + "time_per_iteration": 2.874579429626465 + }, + { + "auxiliary_loss_clip": 0.01503455, + "auxiliary_loss_mlp": 0.01047914, + "balance_loss_clip": 1.31636405, + "balance_loss_mlp": 1.02475214, + "epoch": 0.3644671576732301, + "flos": 18049492581120.0, + "grad_norm": 2.2317472470628297, + "language_loss": 0.77517521, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.80068892, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23168945, + "step": 6062, + "time_per_iteration": 2.8331878185272217 + }, + { + "auxiliary_loss_clip": 0.0147259, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.29266512, + "balance_loss_mlp": 1.02243209, + "epoch": 0.3645272809258981, + "flos": 21552609427200.0, + "grad_norm": 1.6750971410909206, + "language_loss": 0.73876131, + "learning_rate": 2.936508368977432e-06, + "loss": 0.76393843, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.22692871, + "step": 6063, + "time_per_iteration": 2.895822048187256 + }, + { + "auxiliary_loss_clip": 0.01461647, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.2826947, + "balance_loss_mlp": 1.02053308, + "epoch": 0.36458740417856605, + "flos": 22756975511040.0, + "grad_norm": 1.957146427272074, + "language_loss": 0.68836403, + "learning_rate": 2.936164225292901e-06, + "loss": 0.71341777, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.23193359, + "step": 6064, + "time_per_iteration": 4.294676303863525 + }, + { + "auxiliary_loss_clip": 0.01486788, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.30218863, + "balance_loss_mlp": 1.02315736, + "epoch": 0.364647527431234, + "flos": 26151830254080.0, + "grad_norm": 11.46844714887566, + "language_loss": 0.74899793, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.77432233, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22497559, + "step": 6065, + "time_per_iteration": 4.2820234298706055 + }, + { + "auxiliary_loss_clip": 0.01486595, + "auxiliary_loss_mlp": 0.01046513, + "balance_loss_clip": 1.29962635, + "balance_loss_mlp": 1.02239728, + "epoch": 0.364707650683902, + "flos": 31042374220800.0, + "grad_norm": 1.8957278342203059, + "language_loss": 0.75540525, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.78073633, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.2409668, + "step": 6066, + "time_per_iteration": 2.895617961883545 + }, + { + "auxiliary_loss_clip": 0.01471715, + "auxiliary_loss_mlp": 0.01042832, + "balance_loss_clip": 1.2917465, + "balance_loss_mlp": 1.01872814, + "epoch": 0.36476777393656995, + "flos": 19582264558080.0, + "grad_norm": 2.145310713997748, + "language_loss": 0.77669394, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.80183941, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.2409668, + "step": 6067, + "time_per_iteration": 2.8569114208221436 + }, + { + "auxiliary_loss_clip": 0.01472778, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_clip": 1.29296219, + "balance_loss_mlp": 1.02103233, + "epoch": 0.3648278971892379, + "flos": 17757807482880.0, + "grad_norm": 2.3654277471745, + "language_loss": 0.727579, + "learning_rate": 2.934787295690886e-06, + "loss": 0.7527436, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22631836, + "step": 6068, + "time_per_iteration": 4.258959531784058 + }, + { + "auxiliary_loss_clip": 0.01479125, + "auxiliary_loss_mlp": 0.0104269, + "balance_loss_clip": 1.2944541, + "balance_loss_mlp": 1.019122, + "epoch": 0.3648880204419059, + "flos": 17940144602880.0, + "grad_norm": 2.2156814521262786, + "language_loss": 0.75082046, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.77603865, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.23583984, + "step": 6069, + "time_per_iteration": 2.811143636703491 + }, + { + "auxiliary_loss_clip": 0.01476056, + "auxiliary_loss_mlp": 0.01043196, + "balance_loss_clip": 1.29200244, + "balance_loss_mlp": 1.01937842, + "epoch": 0.3649481436945739, + "flos": 22648577673600.0, + "grad_norm": 3.3868239640756355, + "language_loss": 0.67693484, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.70212734, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.23840332, + "step": 6070, + "time_per_iteration": 2.929018020629883 + }, + { + "auxiliary_loss_clip": 0.01458485, + "auxiliary_loss_mlp": 0.01038948, + "balance_loss_clip": 1.28013706, + "balance_loss_mlp": 1.01597595, + "epoch": 0.36500826694724187, + "flos": 21589556446080.0, + "grad_norm": 1.6406391542435967, + "language_loss": 0.75150836, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.77648264, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22961426, + "step": 6071, + "time_per_iteration": 2.8490374088287354 + }, + { + "auxiliary_loss_clip": 0.0147129, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.28992307, + "balance_loss_mlp": 1.01285148, + "epoch": 0.36506839019990983, + "flos": 13780623173760.0, + "grad_norm": 1.650595082254123, + "language_loss": 0.89443535, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.91951203, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.2355957, + "step": 6072, + "time_per_iteration": 2.826725959777832 + }, + { + "auxiliary_loss_clip": 0.01464296, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.28349233, + "balance_loss_mlp": 1.01611853, + "epoch": 0.3651285134525778, + "flos": 17283920999040.0, + "grad_norm": 2.0714057345681622, + "language_loss": 0.74107492, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.76610321, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.22399902, + "step": 6073, + "time_per_iteration": 2.8510191440582275 + }, + { + "auxiliary_loss_clip": 0.0149006, + "auxiliary_loss_mlp": 0.01047065, + "balance_loss_clip": 1.30467105, + "balance_loss_mlp": 1.02195978, + "epoch": 0.36518863670524576, + "flos": 21917962339200.0, + "grad_norm": 6.355038488664238, + "language_loss": 0.67629021, + "learning_rate": 2.932720838132236e-06, + "loss": 0.70166147, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.25109863, + "step": 6074, + "time_per_iteration": 2.8272197246551514 + }, + { + "auxiliary_loss_clip": 0.01465034, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.28464282, + "balance_loss_mlp": 1.01500738, + "epoch": 0.3652487599579137, + "flos": 27132659187840.0, + "grad_norm": 1.6526563472596172, + "language_loss": 0.73647147, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.76150042, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22851562, + "step": 6075, + "time_per_iteration": 2.906740188598633 + }, + { + "auxiliary_loss_clip": 0.0148401, + "auxiliary_loss_mlp": 0.01044115, + "balance_loss_clip": 1.29781389, + "balance_loss_mlp": 1.02023721, + "epoch": 0.3653088832105817, + "flos": 19765054126080.0, + "grad_norm": 2.6624526058001883, + "language_loss": 0.9053483, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.93062955, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23876953, + "step": 6076, + "time_per_iteration": 2.8243727684020996 + }, + { + "auxiliary_loss_clip": 0.01467436, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.28710794, + "balance_loss_mlp": 1.01456833, + "epoch": 0.36536900646324966, + "flos": 13122725512320.0, + "grad_norm": 1.8572592422497634, + "language_loss": 0.71577537, + "learning_rate": 2.931687131696872e-06, + "loss": 0.74082422, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22900391, + "step": 6077, + "time_per_iteration": 2.8374552726745605 + }, + { + "auxiliary_loss_clip": 0.0124919, + "auxiliary_loss_mlp": 0.01023426, + "balance_loss_clip": 1.14271927, + "balance_loss_mlp": 1.00559199, + "epoch": 0.3654291297159176, + "flos": 71135193778560.0, + "grad_norm": 0.7393010657666114, + "language_loss": 0.61834848, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.64107466, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.17871094, + "step": 6078, + "time_per_iteration": 3.4477672576904297 + }, + { + "auxiliary_loss_clip": 0.014695, + "auxiliary_loss_mlp": 0.01037812, + "balance_loss_clip": 1.28728557, + "balance_loss_mlp": 1.01470923, + "epoch": 0.3654892529685856, + "flos": 23626510940160.0, + "grad_norm": 2.0505189947443094, + "language_loss": 0.79122794, + "learning_rate": 2.930997817403173e-06, + "loss": 0.81630111, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23095703, + "step": 6079, + "time_per_iteration": 2.837953805923462 + }, + { + "auxiliary_loss_clip": 0.01480574, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.29653478, + "balance_loss_mlp": 1.01628971, + "epoch": 0.36554937622125355, + "flos": 43486298974080.0, + "grad_norm": 2.3256870484318033, + "language_loss": 0.63427025, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65947139, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.2322998, + "step": 6080, + "time_per_iteration": 3.0391178131103516 + }, + { + "auxiliary_loss_clip": 0.01491874, + "auxiliary_loss_mlp": 0.01044809, + "balance_loss_clip": 1.3063395, + "balance_loss_mlp": 1.01998997, + "epoch": 0.3656094994739215, + "flos": 23305027501440.0, + "grad_norm": 2.1714781034600503, + "language_loss": 0.69401622, + "learning_rate": 2.930308361895352e-06, + "loss": 0.71938312, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.24816895, + "step": 6081, + "time_per_iteration": 2.8528311252593994 + }, + { + "auxiliary_loss_clip": 0.01506484, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.3152777, + "balance_loss_mlp": 1.02405643, + "epoch": 0.3656696227265895, + "flos": 24582789930240.0, + "grad_norm": 2.277096047306174, + "language_loss": 0.75547802, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.78102028, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.23706055, + "step": 6082, + "time_per_iteration": 2.9119865894317627 + }, + { + "auxiliary_loss_clip": 0.01481483, + "auxiliary_loss_mlp": 0.01041515, + "balance_loss_clip": 1.29745471, + "balance_loss_mlp": 1.01948535, + "epoch": 0.3657297459792575, + "flos": 27939973737600.0, + "grad_norm": 1.648029347850574, + "language_loss": 0.83408499, + "learning_rate": 2.929618765277987e-06, + "loss": 0.85931504, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.22009277, + "step": 6083, + "time_per_iteration": 2.8939313888549805 + }, + { + "auxiliary_loss_clip": 0.01254201, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.15137053, + "balance_loss_mlp": 1.01121545, + "epoch": 0.36578986923192547, + "flos": 67420466426880.0, + "grad_norm": 0.8191703717760468, + "language_loss": 0.59444797, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61725658, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.15429688, + "step": 6084, + "time_per_iteration": 3.4789204597473145 + }, + { + "auxiliary_loss_clip": 0.01471946, + "auxiliary_loss_mlp": 0.01039456, + "balance_loss_clip": 1.29062939, + "balance_loss_mlp": 1.01710427, + "epoch": 0.36584999248459343, + "flos": 20236497390720.0, + "grad_norm": 2.013644125550603, + "language_loss": 0.73285282, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75796688, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22338867, + "step": 6085, + "time_per_iteration": 2.8460867404937744 + }, + { + "auxiliary_loss_clip": 0.01481511, + "auxiliary_loss_mlp": 0.01043025, + "balance_loss_clip": 1.29709053, + "balance_loss_mlp": 1.0207926, + "epoch": 0.3659101157372614, + "flos": 19071793013760.0, + "grad_norm": 1.8604244251289053, + "language_loss": 0.79172313, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.81696844, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.22229004, + "step": 6086, + "time_per_iteration": 2.817906618118286 + }, + { + "auxiliary_loss_clip": 0.01464198, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.28603935, + "balance_loss_mlp": 1.01900983, + "epoch": 0.36597023898992936, + "flos": 30823044837120.0, + "grad_norm": 2.3537324566808495, + "language_loss": 0.78043139, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.8054806, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21716309, + "step": 6087, + "time_per_iteration": 2.9251058101654053 + }, + { + "auxiliary_loss_clip": 0.01483786, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.29643345, + "balance_loss_mlp": 1.01821446, + "epoch": 0.36603036224259733, + "flos": 20531485359360.0, + "grad_norm": 2.043838675917153, + "language_loss": 0.71894681, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.74419963, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.23291016, + "step": 6088, + "time_per_iteration": 2.8366916179656982 + }, + { + "auxiliary_loss_clip": 0.01509634, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.31551826, + "balance_loss_mlp": 1.01733875, + "epoch": 0.3660904854952653, + "flos": 38344365043200.0, + "grad_norm": 1.5843145128747842, + "language_loss": 0.80418748, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82969737, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 1.94238281, + "router_z_loss_mlp": 0.24023438, + "step": 6089, + "time_per_iteration": 3.0262606143951416 + }, + { + "auxiliary_loss_clip": 0.01468672, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.28809226, + "balance_loss_mlp": 1.01566613, + "epoch": 0.36615060874793326, + "flos": 21845651869440.0, + "grad_norm": 1.7674714646462293, + "language_loss": 0.71823144, + "learning_rate": 2.927204067389884e-06, + "loss": 0.74329835, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22351074, + "step": 6090, + "time_per_iteration": 2.874279499053955 + }, + { + "auxiliary_loss_clip": 0.01467134, + "auxiliary_loss_mlp": 0.01043369, + "balance_loss_clip": 1.28846288, + "balance_loss_mlp": 1.02163768, + "epoch": 0.3662107320006012, + "flos": 16590252683520.0, + "grad_norm": 2.1299366959225514, + "language_loss": 0.74878818, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.77389318, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21728516, + "step": 6091, + "time_per_iteration": 2.820748805999756 + }, + { + "auxiliary_loss_clip": 0.01490347, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.3062166, + "balance_loss_mlp": 1.01800525, + "epoch": 0.3662708552532692, + "flos": 20967836641920.0, + "grad_norm": 1.922906075692372, + "language_loss": 0.74037707, + "learning_rate": 2.926513837074284e-06, + "loss": 0.76569533, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.23498535, + "step": 6092, + "time_per_iteration": 2.8433773517608643 + }, + { + "auxiliary_loss_clip": 0.01487833, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.30095446, + "balance_loss_mlp": 1.02297354, + "epoch": 0.36633097850593715, + "flos": 21911311353600.0, + "grad_norm": 2.5271108205858575, + "language_loss": 0.79188573, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.81722295, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.22912598, + "step": 6093, + "time_per_iteration": 2.818272590637207 + }, + { + "auxiliary_loss_clip": 0.01483586, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.29771066, + "balance_loss_mlp": 1.0184468, + "epoch": 0.3663911017586051, + "flos": 32867057520000.0, + "grad_norm": 4.814153986388782, + "language_loss": 0.75186938, + "learning_rate": 2.925823466224696e-06, + "loss": 0.77712727, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.23754883, + "step": 6094, + "time_per_iteration": 4.345716714859009 + }, + { + "auxiliary_loss_clip": 0.0147772, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.29248476, + "balance_loss_mlp": 1.01641309, + "epoch": 0.3664512250112731, + "flos": 27283207196160.0, + "grad_norm": 1.8627425238640913, + "language_loss": 0.80273056, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.82791531, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.24353027, + "step": 6095, + "time_per_iteration": 2.905019998550415 + }, + { + "auxiliary_loss_clip": 0.01498429, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.31050277, + "balance_loss_mlp": 1.01617873, + "epoch": 0.3665113482639411, + "flos": 17793170933760.0, + "grad_norm": 2.9713807511714267, + "language_loss": 0.74826097, + "learning_rate": 2.925132954945834e-06, + "loss": 0.77365375, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.24682617, + "step": 6096, + "time_per_iteration": 2.9319705963134766 + }, + { + "auxiliary_loss_clip": 0.01482053, + "auxiliary_loss_mlp": 0.0103868, + "balance_loss_clip": 1.29476893, + "balance_loss_mlp": 1.01579225, + "epoch": 0.36657147151660907, + "flos": 27865989210240.0, + "grad_norm": 2.8295043118273093, + "language_loss": 0.68195665, + "learning_rate": 2.924787646678155e-06, + "loss": 0.70716405, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.22888184, + "step": 6097, + "time_per_iteration": 2.8986992835998535 + }, + { + "auxiliary_loss_clip": 0.01486401, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.30197597, + "balance_loss_mlp": 1.01680827, + "epoch": 0.36663159476927704, + "flos": 25384856083200.0, + "grad_norm": 1.664446240658449, + "language_loss": 0.78559554, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.81087065, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.24291992, + "step": 6098, + "time_per_iteration": 2.922243118286133 + }, + { + "auxiliary_loss_clip": 0.0147617, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.29466474, + "balance_loss_mlp": 1.01602042, + "epoch": 0.366691718021945, + "flos": 21366155030400.0, + "grad_norm": 3.9109827466188194, + "language_loss": 0.74436277, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.76951104, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22631836, + "step": 6099, + "time_per_iteration": 4.374940395355225 + }, + { + "auxiliary_loss_clip": 0.01466271, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.28643334, + "balance_loss_mlp": 1.01768398, + "epoch": 0.36675184127461297, + "flos": 16809129619200.0, + "grad_norm": 1.777181724547024, + "language_loss": 0.85892975, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.88400024, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.23071289, + "step": 6100, + "time_per_iteration": 4.255775690078735 + }, + { + "auxiliary_loss_clip": 0.01499709, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.31027234, + "balance_loss_mlp": 1.0159806, + "epoch": 0.36681196452728093, + "flos": 21916107302400.0, + "grad_norm": 1.5962772499557765, + "language_loss": 0.7202062, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.74558413, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.22106934, + "step": 6101, + "time_per_iteration": 2.855139970779419 + }, + { + "auxiliary_loss_clip": 0.01482895, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.29658461, + "balance_loss_mlp": 1.01831269, + "epoch": 0.3668720877799489, + "flos": 17721403401600.0, + "grad_norm": 2.8292541672192533, + "language_loss": 0.77291501, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.79816586, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.23876953, + "step": 6102, + "time_per_iteration": 2.7939679622650146 + }, + { + "auxiliary_loss_clip": 0.01504232, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_clip": 1.3138901, + "balance_loss_mlp": 1.02144659, + "epoch": 0.36693221103261686, + "flos": 47061635800320.0, + "grad_norm": 1.474537139716108, + "language_loss": 0.71099162, + "learning_rate": 2.922715061101625e-06, + "loss": 0.73649085, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.24243164, + "step": 6103, + "time_per_iteration": 4.444256544113159 + }, + { + "auxiliary_loss_clip": 0.01481409, + "auxiliary_loss_mlp": 0.01043468, + "balance_loss_clip": 1.29537439, + "balance_loss_mlp": 1.01966178, + "epoch": 0.3669923342852848, + "flos": 15969392530560.0, + "grad_norm": 1.9420866401388122, + "language_loss": 0.72913063, + "learning_rate": 2.922369507632716e-06, + "loss": 0.75437945, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.23791504, + "step": 6104, + "time_per_iteration": 2.8346498012542725 + }, + { + "auxiliary_loss_clip": 0.01479757, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.29479742, + "balance_loss_mlp": 1.01887107, + "epoch": 0.3670524575379528, + "flos": 19984021551360.0, + "grad_norm": 2.1009134905568656, + "language_loss": 0.82646203, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.85168505, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.23681641, + "step": 6105, + "time_per_iteration": 2.835418939590454 + }, + { + "auxiliary_loss_clip": 0.01502148, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.31082094, + "balance_loss_mlp": 1.02041507, + "epoch": 0.36711258079062076, + "flos": 25714031137920.0, + "grad_norm": 5.173844524264317, + "language_loss": 0.81124353, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83671224, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.24316406, + "step": 6106, + "time_per_iteration": 2.888395071029663 + }, + { + "auxiliary_loss_clip": 0.01254585, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.14627671, + "balance_loss_mlp": 1.01712406, + "epoch": 0.3671727040432887, + "flos": 60804198817920.0, + "grad_norm": 0.6906588824839662, + "language_loss": 0.59263891, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61553532, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.1796875, + "step": 6107, + "time_per_iteration": 3.437012195587158 + }, + { + "auxiliary_loss_clip": 0.01471422, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.29004145, + "balance_loss_mlp": 1.01773953, + "epoch": 0.3672328272959567, + "flos": 18670578958080.0, + "grad_norm": 1.5179904414639913, + "language_loss": 0.75278032, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7778933, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.22131348, + "step": 6108, + "time_per_iteration": 2.881842851638794 + }, + { + "auxiliary_loss_clip": 0.01493263, + "auxiliary_loss_mlp": 0.01040638, + "balance_loss_clip": 1.30627275, + "balance_loss_mlp": 1.0179646, + "epoch": 0.3672929505486247, + "flos": 15057344972160.0, + "grad_norm": 2.3436450650063643, + "language_loss": 0.74807531, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.77341437, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.2265625, + "step": 6109, + "time_per_iteration": 2.8985915184020996 + }, + { + "auxiliary_loss_clip": 0.01475288, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.2921952, + "balance_loss_mlp": 1.01516628, + "epoch": 0.3673530738012927, + "flos": 20597642536320.0, + "grad_norm": 2.7194641725695208, + "language_loss": 0.53773522, + "learning_rate": 2.920295452774744e-06, + "loss": 0.56285554, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21582031, + "step": 6110, + "time_per_iteration": 2.9379734992980957 + }, + { + "auxiliary_loss_clip": 0.01476347, + "auxiliary_loss_mlp": 0.01046857, + "balance_loss_clip": 1.29558563, + "balance_loss_mlp": 1.02314615, + "epoch": 0.36741319705396064, + "flos": 21699945054720.0, + "grad_norm": 1.5529081773761995, + "language_loss": 0.81270838, + "learning_rate": 2.919949654746672e-06, + "loss": 0.83794045, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.23718262, + "step": 6111, + "time_per_iteration": 2.8541338443756104 + }, + { + "auxiliary_loss_clip": 0.01468314, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.28768396, + "balance_loss_mlp": 1.01808631, + "epoch": 0.3674733203066286, + "flos": 29874321728640.0, + "grad_norm": 1.6617354929515382, + "language_loss": 0.72951168, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.75459892, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.2232666, + "step": 6112, + "time_per_iteration": 2.961596727371216 + }, + { + "auxiliary_loss_clip": 0.01468343, + "auxiliary_loss_mlp": 0.01040522, + "balance_loss_clip": 1.2868365, + "balance_loss_mlp": 1.01783609, + "epoch": 0.36753344355929657, + "flos": 18265428604800.0, + "grad_norm": 1.8096496584819008, + "language_loss": 0.85382366, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87891233, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22680664, + "step": 6113, + "time_per_iteration": 2.918994903564453 + }, + { + "auxiliary_loss_clip": 0.01484352, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.29831648, + "balance_loss_mlp": 1.01520908, + "epoch": 0.36759356681196453, + "flos": 25312002675840.0, + "grad_norm": 1.7822425853438681, + "language_loss": 0.79698771, + "learning_rate": 2.918912051407413e-06, + "loss": 0.82221675, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23352051, + "step": 6114, + "time_per_iteration": 2.889345169067383 + }, + { + "auxiliary_loss_clip": 0.01488052, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.29990172, + "balance_loss_mlp": 1.01833725, + "epoch": 0.3676536900646325, + "flos": 21042861799680.0, + "grad_norm": 1.8163666979231712, + "language_loss": 0.67991114, + "learning_rate": 2.918566113919698e-06, + "loss": 0.70522237, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.24719238, + "step": 6115, + "time_per_iteration": 2.8165950775146484 + }, + { + "auxiliary_loss_clip": 0.0146637, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.28630054, + "balance_loss_mlp": 1.01466596, + "epoch": 0.36771381331730046, + "flos": 16296848282880.0, + "grad_norm": 2.564451650831288, + "language_loss": 0.77520978, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.80024987, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22949219, + "step": 6116, + "time_per_iteration": 2.849198818206787 + }, + { + "auxiliary_loss_clip": 0.01482487, + "auxiliary_loss_mlp": 0.01050984, + "balance_loss_clip": 1.29676878, + "balance_loss_mlp": 1.02832294, + "epoch": 0.36777393656996843, + "flos": 22320217025280.0, + "grad_norm": 1.9326075307191937, + "language_loss": 0.63541031, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.66074502, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22668457, + "step": 6117, + "time_per_iteration": 2.817072868347168 + }, + { + "auxiliary_loss_clip": 0.01469643, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.28871989, + "balance_loss_mlp": 1.02075672, + "epoch": 0.3678340598226364, + "flos": 26845453324800.0, + "grad_norm": 1.7988275151676472, + "language_loss": 0.74282086, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.76794529, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22033691, + "step": 6118, + "time_per_iteration": 2.87461519241333 + }, + { + "auxiliary_loss_clip": 0.01492968, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.3037976, + "balance_loss_mlp": 1.02214074, + "epoch": 0.36789418307530436, + "flos": 21771576852480.0, + "grad_norm": 1.6055185409422474, + "language_loss": 0.7311554, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.75655276, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.24633789, + "step": 6119, + "time_per_iteration": 2.8904404640197754 + }, + { + "auxiliary_loss_clip": 0.01481443, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.29847813, + "balance_loss_mlp": 1.02199507, + "epoch": 0.3679543063279723, + "flos": 15932581246080.0, + "grad_norm": 1.7860672992474638, + "language_loss": 0.81593716, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.84120274, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23120117, + "step": 6120, + "time_per_iteration": 2.80971622467041 + }, + { + "auxiliary_loss_clip": 0.01477064, + "auxiliary_loss_mlp": 0.01051191, + "balance_loss_clip": 1.29390478, + "balance_loss_mlp": 1.02855277, + "epoch": 0.3680144295806403, + "flos": 24285087273600.0, + "grad_norm": 2.5607043876292632, + "language_loss": 0.65166867, + "learning_rate": 2.916489757978126e-06, + "loss": 0.67695117, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22631836, + "step": 6121, + "time_per_iteration": 2.896505355834961 + }, + { + "auxiliary_loss_clip": 0.01476395, + "auxiliary_loss_mlp": 0.0105236, + "balance_loss_clip": 1.29389739, + "balance_loss_mlp": 1.02954304, + "epoch": 0.36807455283330826, + "flos": 26115290438400.0, + "grad_norm": 2.094244634220413, + "language_loss": 0.72165358, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.74694109, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22827148, + "step": 6122, + "time_per_iteration": 2.8586266040802 + }, + { + "auxiliary_loss_clip": 0.01461592, + "auxiliary_loss_mlp": 0.01048838, + "balance_loss_clip": 1.28337455, + "balance_loss_mlp": 1.02631903, + "epoch": 0.3681346760859763, + "flos": 24655824316800.0, + "grad_norm": 1.7472504122657813, + "language_loss": 0.70330942, + "learning_rate": 2.915797361163875e-06, + "loss": 0.7284137, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.22521973, + "step": 6123, + "time_per_iteration": 2.8912243843078613 + }, + { + "auxiliary_loss_clip": 0.01496385, + "auxiliary_loss_mlp": 0.0105226, + "balance_loss_clip": 1.3077116, + "balance_loss_mlp": 1.0286448, + "epoch": 0.36819479933864424, + "flos": 23888895390720.0, + "grad_norm": 3.3791018998367566, + "language_loss": 0.75047755, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.77596402, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.23620605, + "step": 6124, + "time_per_iteration": 2.8506994247436523 + }, + { + "auxiliary_loss_clip": 0.01482104, + "auxiliary_loss_mlp": 0.01055036, + "balance_loss_clip": 1.29677916, + "balance_loss_mlp": 1.03106332, + "epoch": 0.3682549225913122, + "flos": 25564252291200.0, + "grad_norm": 2.1740603969701957, + "language_loss": 0.75427401, + "learning_rate": 2.915104825441114e-06, + "loss": 0.77964544, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.23986816, + "step": 6125, + "time_per_iteration": 2.8641669750213623 + }, + { + "auxiliary_loss_clip": 0.01499679, + "auxiliary_loss_mlp": 0.01050808, + "balance_loss_clip": 1.31134629, + "balance_loss_mlp": 1.02589321, + "epoch": 0.36831504584398017, + "flos": 16955469861120.0, + "grad_norm": 1.8557380661578575, + "language_loss": 0.7941553, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.81966019, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.24914551, + "step": 6126, + "time_per_iteration": 2.821234941482544 + }, + { + "auxiliary_loss_clip": 0.01495913, + "auxiliary_loss_mlp": 0.01048656, + "balance_loss_clip": 1.30525744, + "balance_loss_mlp": 1.02402711, + "epoch": 0.36837516909664814, + "flos": 19874583083520.0, + "grad_norm": 2.287705119854965, + "language_loss": 0.67334235, + "learning_rate": 2.914412150914888e-06, + "loss": 0.69878805, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.24645996, + "step": 6127, + "time_per_iteration": 2.9021427631378174 + }, + { + "auxiliary_loss_clip": 0.01492813, + "auxiliary_loss_mlp": 0.01051451, + "balance_loss_clip": 1.30528641, + "balance_loss_mlp": 1.02657187, + "epoch": 0.3684352923493161, + "flos": 37639385527680.0, + "grad_norm": 2.0236871871878908, + "language_loss": 0.71451163, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.73995429, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.2487793, + "step": 6128, + "time_per_iteration": 3.0019876956939697 + }, + { + "auxiliary_loss_clip": 0.01473872, + "auxiliary_loss_mlp": 0.01046315, + "balance_loss_clip": 1.28975415, + "balance_loss_mlp": 1.02217543, + "epoch": 0.36849541560198407, + "flos": 14473703306880.0, + "grad_norm": 1.9062124232330893, + "language_loss": 0.76246905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.78767091, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.24133301, + "step": 6129, + "time_per_iteration": 4.250041484832764 + }, + { + "auxiliary_loss_clip": 0.01483275, + "auxiliary_loss_mlp": 0.01041212, + "balance_loss_clip": 1.29745889, + "balance_loss_mlp": 1.01697719, + "epoch": 0.36855553885465203, + "flos": 25780505028480.0, + "grad_norm": 1.573525590842421, + "language_loss": 0.85355967, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.87880456, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.24230957, + "step": 6130, + "time_per_iteration": 2.8736064434051514 + }, + { + "auxiliary_loss_clip": 0.01280268, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.16259313, + "balance_loss_mlp": 1.01770592, + "epoch": 0.36861566210732, + "flos": 65084017466880.0, + "grad_norm": 0.810176330522543, + "language_loss": 0.60288447, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62609404, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.22949219, + "step": 6131, + "time_per_iteration": 3.465580463409424 + }, + { + "auxiliary_loss_clip": 0.01467133, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.28456676, + "balance_loss_mlp": 1.01700711, + "epoch": 0.36867578535998796, + "flos": 30966308432640.0, + "grad_norm": 1.7846892854947753, + "language_loss": 0.7381863, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.76326269, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.23510742, + "step": 6132, + "time_per_iteration": 2.931349754333496 + }, + { + "auxiliary_loss_clip": 0.0151051, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.31787062, + "balance_loss_mlp": 1.02648592, + "epoch": 0.3687359086126559, + "flos": 28849125628800.0, + "grad_norm": 1.956673245535813, + "language_loss": 0.74875855, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.774369, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 1.92675781, + "router_z_loss_mlp": 0.24072266, + "step": 6133, + "time_per_iteration": 2.906123161315918 + }, + { + "auxiliary_loss_clip": 0.01460194, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.28021598, + "balance_loss_mlp": 1.02048564, + "epoch": 0.3687960318653239, + "flos": 21406540654080.0, + "grad_norm": 2.737547403012392, + "language_loss": 0.72600615, + "learning_rate": 2.911986698512874e-06, + "loss": 0.75105304, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.24023438, + "step": 6134, + "time_per_iteration": 2.922503709793091 + }, + { + "auxiliary_loss_clip": 0.01477209, + "auxiliary_loss_mlp": 0.01044624, + "balance_loss_clip": 1.29377532, + "balance_loss_mlp": 1.02164102, + "epoch": 0.36885615511799186, + "flos": 20275570915200.0, + "grad_norm": 41.4552009670501, + "language_loss": 0.75773656, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.78295493, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.22973633, + "step": 6135, + "time_per_iteration": 5.69055438041687 + }, + { + "auxiliary_loss_clip": 0.01286783, + "auxiliary_loss_mlp": 0.01025564, + "balance_loss_clip": 1.17026424, + "balance_loss_mlp": 1.00486934, + "epoch": 0.3689162783706599, + "flos": 63115962099840.0, + "grad_norm": 0.8316487855638516, + "language_loss": 0.58884859, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.61197203, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.20703125, + "step": 6136, + "time_per_iteration": 3.3092172145843506 + }, + { + "auxiliary_loss_clip": 0.01470881, + "auxiliary_loss_mlp": 0.01050511, + "balance_loss_clip": 1.28740335, + "balance_loss_mlp": 1.0260613, + "epoch": 0.36897640162332784, + "flos": 10969048137600.0, + "grad_norm": 2.150763428173469, + "language_loss": 0.80162644, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.8268404, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.24462891, + "step": 6137, + "time_per_iteration": 2.8111350536346436 + }, + { + "auxiliary_loss_clip": 0.01473273, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.29058719, + "balance_loss_mlp": 1.02708769, + "epoch": 0.3690365248759958, + "flos": 20714048703360.0, + "grad_norm": 1.876863393325964, + "language_loss": 0.75020713, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.77544743, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.23657227, + "step": 6138, + "time_per_iteration": 4.339171648025513 + }, + { + "auxiliary_loss_clip": 0.01480315, + "auxiliary_loss_mlp": 0.01051351, + "balance_loss_clip": 1.29255569, + "balance_loss_mlp": 1.0272944, + "epoch": 0.3690966481286638, + "flos": 31837427429760.0, + "grad_norm": 2.4379030167686486, + "language_loss": 0.66788483, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.69320154, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.24072266, + "step": 6139, + "time_per_iteration": 2.9106504917144775 + }, + { + "auxiliary_loss_clip": 0.01460106, + "auxiliary_loss_mlp": 0.010508, + "balance_loss_clip": 1.27983975, + "balance_loss_mlp": 1.02619529, + "epoch": 0.36915677138133174, + "flos": 13122635022720.0, + "grad_norm": 2.014060410433049, + "language_loss": 0.71779168, + "learning_rate": 2.909906390418006e-06, + "loss": 0.74290079, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.24597168, + "step": 6140, + "time_per_iteration": 2.9279351234436035 + }, + { + "auxiliary_loss_clip": 0.01282059, + "auxiliary_loss_mlp": 0.01048542, + "balance_loss_clip": 1.1667881, + "balance_loss_mlp": 1.02574873, + "epoch": 0.3692168946339997, + "flos": 68719539156480.0, + "grad_norm": 0.7533367682662114, + "language_loss": 0.59386629, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.6171723, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.22753906, + "step": 6141, + "time_per_iteration": 3.431361198425293 + }, + { + "auxiliary_loss_clip": 0.01479791, + "auxiliary_loss_mlp": 0.0104183, + "balance_loss_clip": 1.29563761, + "balance_loss_mlp": 1.01839328, + "epoch": 0.36927701788666767, + "flos": 22027988989440.0, + "grad_norm": 1.6000270253544508, + "language_loss": 0.76054627, + "learning_rate": 2.909212678216192e-06, + "loss": 0.78576249, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.234375, + "step": 6142, + "time_per_iteration": 2.8373348712921143 + }, + { + "auxiliary_loss_clip": 0.01461456, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_clip": 1.27944684, + "balance_loss_mlp": 1.02358699, + "epoch": 0.36933714113933563, + "flos": 21845697114240.0, + "grad_norm": 1.8338464152562803, + "language_loss": 0.77267879, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79775929, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23010254, + "step": 6143, + "time_per_iteration": 2.8635764122009277 + }, + { + "auxiliary_loss_clip": 0.01469924, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.2875278, + "balance_loss_mlp": 1.01631391, + "epoch": 0.3693972643920036, + "flos": 23701083649920.0, + "grad_norm": 1.9196779693834203, + "language_loss": 0.82645273, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.85154355, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.22839355, + "step": 6144, + "time_per_iteration": 2.866054058074951 + }, + { + "auxiliary_loss_clip": 0.01472124, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.28674436, + "balance_loss_mlp": 1.0229634, + "epoch": 0.36945738764467156, + "flos": 22867002161280.0, + "grad_norm": 2.1112319876498615, + "language_loss": 0.7868005, + "learning_rate": 2.908171851365593e-06, + "loss": 0.81198412, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.23266602, + "step": 6145, + "time_per_iteration": 2.8703463077545166 + }, + { + "auxiliary_loss_clip": 0.01470397, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.28414559, + "balance_loss_mlp": 1.01802659, + "epoch": 0.36951751089733953, + "flos": 16624123056000.0, + "grad_norm": 2.3622876489875466, + "language_loss": 0.77778411, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.80291867, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.25036621, + "step": 6146, + "time_per_iteration": 2.8842873573303223 + }, + { + "auxiliary_loss_clip": 0.0148833, + "auxiliary_loss_mlp": 0.01052409, + "balance_loss_clip": 1.3007381, + "balance_loss_mlp": 1.02832842, + "epoch": 0.3695776341500075, + "flos": 18923326266240.0, + "grad_norm": 1.6700099551924126, + "language_loss": 0.81471956, + "learning_rate": 2.907477794586761e-06, + "loss": 0.84012699, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.24084473, + "step": 6147, + "time_per_iteration": 2.8260788917541504 + }, + { + "auxiliary_loss_clip": 0.01477842, + "auxiliary_loss_mlp": 0.0104697, + "balance_loss_clip": 1.29044998, + "balance_loss_mlp": 1.02199578, + "epoch": 0.36963775740267546, + "flos": 20816881430400.0, + "grad_norm": 1.7917875964956451, + "language_loss": 0.84251684, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.86776495, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.25, + "step": 6148, + "time_per_iteration": 2.8560986518859863 + }, + { + "auxiliary_loss_clip": 0.01479921, + "auxiliary_loss_mlp": 0.01059501, + "balance_loss_clip": 1.29667211, + "balance_loss_mlp": 1.03521836, + "epoch": 0.3696978806553435, + "flos": 26071511454720.0, + "grad_norm": 1.9430812109723135, + "language_loss": 0.75107992, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.77647412, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.24291992, + "step": 6149, + "time_per_iteration": 2.877223014831543 + }, + { + "auxiliary_loss_clip": 0.01488239, + "auxiliary_loss_mlp": 0.01061173, + "balance_loss_clip": 1.30025649, + "balance_loss_mlp": 1.03413677, + "epoch": 0.36975800390801145, + "flos": 26845272345600.0, + "grad_norm": 12.049238490939379, + "language_loss": 0.72102016, + "learning_rate": 2.906436451364054e-06, + "loss": 0.74651432, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.27050781, + "step": 6150, + "time_per_iteration": 2.9266157150268555 + }, + { + "auxiliary_loss_clip": 0.01484547, + "auxiliary_loss_mlp": 0.01049723, + "balance_loss_clip": 1.29911494, + "balance_loss_mlp": 1.02643013, + "epoch": 0.3698181271606794, + "flos": 21152616981120.0, + "grad_norm": 1.5273973720671126, + "language_loss": 0.82401466, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84935737, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.23278809, + "step": 6151, + "time_per_iteration": 2.834444046020508 + }, + { + "auxiliary_loss_clip": 0.01265911, + "auxiliary_loss_mlp": 0.0104613, + "balance_loss_clip": 1.15208375, + "balance_loss_mlp": 1.02476811, + "epoch": 0.3698782504133474, + "flos": 66772070173440.0, + "grad_norm": 0.792460423193385, + "language_loss": 0.63163978, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65476024, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.21386719, + "step": 6152, + "time_per_iteration": 3.4524428844451904 + }, + { + "auxiliary_loss_clip": 0.01464254, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.28494549, + "balance_loss_mlp": 1.02118945, + "epoch": 0.36993837366601534, + "flos": 24320948417280.0, + "grad_norm": 2.8054195003551627, + "language_loss": 0.7078284, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.73291123, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22839355, + "step": 6153, + "time_per_iteration": 2.901646137237549 + }, + { + "auxiliary_loss_clip": 0.01487248, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.29969525, + "balance_loss_mlp": 1.01834846, + "epoch": 0.3699984969186833, + "flos": 24358936066560.0, + "grad_norm": 1.7397056766040822, + "language_loss": 0.73307765, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.75837314, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.23962402, + "step": 6154, + "time_per_iteration": 2.888070821762085 + }, + { + "auxiliary_loss_clip": 0.01491631, + "auxiliary_loss_mlp": 0.01042152, + "balance_loss_clip": 1.30499113, + "balance_loss_mlp": 1.0187993, + "epoch": 0.37005862017135127, + "flos": 19838812429440.0, + "grad_norm": 2.1119159318500547, + "language_loss": 0.68424183, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70957971, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23376465, + "step": 6155, + "time_per_iteration": 2.8335988521575928 + }, + { + "auxiliary_loss_clip": 0.01484952, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.29896903, + "balance_loss_mlp": 1.01708007, + "epoch": 0.37011874342401924, + "flos": 19583395678080.0, + "grad_norm": 1.8246113844097485, + "language_loss": 0.68883437, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.71408749, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23278809, + "step": 6156, + "time_per_iteration": 2.865536689758301 + }, + { + "auxiliary_loss_clip": 0.0147291, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.29064059, + "balance_loss_mlp": 1.01558423, + "epoch": 0.3701788666766872, + "flos": 20383606794240.0, + "grad_norm": 2.109264771324506, + "language_loss": 0.82412326, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84923792, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.22961426, + "step": 6157, + "time_per_iteration": 2.8589301109313965 + }, + { + "auxiliary_loss_clip": 0.01500458, + "auxiliary_loss_mlp": 0.01047792, + "balance_loss_clip": 1.30767083, + "balance_loss_mlp": 1.02328277, + "epoch": 0.37023898992935517, + "flos": 15349301539200.0, + "grad_norm": 2.7438730371875044, + "language_loss": 0.77665997, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.80214244, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.24499512, + "step": 6158, + "time_per_iteration": 2.820526123046875 + }, + { + "auxiliary_loss_clip": 0.01506114, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.31476188, + "balance_loss_mlp": 1.01693702, + "epoch": 0.37029911318202313, + "flos": 19583576657280.0, + "grad_norm": 5.490514350651382, + "language_loss": 0.69832599, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.72379923, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.24291992, + "step": 6159, + "time_per_iteration": 2.830812931060791 + }, + { + "auxiliary_loss_clip": 0.01482631, + "auxiliary_loss_mlp": 0.01045011, + "balance_loss_clip": 1.29644871, + "balance_loss_mlp": 1.02159834, + "epoch": 0.3703592364346911, + "flos": 26224457437440.0, + "grad_norm": 3.109444447070503, + "language_loss": 0.71774, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.74301642, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23425293, + "step": 6160, + "time_per_iteration": 2.927227735519409 + }, + { + "auxiliary_loss_clip": 0.01470143, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.28902268, + "balance_loss_mlp": 1.01494288, + "epoch": 0.37041935968735906, + "flos": 20058096568320.0, + "grad_norm": 1.7020736677870327, + "language_loss": 0.79883558, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.82391632, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.23010254, + "step": 6161, + "time_per_iteration": 2.844646692276001 + }, + { + "auxiliary_loss_clip": 0.01485571, + "auxiliary_loss_mlp": 0.01043153, + "balance_loss_clip": 1.29836345, + "balance_loss_mlp": 1.01956105, + "epoch": 0.3704794829400271, + "flos": 24144402631680.0, + "grad_norm": 1.6523075584211366, + "language_loss": 0.79853642, + "learning_rate": 2.902267988534295e-06, + "loss": 0.82382369, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.23608398, + "step": 6162, + "time_per_iteration": 2.913203001022339 + }, + { + "auxiliary_loss_clip": 0.01485157, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.29939342, + "balance_loss_mlp": 1.02050257, + "epoch": 0.37053960619269505, + "flos": 14875188831360.0, + "grad_norm": 2.1644467725274, + "language_loss": 0.80256826, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.82785839, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23376465, + "step": 6163, + "time_per_iteration": 2.8338193893432617 + }, + { + "auxiliary_loss_clip": 0.01491985, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.30505979, + "balance_loss_mlp": 1.01580262, + "epoch": 0.370599729445363, + "flos": 21371539161600.0, + "grad_norm": 1.818074604580883, + "language_loss": 0.69128847, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.71659076, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22436523, + "step": 6164, + "time_per_iteration": 4.2895588874816895 + }, + { + "auxiliary_loss_clip": 0.01490231, + "auxiliary_loss_mlp": 0.01042409, + "balance_loss_clip": 1.30309319, + "balance_loss_mlp": 1.01904416, + "epoch": 0.370659852698031, + "flos": 26839119052800.0, + "grad_norm": 2.1376517024201416, + "language_loss": 0.83792478, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.86325121, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.23376465, + "step": 6165, + "time_per_iteration": 2.9309298992156982 + }, + { + "auxiliary_loss_clip": 0.01499703, + "auxiliary_loss_mlp": 0.0104329, + "balance_loss_clip": 1.30883145, + "balance_loss_mlp": 1.01823187, + "epoch": 0.37071997595069894, + "flos": 19108378074240.0, + "grad_norm": 1.9851063981824921, + "language_loss": 0.69938815, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.72481811, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.25073242, + "step": 6166, + "time_per_iteration": 2.8714940547943115 + }, + { + "auxiliary_loss_clip": 0.01267523, + "auxiliary_loss_mlp": 0.01054599, + "balance_loss_clip": 1.16007531, + "balance_loss_mlp": 1.03275955, + "epoch": 0.3707800992033669, + "flos": 52202366104320.0, + "grad_norm": 0.7975691393019588, + "language_loss": 0.57127386, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.59449506, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.21875, + "step": 6167, + "time_per_iteration": 3.1985366344451904 + }, + { + "auxiliary_loss_clip": 0.01481138, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.29744291, + "balance_loss_mlp": 1.0148375, + "epoch": 0.3708402224560349, + "flos": 19911439612800.0, + "grad_norm": 3.279379565318279, + "language_loss": 0.7625128, + "learning_rate": 2.900181908135584e-06, + "loss": 0.78770483, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.23217773, + "step": 6168, + "time_per_iteration": 2.889181613922119 + }, + { + "auxiliary_loss_clip": 0.01489708, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.30341697, + "balance_loss_mlp": 1.01576996, + "epoch": 0.37090034570870284, + "flos": 20016308355840.0, + "grad_norm": 1.7352618232866452, + "language_loss": 0.74910963, + "learning_rate": 2.899834108519755e-06, + "loss": 0.77439779, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23339844, + "step": 6169, + "time_per_iteration": 2.951138734817505 + }, + { + "auxiliary_loss_clip": 0.01483888, + "auxiliary_loss_mlp": 0.01041237, + "balance_loss_clip": 1.30023098, + "balance_loss_mlp": 1.01799083, + "epoch": 0.3709604689613708, + "flos": 24145533751680.0, + "grad_norm": 1.8149309976035821, + "language_loss": 0.80442917, + "learning_rate": 2.899486274782127e-06, + "loss": 0.82968044, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23242188, + "step": 6170, + "time_per_iteration": 5.730466842651367 + }, + { + "auxiliary_loss_clip": 0.0148506, + "auxiliary_loss_mlp": 0.01043157, + "balance_loss_clip": 1.29716897, + "balance_loss_mlp": 1.01941109, + "epoch": 0.37102059221403877, + "flos": 23885999723520.0, + "grad_norm": 2.9327156642793653, + "language_loss": 0.77551508, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.80079722, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.23754883, + "step": 6171, + "time_per_iteration": 2.887183904647827 + }, + { + "auxiliary_loss_clip": 0.0149158, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.30446005, + "balance_loss_mlp": 1.0183835, + "epoch": 0.37108071546670673, + "flos": 14509292981760.0, + "grad_norm": 2.6305826895437696, + "language_loss": 0.81225383, + "learning_rate": 2.898790504994232e-06, + "loss": 0.83758783, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.23449707, + "step": 6172, + "time_per_iteration": 2.834150552749634 + }, + { + "auxiliary_loss_clip": 0.01506019, + "auxiliary_loss_mlp": 0.01044323, + "balance_loss_clip": 1.31650424, + "balance_loss_mlp": 1.0211606, + "epoch": 0.3711408387193747, + "flos": 34574339266560.0, + "grad_norm": 2.4048301281959006, + "language_loss": 0.60245776, + "learning_rate": 2.89844256897035e-06, + "loss": 0.62796116, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.23168945, + "step": 6173, + "time_per_iteration": 4.437655448913574 + }, + { + "auxiliary_loss_clip": 0.01498127, + "auxiliary_loss_mlp": 0.01049834, + "balance_loss_clip": 1.30935049, + "balance_loss_mlp": 1.02601576, + "epoch": 0.37120096197204266, + "flos": 17319465429120.0, + "grad_norm": 1.9950311302324908, + "language_loss": 0.81942016, + "learning_rate": 2.898094598877435e-06, + "loss": 0.84489977, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.23840332, + "step": 6174, + "time_per_iteration": 2.8204665184020996 + }, + { + "auxiliary_loss_clip": 0.01486478, + "auxiliary_loss_mlp": 0.01043692, + "balance_loss_clip": 1.30335474, + "balance_loss_mlp": 1.02154303, + "epoch": 0.37126108522471063, + "flos": 30676161657600.0, + "grad_norm": 2.2234879232751212, + "language_loss": 0.80441749, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82971919, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22155762, + "step": 6175, + "time_per_iteration": 2.929279327392578 + }, + { + "auxiliary_loss_clip": 0.01490717, + "auxiliary_loss_mlp": 0.01047888, + "balance_loss_clip": 1.3054899, + "balance_loss_mlp": 1.02520227, + "epoch": 0.37132120847737865, + "flos": 25166567329920.0, + "grad_norm": 2.1870697543388817, + "language_loss": 0.89341241, + "learning_rate": 2.89739855653729e-06, + "loss": 0.91879851, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.22680664, + "step": 6176, + "time_per_iteration": 2.886245012283325 + }, + { + "auxiliary_loss_clip": 0.01498121, + "auxiliary_loss_mlp": 0.01051239, + "balance_loss_clip": 1.31185138, + "balance_loss_mlp": 1.02841032, + "epoch": 0.3713813317300466, + "flos": 21222936679680.0, + "grad_norm": 1.5024782705148505, + "language_loss": 0.74299872, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.76849228, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.22839355, + "step": 6177, + "time_per_iteration": 2.861433982849121 + }, + { + "auxiliary_loss_clip": 0.0149371, + "auxiliary_loss_mlp": 0.01054588, + "balance_loss_clip": 1.30800986, + "balance_loss_mlp": 1.03278422, + "epoch": 0.3714414549827146, + "flos": 21626684444160.0, + "grad_norm": 2.2456372938563907, + "language_loss": 0.76167989, + "learning_rate": 2.896702378079374e-06, + "loss": 0.78716284, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.21789551, + "step": 6178, + "time_per_iteration": 2.826676845550537 + }, + { + "auxiliary_loss_clip": 0.01514105, + "auxiliary_loss_mlp": 0.01054395, + "balance_loss_clip": 1.32756138, + "balance_loss_mlp": 1.03112507, + "epoch": 0.37150157823538255, + "flos": 19981261618560.0, + "grad_norm": 2.412703778219655, + "language_loss": 0.72449785, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.75018287, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23278809, + "step": 6179, + "time_per_iteration": 2.8697168827056885 + }, + { + "auxiliary_loss_clip": 0.01508593, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_clip": 1.31964016, + "balance_loss_mlp": 1.02886486, + "epoch": 0.3715617014880505, + "flos": 24870583975680.0, + "grad_norm": 1.8345234204854466, + "language_loss": 0.7087447, + "learning_rate": 2.896006063609283e-06, + "loss": 0.73435545, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.23620605, + "step": 6180, + "time_per_iteration": 2.9819300174713135 + }, + { + "auxiliary_loss_clip": 0.01501812, + "auxiliary_loss_mlp": 0.01050072, + "balance_loss_clip": 1.31676054, + "balance_loss_mlp": 1.02766109, + "epoch": 0.3716218247407185, + "flos": 20458812931200.0, + "grad_norm": 1.8431508703197537, + "language_loss": 0.78725576, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.81277466, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.22424316, + "step": 6181, + "time_per_iteration": 2.8477768898010254 + }, + { + "auxiliary_loss_clip": 0.01491423, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_clip": 1.30702055, + "balance_loss_mlp": 1.02721632, + "epoch": 0.37168194799338644, + "flos": 24143995428480.0, + "grad_norm": 3.0170805478235048, + "language_loss": 0.79933149, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.82474303, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.22497559, + "step": 6182, + "time_per_iteration": 2.894883155822754 + }, + { + "auxiliary_loss_clip": 0.01288046, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.17472088, + "balance_loss_mlp": 1.01366067, + "epoch": 0.3717420712460544, + "flos": 67441007566080.0, + "grad_norm": 0.7818530899177816, + "language_loss": 0.57529604, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59852105, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.20800781, + "step": 6183, + "time_per_iteration": 3.324918031692505 + }, + { + "auxiliary_loss_clip": 0.01523308, + "auxiliary_loss_mlp": 0.0105369, + "balance_loss_clip": 1.32811081, + "balance_loss_mlp": 1.02968097, + "epoch": 0.37180219449872237, + "flos": 22385605040640.0, + "grad_norm": 2.3164009933047267, + "language_loss": 0.77247167, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79824167, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.23999023, + "step": 6184, + "time_per_iteration": 2.8766307830810547 + }, + { + "auxiliary_loss_clip": 0.01484881, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.30089152, + "balance_loss_mlp": 1.02099061, + "epoch": 0.37186231775139034, + "flos": 21879431752320.0, + "grad_norm": 1.8049261052586905, + "language_loss": 0.72855425, + "learning_rate": 2.894264683073954e-06, + "loss": 0.75384492, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23205566, + "step": 6185, + "time_per_iteration": 2.86906099319458 + }, + { + "auxiliary_loss_clip": 0.01486031, + "auxiliary_loss_mlp": 0.01041625, + "balance_loss_clip": 1.3030138, + "balance_loss_mlp": 1.01783061, + "epoch": 0.3719224410040583, + "flos": 22424407096320.0, + "grad_norm": 1.5510360307707731, + "language_loss": 0.77777874, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.80305529, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.23803711, + "step": 6186, + "time_per_iteration": 2.868987560272217 + }, + { + "auxiliary_loss_clip": 0.01497186, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.30568421, + "balance_loss_mlp": 1.01845527, + "epoch": 0.37198256425672627, + "flos": 25161002219520.0, + "grad_norm": 1.7543539281375307, + "language_loss": 0.8449136, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.87031269, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.24230957, + "step": 6187, + "time_per_iteration": 2.9073140621185303 + }, + { + "auxiliary_loss_clip": 0.01480248, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.29668283, + "balance_loss_mlp": 1.01643729, + "epoch": 0.37204268750939423, + "flos": 21147187605120.0, + "grad_norm": 1.8633987288567233, + "language_loss": 0.84850496, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87370396, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23193359, + "step": 6188, + "time_per_iteration": 2.8571369647979736 + }, + { + "auxiliary_loss_clip": 0.01490443, + "auxiliary_loss_mlp": 0.0104058, + "balance_loss_clip": 1.30440712, + "balance_loss_mlp": 1.01619017, + "epoch": 0.37210281076206225, + "flos": 21516748283520.0, + "grad_norm": 1.8776773539198146, + "language_loss": 0.66719586, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.69250607, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.24389648, + "step": 6189, + "time_per_iteration": 2.926896095275879 + }, + { + "auxiliary_loss_clip": 0.0148734, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.30027604, + "balance_loss_mlp": 1.01590919, + "epoch": 0.3721629340147302, + "flos": 17356774406400.0, + "grad_norm": 1.9844421944634887, + "language_loss": 0.85042906, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.87570518, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.24389648, + "step": 6190, + "time_per_iteration": 2.87680983543396 + }, + { + "auxiliary_loss_clip": 0.01495853, + "auxiliary_loss_mlp": 0.01038667, + "balance_loss_clip": 1.30411828, + "balance_loss_mlp": 1.01412201, + "epoch": 0.3722230572673982, + "flos": 16440247612800.0, + "grad_norm": 2.54891119821353, + "language_loss": 0.89768422, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.92302948, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.2454834, + "step": 6191, + "time_per_iteration": 2.867191791534424 + }, + { + "auxiliary_loss_clip": 0.01499773, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_clip": 1.30757999, + "balance_loss_mlp": 1.01637852, + "epoch": 0.37228318052006615, + "flos": 22685026999680.0, + "grad_norm": 2.3957749309522356, + "language_loss": 0.74188596, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76730597, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.25842285, + "step": 6192, + "time_per_iteration": 2.9041192531585693 + }, + { + "auxiliary_loss_clip": 0.0148693, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.29999709, + "balance_loss_mlp": 1.0156281, + "epoch": 0.3723433037727341, + "flos": 25276548735360.0, + "grad_norm": 2.2795738233590326, + "language_loss": 0.80363429, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82888734, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22741699, + "step": 6193, + "time_per_iteration": 2.911593437194824 + }, + { + "auxiliary_loss_clip": 0.01484774, + "auxiliary_loss_mlp": 0.010385, + "balance_loss_clip": 1.29517698, + "balance_loss_mlp": 1.01539731, + "epoch": 0.3724034270254021, + "flos": 10531746714240.0, + "grad_norm": 2.318332380573544, + "language_loss": 0.85841548, + "learning_rate": 2.891128062852194e-06, + "loss": 0.88364822, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23120117, + "step": 6194, + "time_per_iteration": 2.828993558883667 + }, + { + "auxiliary_loss_clip": 0.01490587, + "auxiliary_loss_mlp": 0.01040554, + "balance_loss_clip": 1.30170703, + "balance_loss_mlp": 1.01715279, + "epoch": 0.37246355027807004, + "flos": 20275797139200.0, + "grad_norm": 16.940796464073316, + "language_loss": 0.784567, + "learning_rate": 2.890779380359646e-06, + "loss": 0.80987835, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.23388672, + "step": 6195, + "time_per_iteration": 2.81355357170105 + }, + { + "auxiliary_loss_clip": 0.01475465, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.29079998, + "balance_loss_mlp": 1.01589227, + "epoch": 0.372523673530738, + "flos": 19510089822720.0, + "grad_norm": 1.5750324391620143, + "language_loss": 0.79839784, + "learning_rate": 2.890430664088655e-06, + "loss": 0.82356322, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.25195312, + "step": 6196, + "time_per_iteration": 2.8635315895080566 + }, + { + "auxiliary_loss_clip": 0.01477139, + "auxiliary_loss_mlp": 0.01042338, + "balance_loss_clip": 1.29207027, + "balance_loss_mlp": 1.01948595, + "epoch": 0.372583796783406, + "flos": 16773087496320.0, + "grad_norm": 2.1439339302685174, + "language_loss": 0.84947181, + "learning_rate": 2.890081914052443e-06, + "loss": 0.87466669, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.22851562, + "step": 6197, + "time_per_iteration": 2.8197293281555176 + }, + { + "auxiliary_loss_clip": 0.01459691, + "auxiliary_loss_mlp": 0.01046297, + "balance_loss_clip": 1.27647388, + "balance_loss_mlp": 1.02069068, + "epoch": 0.37264392003607394, + "flos": 22648215715200.0, + "grad_norm": 1.5685881664075663, + "language_loss": 0.65433162, + "learning_rate": 2.889733130264237e-06, + "loss": 0.6793915, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.25585938, + "step": 6198, + "time_per_iteration": 2.844416618347168 + }, + { + "auxiliary_loss_clip": 0.01472627, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.28849363, + "balance_loss_mlp": 1.01942921, + "epoch": 0.3727040432887419, + "flos": 19982302248960.0, + "grad_norm": 1.45337022416199, + "language_loss": 0.74561405, + "learning_rate": 2.889384312737261e-06, + "loss": 0.77078009, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.24511719, + "step": 6199, + "time_per_iteration": 4.26497745513916 + }, + { + "auxiliary_loss_clip": 0.01475511, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.29194403, + "balance_loss_mlp": 1.01465738, + "epoch": 0.37276416654140987, + "flos": 63918960410880.0, + "grad_norm": 1.7318776150680975, + "language_loss": 0.81691313, + "learning_rate": 2.889035461484742e-06, + "loss": 0.84204495, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23010254, + "step": 6200, + "time_per_iteration": 3.2291831970214844 + }, + { + "auxiliary_loss_clip": 0.01478486, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.29250979, + "balance_loss_mlp": 1.01795673, + "epoch": 0.37282428979407783, + "flos": 39800573539200.0, + "grad_norm": 2.0999174373101592, + "language_loss": 0.6114282, + "learning_rate": 2.88868657651991e-06, + "loss": 0.63664168, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.24914551, + "step": 6201, + "time_per_iteration": 3.011847496032715 + }, + { + "auxiliary_loss_clip": 0.01499207, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.30978966, + "balance_loss_mlp": 1.015136, + "epoch": 0.37288441304674586, + "flos": 22718806882560.0, + "grad_norm": 1.802365694864444, + "language_loss": 0.73977643, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.76515973, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23999023, + "step": 6202, + "time_per_iteration": 2.8908324241638184 + }, + { + "auxiliary_loss_clip": 0.01463767, + "auxiliary_loss_mlp": 0.01041757, + "balance_loss_clip": 1.28001642, + "balance_loss_mlp": 1.01709223, + "epoch": 0.3729445362994138, + "flos": 18779474488320.0, + "grad_norm": 2.0396676359201433, + "language_loss": 0.7423718, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76742703, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.24645996, + "step": 6203, + "time_per_iteration": 2.894284725189209 + }, + { + "auxiliary_loss_clip": 0.01474079, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.29001999, + "balance_loss_mlp": 1.01631391, + "epoch": 0.3730046595520818, + "flos": 22466557267200.0, + "grad_norm": 1.7494041016272246, + "language_loss": 0.82961446, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.85473835, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.2199707, + "step": 6204, + "time_per_iteration": 2.902602195739746 + }, + { + "auxiliary_loss_clip": 0.0148227, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.29434276, + "balance_loss_mlp": 1.01784873, + "epoch": 0.37306478280474975, + "flos": 24327056465280.0, + "grad_norm": 1.75052083102906, + "language_loss": 0.75189584, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77712929, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2322998, + "step": 6205, + "time_per_iteration": 4.295569658279419 + }, + { + "auxiliary_loss_clip": 0.01471258, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.2858882, + "balance_loss_mlp": 1.01578593, + "epoch": 0.3731249060574177, + "flos": 15823640471040.0, + "grad_norm": 3.06495370469463, + "language_loss": 0.78434873, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80945742, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.23791504, + "step": 6206, + "time_per_iteration": 4.268855810165405 + }, + { + "auxiliary_loss_clip": 0.01481774, + "auxiliary_loss_mlp": 0.01033679, + "balance_loss_clip": 1.29566431, + "balance_loss_mlp": 1.01001644, + "epoch": 0.3731850293100857, + "flos": 19837455085440.0, + "grad_norm": 2.097872523146228, + "language_loss": 0.93687749, + "learning_rate": 2.886592559513283e-06, + "loss": 0.96203208, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.23669434, + "step": 6207, + "time_per_iteration": 2.8502519130706787 + }, + { + "auxiliary_loss_clip": 0.01490504, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.3005631, + "balance_loss_mlp": 1.01428974, + "epoch": 0.37324515256275365, + "flos": 19071657279360.0, + "grad_norm": 2.118253559430879, + "language_loss": 0.84525496, + "learning_rate": 2.886243438932759e-06, + "loss": 0.87052596, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22302246, + "step": 6208, + "time_per_iteration": 4.269124984741211 + }, + { + "auxiliary_loss_clip": 0.01474283, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.28673291, + "balance_loss_mlp": 1.0155611, + "epoch": 0.3733052758154216, + "flos": 20714093948160.0, + "grad_norm": 7.338840638390395, + "language_loss": 0.74009168, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.76522672, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2364502, + "step": 6209, + "time_per_iteration": 2.8707008361816406 + }, + { + "auxiliary_loss_clip": 0.0148704, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.30142415, + "balance_loss_mlp": 1.01848161, + "epoch": 0.3733653990680896, + "flos": 20203124711040.0, + "grad_norm": 3.4950351420881454, + "language_loss": 0.71758181, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.7428838, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24682617, + "step": 6210, + "time_per_iteration": 2.9039204120635986 + }, + { + "auxiliary_loss_clip": 0.01492708, + "auxiliary_loss_mlp": 0.01040811, + "balance_loss_clip": 1.30502534, + "balance_loss_mlp": 1.01726723, + "epoch": 0.37342552232075754, + "flos": 20349374463360.0, + "grad_norm": 2.0542228386393298, + "language_loss": 0.7838124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.80914766, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.23547363, + "step": 6211, + "time_per_iteration": 3.0203518867492676 + }, + { + "auxiliary_loss_clip": 0.01488506, + "auxiliary_loss_mlp": 0.01043951, + "balance_loss_clip": 1.30033886, + "balance_loss_mlp": 1.02015734, + "epoch": 0.3734856455734255, + "flos": 35531568397440.0, + "grad_norm": 1.5075568308864071, + "language_loss": 0.74231577, + "learning_rate": 2.884846620678668e-06, + "loss": 0.76764035, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.23791504, + "step": 6212, + "time_per_iteration": 2.991227626800537 + }, + { + "auxiliary_loss_clip": 0.01528496, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.33062077, + "balance_loss_mlp": 1.02550626, + "epoch": 0.37354576882609347, + "flos": 21152209777920.0, + "grad_norm": 1.8949435571362097, + "language_loss": 0.82338262, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84916937, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 1.97851562, + "router_z_loss_mlp": 0.24694824, + "step": 6213, + "time_per_iteration": 2.8980672359466553 + }, + { + "auxiliary_loss_clip": 0.01484715, + "auxiliary_loss_mlp": 0.01043489, + "balance_loss_clip": 1.29688668, + "balance_loss_mlp": 1.01921844, + "epoch": 0.37360589207876144, + "flos": 21516703038720.0, + "grad_norm": 5.064910822873359, + "language_loss": 0.79535091, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.82063293, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.24279785, + "step": 6214, + "time_per_iteration": 2.9481146335601807 + }, + { + "auxiliary_loss_clip": 0.01481096, + "auxiliary_loss_mlp": 0.01041289, + "balance_loss_clip": 1.2959069, + "balance_loss_mlp": 1.01859188, + "epoch": 0.37366601533142946, + "flos": 38450002947840.0, + "grad_norm": 1.8001624781589776, + "language_loss": 0.85406125, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87928522, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.22705078, + "step": 6215, + "time_per_iteration": 3.0597071647644043 + }, + { + "auxiliary_loss_clip": 0.01497893, + "auxiliary_loss_mlp": 0.01044977, + "balance_loss_clip": 1.3079505, + "balance_loss_mlp": 1.01984775, + "epoch": 0.3737261385840974, + "flos": 18450073209600.0, + "grad_norm": 2.0636303279241686, + "language_loss": 0.68660545, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.71203417, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.25134277, + "step": 6216, + "time_per_iteration": 2.9039924144744873 + }, + { + "auxiliary_loss_clip": 0.01483401, + "auxiliary_loss_mlp": 0.01041104, + "balance_loss_clip": 1.29542851, + "balance_loss_mlp": 1.01746464, + "epoch": 0.3737862618367654, + "flos": 22940172282240.0, + "grad_norm": 2.027560576425042, + "language_loss": 0.67462504, + "learning_rate": 2.883099843007303e-06, + "loss": 0.69987011, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2364502, + "step": 6217, + "time_per_iteration": 2.8553824424743652 + }, + { + "auxiliary_loss_clip": 0.0150187, + "auxiliary_loss_mlp": 0.01040638, + "balance_loss_clip": 1.31146801, + "balance_loss_mlp": 1.01786947, + "epoch": 0.37384638508943335, + "flos": 15416951794560.0, + "grad_norm": 1.7549830352504139, + "language_loss": 0.8159157, + "learning_rate": 2.88275038695833e-06, + "loss": 0.84134078, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.2277832, + "step": 6218, + "time_per_iteration": 2.8806140422821045 + }, + { + "auxiliary_loss_clip": 0.01482164, + "auxiliary_loss_mlp": 0.01043763, + "balance_loss_clip": 1.29900408, + "balance_loss_mlp": 1.02006471, + "epoch": 0.3739065083421013, + "flos": 24291738259200.0, + "grad_norm": 1.4583770779151362, + "language_loss": 0.79183143, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.81709075, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.23706055, + "step": 6219, + "time_per_iteration": 2.908015251159668 + }, + { + "auxiliary_loss_clip": 0.0148554, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.30171561, + "balance_loss_mlp": 1.0188117, + "epoch": 0.3739666315947693, + "flos": 23013161424000.0, + "grad_norm": 3.174996278980663, + "language_loss": 0.77798331, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.80326861, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.24157715, + "step": 6220, + "time_per_iteration": 2.8773181438446045 + }, + { + "auxiliary_loss_clip": 0.01498513, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.30928981, + "balance_loss_mlp": 1.0153501, + "epoch": 0.37402675484743725, + "flos": 19400606110080.0, + "grad_norm": 2.8349866279381297, + "language_loss": 0.83631879, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.86170644, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.24926758, + "step": 6221, + "time_per_iteration": 2.8724052906036377 + }, + { + "auxiliary_loss_clip": 0.01484065, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.29875433, + "balance_loss_mlp": 1.01716506, + "epoch": 0.3740868781001052, + "flos": 17134323131520.0, + "grad_norm": 1.8033213631844327, + "language_loss": 0.7736702, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.79892075, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.23828125, + "step": 6222, + "time_per_iteration": 2.8508198261260986 + }, + { + "auxiliary_loss_clip": 0.0149281, + "auxiliary_loss_mlp": 0.01043741, + "balance_loss_clip": 1.30653572, + "balance_loss_mlp": 1.01948237, + "epoch": 0.3741470013527732, + "flos": 20052033765120.0, + "grad_norm": 2.0498855903716535, + "language_loss": 0.71708632, + "learning_rate": 2.881002604868789e-06, + "loss": 0.74245185, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.24243164, + "step": 6223, + "time_per_iteration": 2.962596893310547 + }, + { + "auxiliary_loss_clip": 0.01488459, + "auxiliary_loss_mlp": 0.01041531, + "balance_loss_clip": 1.30395222, + "balance_loss_mlp": 1.01768959, + "epoch": 0.37420712460544114, + "flos": 36909494110080.0, + "grad_norm": 2.4537101037388247, + "language_loss": 0.69817924, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.72347915, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.23852539, + "step": 6224, + "time_per_iteration": 2.985929250717163 + }, + { + "auxiliary_loss_clip": 0.01475389, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.29258728, + "balance_loss_mlp": 1.01804078, + "epoch": 0.3742672478581091, + "flos": 22211592963840.0, + "grad_norm": 1.8919437232950167, + "language_loss": 0.70679736, + "learning_rate": 2.880303258086228e-06, + "loss": 0.73196912, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.23730469, + "step": 6225, + "time_per_iteration": 2.8942956924438477 + }, + { + "auxiliary_loss_clip": 0.01476523, + "auxiliary_loss_mlp": 0.01041735, + "balance_loss_clip": 1.29387724, + "balance_loss_mlp": 1.01763058, + "epoch": 0.3743273711107771, + "flos": 24692409377280.0, + "grad_norm": 2.062999437648166, + "language_loss": 0.80222988, + "learning_rate": 2.879953534616536e-06, + "loss": 0.82741249, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.2409668, + "step": 6226, + "time_per_iteration": 2.8829362392425537 + }, + { + "auxiliary_loss_clip": 0.01487916, + "auxiliary_loss_mlp": 0.01039042, + "balance_loss_clip": 1.30160117, + "balance_loss_mlp": 1.01562893, + "epoch": 0.37438749436344504, + "flos": 24469641388800.0, + "grad_norm": 1.9190242268320272, + "language_loss": 0.68350285, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70877242, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.23425293, + "step": 6227, + "time_per_iteration": 2.9140465259552 + }, + { + "auxiliary_loss_clip": 0.01481889, + "auxiliary_loss_mlp": 0.01039819, + "balance_loss_clip": 1.29735494, + "balance_loss_mlp": 1.01600134, + "epoch": 0.374447617616113, + "flos": 21808750095360.0, + "grad_norm": 1.7461725242324144, + "language_loss": 0.84199232, + "learning_rate": 2.879253987586635e-06, + "loss": 0.86720943, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.23828125, + "step": 6228, + "time_per_iteration": 2.94978404045105 + }, + { + "auxiliary_loss_clip": 0.01477908, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_clip": 1.29480124, + "balance_loss_mlp": 1.02166212, + "epoch": 0.374507740868781, + "flos": 17977317845760.0, + "grad_norm": 1.6777159827182282, + "language_loss": 0.75395256, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.77918857, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.24023438, + "step": 6229, + "time_per_iteration": 2.8197741508483887 + }, + { + "auxiliary_loss_clip": 0.01489671, + "auxiliary_loss_mlp": 0.0104517, + "balance_loss_clip": 1.30456829, + "balance_loss_mlp": 1.01995754, + "epoch": 0.374567864121449, + "flos": 16114058714880.0, + "grad_norm": 1.9001972606036852, + "language_loss": 0.84526777, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.87061614, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.25231934, + "step": 6230, + "time_per_iteration": 2.8428471088409424 + }, + { + "auxiliary_loss_clip": 0.01498524, + "auxiliary_loss_mlp": 0.01048427, + "balance_loss_clip": 1.31126952, + "balance_loss_mlp": 1.02305877, + "epoch": 0.37462798737411696, + "flos": 25783536430080.0, + "grad_norm": 1.7185989442059706, + "language_loss": 0.73995811, + "learning_rate": 2.878204417014456e-06, + "loss": 0.76542765, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.25378418, + "step": 6231, + "time_per_iteration": 2.946723699569702 + }, + { + "auxiliary_loss_clip": 0.01495537, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_clip": 1.30821323, + "balance_loss_mlp": 1.02050209, + "epoch": 0.3746881106267849, + "flos": 16663241825280.0, + "grad_norm": 2.499735754780122, + "language_loss": 0.7457273, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.77113867, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.25085449, + "step": 6232, + "time_per_iteration": 2.8472650051116943 + }, + { + "auxiliary_loss_clip": 0.01497983, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.31055832, + "balance_loss_mlp": 1.01782417, + "epoch": 0.3747482338794529, + "flos": 26189365455360.0, + "grad_norm": 1.6224972946686627, + "language_loss": 0.77717704, + "learning_rate": 2.877504536769561e-06, + "loss": 0.8025775, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.24243164, + "step": 6233, + "time_per_iteration": 2.962507486343384 + }, + { + "auxiliary_loss_clip": 0.0148819, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_clip": 1.30305636, + "balance_loss_mlp": 1.02087998, + "epoch": 0.37480835713212085, + "flos": 12028657547520.0, + "grad_norm": 1.9004702594580773, + "language_loss": 0.69540024, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.72073281, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.24194336, + "step": 6234, + "time_per_iteration": 4.338593244552612 + }, + { + "auxiliary_loss_clip": 0.01490764, + "auxiliary_loss_mlp": 0.01045019, + "balance_loss_clip": 1.30716193, + "balance_loss_mlp": 1.02136838, + "epoch": 0.3748684803847888, + "flos": 19687857217920.0, + "grad_norm": 2.0653449654108336, + "language_loss": 0.83401132, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.85936916, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.23657227, + "step": 6235, + "time_per_iteration": 2.8565175533294678 + }, + { + "auxiliary_loss_clip": 0.01492903, + "auxiliary_loss_mlp": 0.01045305, + "balance_loss_clip": 1.30625033, + "balance_loss_mlp": 1.02137995, + "epoch": 0.3749286036374568, + "flos": 20530716197760.0, + "grad_norm": 2.060620757842387, + "language_loss": 0.78777492, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.81315696, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.23950195, + "step": 6236, + "time_per_iteration": 2.879055976867676 + }, + { + "auxiliary_loss_clip": 0.01513617, + "auxiliary_loss_mlp": 0.0104952, + "balance_loss_clip": 1.32349658, + "balance_loss_mlp": 1.02497447, + "epoch": 0.37498872689012475, + "flos": 20714455906560.0, + "grad_norm": 2.4011591754247026, + "language_loss": 0.7453987, + "learning_rate": 2.876104377085234e-06, + "loss": 0.77103007, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.24560547, + "step": 6237, + "time_per_iteration": 2.830744743347168 + }, + { + "auxiliary_loss_clip": 0.015017, + "auxiliary_loss_mlp": 0.01043055, + "balance_loss_clip": 1.31204534, + "balance_loss_mlp": 1.01922488, + "epoch": 0.3750488501427927, + "flos": 21583493642880.0, + "grad_norm": 1.9397113197964924, + "language_loss": 0.93801355, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.96346116, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.23852539, + "step": 6238, + "time_per_iteration": 2.913250684738159 + }, + { + "auxiliary_loss_clip": 0.01490315, + "auxiliary_loss_mlp": 0.01041288, + "balance_loss_clip": 1.30350804, + "balance_loss_mlp": 1.01689756, + "epoch": 0.3751089733954607, + "flos": 15932174042880.0, + "grad_norm": 2.051205418480865, + "language_loss": 0.71657622, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.74189222, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2442627, + "step": 6239, + "time_per_iteration": 2.826160192489624 + }, + { + "auxiliary_loss_clip": 0.01511967, + "auxiliary_loss_mlp": 0.01048306, + "balance_loss_clip": 1.3225944, + "balance_loss_mlp": 1.02320027, + "epoch": 0.37516909664812864, + "flos": 36298678302720.0, + "grad_norm": 1.6390202687710338, + "language_loss": 0.66141242, + "learning_rate": 2.875053908444895e-06, + "loss": 0.68701512, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.25109863, + "step": 6240, + "time_per_iteration": 4.402130365371704 + }, + { + "auxiliary_loss_clip": 0.01496166, + "auxiliary_loss_mlp": 0.01036799, + "balance_loss_clip": 1.30910492, + "balance_loss_mlp": 1.01262379, + "epoch": 0.3752292199007966, + "flos": 13523622854400.0, + "grad_norm": 2.1455124300871535, + "language_loss": 0.77361584, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.79894549, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.24182129, + "step": 6241, + "time_per_iteration": 4.242900133132935 + }, + { + "auxiliary_loss_clip": 0.01502923, + "auxiliary_loss_mlp": 0.01042853, + "balance_loss_clip": 1.31410551, + "balance_loss_mlp": 1.01793838, + "epoch": 0.3752893431534646, + "flos": 27209358403200.0, + "grad_norm": 9.088576073310552, + "language_loss": 0.84297776, + "learning_rate": 2.874353430085213e-06, + "loss": 0.8684355, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.24902344, + "step": 6242, + "time_per_iteration": 2.929429531097412 + }, + { + "auxiliary_loss_clip": 0.01499395, + "auxiliary_loss_mlp": 0.01046086, + "balance_loss_clip": 1.31042981, + "balance_loss_mlp": 1.02247047, + "epoch": 0.3753494664061326, + "flos": 30019711829760.0, + "grad_norm": 2.5693555863303756, + "language_loss": 0.68382072, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70927548, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.23596191, + "step": 6243, + "time_per_iteration": 4.361286163330078 + }, + { + "auxiliary_loss_clip": 0.01499037, + "auxiliary_loss_mlp": 0.01045716, + "balance_loss_clip": 1.31063199, + "balance_loss_mlp": 1.01997864, + "epoch": 0.37540958965880056, + "flos": 24472582300800.0, + "grad_norm": 2.037182300578338, + "language_loss": 0.84641457, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.87186205, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.25756836, + "step": 6244, + "time_per_iteration": 2.8807318210601807 + }, + { + "auxiliary_loss_clip": 0.01480778, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.29865932, + "balance_loss_mlp": 1.01248634, + "epoch": 0.3754697129114685, + "flos": 16517037317760.0, + "grad_norm": 2.366047557475479, + "language_loss": 0.83522654, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.86038798, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.22888184, + "step": 6245, + "time_per_iteration": 2.8794474601745605 + }, + { + "auxiliary_loss_clip": 0.01499212, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.31103277, + "balance_loss_mlp": 1.01745319, + "epoch": 0.3755298361641365, + "flos": 19400334641280.0, + "grad_norm": 2.176140213940743, + "language_loss": 0.64547455, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.6709044, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.26342773, + "step": 6246, + "time_per_iteration": 2.835742712020874 + }, + { + "auxiliary_loss_clip": 0.01508084, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.31708026, + "balance_loss_mlp": 1.01676393, + "epoch": 0.37558995941680445, + "flos": 14728531875840.0, + "grad_norm": 1.9616735000033112, + "language_loss": 0.75814724, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.78364378, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.24780273, + "step": 6247, + "time_per_iteration": 2.856764078140259 + }, + { + "auxiliary_loss_clip": 0.01501114, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.31228328, + "balance_loss_mlp": 1.01938808, + "epoch": 0.3756500826694724, + "flos": 21700035544320.0, + "grad_norm": 2.9936173171274967, + "language_loss": 0.56940192, + "learning_rate": 2.872251199697598e-06, + "loss": 0.59485626, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.24951172, + "step": 6248, + "time_per_iteration": 2.8775525093078613 + }, + { + "auxiliary_loss_clip": 0.01492458, + "auxiliary_loss_mlp": 0.01042469, + "balance_loss_clip": 1.30584836, + "balance_loss_mlp": 1.0180074, + "epoch": 0.3757102059221404, + "flos": 26516956942080.0, + "grad_norm": 2.841584551611176, + "language_loss": 0.84677184, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.8721211, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.24450684, + "step": 6249, + "time_per_iteration": 2.968430995941162 + }, + { + "auxiliary_loss_clip": 0.01505676, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.31769633, + "balance_loss_mlp": 1.01670289, + "epoch": 0.37577032917480835, + "flos": 37351184279040.0, + "grad_norm": 1.5449061588489856, + "language_loss": 0.69005388, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.7155332, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.2557373, + "step": 6250, + "time_per_iteration": 3.0259289741516113 + }, + { + "auxiliary_loss_clip": 0.01499781, + "auxiliary_loss_mlp": 0.01042906, + "balance_loss_clip": 1.31186461, + "balance_loss_mlp": 1.01923096, + "epoch": 0.3758304524274763, + "flos": 21918731500800.0, + "grad_norm": 1.9476038553374257, + "language_loss": 0.78868824, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.81411511, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.23681641, + "step": 6251, + "time_per_iteration": 2.9016366004943848 + }, + { + "auxiliary_loss_clip": 0.01492817, + "auxiliary_loss_mlp": 0.01045424, + "balance_loss_clip": 1.30726039, + "balance_loss_mlp": 1.02117658, + "epoch": 0.3758905756801443, + "flos": 36581585909760.0, + "grad_norm": 4.087154238135045, + "language_loss": 0.5859949, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.6113773, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.24255371, + "step": 6252, + "time_per_iteration": 3.012312173843384 + }, + { + "auxiliary_loss_clip": 0.01508212, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.31755543, + "balance_loss_mlp": 1.0177443, + "epoch": 0.37595069893281224, + "flos": 24537834581760.0, + "grad_norm": 1.7310930017170976, + "language_loss": 0.89677149, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92227304, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.24194336, + "step": 6253, + "time_per_iteration": 2.8733460903167725 + }, + { + "auxiliary_loss_clip": 0.01495845, + "auxiliary_loss_mlp": 0.01041677, + "balance_loss_clip": 1.31232488, + "balance_loss_mlp": 1.01732278, + "epoch": 0.3760108221854802, + "flos": 16443550483200.0, + "grad_norm": 2.636902154729331, + "language_loss": 0.7733866, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.79876184, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.24365234, + "step": 6254, + "time_per_iteration": 2.963459014892578 + }, + { + "auxiliary_loss_clip": 0.0150739, + "auxiliary_loss_mlp": 0.01042593, + "balance_loss_clip": 1.31858468, + "balance_loss_mlp": 1.0186795, + "epoch": 0.37607094543814823, + "flos": 13779627788160.0, + "grad_norm": 2.3303713474248915, + "language_loss": 0.63347876, + "learning_rate": 2.869797092829169e-06, + "loss": 0.65897858, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.23937988, + "step": 6255, + "time_per_iteration": 2.8558382987976074 + }, + { + "auxiliary_loss_clip": 0.01513555, + "auxiliary_loss_mlp": 0.01048509, + "balance_loss_clip": 1.32242918, + "balance_loss_mlp": 1.0233916, + "epoch": 0.3761310686908162, + "flos": 19865579368320.0, + "grad_norm": 2.725237920363147, + "language_loss": 0.7519393, + "learning_rate": 2.869446374096135e-06, + "loss": 0.77755994, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.25097656, + "step": 6256, + "time_per_iteration": 2.8765554428100586 + }, + { + "auxiliary_loss_clip": 0.01503184, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.31324291, + "balance_loss_mlp": 1.02028954, + "epoch": 0.37619119194348416, + "flos": 12758594209920.0, + "grad_norm": 1.7616335621484713, + "language_loss": 0.72244978, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.25769043, + "step": 6257, + "time_per_iteration": 2.8745737075805664 + }, + { + "auxiliary_loss_clip": 0.01495588, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.30758643, + "balance_loss_mlp": 1.01839519, + "epoch": 0.3762513151961521, + "flos": 17539609219200.0, + "grad_norm": 1.6800473014014758, + "language_loss": 0.85162795, + "learning_rate": 2.868744837734889e-06, + "loss": 0.87700129, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.23364258, + "step": 6258, + "time_per_iteration": 2.8813345432281494 + }, + { + "auxiliary_loss_clip": 0.01501426, + "auxiliary_loss_mlp": 0.01045768, + "balance_loss_clip": 1.31591177, + "balance_loss_mlp": 1.02218878, + "epoch": 0.3763114384488201, + "flos": 23626420450560.0, + "grad_norm": 1.424318213160496, + "language_loss": 0.81498766, + "learning_rate": 2.868394020133277e-06, + "loss": 0.84045964, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.23608398, + "step": 6259, + "time_per_iteration": 2.8640594482421875 + }, + { + "auxiliary_loss_clip": 0.01505292, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.31283426, + "balance_loss_mlp": 1.02656019, + "epoch": 0.37637156170148806, + "flos": 25416916663680.0, + "grad_norm": 2.4918341229501424, + "language_loss": 0.72519898, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.7507689, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.25109863, + "step": 6260, + "time_per_iteration": 2.957181453704834 + }, + { + "auxiliary_loss_clip": 0.01519347, + "auxiliary_loss_mlp": 0.01048629, + "balance_loss_clip": 1.32627857, + "balance_loss_mlp": 1.02462053, + "epoch": 0.376431684954156, + "flos": 23451412988160.0, + "grad_norm": 1.7077504175954468, + "language_loss": 0.79048055, + "learning_rate": 2.867692286154594e-06, + "loss": 0.81616032, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.23999023, + "step": 6261, + "time_per_iteration": 2.8986258506774902 + }, + { + "auxiliary_loss_clip": 0.01514263, + "auxiliary_loss_mlp": 0.01053642, + "balance_loss_clip": 1.32284045, + "balance_loss_mlp": 1.0292871, + "epoch": 0.376491808206824, + "flos": 34217854335360.0, + "grad_norm": 1.8695750093158916, + "language_loss": 0.81270945, + "learning_rate": 2.867341369804132e-06, + "loss": 0.83838856, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.24353027, + "step": 6262, + "time_per_iteration": 2.973497152328491 + }, + { + "auxiliary_loss_clip": 0.01494072, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.30748057, + "balance_loss_mlp": 1.02201557, + "epoch": 0.37655193145949195, + "flos": 35198276065920.0, + "grad_norm": 1.850487327649705, + "language_loss": 0.81328011, + "learning_rate": 2.866990420563998e-06, + "loss": 0.83867478, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23376465, + "step": 6263, + "time_per_iteration": 2.972407579421997 + }, + { + "auxiliary_loss_clip": 0.01506651, + "auxiliary_loss_mlp": 0.01054547, + "balance_loss_clip": 1.31859446, + "balance_loss_mlp": 1.03149199, + "epoch": 0.3766120547121599, + "flos": 16770056094720.0, + "grad_norm": 1.7327716063353824, + "language_loss": 0.80566013, + "learning_rate": 2.866639438447501e-06, + "loss": 0.83127213, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.23059082, + "step": 6264, + "time_per_iteration": 2.863201856613159 + }, + { + "auxiliary_loss_clip": 0.01492386, + "auxiliary_loss_mlp": 0.01052342, + "balance_loss_clip": 1.30497432, + "balance_loss_mlp": 1.02878618, + "epoch": 0.3766721779648279, + "flos": 23560896700800.0, + "grad_norm": 2.4227128787627223, + "language_loss": 0.74706584, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.77251315, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.2355957, + "step": 6265, + "time_per_iteration": 2.8631815910339355 + }, + { + "auxiliary_loss_clip": 0.01486504, + "auxiliary_loss_mlp": 0.01056211, + "balance_loss_clip": 1.30424809, + "balance_loss_mlp": 1.03239298, + "epoch": 0.37673230121749585, + "flos": 29140267789440.0, + "grad_norm": 1.7320405620081623, + "language_loss": 0.68791664, + "learning_rate": 2.865937375638654e-06, + "loss": 0.7133438, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.23840332, + "step": 6266, + "time_per_iteration": 2.9923810958862305 + }, + { + "auxiliary_loss_clip": 0.01523977, + "auxiliary_loss_mlp": 0.01058111, + "balance_loss_clip": 1.32972884, + "balance_loss_mlp": 1.03311324, + "epoch": 0.3767924244701638, + "flos": 28158307735680.0, + "grad_norm": 2.840935502202564, + "language_loss": 0.63225144, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65807235, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 1.94433594, + "router_z_loss_mlp": 0.25, + "step": 6267, + "time_per_iteration": 2.9897868633270264 + }, + { + "auxiliary_loss_clip": 0.01293687, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.17374337, + "balance_loss_mlp": 1.01398969, + "epoch": 0.37685254772283183, + "flos": 60825536380800.0, + "grad_norm": 0.7390293407275279, + "language_loss": 0.58926851, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.61255604, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.2109375, + "step": 6268, + "time_per_iteration": 3.495623826980591 + }, + { + "auxiliary_loss_clip": 0.01498132, + "auxiliary_loss_mlp": 0.01042953, + "balance_loss_clip": 1.30958843, + "balance_loss_mlp": 1.01903975, + "epoch": 0.3769126709754998, + "flos": 26044020599040.0, + "grad_norm": 1.5895486098531133, + "language_loss": 0.65256011, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67797101, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.23925781, + "step": 6269, + "time_per_iteration": 4.404542684555054 + }, + { + "auxiliary_loss_clip": 0.0149016, + "auxiliary_loss_mlp": 0.01049225, + "balance_loss_clip": 1.3066113, + "balance_loss_mlp": 1.02506089, + "epoch": 0.37697279422816776, + "flos": 23588613780480.0, + "grad_norm": 1.7153717506849646, + "language_loss": 0.71803874, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.74343258, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.24169922, + "step": 6270, + "time_per_iteration": 3.0313608646392822 + }, + { + "auxiliary_loss_clip": 0.01299885, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.17728877, + "balance_loss_mlp": 1.01577556, + "epoch": 0.3770329174808357, + "flos": 64777446829440.0, + "grad_norm": 1.0412935320573684, + "language_loss": 0.56066954, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58405501, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.22851562, + "step": 6271, + "time_per_iteration": 3.3044159412384033 + }, + { + "auxiliary_loss_clip": 0.01486381, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.30220902, + "balance_loss_mlp": 1.02280641, + "epoch": 0.3770930407335037, + "flos": 21845335155840.0, + "grad_norm": 1.737235804307271, + "language_loss": 0.8045187, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.8298552, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.24475098, + "step": 6272, + "time_per_iteration": 2.9144816398620605 + }, + { + "auxiliary_loss_clip": 0.0148178, + "auxiliary_loss_mlp": 0.0104436, + "balance_loss_clip": 1.29897988, + "balance_loss_mlp": 1.02153158, + "epoch": 0.37715316398617166, + "flos": 22758287610240.0, + "grad_norm": 1.863581001924852, + "language_loss": 0.74959183, + "learning_rate": 2.863479122159103e-06, + "loss": 0.77485323, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.22839355, + "step": 6273, + "time_per_iteration": 2.974905014038086 + }, + { + "auxiliary_loss_clip": 0.01505338, + "auxiliary_loss_mlp": 0.01052399, + "balance_loss_clip": 1.32060742, + "balance_loss_mlp": 1.02935612, + "epoch": 0.3772132872388396, + "flos": 18923371511040.0, + "grad_norm": 1.9128324775568435, + "language_loss": 0.72273326, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74831057, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23010254, + "step": 6274, + "time_per_iteration": 2.8780627250671387 + }, + { + "auxiliary_loss_clip": 0.01501047, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.31187022, + "balance_loss_mlp": 1.02201772, + "epoch": 0.3772734104915076, + "flos": 17354738390400.0, + "grad_norm": 2.0471710890898214, + "language_loss": 0.84758896, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.8730495, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.22998047, + "step": 6275, + "time_per_iteration": 4.233761310577393 + }, + { + "auxiliary_loss_clip": 0.01476912, + "auxiliary_loss_mlp": 0.01041837, + "balance_loss_clip": 1.2964282, + "balance_loss_mlp": 1.01830506, + "epoch": 0.37733353374417555, + "flos": 32354911918080.0, + "grad_norm": 1.6634433540517763, + "language_loss": 0.76099259, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.78618008, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.23547363, + "step": 6276, + "time_per_iteration": 4.4118475914001465 + }, + { + "auxiliary_loss_clip": 0.01509437, + "auxiliary_loss_mlp": 0.01044416, + "balance_loss_clip": 1.32188106, + "balance_loss_mlp": 1.01973939, + "epoch": 0.3773936569968435, + "flos": 23369284396800.0, + "grad_norm": 1.9498564572269774, + "language_loss": 0.8634991, + "learning_rate": 2.862073685241366e-06, + "loss": 0.88903761, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.2467041, + "step": 6277, + "time_per_iteration": 2.8479421138763428 + }, + { + "auxiliary_loss_clip": 0.01488072, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.30706799, + "balance_loss_mlp": 1.0218873, + "epoch": 0.3774537802495115, + "flos": 21475774477440.0, + "grad_norm": 2.150628373204088, + "language_loss": 0.78836995, + "learning_rate": 2.861722244253818e-06, + "loss": 0.81370759, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.23803711, + "step": 6278, + "time_per_iteration": 4.2657694816589355 + }, + { + "auxiliary_loss_clip": 0.01512862, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.32320523, + "balance_loss_mlp": 1.02323401, + "epoch": 0.37751390350217945, + "flos": 24984727902720.0, + "grad_norm": 2.4849684982907547, + "language_loss": 0.83440816, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.86001253, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.2434082, + "step": 6279, + "time_per_iteration": 2.95794677734375 + }, + { + "auxiliary_loss_clip": 0.0149884, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_clip": 1.31184363, + "balance_loss_mlp": 1.02371395, + "epoch": 0.3775740267548474, + "flos": 27830716248960.0, + "grad_norm": 1.7448450091451106, + "language_loss": 0.75343734, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77888596, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22302246, + "step": 6280, + "time_per_iteration": 2.9447779655456543 + }, + { + "auxiliary_loss_clip": 0.01489106, + "auxiliary_loss_mlp": 0.0104295, + "balance_loss_clip": 1.30833101, + "balance_loss_mlp": 1.02021646, + "epoch": 0.3776341500075154, + "flos": 22575498042240.0, + "grad_norm": 1.3958221733913045, + "language_loss": 0.76732111, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.79264164, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22729492, + "step": 6281, + "time_per_iteration": 2.971968173980713 + }, + { + "auxiliary_loss_clip": 0.01484246, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.29872, + "balance_loss_mlp": 1.01392221, + "epoch": 0.3776942732601834, + "flos": 23087960357760.0, + "grad_norm": 1.9573894619322283, + "language_loss": 0.84761548, + "learning_rate": 2.860316153670974e-06, + "loss": 0.87283266, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.23535156, + "step": 6282, + "time_per_iteration": 2.904425859451294 + }, + { + "auxiliary_loss_clip": 0.01478819, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.29811502, + "balance_loss_mlp": 1.01557899, + "epoch": 0.37775439651285136, + "flos": 21734041651200.0, + "grad_norm": 1.6829077469904408, + "language_loss": 0.70802069, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.73319912, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.23461914, + "step": 6283, + "time_per_iteration": 2.870537519454956 + }, + { + "auxiliary_loss_clip": 0.01491939, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.30774665, + "balance_loss_mlp": 1.01752901, + "epoch": 0.37781451976551933, + "flos": 23998288613760.0, + "grad_norm": 3.0054314339758585, + "language_loss": 0.7769891, + "learning_rate": 2.859612912586581e-06, + "loss": 0.80231947, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.23583984, + "step": 6284, + "time_per_iteration": 2.9045557975769043 + }, + { + "auxiliary_loss_clip": 0.01506584, + "auxiliary_loss_mlp": 0.0104137, + "balance_loss_clip": 1.31633782, + "balance_loss_mlp": 1.01581132, + "epoch": 0.3778746430181873, + "flos": 13733948522880.0, + "grad_norm": 2.108059479660192, + "language_loss": 0.86797404, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.8934536, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.2557373, + "step": 6285, + "time_per_iteration": 2.838348627090454 + }, + { + "auxiliary_loss_clip": 0.01503446, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.31564879, + "balance_loss_mlp": 1.01450801, + "epoch": 0.37793476627085526, + "flos": 19469070771840.0, + "grad_norm": 1.83376686348102, + "language_loss": 0.8514328, + "learning_rate": 2.858909541115758e-06, + "loss": 0.87686223, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.24975586, + "step": 6286, + "time_per_iteration": 2.849339246749878 + }, + { + "auxiliary_loss_clip": 0.01500518, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.31412864, + "balance_loss_mlp": 1.01943803, + "epoch": 0.3779948895235232, + "flos": 10714129079040.0, + "grad_norm": 2.2508748612862925, + "language_loss": 0.8267011, + "learning_rate": 2.858557806518775e-06, + "loss": 0.85213906, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.23864746, + "step": 6287, + "time_per_iteration": 2.904369592666626 + }, + { + "auxiliary_loss_clip": 0.01493647, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.30965114, + "balance_loss_mlp": 1.01687515, + "epoch": 0.3780550127761912, + "flos": 22320262270080.0, + "grad_norm": 2.523627155133825, + "language_loss": 0.7459048, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.77124238, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.23217773, + "step": 6288, + "time_per_iteration": 2.9245071411132812 + }, + { + "auxiliary_loss_clip": 0.01492659, + "auxiliary_loss_mlp": 0.01040577, + "balance_loss_clip": 1.30845451, + "balance_loss_mlp": 1.01768887, + "epoch": 0.37811513602885916, + "flos": 28962681373440.0, + "grad_norm": 3.101994682723379, + "language_loss": 0.76464105, + "learning_rate": 2.857854239668352e-06, + "loss": 0.78997338, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22912598, + "step": 6289, + "time_per_iteration": 2.9749672412872314 + }, + { + "auxiliary_loss_clip": 0.01492654, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.30865836, + "balance_loss_mlp": 1.0149101, + "epoch": 0.3781752592815271, + "flos": 23123459543040.0, + "grad_norm": 1.9083296580109173, + "language_loss": 0.75318021, + "learning_rate": 2.857502407441593e-06, + "loss": 0.77848804, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.23217773, + "step": 6290, + "time_per_iteration": 2.930063486099243 + }, + { + "auxiliary_loss_clip": 0.01511063, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_clip": 1.32038283, + "balance_loss_mlp": 1.01928186, + "epoch": 0.3782353825341951, + "flos": 19765506574080.0, + "grad_norm": 2.378515784087982, + "language_loss": 0.81079257, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.83634615, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.24987793, + "step": 6291, + "time_per_iteration": 2.836290121078491 + }, + { + "auxiliary_loss_clip": 0.01498644, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.31072056, + "balance_loss_mlp": 1.01639724, + "epoch": 0.37829550578686305, + "flos": 22060049569920.0, + "grad_norm": 1.7275238160964965, + "language_loss": 0.77105832, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.79644704, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.23828125, + "step": 6292, + "time_per_iteration": 2.865901231765747 + }, + { + "auxiliary_loss_clip": 0.01489913, + "auxiliary_loss_mlp": 0.01044535, + "balance_loss_clip": 1.30530286, + "balance_loss_mlp": 1.02052605, + "epoch": 0.378355629039531, + "flos": 16478597220480.0, + "grad_norm": 2.6144260879290506, + "language_loss": 0.71318614, + "learning_rate": 2.856446715715224e-06, + "loss": 0.73853058, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.24023438, + "step": 6293, + "time_per_iteration": 2.8297388553619385 + }, + { + "auxiliary_loss_clip": 0.01479412, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.29755437, + "balance_loss_mlp": 1.0184747, + "epoch": 0.378415752292199, + "flos": 19984473999360.0, + "grad_norm": 2.77373987588886, + "language_loss": 0.71909189, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.74430287, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.23217773, + "step": 6294, + "time_per_iteration": 2.8517003059387207 + }, + { + "auxiliary_loss_clip": 0.01508409, + "auxiliary_loss_mlp": 0.01039868, + "balance_loss_clip": 1.31623173, + "balance_loss_mlp": 1.01641965, + "epoch": 0.378475875544867, + "flos": 14655497489280.0, + "grad_norm": 2.379615641316863, + "language_loss": 0.84536791, + "learning_rate": 2.855742758826011e-06, + "loss": 0.87085068, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.23449707, + "step": 6295, + "time_per_iteration": 2.9297478199005127 + }, + { + "auxiliary_loss_clip": 0.01486794, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.30196357, + "balance_loss_mlp": 1.0160898, + "epoch": 0.37853599879753497, + "flos": 26662166064000.0, + "grad_norm": 1.851365068893925, + "language_loss": 0.72114676, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.74640375, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.22790527, + "step": 6296, + "time_per_iteration": 2.9938693046569824 + }, + { + "auxiliary_loss_clip": 0.01471678, + "auxiliary_loss_mlp": 0.01047245, + "balance_loss_clip": 1.29386497, + "balance_loss_mlp": 1.02290213, + "epoch": 0.37859612205020293, + "flos": 17321048997120.0, + "grad_norm": 1.7775700553814144, + "language_loss": 0.77849269, + "learning_rate": 2.855038672137396e-06, + "loss": 0.80368185, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.2434082, + "step": 6297, + "time_per_iteration": 2.8632941246032715 + }, + { + "auxiliary_loss_clip": 0.01480756, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.296556, + "balance_loss_mlp": 1.02416444, + "epoch": 0.3786562453028709, + "flos": 18228753054720.0, + "grad_norm": 1.7659371238271087, + "language_loss": 0.80089307, + "learning_rate": 2.854686580151684e-06, + "loss": 0.82617855, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.23632812, + "step": 6298, + "time_per_iteration": 2.848825454711914 + }, + { + "auxiliary_loss_clip": 0.01476184, + "auxiliary_loss_mlp": 0.01043691, + "balance_loss_clip": 1.29573655, + "balance_loss_mlp": 1.02135086, + "epoch": 0.37871636855553886, + "flos": 21224701226880.0, + "grad_norm": 1.76346846229619, + "language_loss": 0.85384357, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.87904227, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22338867, + "step": 6299, + "time_per_iteration": 2.8439624309539795 + }, + { + "auxiliary_loss_clip": 0.01489216, + "auxiliary_loss_mlp": 0.0104883, + "balance_loss_clip": 1.30369997, + "balance_loss_mlp": 1.02470231, + "epoch": 0.3787764918082068, + "flos": 20960733208320.0, + "grad_norm": 2.0882033785166025, + "language_loss": 0.76774424, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.79312468, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24121094, + "step": 6300, + "time_per_iteration": 2.8400354385375977 + }, + { + "auxiliary_loss_clip": 0.01522195, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.33129478, + "balance_loss_mlp": 1.02776134, + "epoch": 0.3788366150608748, + "flos": 17316072069120.0, + "grad_norm": 1.9763783903963708, + "language_loss": 0.83410978, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.85986567, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.25646973, + "step": 6301, + "time_per_iteration": 2.8780391216278076 + }, + { + "auxiliary_loss_clip": 0.0148689, + "auxiliary_loss_mlp": 0.0104994, + "balance_loss_clip": 1.30301666, + "balance_loss_mlp": 1.02485788, + "epoch": 0.37889673831354276, + "flos": 24320405479680.0, + "grad_norm": 1.846665985209703, + "language_loss": 0.68111312, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70648146, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.25085449, + "step": 6302, + "time_per_iteration": 2.8968801498413086 + }, + { + "auxiliary_loss_clip": 0.01486866, + "auxiliary_loss_mlp": 0.01051439, + "balance_loss_clip": 1.30341995, + "balance_loss_mlp": 1.02704859, + "epoch": 0.3789568615662107, + "flos": 26693774196480.0, + "grad_norm": 1.837946965443948, + "language_loss": 0.693416, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.718799, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.24401855, + "step": 6303, + "time_per_iteration": 2.9194233417510986 + }, + { + "auxiliary_loss_clip": 0.01496156, + "auxiliary_loss_mlp": 0.01049258, + "balance_loss_clip": 1.3100872, + "balance_loss_mlp": 1.02621496, + "epoch": 0.3790169848188787, + "flos": 23595310010880.0, + "grad_norm": 1.6210615702942242, + "language_loss": 0.78282309, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80827725, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23034668, + "step": 6304, + "time_per_iteration": 4.380652189254761 + }, + { + "auxiliary_loss_clip": 0.01508922, + "auxiliary_loss_mlp": 0.01052566, + "balance_loss_clip": 1.32090342, + "balance_loss_mlp": 1.02840185, + "epoch": 0.37907710807154665, + "flos": 18445367750400.0, + "grad_norm": 2.3062887680939808, + "language_loss": 0.81303567, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.83865052, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.24169922, + "step": 6305, + "time_per_iteration": 2.8929603099823 + }, + { + "auxiliary_loss_clip": 0.01301115, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.18528533, + "balance_loss_mlp": 1.01330018, + "epoch": 0.3791372313242146, + "flos": 50132808092160.0, + "grad_norm": 0.9779997288408186, + "language_loss": 0.64549124, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66887194, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.23632812, + "step": 6306, + "time_per_iteration": 3.35331654548645 + }, + { + "auxiliary_loss_clip": 0.01483497, + "auxiliary_loss_mlp": 0.01050713, + "balance_loss_clip": 1.29993582, + "balance_loss_mlp": 1.02638268, + "epoch": 0.3791973545768826, + "flos": 24327282689280.0, + "grad_norm": 1.6372744533336887, + "language_loss": 0.74138665, + "learning_rate": 2.851516295441817e-06, + "loss": 0.76672882, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.24316406, + "step": 6307, + "time_per_iteration": 2.974104404449463 + }, + { + "auxiliary_loss_clip": 0.01497025, + "auxiliary_loss_mlp": 0.01046581, + "balance_loss_clip": 1.31253469, + "balance_loss_mlp": 1.02245307, + "epoch": 0.3792574778295506, + "flos": 21589873159680.0, + "grad_norm": 1.5694221928728536, + "language_loss": 0.78825802, + "learning_rate": 2.851163879959112e-06, + "loss": 0.81369412, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.24133301, + "step": 6308, + "time_per_iteration": 2.964582681655884 + }, + { + "auxiliary_loss_clip": 0.01479545, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.29441297, + "balance_loss_mlp": 1.02163315, + "epoch": 0.37931760108221857, + "flos": 22282772313600.0, + "grad_norm": 2.151287477825613, + "language_loss": 0.73140073, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75664753, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.23510742, + "step": 6309, + "time_per_iteration": 2.9911539554595947 + }, + { + "auxiliary_loss_clip": 0.0148185, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_clip": 1.29965687, + "balance_loss_mlp": 1.02135921, + "epoch": 0.37937772433488653, + "flos": 19692562677120.0, + "grad_norm": 1.4037327349070574, + "language_loss": 0.79247475, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81773943, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.23242188, + "step": 6310, + "time_per_iteration": 4.351290941238403 + }, + { + "auxiliary_loss_clip": 0.0148257, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.29979253, + "balance_loss_mlp": 1.01948869, + "epoch": 0.3794378475875545, + "flos": 19108830522240.0, + "grad_norm": 1.7913933566749083, + "language_loss": 0.77620691, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.80146545, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.23791504, + "step": 6311, + "time_per_iteration": 4.268390655517578 + }, + { + "auxiliary_loss_clip": 0.01476927, + "auxiliary_loss_mlp": 0.01042721, + "balance_loss_clip": 1.29531479, + "balance_loss_mlp": 1.02009523, + "epoch": 0.37949797084022246, + "flos": 20349374463360.0, + "grad_norm": 1.5414059300766683, + "language_loss": 0.7104162, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73561263, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22607422, + "step": 6312, + "time_per_iteration": 2.8651866912841797 + }, + { + "auxiliary_loss_clip": 0.01292442, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.17888284, + "balance_loss_mlp": 1.00466657, + "epoch": 0.37955809409289043, + "flos": 64001559432960.0, + "grad_norm": 0.7779416941232136, + "language_loss": 0.56191075, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58514506, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.26367188, + "step": 6313, + "time_per_iteration": 4.712616443634033 + }, + { + "auxiliary_loss_clip": 0.01481284, + "auxiliary_loss_mlp": 0.01043183, + "balance_loss_clip": 1.2986573, + "balance_loss_mlp": 1.01990175, + "epoch": 0.3796182173455584, + "flos": 31553252968320.0, + "grad_norm": 2.69652168432199, + "language_loss": 0.72442245, + "learning_rate": 2.849048709730083e-06, + "loss": 0.74966711, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.23266602, + "step": 6314, + "time_per_iteration": 2.931257486343384 + }, + { + "auxiliary_loss_clip": 0.01485669, + "auxiliary_loss_mlp": 0.01042285, + "balance_loss_clip": 1.29875124, + "balance_loss_mlp": 1.01882434, + "epoch": 0.37967834059822636, + "flos": 12138503218560.0, + "grad_norm": 3.9492798174339825, + "language_loss": 0.74301887, + "learning_rate": 2.848696068594545e-06, + "loss": 0.76829839, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23461914, + "step": 6315, + "time_per_iteration": 2.827350616455078 + }, + { + "auxiliary_loss_clip": 0.0148523, + "auxiliary_loss_mlp": 0.01043911, + "balance_loss_clip": 1.30392504, + "balance_loss_mlp": 1.020594, + "epoch": 0.3797384638508943, + "flos": 39363181626240.0, + "grad_norm": 1.880523513036837, + "language_loss": 0.71885109, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.74414253, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.2331543, + "step": 6316, + "time_per_iteration": 3.0200612545013428 + }, + { + "auxiliary_loss_clip": 0.01476502, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.29441857, + "balance_loss_mlp": 1.01422024, + "epoch": 0.3797985871035623, + "flos": 34066175207040.0, + "grad_norm": 1.8390991751123245, + "language_loss": 0.66421998, + "learning_rate": 2.847990689788923e-06, + "loss": 0.68935299, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22570801, + "step": 6317, + "time_per_iteration": 2.9777965545654297 + }, + { + "auxiliary_loss_clip": 0.01462691, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.28376007, + "balance_loss_mlp": 1.01627719, + "epoch": 0.37985871035623026, + "flos": 23232671786880.0, + "grad_norm": 2.6903485652199457, + "language_loss": 0.87492704, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.89994335, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22631836, + "step": 6318, + "time_per_iteration": 2.8897392749786377 + }, + { + "auxiliary_loss_clip": 0.01475997, + "auxiliary_loss_mlp": 0.01047835, + "balance_loss_clip": 1.29273415, + "balance_loss_mlp": 1.02508998, + "epoch": 0.3799188336088982, + "flos": 18124336759680.0, + "grad_norm": 2.0673632539962887, + "language_loss": 0.77809173, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.80333006, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.22741699, + "step": 6319, + "time_per_iteration": 2.8912525177001953 + }, + { + "auxiliary_loss_clip": 0.01473587, + "auxiliary_loss_mlp": 0.01044529, + "balance_loss_clip": 1.29311597, + "balance_loss_mlp": 1.0221417, + "epoch": 0.3799789568615662, + "flos": 21881965461120.0, + "grad_norm": 1.8209029079271601, + "language_loss": 0.64310622, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66828746, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.22387695, + "step": 6320, + "time_per_iteration": 2.887160062789917 + }, + { + "auxiliary_loss_clip": 0.01460315, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.28033614, + "balance_loss_mlp": 1.01665354, + "epoch": 0.3800390801142342, + "flos": 32975726826240.0, + "grad_norm": 2.061216799892788, + "language_loss": 0.72133195, + "learning_rate": 2.846579546413992e-06, + "loss": 0.74632657, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22485352, + "step": 6321, + "time_per_iteration": 2.987090587615967 + }, + { + "auxiliary_loss_clip": 0.01478408, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.29300857, + "balance_loss_mlp": 1.01783299, + "epoch": 0.38009920336690217, + "flos": 26918487711360.0, + "grad_norm": 2.0401838505336145, + "language_loss": 0.75894856, + "learning_rate": 2.846226680280859e-06, + "loss": 0.78415191, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24108887, + "step": 6322, + "time_per_iteration": 2.9191226959228516 + }, + { + "auxiliary_loss_clip": 0.01468667, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.28786731, + "balance_loss_mlp": 1.02075326, + "epoch": 0.38015932661957014, + "flos": 22498301134080.0, + "grad_norm": 5.517411947282556, + "language_loss": 0.86031461, + "learning_rate": 2.845873782058725e-06, + "loss": 0.88542753, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21875, + "step": 6323, + "time_per_iteration": 2.8930017948150635 + }, + { + "auxiliary_loss_clip": 0.01468551, + "auxiliary_loss_mlp": 0.010479, + "balance_loss_clip": 1.28498495, + "balance_loss_mlp": 1.023808, + "epoch": 0.3802194498722381, + "flos": 21990996725760.0, + "grad_norm": 2.841858588498901, + "language_loss": 0.73756182, + "learning_rate": 2.845520851760973e-06, + "loss": 0.76272631, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.2409668, + "step": 6324, + "time_per_iteration": 2.852933645248413 + }, + { + "auxiliary_loss_clip": 0.01476405, + "auxiliary_loss_mlp": 0.01041803, + "balance_loss_clip": 1.29211807, + "balance_loss_mlp": 1.01933169, + "epoch": 0.38027957312490607, + "flos": 21334863611520.0, + "grad_norm": 1.7910219206002573, + "language_loss": 0.85399032, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.87917244, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22460938, + "step": 6325, + "time_per_iteration": 2.886261463165283 + }, + { + "auxiliary_loss_clip": 0.01467986, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.28766572, + "balance_loss_mlp": 1.02168345, + "epoch": 0.38033969637757403, + "flos": 16700053109760.0, + "grad_norm": 1.6237585432195174, + "language_loss": 0.80420876, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.82933295, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.22753906, + "step": 6326, + "time_per_iteration": 2.8814303874969482 + }, + { + "auxiliary_loss_clip": 0.01460226, + "auxiliary_loss_mlp": 0.01045291, + "balance_loss_clip": 1.28150141, + "balance_loss_mlp": 1.02234352, + "epoch": 0.380399819630242, + "flos": 36224648530560.0, + "grad_norm": 1.878557043865977, + "language_loss": 0.74190414, + "learning_rate": 2.844461868547842e-06, + "loss": 0.76695931, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.22937012, + "step": 6327, + "time_per_iteration": 2.97017765045166 + }, + { + "auxiliary_loss_clip": 0.01464805, + "auxiliary_loss_mlp": 0.01048025, + "balance_loss_clip": 1.28400671, + "balance_loss_mlp": 1.02457643, + "epoch": 0.38045994288290996, + "flos": 21298957223040.0, + "grad_norm": 1.4578241424633058, + "language_loss": 0.8362661, + "learning_rate": 2.844108810081459e-06, + "loss": 0.86139441, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.23449707, + "step": 6328, + "time_per_iteration": 2.909496784210205 + }, + { + "auxiliary_loss_clip": 0.01453916, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.27340674, + "balance_loss_mlp": 1.02427959, + "epoch": 0.38052006613557793, + "flos": 20932608925440.0, + "grad_norm": 2.160231016989018, + "language_loss": 0.62428194, + "learning_rate": 2.843755719606385e-06, + "loss": 0.64929247, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.2286377, + "step": 6329, + "time_per_iteration": 2.859618902206421 + }, + { + "auxiliary_loss_clip": 0.01463386, + "auxiliary_loss_mlp": 0.01045756, + "balance_loss_clip": 1.28376389, + "balance_loss_mlp": 1.02326107, + "epoch": 0.3805801893882459, + "flos": 20999128060800.0, + "grad_norm": 1.9589070342352326, + "language_loss": 0.56691611, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.59200746, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22497559, + "step": 6330, + "time_per_iteration": 2.896533489227295 + }, + { + "auxiliary_loss_clip": 0.01448263, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.27367938, + "balance_loss_mlp": 1.0186218, + "epoch": 0.38064031264091386, + "flos": 25570043625600.0, + "grad_norm": 1.9569099947078032, + "language_loss": 0.66493446, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68981028, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.20703125, + "step": 6331, + "time_per_iteration": 2.9298412799835205 + }, + { + "auxiliary_loss_clip": 0.01473406, + "auxiliary_loss_mlp": 0.01048753, + "balance_loss_clip": 1.29232454, + "balance_loss_mlp": 1.02569818, + "epoch": 0.3807004358935818, + "flos": 15094246746240.0, + "grad_norm": 1.6508784327364194, + "language_loss": 0.77519333, + "learning_rate": 2.842696256262919e-06, + "loss": 0.80041486, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.23046875, + "step": 6332, + "time_per_iteration": 2.8665952682495117 + }, + { + "auxiliary_loss_clip": 0.01473626, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_clip": 1.29052782, + "balance_loss_mlp": 1.02277291, + "epoch": 0.3807605591462498, + "flos": 16408141787520.0, + "grad_norm": 1.7719742929967748, + "language_loss": 0.82400548, + "learning_rate": 2.842343037886987e-06, + "loss": 0.8492142, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.24499512, + "step": 6333, + "time_per_iteration": 2.8203916549682617 + }, + { + "auxiliary_loss_clip": 0.01462649, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.28264165, + "balance_loss_mlp": 1.01827097, + "epoch": 0.3808206823989178, + "flos": 29068364522880.0, + "grad_norm": 1.5720893709163775, + "language_loss": 0.86965901, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.89468753, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.21936035, + "step": 6334, + "time_per_iteration": 2.9206113815307617 + }, + { + "auxiliary_loss_clip": 0.01464243, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.28375411, + "balance_loss_mlp": 1.02038836, + "epoch": 0.3808808056515858, + "flos": 15714337737600.0, + "grad_norm": 1.8470292245351432, + "language_loss": 0.80183828, + "learning_rate": 2.841636505323321e-06, + "loss": 0.82689989, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.21520996, + "step": 6335, + "time_per_iteration": 2.8583874702453613 + }, + { + "auxiliary_loss_clip": 0.01474972, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.29245949, + "balance_loss_mlp": 1.01878679, + "epoch": 0.38094092890425374, + "flos": 20714410661760.0, + "grad_norm": 2.4029274401277623, + "language_loss": 0.73110998, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75629109, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.24353027, + "step": 6336, + "time_per_iteration": 2.844435930252075 + }, + { + "auxiliary_loss_clip": 0.01466313, + "auxiliary_loss_mlp": 0.01040899, + "balance_loss_clip": 1.28693366, + "balance_loss_mlp": 1.0199182, + "epoch": 0.3810010521569217, + "flos": 20677735111680.0, + "grad_norm": 2.0467374791921062, + "language_loss": 0.70442438, + "learning_rate": 2.840929845099894e-06, + "loss": 0.72949648, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.20983887, + "step": 6337, + "time_per_iteration": 2.8668885231018066 + }, + { + "auxiliary_loss_clip": 0.01462534, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.28285766, + "balance_loss_mlp": 1.01538563, + "epoch": 0.38106117540958967, + "flos": 31839010997760.0, + "grad_norm": 2.492931007081314, + "language_loss": 0.64546967, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.67047358, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.22473145, + "step": 6338, + "time_per_iteration": 2.9405808448791504 + }, + { + "auxiliary_loss_clip": 0.01481611, + "auxiliary_loss_mlp": 0.01043117, + "balance_loss_clip": 1.29696095, + "balance_loss_mlp": 1.02045548, + "epoch": 0.38112129866225763, + "flos": 16909700106240.0, + "grad_norm": 2.1070997019757636, + "language_loss": 0.70236492, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.7276122, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22644043, + "step": 6339, + "time_per_iteration": 4.353609085083008 + }, + { + "auxiliary_loss_clip": 0.01461085, + "auxiliary_loss_mlp": 0.01038311, + "balance_loss_clip": 1.28233004, + "balance_loss_mlp": 1.0164361, + "epoch": 0.3811814219149256, + "flos": 20897154984960.0, + "grad_norm": 2.596607238801787, + "language_loss": 0.69215888, + "learning_rate": 2.839869615637177e-06, + "loss": 0.71715283, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21875, + "step": 6340, + "time_per_iteration": 2.8762898445129395 + }, + { + "auxiliary_loss_clip": 0.01466505, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.28342152, + "balance_loss_mlp": 1.01477933, + "epoch": 0.38124154516759357, + "flos": 16699510172160.0, + "grad_norm": 1.8116489750737765, + "language_loss": 0.90523756, + "learning_rate": 2.839516142102522e-06, + "loss": 0.93027186, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22143555, + "step": 6341, + "time_per_iteration": 2.870163679122925 + }, + { + "auxiliary_loss_clip": 0.01477731, + "auxiliary_loss_mlp": 0.01040606, + "balance_loss_clip": 1.29344785, + "balance_loss_mlp": 1.01776552, + "epoch": 0.38130166842026153, + "flos": 19691250577920.0, + "grad_norm": 1.6369134641032932, + "language_loss": 0.75738603, + "learning_rate": 2.83916263673333e-06, + "loss": 0.78256941, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.22839355, + "step": 6342, + "time_per_iteration": 2.8650383949279785 + }, + { + "auxiliary_loss_clip": 0.01460929, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.27946889, + "balance_loss_mlp": 1.0130899, + "epoch": 0.3813617916729295, + "flos": 22208109114240.0, + "grad_norm": 1.6826225159628223, + "language_loss": 0.84288824, + "learning_rate": 2.838809099543007e-06, + "loss": 0.86784261, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.2142334, + "step": 6343, + "time_per_iteration": 2.8838319778442383 + }, + { + "auxiliary_loss_clip": 0.01474394, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.29084575, + "balance_loss_mlp": 1.01947832, + "epoch": 0.38142191492559746, + "flos": 19105708631040.0, + "grad_norm": 1.6514449183137576, + "language_loss": 0.78218079, + "learning_rate": 2.838455530544959e-06, + "loss": 0.80733967, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22009277, + "step": 6344, + "time_per_iteration": 2.835404872894287 + }, + { + "auxiliary_loss_clip": 0.01465735, + "auxiliary_loss_mlp": 0.010451, + "balance_loss_clip": 1.28465557, + "balance_loss_mlp": 1.02332056, + "epoch": 0.3814820381782654, + "flos": 24108450998400.0, + "grad_norm": 2.0604302654532365, + "language_loss": 0.74195874, + "learning_rate": 2.838101929752593e-06, + "loss": 0.76706713, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.21801758, + "step": 6345, + "time_per_iteration": 2.880025863647461 + }, + { + "auxiliary_loss_clip": 0.01471216, + "auxiliary_loss_mlp": 0.01045327, + "balance_loss_clip": 1.29380429, + "balance_loss_mlp": 1.02285624, + "epoch": 0.3815421614309334, + "flos": 15786919676160.0, + "grad_norm": 2.154503130461109, + "language_loss": 0.70894259, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.73410797, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.22473145, + "step": 6346, + "time_per_iteration": 4.262303352355957 + }, + { + "auxiliary_loss_clip": 0.01482377, + "auxiliary_loss_mlp": 0.01038341, + "balance_loss_clip": 1.29789686, + "balance_loss_mlp": 1.01639438, + "epoch": 0.38160228468360136, + "flos": 19908815414400.0, + "grad_norm": 2.043776113635593, + "language_loss": 0.76333809, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78854531, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.21960449, + "step": 6347, + "time_per_iteration": 4.265958070755005 + }, + { + "auxiliary_loss_clip": 0.01462471, + "auxiliary_loss_mlp": 0.01034335, + "balance_loss_clip": 1.2802335, + "balance_loss_mlp": 1.01292467, + "epoch": 0.3816624079362694, + "flos": 19290353235840.0, + "grad_norm": 1.5027810037054412, + "language_loss": 0.75407803, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.77904612, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.21398926, + "step": 6348, + "time_per_iteration": 4.3215861320495605 + }, + { + "auxiliary_loss_clip": 0.01477944, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.292539, + "balance_loss_mlp": 1.0164268, + "epoch": 0.38172253118893734, + "flos": 21187166025600.0, + "grad_norm": 1.8083599871118465, + "language_loss": 0.88211155, + "learning_rate": 2.836687208908142e-06, + "loss": 0.90726876, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.21350098, + "step": 6349, + "time_per_iteration": 2.8707215785980225 + }, + { + "auxiliary_loss_clip": 0.01471971, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.28897691, + "balance_loss_mlp": 1.02079201, + "epoch": 0.3817826544416053, + "flos": 17537935161600.0, + "grad_norm": 2.0622007583519593, + "language_loss": 0.77618313, + "learning_rate": 2.836333449345341e-06, + "loss": 0.80133581, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22485352, + "step": 6350, + "time_per_iteration": 2.9012835025787354 + }, + { + "auxiliary_loss_clip": 0.01467086, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.28642046, + "balance_loss_mlp": 1.01204348, + "epoch": 0.38184277769427327, + "flos": 16335288380160.0, + "grad_norm": 2.420915241355584, + "language_loss": 0.77453232, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.79956108, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.23730469, + "step": 6351, + "time_per_iteration": 2.8255538940429688 + }, + { + "auxiliary_loss_clip": 0.01475522, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.29215169, + "balance_loss_mlp": 1.01927733, + "epoch": 0.38190290094694124, + "flos": 30455067726720.0, + "grad_norm": 1.7615366536542985, + "language_loss": 0.74813509, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.77331388, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.23095703, + "step": 6352, + "time_per_iteration": 2.9712560176849365 + }, + { + "auxiliary_loss_clip": 0.01468406, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.28897178, + "balance_loss_mlp": 1.01396465, + "epoch": 0.3819630241996092, + "flos": 14218015086720.0, + "grad_norm": 3.0205535118185796, + "language_loss": 0.64851499, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.67354131, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.20263672, + "step": 6353, + "time_per_iteration": 2.8374369144439697 + }, + { + "auxiliary_loss_clip": 0.01475284, + "auxiliary_loss_mlp": 0.01044414, + "balance_loss_clip": 1.29307485, + "balance_loss_mlp": 1.0191412, + "epoch": 0.38202314745227717, + "flos": 25020589046400.0, + "grad_norm": 1.6824461027623239, + "language_loss": 0.84072638, + "learning_rate": 2.834918094089816e-06, + "loss": 0.86592335, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.25280762, + "step": 6354, + "time_per_iteration": 2.897923707962036 + }, + { + "auxiliary_loss_clip": 0.01462945, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.28588033, + "balance_loss_mlp": 1.0182426, + "epoch": 0.38208327070494513, + "flos": 20824482556800.0, + "grad_norm": 1.6516875772862567, + "language_loss": 0.81248164, + "learning_rate": 2.834564176091943e-06, + "loss": 0.83750576, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21252441, + "step": 6355, + "time_per_iteration": 2.885236978530884 + }, + { + "auxiliary_loss_clip": 0.01473151, + "auxiliary_loss_mlp": 0.01042139, + "balance_loss_clip": 1.29184294, + "balance_loss_mlp": 1.02081203, + "epoch": 0.3821433939576131, + "flos": 22647899001600.0, + "grad_norm": 1.8765918338451968, + "language_loss": 0.76048672, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.78563964, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21337891, + "step": 6356, + "time_per_iteration": 2.8902339935302734 + }, + { + "auxiliary_loss_clip": 0.01475655, + "auxiliary_loss_mlp": 0.01041776, + "balance_loss_clip": 1.293257, + "balance_loss_mlp": 1.0205332, + "epoch": 0.38220351721028106, + "flos": 26881088244480.0, + "grad_norm": 1.7237475732783842, + "language_loss": 0.82314515, + "learning_rate": 2.833856245169348e-06, + "loss": 0.84831941, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.21252441, + "step": 6357, + "time_per_iteration": 2.9008283615112305 + }, + { + "auxiliary_loss_clip": 0.01486109, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.30160284, + "balance_loss_mlp": 1.01539969, + "epoch": 0.38226364046294903, + "flos": 23377835664000.0, + "grad_norm": 1.6804313389302816, + "language_loss": 0.78944242, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.81468689, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.22912598, + "step": 6358, + "time_per_iteration": 2.863264799118042 + }, + { + "auxiliary_loss_clip": 0.01489531, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.3028338, + "balance_loss_mlp": 1.01657534, + "epoch": 0.382323763715617, + "flos": 19655706147840.0, + "grad_norm": 8.292777523418867, + "language_loss": 0.79915798, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.82443398, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21496582, + "step": 6359, + "time_per_iteration": 2.905219078063965 + }, + { + "auxiliary_loss_clip": 0.01474437, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.2943716, + "balance_loss_mlp": 1.01486135, + "epoch": 0.38238388696828496, + "flos": 54143030384640.0, + "grad_norm": 1.81840829472201, + "language_loss": 0.70293808, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.72805411, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.22302246, + "step": 6360, + "time_per_iteration": 3.1570260524749756 + }, + { + "auxiliary_loss_clip": 0.01473441, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.2932837, + "balance_loss_mlp": 1.01455212, + "epoch": 0.382444010220953, + "flos": 24946921232640.0, + "grad_norm": 1.811804490322789, + "language_loss": 0.79069668, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81578487, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.20849609, + "step": 6361, + "time_per_iteration": 2.950392723083496 + }, + { + "auxiliary_loss_clip": 0.01456551, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.27816916, + "balance_loss_mlp": 1.01389229, + "epoch": 0.38250413347362094, + "flos": 42351935875200.0, + "grad_norm": 2.0255515638387136, + "language_loss": 0.65992773, + "learning_rate": 2.832085864749337e-06, + "loss": 0.68483651, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.2043457, + "step": 6362, + "time_per_iteration": 3.0583200454711914 + }, + { + "auxiliary_loss_clip": 0.01489736, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.30497468, + "balance_loss_mlp": 1.01837277, + "epoch": 0.3825642567262889, + "flos": 16297753178880.0, + "grad_norm": 1.680242374311346, + "language_loss": 0.82882106, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.85412216, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.21984863, + "step": 6363, + "time_per_iteration": 2.8550002574920654 + }, + { + "auxiliary_loss_clip": 0.01475338, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.29662347, + "balance_loss_mlp": 1.02053392, + "epoch": 0.3826243799789569, + "flos": 45669503220480.0, + "grad_norm": 1.653008496440779, + "language_loss": 0.59863764, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.62381762, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.22131348, + "step": 6364, + "time_per_iteration": 3.1291422843933105 + }, + { + "auxiliary_loss_clip": 0.01492701, + "auxiliary_loss_mlp": 0.0104136, + "balance_loss_clip": 1.30532014, + "balance_loss_mlp": 1.0212729, + "epoch": 0.38268450323162484, + "flos": 25312545613440.0, + "grad_norm": 1.9906214832280595, + "language_loss": 0.70165265, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.72699326, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.20080566, + "step": 6365, + "time_per_iteration": 2.9093146324157715 + }, + { + "auxiliary_loss_clip": 0.01495504, + "auxiliary_loss_mlp": 0.01039124, + "balance_loss_clip": 1.30513847, + "balance_loss_mlp": 1.01730847, + "epoch": 0.3827446264842928, + "flos": 21846240051840.0, + "grad_norm": 1.7433582491962958, + "language_loss": 0.74365854, + "learning_rate": 2.830668992382758e-06, + "loss": 0.76900476, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.21801758, + "step": 6366, + "time_per_iteration": 2.8879058361053467 + }, + { + "auxiliary_loss_clip": 0.01491589, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.30520916, + "balance_loss_mlp": 1.02169871, + "epoch": 0.38280474973696077, + "flos": 25744417660800.0, + "grad_norm": 2.0890637451863263, + "language_loss": 0.69884437, + "learning_rate": 2.830314695509902e-06, + "loss": 0.72418618, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.2088623, + "step": 6367, + "time_per_iteration": 2.9425206184387207 + }, + { + "auxiliary_loss_clip": 0.01462811, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.28536391, + "balance_loss_mlp": 1.01505113, + "epoch": 0.38286487298962874, + "flos": 24905675957760.0, + "grad_norm": 2.17103859511807, + "language_loss": 0.65749955, + "learning_rate": 2.82996036715143e-06, + "loss": 0.68249226, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.21411133, + "step": 6368, + "time_per_iteration": 2.8961455821990967 + }, + { + "auxiliary_loss_clip": 0.01474303, + "auxiliary_loss_mlp": 0.01044118, + "balance_loss_clip": 1.2930057, + "balance_loss_mlp": 1.02289832, + "epoch": 0.3829249962422967, + "flos": 28554182904960.0, + "grad_norm": 1.3500827258370098, + "language_loss": 0.68742144, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.71260566, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21228027, + "step": 6369, + "time_per_iteration": 2.949284791946411 + }, + { + "auxiliary_loss_clip": 0.0147638, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.29476738, + "balance_loss_mlp": 1.01698923, + "epoch": 0.38298511949496467, + "flos": 21481339587840.0, + "grad_norm": 2.405714402298814, + "language_loss": 0.78738487, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.81253004, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21130371, + "step": 6370, + "time_per_iteration": 2.885477066040039 + }, + { + "auxiliary_loss_clip": 0.01488802, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.30485296, + "balance_loss_mlp": 1.01608479, + "epoch": 0.38304524274763263, + "flos": 31690906208640.0, + "grad_norm": 3.775614122888807, + "language_loss": 0.65018928, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.67544746, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.20922852, + "step": 6371, + "time_per_iteration": 2.9398391246795654 + }, + { + "auxiliary_loss_clip": 0.01503975, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.31298292, + "balance_loss_mlp": 1.02054596, + "epoch": 0.3831053660003006, + "flos": 25086203285760.0, + "grad_norm": 1.8248677146460808, + "language_loss": 0.73462999, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.76010013, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.22485352, + "step": 6372, + "time_per_iteration": 2.884978771209717 + }, + { + "auxiliary_loss_clip": 0.01485398, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.298226, + "balance_loss_mlp": 1.01620245, + "epoch": 0.38316548925296856, + "flos": 23269618805760.0, + "grad_norm": 1.9728709977103525, + "language_loss": 0.85973251, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.88495398, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.20544434, + "step": 6373, + "time_per_iteration": 2.8574302196502686 + }, + { + "auxiliary_loss_clip": 0.01482395, + "auxiliary_loss_mlp": 0.01039765, + "balance_loss_clip": 1.29651833, + "balance_loss_mlp": 1.01854563, + "epoch": 0.3832256125056366, + "flos": 34436912250240.0, + "grad_norm": 5.050226645035196, + "language_loss": 0.75684094, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.78206253, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.21228027, + "step": 6374, + "time_per_iteration": 4.462501764297485 + }, + { + "auxiliary_loss_clip": 0.01480882, + "auxiliary_loss_mlp": 0.01038968, + "balance_loss_clip": 1.29431736, + "balance_loss_mlp": 1.01848757, + "epoch": 0.38328573575830455, + "flos": 21772888951680.0, + "grad_norm": 2.67722952043786, + "language_loss": 0.76968169, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.79488021, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.20471191, + "step": 6375, + "time_per_iteration": 2.8899521827697754 + }, + { + "auxiliary_loss_clip": 0.01488062, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.3023392, + "balance_loss_mlp": 1.01739478, + "epoch": 0.3833458590109725, + "flos": 17387839601280.0, + "grad_norm": 2.08649993253791, + "language_loss": 0.74226201, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.76752937, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21276855, + "step": 6376, + "time_per_iteration": 2.8692827224731445 + }, + { + "auxiliary_loss_clip": 0.01483432, + "auxiliary_loss_mlp": 0.01040467, + "balance_loss_clip": 1.30110979, + "balance_loss_mlp": 1.01798415, + "epoch": 0.3834059822636405, + "flos": 29436613102080.0, + "grad_norm": 2.02066135701997, + "language_loss": 0.68934357, + "learning_rate": 2.826769997289796e-06, + "loss": 0.71458256, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22473145, + "step": 6377, + "time_per_iteration": 2.9613442420959473 + }, + { + "auxiliary_loss_clip": 0.01495446, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.30729866, + "balance_loss_mlp": 1.01460958, + "epoch": 0.38346610551630844, + "flos": 21480751405440.0, + "grad_norm": 2.1675152170038685, + "language_loss": 0.74037856, + "learning_rate": 2.826415354814344e-06, + "loss": 0.765692, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.21289062, + "step": 6378, + "time_per_iteration": 2.869659185409546 + }, + { + "auxiliary_loss_clip": 0.01484316, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.29913902, + "balance_loss_mlp": 1.02222693, + "epoch": 0.3835262287689764, + "flos": 27572132361600.0, + "grad_norm": 1.7491966164074595, + "language_loss": 0.69906712, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.72434604, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21362305, + "step": 6379, + "time_per_iteration": 3.0107345581054688 + }, + { + "auxiliary_loss_clip": 0.01470977, + "auxiliary_loss_mlp": 0.01039392, + "balance_loss_clip": 1.29042792, + "balance_loss_mlp": 1.01872087, + "epoch": 0.3835863520216444, + "flos": 15532226841600.0, + "grad_norm": 1.9180832646828803, + "language_loss": 0.83871889, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.86382258, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.20678711, + "step": 6380, + "time_per_iteration": 2.867462158203125 + }, + { + "auxiliary_loss_clip": 0.01481472, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.29960668, + "balance_loss_mlp": 1.01716506, + "epoch": 0.38364647527431234, + "flos": 21914116531200.0, + "grad_norm": 1.5021757788203036, + "language_loss": 0.81776273, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.84296048, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21142578, + "step": 6381, + "time_per_iteration": 4.306679010391235 + }, + { + "auxiliary_loss_clip": 0.01277233, + "auxiliary_loss_mlp": 0.01018558, + "balance_loss_clip": 1.17606103, + "balance_loss_mlp": 1.00196409, + "epoch": 0.3837065985269803, + "flos": 65563089120000.0, + "grad_norm": 0.8005831697450573, + "language_loss": 0.60620368, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6291616, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.16601562, + "step": 6382, + "time_per_iteration": 4.752147674560547 + }, + { + "auxiliary_loss_clip": 0.0149424, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.30550528, + "balance_loss_mlp": 1.01624417, + "epoch": 0.38376672177964827, + "flos": 28268243896320.0, + "grad_norm": 16.06141120732858, + "language_loss": 0.68488938, + "learning_rate": 2.824641672639794e-06, + "loss": 0.71020842, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.21411133, + "step": 6383, + "time_per_iteration": 4.2904627323150635 + }, + { + "auxiliary_loss_clip": 0.01494766, + "auxiliary_loss_mlp": 0.01042895, + "balance_loss_clip": 1.30922294, + "balance_loss_mlp": 1.02147305, + "epoch": 0.38382684503231623, + "flos": 20641104806400.0, + "grad_norm": 1.7034770266698918, + "language_loss": 0.74829996, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77367663, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.21411133, + "step": 6384, + "time_per_iteration": 2.8940584659576416 + }, + { + "auxiliary_loss_clip": 0.01486878, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.30405641, + "balance_loss_mlp": 1.02042973, + "epoch": 0.3838869682849842, + "flos": 19614279893760.0, + "grad_norm": 1.3806273808939558, + "language_loss": 0.76592529, + "learning_rate": 2.823931980782341e-06, + "loss": 0.79120749, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.20898438, + "step": 6385, + "time_per_iteration": 2.9040303230285645 + }, + { + "auxiliary_loss_clip": 0.01272182, + "auxiliary_loss_mlp": 0.01021681, + "balance_loss_clip": 1.17353058, + "balance_loss_mlp": 1.00461042, + "epoch": 0.38394709153765216, + "flos": 56581081182720.0, + "grad_norm": 0.8932560006944583, + "language_loss": 0.67099303, + "learning_rate": 2.82357708798151e-06, + "loss": 0.6939317, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17089844, + "step": 6386, + "time_per_iteration": 3.2035484313964844 + }, + { + "auxiliary_loss_clip": 0.0147972, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.29776537, + "balance_loss_mlp": 1.01983857, + "epoch": 0.3840072147903202, + "flos": 15897398774400.0, + "grad_norm": 1.7286873706384767, + "language_loss": 0.73764527, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.76284444, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.20373535, + "step": 6387, + "time_per_iteration": 2.8605823516845703 + }, + { + "auxiliary_loss_clip": 0.01464733, + "auxiliary_loss_mlp": 0.01048822, + "balance_loss_clip": 1.28585386, + "balance_loss_mlp": 1.02726889, + "epoch": 0.38406733804298815, + "flos": 28229125127040.0, + "grad_norm": 1.5945548895896784, + "language_loss": 0.8176989, + "learning_rate": 2.822867208702932e-06, + "loss": 0.84283447, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.21557617, + "step": 6388, + "time_per_iteration": 2.9260332584381104 + }, + { + "auxiliary_loss_clip": 0.01470352, + "auxiliary_loss_mlp": 0.01046604, + "balance_loss_clip": 1.28783333, + "balance_loss_mlp": 1.02635086, + "epoch": 0.3841274612956561, + "flos": 18232779841920.0, + "grad_norm": 2.015058797881041, + "language_loss": 0.77203172, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.79720128, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.20263672, + "step": 6389, + "time_per_iteration": 2.9214928150177 + }, + { + "auxiliary_loss_clip": 0.0149465, + "auxiliary_loss_mlp": 0.01048808, + "balance_loss_clip": 1.30683494, + "balance_loss_mlp": 1.02674246, + "epoch": 0.3841875845483241, + "flos": 19802589327360.0, + "grad_norm": 3.0266721421039446, + "language_loss": 0.77224058, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.79767513, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.22058105, + "step": 6390, + "time_per_iteration": 2.8680434226989746 + }, + { + "auxiliary_loss_clip": 0.01492606, + "auxiliary_loss_mlp": 0.01050198, + "balance_loss_clip": 1.30423748, + "balance_loss_mlp": 1.02813232, + "epoch": 0.38424770780099204, + "flos": 29910997278720.0, + "grad_norm": 1.5965653904454025, + "language_loss": 0.70734018, + "learning_rate": 2.821802155794668e-06, + "loss": 0.73276818, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.2208252, + "step": 6391, + "time_per_iteration": 2.9442131519317627 + }, + { + "auxiliary_loss_clip": 0.01487042, + "auxiliary_loss_mlp": 0.01047873, + "balance_loss_clip": 1.30125988, + "balance_loss_mlp": 1.02647495, + "epoch": 0.38430783105366, + "flos": 20823487171200.0, + "grad_norm": 2.21306105392205, + "language_loss": 0.84963465, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.87498373, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.21398926, + "step": 6392, + "time_per_iteration": 2.8224666118621826 + }, + { + "auxiliary_loss_clip": 0.01486384, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.3004477, + "balance_loss_mlp": 1.0218755, + "epoch": 0.384367954306328, + "flos": 11005678442880.0, + "grad_norm": 2.16649539158351, + "language_loss": 0.6267308, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.65202504, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.21166992, + "step": 6393, + "time_per_iteration": 2.863619804382324 + }, + { + "auxiliary_loss_clip": 0.01495852, + "auxiliary_loss_mlp": 0.01043709, + "balance_loss_clip": 1.30570877, + "balance_loss_mlp": 1.02232325, + "epoch": 0.38442807755899594, + "flos": 25348994939520.0, + "grad_norm": 3.347236496506489, + "language_loss": 0.72067714, + "learning_rate": 2.820736822421029e-06, + "loss": 0.74607271, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.21398926, + "step": 6394, + "time_per_iteration": 2.871183156967163 + }, + { + "auxiliary_loss_clip": 0.01495665, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_clip": 1.30557239, + "balance_loss_mlp": 1.02367949, + "epoch": 0.3844882008116639, + "flos": 21079763573760.0, + "grad_norm": 2.2096223223753033, + "language_loss": 0.82889557, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.85430849, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.21960449, + "step": 6395, + "time_per_iteration": 2.865556001663208 + }, + { + "auxiliary_loss_clip": 0.01484459, + "auxiliary_loss_mlp": 0.01042022, + "balance_loss_clip": 1.29866266, + "balance_loss_mlp": 1.02077854, + "epoch": 0.38454832406433187, + "flos": 17971074063360.0, + "grad_norm": 2.4723800507032534, + "language_loss": 0.71483266, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.7400974, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.21240234, + "step": 6396, + "time_per_iteration": 2.826753616333008 + }, + { + "auxiliary_loss_clip": 0.01264789, + "auxiliary_loss_mlp": 0.01038708, + "balance_loss_clip": 1.16112924, + "balance_loss_mlp": 1.01820374, + "epoch": 0.38460844731699984, + "flos": 67958582544000.0, + "grad_norm": 0.905978046724653, + "language_loss": 0.59718275, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.62021774, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.20507812, + "step": 6397, + "time_per_iteration": 3.411128282546997 + }, + { + "auxiliary_loss_clip": 0.01473044, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.2935921, + "balance_loss_mlp": 1.01874804, + "epoch": 0.3846685705696678, + "flos": 25860235645440.0, + "grad_norm": 1.7890644238007474, + "language_loss": 0.85283506, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87797362, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22094727, + "step": 6398, + "time_per_iteration": 2.905503988265991 + }, + { + "auxiliary_loss_clip": 0.01474371, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.29311275, + "balance_loss_mlp": 1.01662087, + "epoch": 0.38472869382233577, + "flos": 16298974788480.0, + "grad_norm": 2.310695778619935, + "language_loss": 0.80692118, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.83204186, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21081543, + "step": 6399, + "time_per_iteration": 2.8995730876922607 + }, + { + "auxiliary_loss_clip": 0.01479613, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.29436743, + "balance_loss_mlp": 1.01423943, + "epoch": 0.38478881707500373, + "flos": 19362211257600.0, + "grad_norm": 1.7904550922867954, + "language_loss": 0.68989146, + "learning_rate": 2.818605315732038e-06, + "loss": 0.7150526, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22253418, + "step": 6400, + "time_per_iteration": 2.8898890018463135 + }, + { + "auxiliary_loss_clip": 0.01499457, + "auxiliary_loss_mlp": 0.01040739, + "balance_loss_clip": 1.31277633, + "balance_loss_mlp": 1.01925778, + "epoch": 0.38484894032767175, + "flos": 24870945934080.0, + "grad_norm": 1.924636558672334, + "language_loss": 0.73872423, + "learning_rate": 2.81824995589303e-06, + "loss": 0.76412624, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.21484375, + "step": 6401, + "time_per_iteration": 2.943347454071045 + }, + { + "auxiliary_loss_clip": 0.01475964, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.28969836, + "balance_loss_mlp": 1.01706636, + "epoch": 0.3849090635803397, + "flos": 14509971653760.0, + "grad_norm": 2.2330143138180767, + "language_loss": 0.73316121, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.75830942, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.21801758, + "step": 6402, + "time_per_iteration": 2.840742826461792 + }, + { + "auxiliary_loss_clip": 0.01470295, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.28916001, + "balance_loss_mlp": 1.01408958, + "epoch": 0.3849691868330077, + "flos": 18525234101760.0, + "grad_norm": 2.0501697592489925, + "language_loss": 0.84400964, + "learning_rate": 2.817539143144128e-06, + "loss": 0.86905485, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.20117188, + "step": 6403, + "time_per_iteration": 2.828883171081543 + }, + { + "auxiliary_loss_clip": 0.0146918, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.2881546, + "balance_loss_mlp": 1.01687443, + "epoch": 0.38502931008567565, + "flos": 21626367730560.0, + "grad_norm": 2.0556815213181987, + "language_loss": 0.83827353, + "learning_rate": 2.817183690261189e-06, + "loss": 0.86334491, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21081543, + "step": 6404, + "time_per_iteration": 2.8430607318878174 + }, + { + "auxiliary_loss_clip": 0.01477119, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.29161692, + "balance_loss_mlp": 1.01365066, + "epoch": 0.3850894333383436, + "flos": 25427458702080.0, + "grad_norm": 1.6604351362397647, + "language_loss": 0.70625466, + "learning_rate": 2.816828206390563e-06, + "loss": 0.73137617, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21374512, + "step": 6405, + "time_per_iteration": 2.957427978515625 + }, + { + "auxiliary_loss_clip": 0.01465562, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.28691435, + "balance_loss_mlp": 1.01422071, + "epoch": 0.3851495565910116, + "flos": 20236949838720.0, + "grad_norm": 7.750699712934738, + "language_loss": 0.80087835, + "learning_rate": 2.816472691545729e-06, + "loss": 0.82588524, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.20898438, + "step": 6406, + "time_per_iteration": 2.8318138122558594 + }, + { + "auxiliary_loss_clip": 0.0147823, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.29527712, + "balance_loss_mlp": 1.01748812, + "epoch": 0.38520967984367954, + "flos": 16517127807360.0, + "grad_norm": 2.2339889523404746, + "language_loss": 0.8527959, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.87796593, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21264648, + "step": 6407, + "time_per_iteration": 2.796774387359619 + }, + { + "auxiliary_loss_clip": 0.01273419, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.16713905, + "balance_loss_mlp": 1.01327932, + "epoch": 0.3852698030963475, + "flos": 61343744785920.0, + "grad_norm": 0.8711012872483209, + "language_loss": 0.65034127, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67338943, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.18164062, + "step": 6408, + "time_per_iteration": 3.391498327255249 + }, + { + "auxiliary_loss_clip": 0.01465939, + "auxiliary_loss_mlp": 0.01041551, + "balance_loss_clip": 1.28284502, + "balance_loss_mlp": 1.02005732, + "epoch": 0.3853299263490155, + "flos": 22903225263360.0, + "grad_norm": 1.5121160488118142, + "language_loss": 0.74127185, + "learning_rate": 2.8154059613008e-06, + "loss": 0.76634675, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21496582, + "step": 6409, + "time_per_iteration": 4.253864765167236 + }, + { + "auxiliary_loss_clip": 0.01496298, + "auxiliary_loss_mlp": 0.01041802, + "balance_loss_clip": 1.30494666, + "balance_loss_mlp": 1.01987982, + "epoch": 0.38539004960168344, + "flos": 20057055937920.0, + "grad_norm": 3.029212081743284, + "language_loss": 0.72221071, + "learning_rate": 2.81505032269396e-06, + "loss": 0.74759173, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.21936035, + "step": 6410, + "time_per_iteration": 2.837019205093384 + }, + { + "auxiliary_loss_clip": 0.01268011, + "auxiliary_loss_mlp": 0.01025794, + "balance_loss_clip": 1.16594756, + "balance_loss_mlp": 1.00881898, + "epoch": 0.3854501728543514, + "flos": 68765263666560.0, + "grad_norm": 0.6853781857179113, + "language_loss": 0.60369277, + "learning_rate": 2.81469465318033e-06, + "loss": 0.6266309, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.16992188, + "step": 6411, + "time_per_iteration": 3.4090776443481445 + }, + { + "auxiliary_loss_clip": 0.01481759, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.29769957, + "balance_loss_mlp": 1.01555717, + "epoch": 0.38551029610701937, + "flos": 20494628830080.0, + "grad_norm": 1.8030010635662714, + "language_loss": 0.78187984, + "learning_rate": 2.814338952773397e-06, + "loss": 0.80706191, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.20874023, + "step": 6412, + "time_per_iteration": 2.851182222366333 + }, + { + "auxiliary_loss_clip": 0.01493418, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.30618358, + "balance_loss_mlp": 1.01833081, + "epoch": 0.38557041935968733, + "flos": 23480849370240.0, + "grad_norm": 1.741526109935132, + "language_loss": 0.78794944, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.81329864, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.23156738, + "step": 6413, + "time_per_iteration": 2.8615612983703613 + }, + { + "auxiliary_loss_clip": 0.01268687, + "auxiliary_loss_mlp": 0.01029118, + "balance_loss_clip": 1.1639086, + "balance_loss_mlp": 1.0054673, + "epoch": 0.38563054261235535, + "flos": 63994546488960.0, + "grad_norm": 0.808082180624448, + "language_loss": 0.61353862, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63651657, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.23632812, + "step": 6414, + "time_per_iteration": 3.15305495262146 + }, + { + "auxiliary_loss_clip": 0.01497018, + "auxiliary_loss_mlp": 0.0103882, + "balance_loss_clip": 1.31028819, + "balance_loss_mlp": 1.01769578, + "epoch": 0.3856906658650233, + "flos": 23998333858560.0, + "grad_norm": 1.9997209864550294, + "language_loss": 0.78261054, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.80796891, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21142578, + "step": 6415, + "time_per_iteration": 2.8667171001434326 + }, + { + "auxiliary_loss_clip": 0.01472682, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.29573774, + "balance_loss_mlp": 1.01281011, + "epoch": 0.3857507891176913, + "flos": 25017738624000.0, + "grad_norm": 1.6405510485679282, + "language_loss": 0.80620372, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.8312571, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.19836426, + "step": 6416, + "time_per_iteration": 5.788764238357544 + }, + { + "auxiliary_loss_clip": 0.01470687, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.28910518, + "balance_loss_mlp": 1.01945448, + "epoch": 0.38581091237035925, + "flos": 21545008300800.0, + "grad_norm": 2.1913133322661817, + "language_loss": 0.80297852, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.82808197, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20202637, + "step": 6417, + "time_per_iteration": 2.8774495124816895 + }, + { + "auxiliary_loss_clip": 0.01475952, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.29567242, + "balance_loss_mlp": 1.02829742, + "epoch": 0.3858710356230272, + "flos": 17392726039680.0, + "grad_norm": 2.3703855328507473, + "language_loss": 0.81162536, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.83687127, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.20349121, + "step": 6418, + "time_per_iteration": 2.8629744052886963 + }, + { + "auxiliary_loss_clip": 0.01467859, + "auxiliary_loss_mlp": 0.0104244, + "balance_loss_clip": 1.28882205, + "balance_loss_mlp": 1.02260363, + "epoch": 0.3859311588756952, + "flos": 20349283973760.0, + "grad_norm": 1.7139350076482205, + "language_loss": 0.811517, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.83662003, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.19848633, + "step": 6419, + "time_per_iteration": 4.2436363697052 + }, + { + "auxiliary_loss_clip": 0.01467196, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.28735328, + "balance_loss_mlp": 1.02004123, + "epoch": 0.38599128212836314, + "flos": 26331859889280.0, + "grad_norm": 2.4743976699248873, + "language_loss": 0.68580019, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.71088916, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21655273, + "step": 6420, + "time_per_iteration": 2.894181728363037 + }, + { + "auxiliary_loss_clip": 0.01467594, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.29165626, + "balance_loss_mlp": 1.03256202, + "epoch": 0.3860514053810311, + "flos": 13561927217280.0, + "grad_norm": 1.8505806163753193, + "language_loss": 0.81345087, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83865047, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.19812012, + "step": 6421, + "time_per_iteration": 2.83227801322937 + }, + { + "auxiliary_loss_clip": 0.01472555, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.29076171, + "balance_loss_mlp": 1.02248192, + "epoch": 0.3861115286336991, + "flos": 20962678734720.0, + "grad_norm": 2.1295929851740936, + "language_loss": 0.73341775, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.75858039, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.21240234, + "step": 6422, + "time_per_iteration": 2.8425204753875732 + }, + { + "auxiliary_loss_clip": 0.01447738, + "auxiliary_loss_mlp": 0.01042561, + "balance_loss_clip": 1.27344131, + "balance_loss_mlp": 1.02076983, + "epoch": 0.38617165188636704, + "flos": 16371375747840.0, + "grad_norm": 1.6019010590450193, + "language_loss": 0.6753881, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.70029116, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21801758, + "step": 6423, + "time_per_iteration": 2.968427896499634 + }, + { + "auxiliary_loss_clip": 0.01483089, + "auxiliary_loss_mlp": 0.01046953, + "balance_loss_clip": 1.3000555, + "balance_loss_mlp": 1.02613902, + "epoch": 0.386231775139035, + "flos": 34800545859840.0, + "grad_norm": 2.108289612882626, + "language_loss": 0.70426214, + "learning_rate": 2.810068143123449e-06, + "loss": 0.72956258, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.20800781, + "step": 6424, + "time_per_iteration": 2.97904896736145 + }, + { + "auxiliary_loss_clip": 0.0146359, + "auxiliary_loss_mlp": 0.01046369, + "balance_loss_clip": 1.28626776, + "balance_loss_mlp": 1.02476835, + "epoch": 0.38629189839170297, + "flos": 21736258646400.0, + "grad_norm": 1.5996756437961153, + "language_loss": 0.73315668, + "learning_rate": 2.809712042331429e-06, + "loss": 0.75825632, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.21618652, + "step": 6425, + "time_per_iteration": 2.872779607772827 + }, + { + "auxiliary_loss_clip": 0.01482738, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.29841113, + "balance_loss_mlp": 1.02671039, + "epoch": 0.38635202164437094, + "flos": 27934001424000.0, + "grad_norm": 2.1870348304321117, + "language_loss": 0.81823647, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.84354043, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.20947266, + "step": 6426, + "time_per_iteration": 2.888880729675293 + }, + { + "auxiliary_loss_clip": 0.01478724, + "auxiliary_loss_mlp": 0.01049546, + "balance_loss_clip": 1.29671085, + "balance_loss_mlp": 1.02774262, + "epoch": 0.38641214489703896, + "flos": 23597119802880.0, + "grad_norm": 2.2207847060981103, + "language_loss": 0.76010597, + "learning_rate": 2.80899974864781e-06, + "loss": 0.78538871, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.21813965, + "step": 6427, + "time_per_iteration": 2.8851401805877686 + }, + { + "auxiliary_loss_clip": 0.01468154, + "auxiliary_loss_mlp": 0.01050359, + "balance_loss_clip": 1.28958833, + "balance_loss_mlp": 1.0288893, + "epoch": 0.3864722681497069, + "flos": 12648974762880.0, + "grad_norm": 2.293264408205655, + "language_loss": 0.70690644, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.73209155, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21472168, + "step": 6428, + "time_per_iteration": 2.886946439743042 + }, + { + "auxiliary_loss_clip": 0.01463613, + "auxiliary_loss_mlp": 0.01048967, + "balance_loss_clip": 1.28399777, + "balance_loss_mlp": 1.02752101, + "epoch": 0.3865323914023749, + "flos": 17606807026560.0, + "grad_norm": 2.4469024319812784, + "language_loss": 0.85138118, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.87650698, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21447754, + "step": 6429, + "time_per_iteration": 2.8583502769470215 + }, + { + "auxiliary_loss_clip": 0.01473688, + "auxiliary_loss_mlp": 0.01046, + "balance_loss_clip": 1.29339623, + "balance_loss_mlp": 1.0241251, + "epoch": 0.38659251465504285, + "flos": 18488196593280.0, + "grad_norm": 1.9011376055613178, + "language_loss": 0.82138145, + "learning_rate": 2.807931078076015e-06, + "loss": 0.8465783, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21875, + "step": 6430, + "time_per_iteration": 2.8387465476989746 + }, + { + "auxiliary_loss_clip": 0.0127948, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.16495252, + "balance_loss_mlp": 1.01131022, + "epoch": 0.3866526379077108, + "flos": 64198012930560.0, + "grad_norm": 0.7260824743711669, + "language_loss": 0.58921963, + "learning_rate": 2.807574793260416e-06, + "loss": 0.61236215, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.234375, + "step": 6431, + "time_per_iteration": 3.388615131378174 + }, + { + "auxiliary_loss_clip": 0.01473144, + "auxiliary_loss_mlp": 0.01042897, + "balance_loss_clip": 1.28821278, + "balance_loss_mlp": 1.0212245, + "epoch": 0.3867127611603788, + "flos": 14395375278720.0, + "grad_norm": 2.5628167451923765, + "language_loss": 0.80467409, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.82983446, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.2166748, + "step": 6432, + "time_per_iteration": 2.843686580657959 + }, + { + "auxiliary_loss_clip": 0.01487968, + "auxiliary_loss_mlp": 0.01047551, + "balance_loss_clip": 1.30038512, + "balance_loss_mlp": 1.02506781, + "epoch": 0.38677288441304675, + "flos": 20020244653440.0, + "grad_norm": 3.332195892725776, + "language_loss": 0.82287371, + "learning_rate": 2.806862131772779e-06, + "loss": 0.84822887, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.22485352, + "step": 6433, + "time_per_iteration": 2.8364133834838867 + }, + { + "auxiliary_loss_clip": 0.01476982, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_clip": 1.2938385, + "balance_loss_mlp": 1.02109385, + "epoch": 0.3868330076657147, + "flos": 22247092149120.0, + "grad_norm": 2.216064351870715, + "language_loss": 0.71774495, + "learning_rate": 2.806505755127765e-06, + "loss": 0.74295473, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.22888184, + "step": 6434, + "time_per_iteration": 2.8981528282165527 + }, + { + "auxiliary_loss_clip": 0.01487767, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.29846489, + "balance_loss_mlp": 1.01758432, + "epoch": 0.3868931309183827, + "flos": 16736185722240.0, + "grad_norm": 2.028682790579075, + "language_loss": 0.78933549, + "learning_rate": 2.806149347899972e-06, + "loss": 0.81460965, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.22070312, + "step": 6435, + "time_per_iteration": 2.8888416290283203 + }, + { + "auxiliary_loss_clip": 0.01458023, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.27911603, + "balance_loss_mlp": 1.01332593, + "epoch": 0.38695325417105064, + "flos": 22685026999680.0, + "grad_norm": 2.0275445829562564, + "language_loss": 0.79927707, + "learning_rate": 2.805792910102915e-06, + "loss": 0.8242141, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22363281, + "step": 6436, + "time_per_iteration": 2.90470027923584 + }, + { + "auxiliary_loss_clip": 0.01457309, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.28068256, + "balance_loss_mlp": 1.01480377, + "epoch": 0.3870133774237186, + "flos": 23122328423040.0, + "grad_norm": 1.640191650980483, + "language_loss": 0.77635419, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.80128896, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21374512, + "step": 6437, + "time_per_iteration": 2.8718676567077637 + }, + { + "auxiliary_loss_clip": 0.01463732, + "auxiliary_loss_mlp": 0.01035988, + "balance_loss_clip": 1.28614783, + "balance_loss_mlp": 1.01419663, + "epoch": 0.3870735006763866, + "flos": 17684592117120.0, + "grad_norm": 2.1851073485926094, + "language_loss": 0.8260628, + "learning_rate": 2.805079942855074e-06, + "loss": 0.85106003, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21777344, + "step": 6438, + "time_per_iteration": 2.82076358795166 + }, + { + "auxiliary_loss_clip": 0.01476631, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.29487753, + "balance_loss_mlp": 1.01727939, + "epoch": 0.38713362392905454, + "flos": 23306475335040.0, + "grad_norm": 1.4521267372192577, + "language_loss": 0.76461482, + "learning_rate": 2.804723413431326e-06, + "loss": 0.78978568, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.23181152, + "step": 6439, + "time_per_iteration": 2.933702230453491 + }, + { + "auxiliary_loss_clip": 0.01455923, + "auxiliary_loss_mlp": 0.01039541, + "balance_loss_clip": 1.28051472, + "balance_loss_mlp": 1.01727295, + "epoch": 0.38719374718172256, + "flos": 21040644804480.0, + "grad_norm": 1.6393280996453956, + "language_loss": 0.74339986, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76835454, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.22265625, + "step": 6440, + "time_per_iteration": 2.860747814178467 + }, + { + "auxiliary_loss_clip": 0.01478727, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.29251587, + "balance_loss_mlp": 1.01702034, + "epoch": 0.3872538704343905, + "flos": 19619256821760.0, + "grad_norm": 1.9689011216711991, + "language_loss": 0.82791698, + "learning_rate": 2.804010263051774e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.234375, + "step": 6441, + "time_per_iteration": 2.8553576469421387 + }, + { + "auxiliary_loss_clip": 0.0145344, + "auxiliary_loss_mlp": 0.01047035, + "balance_loss_clip": 1.275002, + "balance_loss_mlp": 1.0253861, + "epoch": 0.3873139936870585, + "flos": 17538975792000.0, + "grad_norm": 2.6065766299653146, + "language_loss": 0.8242296, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.8492344, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21655273, + "step": 6442, + "time_per_iteration": 2.860387086868286 + }, + { + "auxiliary_loss_clip": 0.01464416, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.28522718, + "balance_loss_mlp": 1.01726389, + "epoch": 0.38737411693972645, + "flos": 17795840376960.0, + "grad_norm": 1.7481945186990149, + "language_loss": 0.85071969, + "learning_rate": 2.803296990719624e-06, + "loss": 0.8757571, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22070312, + "step": 6443, + "time_per_iteration": 2.882420539855957 + }, + { + "auxiliary_loss_clip": 0.01291873, + "auxiliary_loss_mlp": 0.01048852, + "balance_loss_clip": 1.17577314, + "balance_loss_mlp": 1.02577293, + "epoch": 0.3874342401923944, + "flos": 58329336735360.0, + "grad_norm": 0.7771330312485016, + "language_loss": 0.50312662, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52653384, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.23046875, + "step": 6444, + "time_per_iteration": 4.82112455368042 + }, + { + "auxiliary_loss_clip": 0.01450054, + "auxiliary_loss_mlp": 0.01038676, + "balance_loss_clip": 1.27650774, + "balance_loss_mlp": 1.0175643, + "epoch": 0.3874943634450624, + "flos": 17720769974400.0, + "grad_norm": 2.322886185799783, + "language_loss": 0.7956928, + "learning_rate": 2.802583596543065e-06, + "loss": 0.82058012, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.21118164, + "step": 6445, + "time_per_iteration": 2.856750011444092 + }, + { + "auxiliary_loss_clip": 0.01468063, + "auxiliary_loss_mlp": 0.01039607, + "balance_loss_clip": 1.29000449, + "balance_loss_mlp": 1.01781547, + "epoch": 0.38755448669773035, + "flos": 19254220623360.0, + "grad_norm": 2.7073919445990993, + "language_loss": 0.82057106, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.84564781, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.21789551, + "step": 6446, + "time_per_iteration": 2.900074005126953 + }, + { + "auxiliary_loss_clip": 0.01465203, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.28638029, + "balance_loss_mlp": 1.01369643, + "epoch": 0.3876146099503983, + "flos": 20603841073920.0, + "grad_norm": 1.9318680785826423, + "language_loss": 0.77886713, + "learning_rate": 2.801870080630306e-06, + "loss": 0.80387813, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22180176, + "step": 6447, + "time_per_iteration": 2.8529326915740967 + }, + { + "auxiliary_loss_clip": 0.01449531, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.27591097, + "balance_loss_mlp": 1.0150764, + "epoch": 0.3876747332030663, + "flos": 19290443725440.0, + "grad_norm": 1.4745814281482696, + "language_loss": 0.76953954, + "learning_rate": 2.801513277056671e-06, + "loss": 0.79439294, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20715332, + "step": 6448, + "time_per_iteration": 2.842885732650757 + }, + { + "auxiliary_loss_clip": 0.01480814, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.30317831, + "balance_loss_mlp": 1.01419389, + "epoch": 0.38773485645573424, + "flos": 18953350830720.0, + "grad_norm": 2.1134305871008094, + "language_loss": 0.76715136, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.79230678, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.20544434, + "step": 6449, + "time_per_iteration": 2.8640687465667725 + }, + { + "auxiliary_loss_clip": 0.01473864, + "auxiliary_loss_mlp": 0.01037877, + "balance_loss_clip": 1.28976667, + "balance_loss_mlp": 1.01572824, + "epoch": 0.3877949797084022, + "flos": 23080540210560.0, + "grad_norm": 1.7698323328407812, + "language_loss": 0.7871573, + "learning_rate": 2.800799578742542e-06, + "loss": 0.81227469, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22143555, + "step": 6450, + "time_per_iteration": 2.935988187789917 + }, + { + "auxiliary_loss_clip": 0.01491692, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.30458879, + "balance_loss_mlp": 1.0198009, + "epoch": 0.3878551029610702, + "flos": 29107347557760.0, + "grad_norm": 2.212877986935538, + "language_loss": 0.78377247, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.80910373, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.21643066, + "step": 6451, + "time_per_iteration": 4.314770698547363 + }, + { + "auxiliary_loss_clip": 0.01451492, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.27736223, + "balance_loss_mlp": 1.0110898, + "epoch": 0.38791522621373814, + "flos": 21006276739200.0, + "grad_norm": 3.698004062761702, + "language_loss": 0.77440166, + "learning_rate": 2.800085758962812e-06, + "loss": 0.79924822, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.22070312, + "step": 6452, + "time_per_iteration": 2.869380474090576 + }, + { + "auxiliary_loss_clip": 0.0146, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.28212309, + "balance_loss_mlp": 1.01779008, + "epoch": 0.3879753494664061, + "flos": 15495234577920.0, + "grad_norm": 1.7316537615798895, + "language_loss": 0.80512798, + "learning_rate": 2.799728803557182e-06, + "loss": 0.83011782, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21203613, + "step": 6453, + "time_per_iteration": 4.285577297210693 + }, + { + "auxiliary_loss_clip": 0.01480769, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.29570472, + "balance_loss_mlp": 1.01373744, + "epoch": 0.3880354727190741, + "flos": 22063985867520.0, + "grad_norm": 2.1151465641588554, + "language_loss": 0.72580135, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.75095737, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21081543, + "step": 6454, + "time_per_iteration": 2.8691494464874268 + }, + { + "auxiliary_loss_clip": 0.01481033, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.29784381, + "balance_loss_mlp": 1.01492906, + "epoch": 0.3880955959717421, + "flos": 20350188869760.0, + "grad_norm": 1.9676575668173608, + "language_loss": 0.78671861, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.81189871, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22058105, + "step": 6455, + "time_per_iteration": 2.8559696674346924 + }, + { + "auxiliary_loss_clip": 0.01473937, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.29308701, + "balance_loss_mlp": 1.01246953, + "epoch": 0.38815571922441006, + "flos": 23085652872960.0, + "grad_norm": 2.1914610183734173, + "language_loss": 0.76643538, + "learning_rate": 2.798657755439662e-06, + "loss": 0.79151893, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21960449, + "step": 6456, + "time_per_iteration": 2.8931961059570312 + }, + { + "auxiliary_loss_clip": 0.01475755, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.29378259, + "balance_loss_mlp": 1.01439679, + "epoch": 0.388215842477078, + "flos": 20786630641920.0, + "grad_norm": 2.2015591730440103, + "language_loss": 0.61973786, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.64484847, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.20922852, + "step": 6457, + "time_per_iteration": 2.852195978164673 + }, + { + "auxiliary_loss_clip": 0.01473364, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.29011035, + "balance_loss_mlp": 1.01494551, + "epoch": 0.388275965729746, + "flos": 20457681811200.0, + "grad_norm": 2.436070967150388, + "language_loss": 0.80303782, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82814735, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.2265625, + "step": 6458, + "time_per_iteration": 2.8481314182281494 + }, + { + "auxiliary_loss_clip": 0.01473268, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.29112697, + "balance_loss_mlp": 1.01625156, + "epoch": 0.38833608898241395, + "flos": 27904112593920.0, + "grad_norm": 1.81315455275797, + "language_loss": 0.82588696, + "learning_rate": 2.797586434755509e-06, + "loss": 0.85100329, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22119141, + "step": 6459, + "time_per_iteration": 2.9705734252929688 + }, + { + "auxiliary_loss_clip": 0.01455555, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.27912235, + "balance_loss_mlp": 1.0121038, + "epoch": 0.3883962122350819, + "flos": 18085217990400.0, + "grad_norm": 1.8099352372326558, + "language_loss": 0.6368261, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.66170746, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.20471191, + "step": 6460, + "time_per_iteration": 2.839108943939209 + }, + { + "auxiliary_loss_clip": 0.01463363, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.2850225, + "balance_loss_mlp": 1.01524258, + "epoch": 0.3884563354877499, + "flos": 23632528498560.0, + "grad_norm": 3.7580611129221584, + "language_loss": 0.86891162, + "learning_rate": 2.796872069720717e-06, + "loss": 0.89391202, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.2142334, + "step": 6461, + "time_per_iteration": 2.8754096031188965 + }, + { + "auxiliary_loss_clip": 0.01473467, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.29247034, + "balance_loss_mlp": 1.01732922, + "epoch": 0.38851645874041785, + "flos": 27464865644160.0, + "grad_norm": 2.2714153902233365, + "language_loss": 0.72698379, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.75210965, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21777344, + "step": 6462, + "time_per_iteration": 2.8975532054901123 + }, + { + "auxiliary_loss_clip": 0.01470805, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.28996825, + "balance_loss_mlp": 1.01568377, + "epoch": 0.3885765819930858, + "flos": 25238877799680.0, + "grad_norm": 6.6917940898368125, + "language_loss": 0.77156401, + "learning_rate": 2.796157583816052e-06, + "loss": 0.79665226, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.22351074, + "step": 6463, + "time_per_iteration": 2.922329902648926 + }, + { + "auxiliary_loss_clip": 0.01489555, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.30257404, + "balance_loss_mlp": 1.02100778, + "epoch": 0.3886367052457538, + "flos": 16955469861120.0, + "grad_norm": 1.8702766843642906, + "language_loss": 0.71651065, + "learning_rate": 2.795800295571382e-06, + "loss": 0.74184632, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.23034668, + "step": 6464, + "time_per_iteration": 2.909059762954712 + }, + { + "auxiliary_loss_clip": 0.01463859, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.28506041, + "balance_loss_mlp": 1.00924802, + "epoch": 0.38869682849842174, + "flos": 27163452913920.0, + "grad_norm": 2.018986641119819, + "language_loss": 0.70529342, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.7302388, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.21435547, + "step": 6465, + "time_per_iteration": 2.8694539070129395 + }, + { + "auxiliary_loss_clip": 0.0146733, + "auxiliary_loss_mlp": 0.01042124, + "balance_loss_clip": 1.2856704, + "balance_loss_mlp": 1.01874709, + "epoch": 0.3887569517510897, + "flos": 21072026712960.0, + "grad_norm": 2.6850146818168303, + "language_loss": 0.78791839, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.81301296, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.23376465, + "step": 6466, + "time_per_iteration": 2.83375883102417 + }, + { + "auxiliary_loss_clip": 0.01474671, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.29096472, + "balance_loss_mlp": 1.01636422, + "epoch": 0.38881707500375773, + "flos": 29509421264640.0, + "grad_norm": 1.7837472509113252, + "language_loss": 0.69781858, + "learning_rate": 2.794728249830611e-06, + "loss": 0.72295052, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22167969, + "step": 6467, + "time_per_iteration": 3.0356523990631104 + }, + { + "auxiliary_loss_clip": 0.01473144, + "auxiliary_loss_mlp": 0.01046356, + "balance_loss_clip": 1.289891, + "balance_loss_mlp": 1.02405155, + "epoch": 0.3888771982564257, + "flos": 17495830235520.0, + "grad_norm": 2.6897972993320693, + "language_loss": 0.84835529, + "learning_rate": 2.794370840959936e-06, + "loss": 0.8735503, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.22302246, + "step": 6468, + "time_per_iteration": 3.009885787963867 + }, + { + "auxiliary_loss_clip": 0.01467082, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.28749359, + "balance_loss_mlp": 1.02245724, + "epoch": 0.38893732150909366, + "flos": 21951877956480.0, + "grad_norm": 2.052947770134962, + "language_loss": 0.85111868, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.87622261, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.20861816, + "step": 6469, + "time_per_iteration": 2.87727952003479 + }, + { + "auxiliary_loss_clip": 0.01471598, + "auxiliary_loss_mlp": 0.0104336, + "balance_loss_clip": 1.2905519, + "balance_loss_mlp": 1.02041197, + "epoch": 0.3889974447617616, + "flos": 24286127904000.0, + "grad_norm": 1.6731880781646202, + "language_loss": 0.75288224, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77803177, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.22949219, + "step": 6470, + "time_per_iteration": 2.922835350036621 + }, + { + "auxiliary_loss_clip": 0.01463586, + "auxiliary_loss_mlp": 0.01042444, + "balance_loss_clip": 1.28213191, + "balance_loss_mlp": 1.02011669, + "epoch": 0.3890575680144296, + "flos": 25678350973440.0, + "grad_norm": 1.6386236340461162, + "language_loss": 0.75468874, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77974904, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.2232666, + "step": 6471, + "time_per_iteration": 2.895298957824707 + }, + { + "auxiliary_loss_clip": 0.01473805, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.29270124, + "balance_loss_mlp": 1.02445054, + "epoch": 0.38911769126709755, + "flos": 22865147124480.0, + "grad_norm": 2.2776537363550586, + "language_loss": 0.68737334, + "learning_rate": 2.792940904386562e-06, + "loss": 0.71257311, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21728516, + "step": 6472, + "time_per_iteration": 2.8691658973693848 + }, + { + "auxiliary_loss_clip": 0.01471348, + "auxiliary_loss_mlp": 0.01049088, + "balance_loss_clip": 1.29006004, + "balance_loss_mlp": 1.02658105, + "epoch": 0.3891778145197655, + "flos": 25458523896960.0, + "grad_norm": 1.6502567750175128, + "language_loss": 0.76723725, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.79244161, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.22521973, + "step": 6473, + "time_per_iteration": 2.877331018447876 + }, + { + "auxiliary_loss_clip": 0.01479081, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.29568315, + "balance_loss_mlp": 1.01939034, + "epoch": 0.3892379377724335, + "flos": 14036130414720.0, + "grad_norm": 3.1481225693589185, + "language_loss": 0.72411156, + "learning_rate": 2.792225755635257e-06, + "loss": 0.74932933, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23278809, + "step": 6474, + "time_per_iteration": 2.826059341430664 + }, + { + "auxiliary_loss_clip": 0.01467027, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.28581154, + "balance_loss_mlp": 1.0200479, + "epoch": 0.38929806102510145, + "flos": 20167173077760.0, + "grad_norm": 1.5594234756638636, + "language_loss": 0.69724584, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.72233307, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.21630859, + "step": 6475, + "time_per_iteration": 2.8886568546295166 + }, + { + "auxiliary_loss_clip": 0.01484171, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.29821014, + "balance_loss_mlp": 1.02486825, + "epoch": 0.3893581842777694, + "flos": 22174148252160.0, + "grad_norm": 3.4826287002597947, + "language_loss": 0.76736051, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.79267311, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.22216797, + "step": 6476, + "time_per_iteration": 2.8204667568206787 + }, + { + "auxiliary_loss_clip": 0.01269792, + "auxiliary_loss_mlp": 0.01026206, + "balance_loss_clip": 1.15640378, + "balance_loss_mlp": 1.00188696, + "epoch": 0.3894183075304374, + "flos": 67334555255040.0, + "grad_norm": 0.8198489608675659, + "language_loss": 0.58249038, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60545039, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.24316406, + "step": 6477, + "time_per_iteration": 3.3603296279907227 + }, + { + "auxiliary_loss_clip": 0.01479656, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.2941792, + "balance_loss_mlp": 1.01441455, + "epoch": 0.38947843078310534, + "flos": 18555620624640.0, + "grad_norm": 1.9362290964443059, + "language_loss": 0.78700054, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.81218445, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.24328613, + "step": 6478, + "time_per_iteration": 2.8588738441467285 + }, + { + "auxiliary_loss_clip": 0.01453724, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.27361417, + "balance_loss_mlp": 1.01469135, + "epoch": 0.3895385540357733, + "flos": 14612668646400.0, + "grad_norm": 20.425634629816475, + "language_loss": 0.83245409, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.85735661, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21838379, + "step": 6479, + "time_per_iteration": 4.239387273788452 + }, + { + "auxiliary_loss_clip": 0.01459382, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.28029537, + "balance_loss_mlp": 1.01574111, + "epoch": 0.38959867728844133, + "flos": 19984926447360.0, + "grad_norm": 1.8188356817462135, + "language_loss": 0.81275952, + "learning_rate": 2.790079588824617e-06, + "loss": 0.83773696, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.22631836, + "step": 6480, + "time_per_iteration": 2.9282755851745605 + }, + { + "auxiliary_loss_clip": 0.01452399, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.27439296, + "balance_loss_mlp": 1.01161742, + "epoch": 0.3896588005411093, + "flos": 22681769374080.0, + "grad_norm": 1.6642284044167857, + "language_loss": 0.83807117, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.86292791, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.21655273, + "step": 6481, + "time_per_iteration": 2.979628324508667 + }, + { + "auxiliary_loss_clip": 0.01452093, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.27674007, + "balance_loss_mlp": 1.01597905, + "epoch": 0.38971892379377726, + "flos": 21005824291200.0, + "grad_norm": 1.8126122118030907, + "language_loss": 0.76101524, + "learning_rate": 2.789363960063863e-06, + "loss": 0.78590763, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21166992, + "step": 6482, + "time_per_iteration": 2.911851644515991 + }, + { + "auxiliary_loss_clip": 0.01464236, + "auxiliary_loss_mlp": 0.01041872, + "balance_loss_clip": 1.28133214, + "balance_loss_mlp": 1.02085507, + "epoch": 0.3897790470464452, + "flos": 22538686757760.0, + "grad_norm": 2.3329985177127925, + "language_loss": 0.79808515, + "learning_rate": 2.78900610077756e-06, + "loss": 0.82314622, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21008301, + "step": 6483, + "time_per_iteration": 2.8610291481018066 + }, + { + "auxiliary_loss_clip": 0.01451322, + "auxiliary_loss_mlp": 0.01028988, + "balance_loss_clip": 1.27089453, + "balance_loss_mlp": 1.0071609, + "epoch": 0.3898391702991132, + "flos": 26220113936640.0, + "grad_norm": 1.6891186744655102, + "language_loss": 0.80901259, + "learning_rate": 2.788648211572067e-06, + "loss": 0.83381569, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21850586, + "step": 6484, + "time_per_iteration": 2.884709596633911 + }, + { + "auxiliary_loss_clip": 0.01466922, + "auxiliary_loss_mlp": 0.01041446, + "balance_loss_clip": 1.28651452, + "balance_loss_mlp": 1.01874852, + "epoch": 0.38989929355178116, + "flos": 21074469932160.0, + "grad_norm": 1.5690804142810402, + "language_loss": 0.78686631, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.81194997, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.22705078, + "step": 6485, + "time_per_iteration": 2.8519203662872314 + }, + { + "auxiliary_loss_clip": 0.01465191, + "auxiliary_loss_mlp": 0.01037081, + "balance_loss_clip": 1.28090131, + "balance_loss_mlp": 1.01503956, + "epoch": 0.3899594168044491, + "flos": 25495154202240.0, + "grad_norm": 2.938284277479139, + "language_loss": 0.86117804, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.88620079, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22045898, + "step": 6486, + "time_per_iteration": 5.717637538909912 + }, + { + "auxiliary_loss_clip": 0.01471592, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.28603804, + "balance_loss_mlp": 1.011217, + "epoch": 0.3900195400571171, + "flos": 31151857933440.0, + "grad_norm": 2.5520437927577055, + "language_loss": 0.8616637, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.88671315, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.22143555, + "step": 6487, + "time_per_iteration": 2.8867156505584717 + }, + { + "auxiliary_loss_clip": 0.01446172, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.26829076, + "balance_loss_mlp": 1.01286888, + "epoch": 0.39007966330978505, + "flos": 20239528792320.0, + "grad_norm": 1.6136897087334714, + "language_loss": 0.74004883, + "learning_rate": 2.787216355829633e-06, + "loss": 0.76486373, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.22460938, + "step": 6488, + "time_per_iteration": 4.255650997161865 + }, + { + "auxiliary_loss_clip": 0.0146968, + "auxiliary_loss_mlp": 0.01040041, + "balance_loss_clip": 1.28528798, + "balance_loss_mlp": 1.01846433, + "epoch": 0.390139786562453, + "flos": 22539003471360.0, + "grad_norm": 1.6973016367126375, + "language_loss": 0.68697822, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71207547, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.21582031, + "step": 6489, + "time_per_iteration": 2.8851261138916016 + }, + { + "auxiliary_loss_clip": 0.01449179, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.27143812, + "balance_loss_mlp": 1.01566935, + "epoch": 0.390199909815121, + "flos": 26444239269120.0, + "grad_norm": 1.6614095441877768, + "language_loss": 0.81629241, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.8411544, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21337891, + "step": 6490, + "time_per_iteration": 2.861380100250244 + }, + { + "auxiliary_loss_clip": 0.01462105, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.28049338, + "balance_loss_mlp": 1.0168972, + "epoch": 0.39026003306778895, + "flos": 17283423306240.0, + "grad_norm": 2.6496128451993566, + "language_loss": 0.90749961, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.93250895, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21923828, + "step": 6491, + "time_per_iteration": 2.8403480052948 + }, + { + "auxiliary_loss_clip": 0.01467062, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_clip": 1.28364277, + "balance_loss_mlp": 1.02226639, + "epoch": 0.3903201563204569, + "flos": 24542947244160.0, + "grad_norm": 1.8984510525372758, + "language_loss": 0.79254913, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81766701, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22460938, + "step": 6492, + "time_per_iteration": 2.9087772369384766 + }, + { + "auxiliary_loss_clip": 0.01454749, + "auxiliary_loss_mlp": 0.01037222, + "balance_loss_clip": 1.27522135, + "balance_loss_mlp": 1.01439393, + "epoch": 0.39038027957312493, + "flos": 23778190068480.0, + "grad_norm": 2.4098980113312622, + "language_loss": 0.74625432, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.77117395, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22839355, + "step": 6493, + "time_per_iteration": 2.86013126373291 + }, + { + "auxiliary_loss_clip": 0.01494065, + "auxiliary_loss_mlp": 0.01040747, + "balance_loss_clip": 1.30416441, + "balance_loss_mlp": 1.01772761, + "epoch": 0.3904404028257929, + "flos": 14108712353280.0, + "grad_norm": 12.707937005054411, + "language_loss": 0.77330816, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.79865628, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.22998047, + "step": 6494, + "time_per_iteration": 2.8610501289367676 + }, + { + "auxiliary_loss_clip": 0.01495229, + "auxiliary_loss_mlp": 0.01047392, + "balance_loss_clip": 1.30292869, + "balance_loss_mlp": 1.02425337, + "epoch": 0.39050052607846086, + "flos": 16918884800640.0, + "grad_norm": 2.837665955404574, + "language_loss": 0.75787115, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.78329742, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.23144531, + "step": 6495, + "time_per_iteration": 2.8506293296813965 + }, + { + "auxiliary_loss_clip": 0.01463353, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.28470898, + "balance_loss_mlp": 1.01910806, + "epoch": 0.39056064933112883, + "flos": 25925306947200.0, + "grad_norm": 1.5961995683225885, + "language_loss": 0.68052977, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70557535, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22106934, + "step": 6496, + "time_per_iteration": 2.9213039875030518 + }, + { + "auxiliary_loss_clip": 0.01246077, + "auxiliary_loss_mlp": 0.01022988, + "balance_loss_clip": 1.1387701, + "balance_loss_mlp": 1.00086308, + "epoch": 0.3906207725837968, + "flos": 60055892766720.0, + "grad_norm": 0.6614577911319535, + "language_loss": 0.54061711, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56330776, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.22167969, + "step": 6497, + "time_per_iteration": 3.4821202754974365 + }, + { + "auxiliary_loss_clip": 0.0147616, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.29337895, + "balance_loss_mlp": 1.01647735, + "epoch": 0.39068089583646476, + "flos": 21078406229760.0, + "grad_norm": 2.0237171771950404, + "language_loss": 0.70241982, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.72756743, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.22143555, + "step": 6498, + "time_per_iteration": 2.9563822746276855 + }, + { + "auxiliary_loss_clip": 0.01245281, + "auxiliary_loss_mlp": 0.01020022, + "balance_loss_clip": 1.13839436, + "balance_loss_mlp": 1.00247467, + "epoch": 0.3907410190891327, + "flos": 70480734721920.0, + "grad_norm": 0.7328468323146559, + "language_loss": 0.51780653, + "learning_rate": 2.783276292417936e-06, + "loss": 0.54045951, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.17578125, + "step": 6499, + "time_per_iteration": 3.4432950019836426 + }, + { + "auxiliary_loss_clip": 0.01477106, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.29118872, + "balance_loss_mlp": 1.01857555, + "epoch": 0.3908011423418007, + "flos": 27973436906880.0, + "grad_norm": 1.6003017893856029, + "language_loss": 0.74879336, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.77398658, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23657227, + "step": 6500, + "time_per_iteration": 2.979611396789551 + }, + { + "auxiliary_loss_clip": 0.01479284, + "auxiliary_loss_mlp": 0.01045425, + "balance_loss_clip": 1.29217124, + "balance_loss_mlp": 1.02232242, + "epoch": 0.39086126559446865, + "flos": 24472763280000.0, + "grad_norm": 1.8510884974050967, + "language_loss": 0.6942327, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.7194798, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.2310791, + "step": 6501, + "time_per_iteration": 2.8669726848602295 + }, + { + "auxiliary_loss_clip": 0.01469837, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.2880547, + "balance_loss_mlp": 1.02168059, + "epoch": 0.3909213888471366, + "flos": 16949090344320.0, + "grad_norm": 1.720408861246598, + "language_loss": 0.79274988, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81788188, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.21716309, + "step": 6502, + "time_per_iteration": 2.853104591369629 + }, + { + "auxiliary_loss_clip": 0.01454593, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.2766819, + "balance_loss_mlp": 1.01669765, + "epoch": 0.3909815120998046, + "flos": 29290363349760.0, + "grad_norm": 2.2260472964275695, + "language_loss": 0.80701578, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.83194458, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.21582031, + "step": 6503, + "time_per_iteration": 2.8915133476257324 + }, + { + "auxiliary_loss_clip": 0.01458157, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.28059912, + "balance_loss_mlp": 1.01816654, + "epoch": 0.39104163535247255, + "flos": 18960092305920.0, + "grad_norm": 2.1948605446885585, + "language_loss": 0.72240686, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.74739242, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.22229004, + "step": 6504, + "time_per_iteration": 2.865107536315918 + }, + { + "auxiliary_loss_clip": 0.01463038, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.28204083, + "balance_loss_mlp": 1.01395261, + "epoch": 0.3911017586051405, + "flos": 26334619822080.0, + "grad_norm": 1.6587255387775324, + "language_loss": 0.83796978, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.86296797, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.22814941, + "step": 6505, + "time_per_iteration": 2.8531925678253174 + }, + { + "auxiliary_loss_clip": 0.01466117, + "auxiliary_loss_mlp": 0.01038941, + "balance_loss_clip": 1.28561759, + "balance_loss_mlp": 1.0145154, + "epoch": 0.3911618818578085, + "flos": 21845923338240.0, + "grad_norm": 2.908049162175065, + "language_loss": 0.72020316, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.74525368, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.24414062, + "step": 6506, + "time_per_iteration": 2.8197293281555176 + }, + { + "auxiliary_loss_clip": 0.0145306, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.27710593, + "balance_loss_mlp": 1.0170337, + "epoch": 0.3912220051104765, + "flos": 16367575184640.0, + "grad_norm": 1.8452941956639932, + "language_loss": 0.76196462, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.78688949, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.22399902, + "step": 6507, + "time_per_iteration": 2.800933599472046 + }, + { + "auxiliary_loss_clip": 0.01246192, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.13445079, + "balance_loss_mlp": 1.00186241, + "epoch": 0.39128212836314447, + "flos": 71082320590080.0, + "grad_norm": 0.7557618663356298, + "language_loss": 0.56528008, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58802766, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.26757812, + "step": 6508, + "time_per_iteration": 3.4869067668914795 + }, + { + "auxiliary_loss_clip": 0.0145485, + "auxiliary_loss_mlp": 0.01038118, + "balance_loss_clip": 1.27594733, + "balance_loss_mlp": 1.01568258, + "epoch": 0.39134225161581243, + "flos": 20339692076160.0, + "grad_norm": 2.1393188476974334, + "language_loss": 0.77129066, + "learning_rate": 2.779691297413471e-06, + "loss": 0.79622036, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.2244873, + "step": 6509, + "time_per_iteration": 2.8593108654022217 + }, + { + "auxiliary_loss_clip": 0.01479067, + "auxiliary_loss_mlp": 0.01041819, + "balance_loss_clip": 1.29573584, + "balance_loss_mlp": 1.01810861, + "epoch": 0.3914023748684804, + "flos": 17026965924480.0, + "grad_norm": 3.617374419375828, + "language_loss": 0.83670205, + "learning_rate": 2.779332635075825e-06, + "loss": 0.86191094, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.23693848, + "step": 6510, + "time_per_iteration": 2.982682466506958 + }, + { + "auxiliary_loss_clip": 0.01479391, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.29542947, + "balance_loss_mlp": 1.01335347, + "epoch": 0.39146249812114836, + "flos": 18414257310720.0, + "grad_norm": 4.242789570282186, + "language_loss": 0.77992827, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.80508196, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22631836, + "step": 6511, + "time_per_iteration": 2.846534013748169 + }, + { + "auxiliary_loss_clip": 0.01248754, + "auxiliary_loss_mlp": 0.01023101, + "balance_loss_clip": 1.13660741, + "balance_loss_mlp": 0.99964011, + "epoch": 0.3915226213738163, + "flos": 67671964863360.0, + "grad_norm": 0.7173514472524468, + "language_loss": 0.57781255, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.60053116, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.234375, + "step": 6512, + "time_per_iteration": 3.3698952198028564 + }, + { + "auxiliary_loss_clip": 0.01473038, + "auxiliary_loss_mlp": 0.01037559, + "balance_loss_clip": 1.29186249, + "balance_loss_mlp": 1.01352644, + "epoch": 0.3915827446264843, + "flos": 26370480965760.0, + "grad_norm": 1.6755171546132923, + "language_loss": 0.70543802, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.73054397, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.24023438, + "step": 6513, + "time_per_iteration": 2.8852343559265137 + }, + { + "auxiliary_loss_clip": 0.01490079, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.30126214, + "balance_loss_mlp": 1.01559663, + "epoch": 0.39164286787915226, + "flos": 21953732993280.0, + "grad_norm": 6.981320307495923, + "language_loss": 0.76992071, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.79520321, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.22583008, + "step": 6514, + "time_per_iteration": 4.2583982944488525 + }, + { + "auxiliary_loss_clip": 0.01462652, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.28275394, + "balance_loss_mlp": 1.01501727, + "epoch": 0.3917029911318202, + "flos": 16408684725120.0, + "grad_norm": 1.792733649073772, + "language_loss": 0.78699082, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.81199384, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22631836, + "step": 6515, + "time_per_iteration": 2.8541603088378906 + }, + { + "auxiliary_loss_clip": 0.01454579, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.277246, + "balance_loss_mlp": 1.01483083, + "epoch": 0.3917631143844882, + "flos": 26222376176640.0, + "grad_norm": 1.369738973240342, + "language_loss": 0.80450237, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.82941318, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21679688, + "step": 6516, + "time_per_iteration": 2.874918222427368 + }, + { + "auxiliary_loss_clip": 0.01460763, + "auxiliary_loss_mlp": 0.01039018, + "balance_loss_clip": 1.27860594, + "balance_loss_mlp": 1.01611829, + "epoch": 0.39182323763715615, + "flos": 18556706499840.0, + "grad_norm": 2.15663250984783, + "language_loss": 0.71346545, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.73846322, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22888184, + "step": 6517, + "time_per_iteration": 2.857529878616333 + }, + { + "auxiliary_loss_clip": 0.01477906, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.29398489, + "balance_loss_mlp": 1.01871586, + "epoch": 0.3918833608898241, + "flos": 34326840355200.0, + "grad_norm": 1.6455553102558327, + "language_loss": 0.72382867, + "learning_rate": 2.776462273631956e-06, + "loss": 0.7490145, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.21960449, + "step": 6518, + "time_per_iteration": 2.9520936012268066 + }, + { + "auxiliary_loss_clip": 0.01482304, + "auxiliary_loss_mlp": 0.01042854, + "balance_loss_clip": 1.29638338, + "balance_loss_mlp": 1.01960862, + "epoch": 0.3919434841424921, + "flos": 36953318338560.0, + "grad_norm": 1.753587491936517, + "language_loss": 0.6210888, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.64634037, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.2322998, + "step": 6519, + "time_per_iteration": 2.9504072666168213 + }, + { + "auxiliary_loss_clip": 0.01504056, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.31318772, + "balance_loss_mlp": 1.01837873, + "epoch": 0.3920036073951601, + "flos": 23518203592320.0, + "grad_norm": 8.15818327060903, + "language_loss": 0.68571562, + "learning_rate": 2.775744388563563e-06, + "loss": 0.71116763, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22766113, + "step": 6520, + "time_per_iteration": 2.9840166568756104 + }, + { + "auxiliary_loss_clip": 0.01478665, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.29522336, + "balance_loss_mlp": 1.01605034, + "epoch": 0.39206373064782807, + "flos": 18415343185920.0, + "grad_norm": 1.712994880313037, + "language_loss": 0.79395235, + "learning_rate": 2.775385401898104e-06, + "loss": 0.81912255, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22314453, + "step": 6521, + "time_per_iteration": 4.256750583648682 + }, + { + "auxiliary_loss_clip": 0.01491442, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.30240369, + "balance_loss_mlp": 1.01689339, + "epoch": 0.39212385390049603, + "flos": 12320749848960.0, + "grad_norm": 2.2838888590056157, + "language_loss": 0.71719515, + "learning_rate": 2.775026385829952e-06, + "loss": 0.74251986, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.24145508, + "step": 6522, + "time_per_iteration": 4.262572288513184 + }, + { + "auxiliary_loss_clip": 0.01478958, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.29362535, + "balance_loss_mlp": 1.01813519, + "epoch": 0.392183977153164, + "flos": 19728650044800.0, + "grad_norm": 5.776711946182707, + "language_loss": 0.78119576, + "learning_rate": 2.774667340372722e-06, + "loss": 0.80638444, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.21789551, + "step": 6523, + "time_per_iteration": 4.235956192016602 + }, + { + "auxiliary_loss_clip": 0.01486248, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.29976964, + "balance_loss_mlp": 1.01912212, + "epoch": 0.39224410040583196, + "flos": 33157611498240.0, + "grad_norm": 2.144101962473702, + "language_loss": 0.62729132, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.65256381, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.21887207, + "step": 6524, + "time_per_iteration": 2.944766044616699 + }, + { + "auxiliary_loss_clip": 0.01483324, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.29438162, + "balance_loss_mlp": 1.01293349, + "epoch": 0.39230422365849993, + "flos": 27793995454080.0, + "grad_norm": 3.153439739620719, + "language_loss": 0.74745744, + "learning_rate": 2.773949161345489e-06, + "loss": 0.77265334, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.23327637, + "step": 6525, + "time_per_iteration": 2.863583564758301 + }, + { + "auxiliary_loss_clip": 0.01487576, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.29956925, + "balance_loss_mlp": 1.01732385, + "epoch": 0.3923643469111679, + "flos": 17940732785280.0, + "grad_norm": 2.4825014791628206, + "language_loss": 0.81652153, + "learning_rate": 2.773590027802719e-06, + "loss": 0.84179074, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22033691, + "step": 6526, + "time_per_iteration": 2.8307366371154785 + }, + { + "auxiliary_loss_clip": 0.01507555, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.31958461, + "balance_loss_mlp": 1.01857233, + "epoch": 0.39242447016383586, + "flos": 24069784677120.0, + "grad_norm": 2.068573661272539, + "language_loss": 0.70861399, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.73408675, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.21179199, + "step": 6527, + "time_per_iteration": 2.8415517807006836 + }, + { + "auxiliary_loss_clip": 0.0148445, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.30081272, + "balance_loss_mlp": 1.01624942, + "epoch": 0.3924845934165038, + "flos": 10669490444160.0, + "grad_norm": 2.353052875278004, + "language_loss": 0.83746958, + "learning_rate": 2.772871672726965e-06, + "loss": 0.86269635, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.21972656, + "step": 6528, + "time_per_iteration": 2.810439348220825 + }, + { + "auxiliary_loss_clip": 0.01479589, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.29753983, + "balance_loss_mlp": 1.01751733, + "epoch": 0.3925447166691718, + "flos": 31257676817280.0, + "grad_norm": 1.5860588978559498, + "language_loss": 0.69345576, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.71865213, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22521973, + "step": 6529, + "time_per_iteration": 2.9746310710906982 + }, + { + "auxiliary_loss_clip": 0.01485064, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.29735506, + "balance_loss_mlp": 1.02068818, + "epoch": 0.39260483992183975, + "flos": 29424623230080.0, + "grad_norm": 2.632745526743396, + "language_loss": 0.81275624, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.83804286, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.22912598, + "step": 6530, + "time_per_iteration": 2.8779969215393066 + }, + { + "auxiliary_loss_clip": 0.01479276, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.29565322, + "balance_loss_mlp": 1.01944017, + "epoch": 0.3926649631745077, + "flos": 22868314260480.0, + "grad_norm": 1.9031338238280056, + "language_loss": 0.76709366, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.79231209, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.23120117, + "step": 6531, + "time_per_iteration": 2.8764700889587402 + }, + { + "auxiliary_loss_clip": 0.01252326, + "auxiliary_loss_mlp": 0.0104516, + "balance_loss_clip": 1.14402032, + "balance_loss_mlp": 1.02541852, + "epoch": 0.3927250864271757, + "flos": 63921828816000.0, + "grad_norm": 0.8297735564892434, + "language_loss": 0.60433906, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62731391, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.19726562, + "step": 6532, + "time_per_iteration": 3.200252056121826 + }, + { + "auxiliary_loss_clip": 0.01256339, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.14762747, + "balance_loss_mlp": 1.00539672, + "epoch": 0.3927852096798437, + "flos": 68943483509760.0, + "grad_norm": 0.7844141063688981, + "language_loss": 0.556095, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57893836, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.22558594, + "step": 6533, + "time_per_iteration": 3.3633668422698975 + }, + { + "auxiliary_loss_clip": 0.01497272, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.31052828, + "balance_loss_mlp": 1.01634967, + "epoch": 0.39284533293251167, + "flos": 29727981486720.0, + "grad_norm": 1.9367288234962818, + "language_loss": 0.77596992, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.80131805, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.21191406, + "step": 6534, + "time_per_iteration": 2.9465172290802 + }, + { + "auxiliary_loss_clip": 0.01496993, + "auxiliary_loss_mlp": 0.01038887, + "balance_loss_clip": 1.30748987, + "balance_loss_mlp": 1.01665449, + "epoch": 0.39290545618517964, + "flos": 18561321469440.0, + "grad_norm": 2.0171849265588944, + "language_loss": 0.79380453, + "learning_rate": 2.770356507494851e-06, + "loss": 0.81916326, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.22241211, + "step": 6535, + "time_per_iteration": 2.8136367797851562 + }, + { + "auxiliary_loss_clip": 0.0147328, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.2912364, + "balance_loss_mlp": 1.01834917, + "epoch": 0.3929655794378476, + "flos": 26260499560320.0, + "grad_norm": 1.605071914258273, + "language_loss": 0.68932575, + "learning_rate": 2.769997081218978e-06, + "loss": 0.71446097, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.21875, + "step": 6536, + "time_per_iteration": 2.9005329608917236 + }, + { + "auxiliary_loss_clip": 0.01462204, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.28379941, + "balance_loss_mlp": 1.01648605, + "epoch": 0.39302570269051557, + "flos": 29289775167360.0, + "grad_norm": 1.9099635530738206, + "language_loss": 0.70077354, + "learning_rate": 2.769637625744738e-06, + "loss": 0.72577572, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.21533203, + "step": 6537, + "time_per_iteration": 2.8891079425811768 + }, + { + "auxiliary_loss_clip": 0.01478646, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.29367328, + "balance_loss_mlp": 1.01980281, + "epoch": 0.39308582594318353, + "flos": 17356276713600.0, + "grad_norm": 2.301179986908904, + "language_loss": 0.79670823, + "learning_rate": 2.769278141085763e-06, + "loss": 0.82193154, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.2388916, + "step": 6538, + "time_per_iteration": 2.834627866744995 + }, + { + "auxiliary_loss_clip": 0.01248717, + "auxiliary_loss_mlp": 0.01021196, + "balance_loss_clip": 1.13858294, + "balance_loss_mlp": 0.99897557, + "epoch": 0.3931459491958515, + "flos": 61033147361280.0, + "grad_norm": 0.8052640312663555, + "language_loss": 0.61967766, + "learning_rate": 2.768918627255683e-06, + "loss": 0.64237678, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.22265625, + "step": 6539, + "time_per_iteration": 3.1424150466918945 + }, + { + "auxiliary_loss_clip": 0.01483646, + "auxiliary_loss_mlp": 0.01040554, + "balance_loss_clip": 1.30007553, + "balance_loss_mlp": 1.0181669, + "epoch": 0.39320607244851946, + "flos": 39029436846720.0, + "grad_norm": 2.0953723427176567, + "language_loss": 0.68625259, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.71149457, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.22387695, + "step": 6540, + "time_per_iteration": 2.96734881401062 + }, + { + "auxiliary_loss_clip": 0.01484796, + "auxiliary_loss_mlp": 0.01045853, + "balance_loss_clip": 1.30104339, + "balance_loss_mlp": 1.02252352, + "epoch": 0.3932661957011874, + "flos": 24690147137280.0, + "grad_norm": 1.7120149454324094, + "language_loss": 0.72967541, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.75498188, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23303223, + "step": 6541, + "time_per_iteration": 2.8768720626831055 + }, + { + "auxiliary_loss_clip": 0.01252999, + "auxiliary_loss_mlp": 0.01025621, + "balance_loss_clip": 1.14407766, + "balance_loss_mlp": 1.00263786, + "epoch": 0.3933263189538554, + "flos": 70126132089600.0, + "grad_norm": 0.8248936924620905, + "language_loss": 0.60389388, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62668002, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.22949219, + "step": 6542, + "time_per_iteration": 3.129950523376465 + }, + { + "auxiliary_loss_clip": 0.01479956, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.29591346, + "balance_loss_mlp": 1.02140474, + "epoch": 0.39338644220652336, + "flos": 22939176896640.0, + "grad_norm": 3.1261540717692604, + "language_loss": 0.83070922, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.85595626, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.23303223, + "step": 6543, + "time_per_iteration": 2.8621015548706055 + }, + { + "auxiliary_loss_clip": 0.01469356, + "auxiliary_loss_mlp": 0.01044965, + "balance_loss_clip": 1.28551626, + "balance_loss_mlp": 1.02132618, + "epoch": 0.3934465654591913, + "flos": 30860761017600.0, + "grad_norm": 1.896485214834873, + "language_loss": 0.69794786, + "learning_rate": 2.767120621015908e-06, + "loss": 0.72309113, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23657227, + "step": 6544, + "time_per_iteration": 2.91471266746521 + }, + { + "auxiliary_loss_clip": 0.0148786, + "auxiliary_loss_mlp": 0.01046209, + "balance_loss_clip": 1.29942107, + "balance_loss_mlp": 1.02167594, + "epoch": 0.3935066887118593, + "flos": 29247082058880.0, + "grad_norm": 2.109915128070669, + "language_loss": 0.7639848, + "learning_rate": 2.76676093244553e-06, + "loss": 0.78932548, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.24536133, + "step": 6545, + "time_per_iteration": 2.9296715259552 + }, + { + "auxiliary_loss_clip": 0.01459401, + "auxiliary_loss_mlp": 0.01045994, + "balance_loss_clip": 1.28283834, + "balance_loss_mlp": 1.02370214, + "epoch": 0.3935668119645273, + "flos": 19144601176320.0, + "grad_norm": 1.5157993198059068, + "language_loss": 0.75144851, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.77650249, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.22302246, + "step": 6546, + "time_per_iteration": 2.8275420665740967 + }, + { + "auxiliary_loss_clip": 0.0150677, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_clip": 1.31472278, + "balance_loss_mlp": 1.01906037, + "epoch": 0.3936269352171953, + "flos": 18525550815360.0, + "grad_norm": 1.9973043510342527, + "language_loss": 0.82351792, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.849015, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.2388916, + "step": 6547, + "time_per_iteration": 2.799750804901123 + }, + { + "auxiliary_loss_clip": 0.01486583, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.3019762, + "balance_loss_mlp": 1.01869321, + "epoch": 0.39368705846986324, + "flos": 15641258106240.0, + "grad_norm": 2.0218766220773423, + "language_loss": 0.8451618, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.87044573, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23132324, + "step": 6548, + "time_per_iteration": 2.858471155166626 + }, + { + "auxiliary_loss_clip": 0.01471623, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.2897172, + "balance_loss_mlp": 1.01729107, + "epoch": 0.3937471817225312, + "flos": 21336085221120.0, + "grad_norm": 1.7513204856908495, + "language_loss": 0.73388767, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75898719, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.21044922, + "step": 6549, + "time_per_iteration": 4.360385179519653 + }, + { + "auxiliary_loss_clip": 0.01500762, + "auxiliary_loss_mlp": 0.0104547, + "balance_loss_clip": 1.31430566, + "balance_loss_mlp": 1.02162838, + "epoch": 0.39380730497519917, + "flos": 20786313928320.0, + "grad_norm": 1.5107616511517212, + "language_loss": 0.78474641, + "learning_rate": 2.764962053731699e-06, + "loss": 0.81020874, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.23852539, + "step": 6550, + "time_per_iteration": 2.8533008098602295 + }, + { + "auxiliary_loss_clip": 0.01480528, + "auxiliary_loss_mlp": 0.01038924, + "balance_loss_clip": 1.29686213, + "balance_loss_mlp": 1.01629782, + "epoch": 0.39386742822786713, + "flos": 21618042687360.0, + "grad_norm": 2.0100223811213067, + "language_loss": 0.81656879, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.84176332, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22619629, + "step": 6551, + "time_per_iteration": 2.8873531818389893 + }, + { + "auxiliary_loss_clip": 0.01469733, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.28538704, + "balance_loss_mlp": 1.01772761, + "epoch": 0.3939275514805351, + "flos": 12420324950400.0, + "grad_norm": 2.1774937406382877, + "language_loss": 0.80593526, + "learning_rate": 2.764242299098596e-06, + "loss": 0.83103597, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22607422, + "step": 6552, + "time_per_iteration": 2.9269766807556152 + }, + { + "auxiliary_loss_clip": 0.01491072, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_clip": 1.30464625, + "balance_loss_mlp": 1.02447248, + "epoch": 0.39398767473320306, + "flos": 18561321469440.0, + "grad_norm": 2.154720883761838, + "language_loss": 0.71704704, + "learning_rate": 2.763882378305003e-06, + "loss": 0.74242395, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.22167969, + "step": 6553, + "time_per_iteration": 2.889822244644165 + }, + { + "auxiliary_loss_clip": 0.01467518, + "auxiliary_loss_mlp": 0.010431, + "balance_loss_clip": 1.28547549, + "balance_loss_mlp": 1.02029526, + "epoch": 0.39404779798587103, + "flos": 29319302039040.0, + "grad_norm": 2.0590786757959294, + "language_loss": 0.6506564, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.67576253, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22802734, + "step": 6554, + "time_per_iteration": 2.925755739212036 + }, + { + "auxiliary_loss_clip": 0.01472356, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.28873515, + "balance_loss_mlp": 1.02030349, + "epoch": 0.394107921238539, + "flos": 34910436775680.0, + "grad_norm": 23.005876682661086, + "language_loss": 0.80254412, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.82769597, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.2253418, + "step": 6555, + "time_per_iteration": 2.9472358226776123 + }, + { + "auxiliary_loss_clip": 0.01486856, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.30029416, + "balance_loss_mlp": 1.01481807, + "epoch": 0.39416804449120696, + "flos": 25091994620160.0, + "grad_norm": 1.630655908738108, + "language_loss": 0.72679985, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.75205469, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.23815918, + "step": 6556, + "time_per_iteration": 4.28685736656189 + }, + { + "auxiliary_loss_clip": 0.01475483, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.29115808, + "balance_loss_mlp": 1.01276064, + "epoch": 0.3942281677438749, + "flos": 32319367488000.0, + "grad_norm": 2.3214319040599323, + "language_loss": 0.83904105, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86414647, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22302246, + "step": 6557, + "time_per_iteration": 4.37511134147644 + }, + { + "auxiliary_loss_clip": 0.01481856, + "auxiliary_loss_mlp": 0.01039075, + "balance_loss_clip": 1.29715168, + "balance_loss_mlp": 1.0165925, + "epoch": 0.3942882909965429, + "flos": 24947464170240.0, + "grad_norm": 2.064544376955438, + "language_loss": 0.81300175, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.83821112, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.22473145, + "step": 6558, + "time_per_iteration": 4.268579721450806 + }, + { + "auxiliary_loss_clip": 0.0147978, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.29711282, + "balance_loss_mlp": 1.01409864, + "epoch": 0.39434841424921085, + "flos": 11882272060800.0, + "grad_norm": 2.2100435787577952, + "language_loss": 0.7218855, + "learning_rate": 2.761722245724792e-06, + "loss": 0.74704957, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.2253418, + "step": 6559, + "time_per_iteration": 2.7939467430114746 + }, + { + "auxiliary_loss_clip": 0.01503229, + "auxiliary_loss_mlp": 0.01040983, + "balance_loss_clip": 1.3119272, + "balance_loss_mlp": 1.01830983, + "epoch": 0.3944085375018789, + "flos": 16370018403840.0, + "grad_norm": 1.9738544126592652, + "language_loss": 0.80983186, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.83527398, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.22668457, + "step": 6560, + "time_per_iteration": 2.799424648284912 + }, + { + "auxiliary_loss_clip": 0.01487102, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.30070114, + "balance_loss_mlp": 1.01745558, + "epoch": 0.39446866075454684, + "flos": 10640280286080.0, + "grad_norm": 1.994206091552577, + "language_loss": 0.8379246, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.8632018, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.23156738, + "step": 6561, + "time_per_iteration": 2.846052646636963 + }, + { + "auxiliary_loss_clip": 0.01468628, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.28457093, + "balance_loss_mlp": 1.01251507, + "epoch": 0.3945287840072148, + "flos": 18196782963840.0, + "grad_norm": 2.422890216631079, + "language_loss": 0.81424081, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.83927774, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22546387, + "step": 6562, + "time_per_iteration": 2.8391945362091064 + }, + { + "auxiliary_loss_clip": 0.01468561, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.28732133, + "balance_loss_mlp": 1.01691651, + "epoch": 0.39458890725988277, + "flos": 23050379911680.0, + "grad_norm": 1.5404717174373483, + "language_loss": 0.82061183, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.84569311, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22644043, + "step": 6563, + "time_per_iteration": 2.9049787521362305 + }, + { + "auxiliary_loss_clip": 0.01475575, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.29013693, + "balance_loss_mlp": 1.01649344, + "epoch": 0.39464903051255074, + "flos": 17166293222400.0, + "grad_norm": 2.4617212188495645, + "language_loss": 0.70996672, + "learning_rate": 2.759921340790127e-06, + "loss": 0.73511082, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.2232666, + "step": 6564, + "time_per_iteration": 2.8074395656585693 + }, + { + "auxiliary_loss_clip": 0.01481578, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.29474294, + "balance_loss_mlp": 1.01568913, + "epoch": 0.3947091537652187, + "flos": 15897715488000.0, + "grad_norm": 2.6602540727915747, + "language_loss": 0.84789193, + "learning_rate": 2.759561073299676e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.22937012, + "step": 6565, + "time_per_iteration": 2.837442636489868 + }, + { + "auxiliary_loss_clip": 0.014725, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.28869128, + "balance_loss_mlp": 1.01319051, + "epoch": 0.39476927701788667, + "flos": 18553086915840.0, + "grad_norm": 1.7655644726715483, + "language_loss": 0.84606266, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.87114036, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.22058105, + "step": 6566, + "time_per_iteration": 2.793774127960205 + }, + { + "auxiliary_loss_clip": 0.01501007, + "auxiliary_loss_mlp": 0.01040933, + "balance_loss_clip": 1.30888116, + "balance_loss_mlp": 1.01918876, + "epoch": 0.39482940027055463, + "flos": 22286527632000.0, + "grad_norm": 3.186992962660655, + "language_loss": 0.78571385, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.81113321, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.21740723, + "step": 6567, + "time_per_iteration": 2.861283540725708 + }, + { + "auxiliary_loss_clip": 0.01457389, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.27805948, + "balance_loss_mlp": 1.01761198, + "epoch": 0.3948895235232226, + "flos": 14765976587520.0, + "grad_norm": 2.446334283916939, + "language_loss": 0.80780041, + "learning_rate": 2.758480098067182e-06, + "loss": 0.83276647, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.21618652, + "step": 6568, + "time_per_iteration": 2.8019163608551025 + }, + { + "auxiliary_loss_clip": 0.01473127, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.2886132, + "balance_loss_mlp": 1.0169065, + "epoch": 0.39494964677589056, + "flos": 22576176714240.0, + "grad_norm": 1.5783561122097896, + "language_loss": 0.85738629, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.88251048, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22387695, + "step": 6569, + "time_per_iteration": 2.871307134628296 + }, + { + "auxiliary_loss_clip": 0.01473724, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.29112279, + "balance_loss_mlp": 1.01869512, + "epoch": 0.3950097700285585, + "flos": 22972685310720.0, + "grad_norm": 1.8149179879007349, + "language_loss": 0.75317466, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77832013, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.22131348, + "step": 6570, + "time_per_iteration": 2.866025447845459 + }, + { + "auxiliary_loss_clip": 0.01470264, + "auxiliary_loss_mlp": 0.01038107, + "balance_loss_clip": 1.28562343, + "balance_loss_mlp": 1.01659012, + "epoch": 0.3950698932812265, + "flos": 20605062683520.0, + "grad_norm": 1.5865369347785425, + "language_loss": 0.80531251, + "learning_rate": 2.757398863979922e-06, + "loss": 0.83039629, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.21520996, + "step": 6571, + "time_per_iteration": 2.835641384124756 + }, + { + "auxiliary_loss_clip": 0.01470822, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.28788424, + "balance_loss_mlp": 1.01684654, + "epoch": 0.39513001653389446, + "flos": 20385416586240.0, + "grad_norm": 2.0891886773397683, + "language_loss": 0.78874087, + "learning_rate": 2.757038395157997e-06, + "loss": 0.81383646, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21887207, + "step": 6572, + "time_per_iteration": 2.8703367710113525 + }, + { + "auxiliary_loss_clip": 0.0147982, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.29361761, + "balance_loss_mlp": 1.01428771, + "epoch": 0.3951901397865625, + "flos": 26473494672000.0, + "grad_norm": 2.1750907042735776, + "language_loss": 0.7514416, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.7766102, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.22729492, + "step": 6573, + "time_per_iteration": 2.8805041313171387 + }, + { + "auxiliary_loss_clip": 0.01479869, + "auxiliary_loss_mlp": 0.01035381, + "balance_loss_clip": 1.29626048, + "balance_loss_mlp": 1.01453102, + "epoch": 0.39525026303923044, + "flos": 43854638042880.0, + "grad_norm": 1.593476149063084, + "language_loss": 0.68314344, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70829594, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.20849609, + "step": 6574, + "time_per_iteration": 3.124318838119507 + }, + { + "auxiliary_loss_clip": 0.01477643, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.29043925, + "balance_loss_mlp": 1.0130024, + "epoch": 0.3953103862918984, + "flos": 18049854539520.0, + "grad_norm": 2.3150344167708927, + "language_loss": 0.7323705, + "learning_rate": 2.755956816505072e-06, + "loss": 0.75750107, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.22412109, + "step": 6575, + "time_per_iteration": 2.8557560443878174 + }, + { + "auxiliary_loss_clip": 0.0149352, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_clip": 1.30404568, + "balance_loss_mlp": 1.01971316, + "epoch": 0.3953705095445664, + "flos": 16983051206400.0, + "grad_norm": 3.0166831190399694, + "language_loss": 0.74060446, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.76596677, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.22998047, + "step": 6576, + "time_per_iteration": 2.907925605773926 + }, + { + "auxiliary_loss_clip": 0.01474956, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.28951383, + "balance_loss_mlp": 1.01694441, + "epoch": 0.39543063279723434, + "flos": 17417592696960.0, + "grad_norm": 2.33763481752221, + "language_loss": 0.8438313, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.86895657, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.20629883, + "step": 6577, + "time_per_iteration": 2.895383358001709 + }, + { + "auxiliary_loss_clip": 0.01482899, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.29922807, + "balance_loss_mlp": 1.01761007, + "epoch": 0.3954907560499023, + "flos": 22794555957120.0, + "grad_norm": 2.4946180715393633, + "language_loss": 0.91537964, + "learning_rate": 2.75487497985853e-06, + "loss": 0.94061226, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22753906, + "step": 6578, + "time_per_iteration": 2.9213907718658447 + }, + { + "auxiliary_loss_clip": 0.01488697, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_clip": 1.29864359, + "balance_loss_mlp": 1.01853585, + "epoch": 0.39555087930257027, + "flos": 21954366420480.0, + "grad_norm": 1.9331173304684939, + "language_loss": 0.79063666, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.81594592, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.23718262, + "step": 6579, + "time_per_iteration": 2.952899694442749 + }, + { + "auxiliary_loss_clip": 0.01492287, + "auxiliary_loss_mlp": 0.01037414, + "balance_loss_clip": 1.30243576, + "balance_loss_mlp": 1.0136317, + "epoch": 0.39561100255523823, + "flos": 20412274014720.0, + "grad_norm": 2.25203564265753, + "language_loss": 0.69400918, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71930623, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.23803711, + "step": 6580, + "time_per_iteration": 2.8576860427856445 + }, + { + "auxiliary_loss_clip": 0.01472771, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.28704071, + "balance_loss_mlp": 1.01515555, + "epoch": 0.3956711258079062, + "flos": 27976106350080.0, + "grad_norm": 1.929766114834269, + "language_loss": 0.59708238, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.62218136, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.21960449, + "step": 6581, + "time_per_iteration": 2.9328601360321045 + }, + { + "auxiliary_loss_clip": 0.01480082, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.29438698, + "balance_loss_mlp": 1.01595938, + "epoch": 0.39573124906057416, + "flos": 14436303840000.0, + "grad_norm": 2.257470849691793, + "language_loss": 0.7048713, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.73005921, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22753906, + "step": 6582, + "time_per_iteration": 2.936013698577881 + }, + { + "auxiliary_loss_clip": 0.01484426, + "auxiliary_loss_mlp": 0.01037796, + "balance_loss_clip": 1.29732001, + "balance_loss_mlp": 1.01636255, + "epoch": 0.39579137231324213, + "flos": 18742844183040.0, + "grad_norm": 4.850254105393422, + "language_loss": 0.77062416, + "learning_rate": 2.753071346464642e-06, + "loss": 0.79584634, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.2142334, + "step": 6583, + "time_per_iteration": 2.8234238624572754 + }, + { + "auxiliary_loss_clip": 0.01477311, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.29092383, + "balance_loss_mlp": 1.01446509, + "epoch": 0.3958514955659101, + "flos": 17685497013120.0, + "grad_norm": 1.4997864119641209, + "language_loss": 0.66754079, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.69266254, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.20410156, + "step": 6584, + "time_per_iteration": 2.836475372314453 + }, + { + "auxiliary_loss_clip": 0.01496704, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.30636358, + "balance_loss_mlp": 1.02332783, + "epoch": 0.39591161881857806, + "flos": 29319573507840.0, + "grad_norm": 2.2869198702366065, + "language_loss": 0.7339831, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.75940454, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.22106934, + "step": 6585, + "time_per_iteration": 4.310010671615601 + }, + { + "auxiliary_loss_clip": 0.01474366, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.28697908, + "balance_loss_mlp": 1.01495767, + "epoch": 0.3959717420712461, + "flos": 25781862372480.0, + "grad_norm": 2.756985163476723, + "language_loss": 0.742046, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.76715052, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.21118164, + "step": 6586, + "time_per_iteration": 2.9039525985717773 + }, + { + "auxiliary_loss_clip": 0.01484702, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.29637241, + "balance_loss_mlp": 1.01690173, + "epoch": 0.39603186532391405, + "flos": 20933740045440.0, + "grad_norm": 1.6122831067467862, + "language_loss": 0.72106218, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.74630213, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.22399902, + "step": 6587, + "time_per_iteration": 2.8939497470855713 + }, + { + "auxiliary_loss_clip": 0.01242106, + "auxiliary_loss_mlp": 0.01024492, + "balance_loss_clip": 1.13783884, + "balance_loss_mlp": 1.00475109, + "epoch": 0.396091988576582, + "flos": 54906357709440.0, + "grad_norm": 0.8740181847071061, + "language_loss": 0.61278713, + "learning_rate": 2.751266999157285e-06, + "loss": 0.6354531, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.19726562, + "step": 6588, + "time_per_iteration": 3.2782557010650635 + }, + { + "auxiliary_loss_clip": 0.01493082, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.30390477, + "balance_loss_mlp": 1.01823509, + "epoch": 0.39615211182925, + "flos": 20712284156160.0, + "grad_norm": 1.6979532649248734, + "language_loss": 0.81773698, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.84306395, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.21374512, + "step": 6589, + "time_per_iteration": 2.9053702354431152 + }, + { + "auxiliary_loss_clip": 0.01490885, + "auxiliary_loss_mlp": 0.01037302, + "balance_loss_clip": 1.30111742, + "balance_loss_mlp": 1.01491439, + "epoch": 0.39621223508191794, + "flos": 21003652540800.0, + "grad_norm": 2.645348759630068, + "language_loss": 0.70878124, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.73406315, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.22375488, + "step": 6590, + "time_per_iteration": 2.954557418823242 + }, + { + "auxiliary_loss_clip": 0.01477142, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.29195988, + "balance_loss_mlp": 1.01655281, + "epoch": 0.3962723583345859, + "flos": 23379554966400.0, + "grad_norm": 1.7765237536725684, + "language_loss": 0.75894034, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78409082, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21350098, + "step": 6591, + "time_per_iteration": 2.9496283531188965 + }, + { + "auxiliary_loss_clip": 0.0148147, + "auxiliary_loss_mlp": 0.01039473, + "balance_loss_clip": 1.2946527, + "balance_loss_mlp": 1.0171212, + "epoch": 0.39633248158725387, + "flos": 25125593523840.0, + "grad_norm": 2.2327032530817634, + "language_loss": 0.7880711, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81328052, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22338867, + "step": 6592, + "time_per_iteration": 5.74885368347168 + }, + { + "auxiliary_loss_clip": 0.01467612, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.28506815, + "balance_loss_mlp": 1.01444125, + "epoch": 0.39639260483992184, + "flos": 39802654800000.0, + "grad_norm": 2.0519178569021395, + "language_loss": 0.69530213, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.72033763, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.21459961, + "step": 6593, + "time_per_iteration": 4.4713521003723145 + }, + { + "auxiliary_loss_clip": 0.01486005, + "auxiliary_loss_mlp": 0.01044371, + "balance_loss_clip": 1.29662621, + "balance_loss_mlp": 1.02234125, + "epoch": 0.3964527280925898, + "flos": 17355779020800.0, + "grad_norm": 2.6506204407076286, + "language_loss": 0.78044224, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80574596, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.22033691, + "step": 6594, + "time_per_iteration": 2.94380784034729 + }, + { + "auxiliary_loss_clip": 0.01245823, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.13871956, + "balance_loss_mlp": 1.016343, + "epoch": 0.39651285134525777, + "flos": 71751981899520.0, + "grad_norm": 1.1493737908206458, + "language_loss": 0.63134992, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65414989, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.17871094, + "step": 6595, + "time_per_iteration": 3.4127001762390137 + }, + { + "auxiliary_loss_clip": 0.01497954, + "auxiliary_loss_mlp": 0.01041919, + "balance_loss_clip": 1.30718112, + "balance_loss_mlp": 1.01986563, + "epoch": 0.39657297459792573, + "flos": 25786748810880.0, + "grad_norm": 2.0759823633828485, + "language_loss": 0.64069873, + "learning_rate": 2.748378562795223e-06, + "loss": 0.66609746, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22070312, + "step": 6596, + "time_per_iteration": 2.916390895843506 + }, + { + "auxiliary_loss_clip": 0.01457862, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.27720165, + "balance_loss_mlp": 1.01804376, + "epoch": 0.3966330978505937, + "flos": 20275661404800.0, + "grad_norm": 1.7861893724144253, + "language_loss": 0.79292023, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.81788617, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.20666504, + "step": 6597, + "time_per_iteration": 2.9172351360321045 + }, + { + "auxiliary_loss_clip": 0.01495618, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.30506778, + "balance_loss_mlp": 1.01798081, + "epoch": 0.39669322110326166, + "flos": 20640878582400.0, + "grad_norm": 2.263292847172142, + "language_loss": 0.68497074, + "learning_rate": 2.747656169644941e-06, + "loss": 0.71032917, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 1.90625, + "router_z_loss_mlp": 0.22253418, + "step": 6598, + "time_per_iteration": 2.8455846309661865 + }, + { + "auxiliary_loss_clip": 0.01482165, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.29377866, + "balance_loss_mlp": 1.01869726, + "epoch": 0.3967533443559297, + "flos": 21736122912000.0, + "grad_norm": 2.6715068887304625, + "language_loss": 0.79484105, + "learning_rate": 2.747294930536157e-06, + "loss": 0.82005817, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.20837402, + "step": 6599, + "time_per_iteration": 2.9381017684936523 + }, + { + "auxiliary_loss_clip": 0.0148589, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.29776335, + "balance_loss_mlp": 1.01604748, + "epoch": 0.39681346760859765, + "flos": 25495199447040.0, + "grad_norm": 2.2958424263427366, + "language_loss": 0.73252112, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75777233, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.23193359, + "step": 6600, + "time_per_iteration": 2.885343551635742 + }, + { + "auxiliary_loss_clip": 0.0147746, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.28987539, + "balance_loss_mlp": 1.01558876, + "epoch": 0.3968735908612656, + "flos": 20969374965120.0, + "grad_norm": 2.210982761197086, + "language_loss": 0.86426091, + "learning_rate": 2.746572367319791e-06, + "loss": 0.8894043, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.2130127, + "step": 6601, + "time_per_iteration": 2.85606050491333 + }, + { + "auxiliary_loss_clip": 0.01494752, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_clip": 1.30188775, + "balance_loss_mlp": 1.02116823, + "epoch": 0.3969337141139336, + "flos": 10714219568640.0, + "grad_norm": 2.438144760481022, + "language_loss": 0.70808136, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.73347425, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 1.9296875, + "router_z_loss_mlp": 0.23376465, + "step": 6602, + "time_per_iteration": 2.796848773956299 + }, + { + "auxiliary_loss_clip": 0.01485633, + "auxiliary_loss_mlp": 0.01045793, + "balance_loss_clip": 1.29501867, + "balance_loss_mlp": 1.02397752, + "epoch": 0.39699383736660154, + "flos": 17600246530560.0, + "grad_norm": 3.0046105078852, + "language_loss": 0.85038489, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.87569916, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 1.90722656, + "router_z_loss_mlp": 0.21826172, + "step": 6603, + "time_per_iteration": 2.8304295539855957 + }, + { + "auxiliary_loss_clip": 0.0148068, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.29388535, + "balance_loss_mlp": 1.01693797, + "epoch": 0.3970539606192695, + "flos": 17795071215360.0, + "grad_norm": 1.8195238822985256, + "language_loss": 0.7349658, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.7601546, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.21264648, + "step": 6604, + "time_per_iteration": 2.843843698501587 + }, + { + "auxiliary_loss_clip": 0.01464191, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.28384614, + "balance_loss_mlp": 1.02029848, + "epoch": 0.3971140838719375, + "flos": 24799992808320.0, + "grad_norm": 1.7586239919664313, + "language_loss": 0.82954371, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85459781, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.20910645, + "step": 6605, + "time_per_iteration": 2.861524820327759 + }, + { + "auxiliary_loss_clip": 0.01470834, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.28614497, + "balance_loss_mlp": 1.01495075, + "epoch": 0.39717420712460544, + "flos": 24254293547520.0, + "grad_norm": 2.778611059543577, + "language_loss": 0.7488209, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.77389586, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.21716309, + "step": 6606, + "time_per_iteration": 2.990558385848999 + }, + { + "auxiliary_loss_clip": 0.01490021, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.29993939, + "balance_loss_mlp": 1.01491332, + "epoch": 0.3972343303772734, + "flos": 25895644341120.0, + "grad_norm": 1.832985078083235, + "language_loss": 0.74589497, + "learning_rate": 2.744403998666805e-06, + "loss": 0.77115488, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.21069336, + "step": 6607, + "time_per_iteration": 2.870893716812134 + }, + { + "auxiliary_loss_clip": 0.01489745, + "auxiliary_loss_mlp": 0.01041019, + "balance_loss_clip": 1.30014467, + "balance_loss_mlp": 1.01951337, + "epoch": 0.39729445362994137, + "flos": 45639840614400.0, + "grad_norm": 1.4567900588699736, + "language_loss": 0.68663007, + "learning_rate": 2.744042505013797e-06, + "loss": 0.71193773, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.21508789, + "step": 6608, + "time_per_iteration": 3.0299103260040283 + }, + { + "auxiliary_loss_clip": 0.01492896, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.30121541, + "balance_loss_mlp": 1.02005506, + "epoch": 0.39735457688260933, + "flos": 20203712893440.0, + "grad_norm": 1.9873138562668455, + "language_loss": 0.75270557, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.77805942, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.22412109, + "step": 6609, + "time_per_iteration": 2.878965139389038 + }, + { + "auxiliary_loss_clip": 0.01496928, + "auxiliary_loss_mlp": 0.01045528, + "balance_loss_clip": 1.30827749, + "balance_loss_mlp": 1.0234983, + "epoch": 0.3974147001352773, + "flos": 23341567317120.0, + "grad_norm": 1.9057705259741453, + "language_loss": 0.7240485, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.74947309, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.22021484, + "step": 6610, + "time_per_iteration": 2.8512966632843018 + }, + { + "auxiliary_loss_clip": 0.01467685, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.2845794, + "balance_loss_mlp": 1.01976228, + "epoch": 0.39747482338794526, + "flos": 21698451976320.0, + "grad_norm": 1.6649936205138955, + "language_loss": 0.79294044, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.81803608, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22106934, + "step": 6611, + "time_per_iteration": 2.8537490367889404 + }, + { + "auxiliary_loss_clip": 0.01483943, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_clip": 1.29548335, + "balance_loss_mlp": 1.0237422, + "epoch": 0.3975349466406133, + "flos": 30999952581120.0, + "grad_norm": 1.8694837516679714, + "language_loss": 0.7991693, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.8244611, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.21484375, + "step": 6612, + "time_per_iteration": 2.9016566276550293 + }, + { + "auxiliary_loss_clip": 0.01229635, + "auxiliary_loss_mlp": 0.01040026, + "balance_loss_clip": 1.12363505, + "balance_loss_mlp": 1.02018976, + "epoch": 0.39759506989328125, + "flos": 63714172590720.0, + "grad_norm": 0.8613701226834495, + "language_loss": 0.65054369, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6732403, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.19824219, + "step": 6613, + "time_per_iteration": 3.2614665031433105 + }, + { + "auxiliary_loss_clip": 0.01483428, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.29706907, + "balance_loss_mlp": 1.0191592, + "epoch": 0.3976551931459492, + "flos": 23706286801920.0, + "grad_norm": 7.858383440013012, + "language_loss": 0.72740179, + "learning_rate": 2.741872951078109e-06, + "loss": 0.75264716, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21936035, + "step": 6614, + "time_per_iteration": 2.8597917556762695 + }, + { + "auxiliary_loss_clip": 0.01480504, + "auxiliary_loss_mlp": 0.01040293, + "balance_loss_clip": 1.29509568, + "balance_loss_mlp": 1.01825142, + "epoch": 0.3977153163986172, + "flos": 15678205125120.0, + "grad_norm": 1.85362720576193, + "language_loss": 0.82140237, + "learning_rate": 2.741511260213862e-06, + "loss": 0.84661031, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22033691, + "step": 6615, + "time_per_iteration": 2.944305896759033 + }, + { + "auxiliary_loss_clip": 0.01480675, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.29292452, + "balance_loss_mlp": 1.01274276, + "epoch": 0.39777543965128515, + "flos": 14072941699200.0, + "grad_norm": 1.9785460549099283, + "language_loss": 0.68180203, + "learning_rate": 2.741149541231434e-06, + "loss": 0.70694387, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.20776367, + "step": 6616, + "time_per_iteration": 2.892889976501465 + }, + { + "auxiliary_loss_clip": 0.01501956, + "auxiliary_loss_mlp": 0.01040161, + "balance_loss_clip": 1.30936933, + "balance_loss_mlp": 1.01858413, + "epoch": 0.3978355629039531, + "flos": 23377835664000.0, + "grad_norm": 2.8706099509194205, + "language_loss": 0.8475318, + "learning_rate": 2.740787794144541e-06, + "loss": 0.87295288, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.21582031, + "step": 6617, + "time_per_iteration": 2.959622383117676 + }, + { + "auxiliary_loss_clip": 0.01476434, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.29456091, + "balance_loss_mlp": 1.01972795, + "epoch": 0.3978956861566211, + "flos": 19072200216960.0, + "grad_norm": 1.7105717562858966, + "language_loss": 0.72671384, + "learning_rate": 2.7404260189669e-06, + "loss": 0.75187761, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.20214844, + "step": 6618, + "time_per_iteration": 2.8881735801696777 + }, + { + "auxiliary_loss_clip": 0.01488423, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.30045462, + "balance_loss_mlp": 1.01400518, + "epoch": 0.39795580940928904, + "flos": 30240081843840.0, + "grad_norm": 1.790417154422876, + "language_loss": 0.66451579, + "learning_rate": 2.740064215712231e-06, + "loss": 0.68977672, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.23681641, + "step": 6619, + "time_per_iteration": 4.334418058395386 + }, + { + "auxiliary_loss_clip": 0.01230265, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.12538528, + "balance_loss_mlp": 1.02284455, + "epoch": 0.398015932661957, + "flos": 69878316464640.0, + "grad_norm": 0.777801330185096, + "language_loss": 0.58349729, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60622203, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.19335938, + "step": 6620, + "time_per_iteration": 3.320657730102539 + }, + { + "auxiliary_loss_clip": 0.01474988, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.29166722, + "balance_loss_mlp": 1.01425147, + "epoch": 0.39807605591462497, + "flos": 20167580280960.0, + "grad_norm": 1.6852235727119147, + "language_loss": 0.79696846, + "learning_rate": 2.739340525026686e-06, + "loss": 0.82206458, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.20373535, + "step": 6621, + "time_per_iteration": 2.863731861114502 + }, + { + "auxiliary_loss_clip": 0.01479312, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.29639149, + "balance_loss_mlp": 1.01398361, + "epoch": 0.39813617916729294, + "flos": 21151531105920.0, + "grad_norm": 6.062456004448127, + "language_loss": 0.78872299, + "learning_rate": 2.738978637623252e-06, + "loss": 0.81387019, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21411133, + "step": 6622, + "time_per_iteration": 2.8547496795654297 + }, + { + "auxiliary_loss_clip": 0.01475272, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.29009533, + "balance_loss_mlp": 1.01556993, + "epoch": 0.3981963024199609, + "flos": 18997537017600.0, + "grad_norm": 1.5576856212578405, + "language_loss": 0.7594732, + "learning_rate": 2.738616722197674e-06, + "loss": 0.78460693, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.2253418, + "step": 6623, + "time_per_iteration": 2.8832640647888184 + }, + { + "auxiliary_loss_clip": 0.0148873, + "auxiliary_loss_mlp": 0.01042453, + "balance_loss_clip": 1.30217016, + "balance_loss_mlp": 1.02080488, + "epoch": 0.39825642567262887, + "flos": 16582153864320.0, + "grad_norm": 2.184057595019245, + "language_loss": 0.80569506, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.83100688, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21630859, + "step": 6624, + "time_per_iteration": 2.844261407852173 + }, + { + "auxiliary_loss_clip": 0.01504825, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.31328404, + "balance_loss_mlp": 1.01800215, + "epoch": 0.39831654892529683, + "flos": 22209873661440.0, + "grad_norm": 1.9848074252639605, + "language_loss": 0.8485114, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.87396294, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 1.91503906, + "router_z_loss_mlp": 0.2232666, + "step": 6625, + "time_per_iteration": 2.832756757736206 + }, + { + "auxiliary_loss_clip": 0.0148327, + "auxiliary_loss_mlp": 0.01045963, + "balance_loss_clip": 1.29913759, + "balance_loss_mlp": 1.02312231, + "epoch": 0.39837667217796485, + "flos": 10494980674560.0, + "grad_norm": 2.378175900333542, + "language_loss": 0.87445927, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89975166, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.22827148, + "step": 6626, + "time_per_iteration": 4.214043378829956 + }, + { + "auxiliary_loss_clip": 0.01476268, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.28993678, + "balance_loss_mlp": 1.01819015, + "epoch": 0.3984367954306328, + "flos": 17973653016960.0, + "grad_norm": 2.5889033414007265, + "language_loss": 0.84414101, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86929744, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.21203613, + "step": 6627, + "time_per_iteration": 4.299873352050781 + }, + { + "auxiliary_loss_clip": 0.01465167, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.28178573, + "balance_loss_mlp": 1.01810312, + "epoch": 0.3984969186833008, + "flos": 22721340591360.0, + "grad_norm": 1.7826922125057385, + "language_loss": 0.83824605, + "learning_rate": 2.736806725217998e-06, + "loss": 0.86328626, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.20751953, + "step": 6628, + "time_per_iteration": 2.887967348098755 + }, + { + "auxiliary_loss_clip": 0.0149903, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_clip": 1.31046557, + "balance_loss_mlp": 1.02041054, + "epoch": 0.39855704193596875, + "flos": 23416637719680.0, + "grad_norm": 1.6635885480747885, + "language_loss": 0.72011828, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.74553418, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.22131348, + "step": 6629, + "time_per_iteration": 4.316423654556274 + }, + { + "auxiliary_loss_clip": 0.01455903, + "auxiliary_loss_mlp": 0.01042552, + "balance_loss_clip": 1.27737856, + "balance_loss_mlp": 1.02090323, + "epoch": 0.3986171651886367, + "flos": 21261919714560.0, + "grad_norm": 1.8646172401037568, + "language_loss": 0.81034625, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83533078, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.21643066, + "step": 6630, + "time_per_iteration": 2.8564646244049072 + }, + { + "auxiliary_loss_clip": 0.01473077, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.28734756, + "balance_loss_mlp": 1.01463461, + "epoch": 0.3986772884413047, + "flos": 12466366174080.0, + "grad_norm": 3.164461314559046, + "language_loss": 0.76318049, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.78827775, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22009277, + "step": 6631, + "time_per_iteration": 2.8109452724456787 + }, + { + "auxiliary_loss_clip": 0.01471353, + "auxiliary_loss_mlp": 0.01041347, + "balance_loss_clip": 1.28606606, + "balance_loss_mlp": 1.01931727, + "epoch": 0.39873741169397264, + "flos": 19655570413440.0, + "grad_norm": 1.765926667132695, + "language_loss": 0.72772241, + "learning_rate": 2.735358224635783e-06, + "loss": 0.75284946, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22045898, + "step": 6632, + "time_per_iteration": 2.8458011150360107 + }, + { + "auxiliary_loss_clip": 0.01455904, + "auxiliary_loss_mlp": 0.01041559, + "balance_loss_clip": 1.27507925, + "balance_loss_mlp": 1.01946974, + "epoch": 0.3987975349466406, + "flos": 21693565537920.0, + "grad_norm": 2.317838278960125, + "language_loss": 0.75535572, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.78033036, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22106934, + "step": 6633, + "time_per_iteration": 2.82204532623291 + }, + { + "auxiliary_loss_clip": 0.01471947, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.28611743, + "balance_loss_mlp": 1.0189898, + "epoch": 0.3988576581993086, + "flos": 23924394576000.0, + "grad_norm": 1.7783172905490863, + "language_loss": 0.81580746, + "learning_rate": 2.7346338069806e-06, + "loss": 0.84093815, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.22131348, + "step": 6634, + "time_per_iteration": 2.8431789875030518 + }, + { + "auxiliary_loss_clip": 0.01483419, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.29750264, + "balance_loss_mlp": 1.01694584, + "epoch": 0.39891778145197654, + "flos": 18158885804160.0, + "grad_norm": 1.9091389275848272, + "language_loss": 0.76069069, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.78591245, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.21826172, + "step": 6635, + "time_per_iteration": 2.867730140686035 + }, + { + "auxiliary_loss_clip": 0.01504446, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.31119013, + "balance_loss_mlp": 1.02178264, + "epoch": 0.3989779047046445, + "flos": 22603893793920.0, + "grad_norm": 4.217272985903355, + "language_loss": 0.66805291, + "learning_rate": 2.733909277895868e-06, + "loss": 0.69355178, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 1.93359375, + "router_z_loss_mlp": 0.23669434, + "step": 6636, + "time_per_iteration": 2.83748459815979 + }, + { + "auxiliary_loss_clip": 0.01468044, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.28586817, + "balance_loss_mlp": 1.02054739, + "epoch": 0.39903802795731247, + "flos": 18086258620800.0, + "grad_norm": 1.7708653506718106, + "language_loss": 0.82918596, + "learning_rate": 2.733546971601763e-06, + "loss": 0.85429126, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.21948242, + "step": 6637, + "time_per_iteration": 2.8742995262145996 + }, + { + "auxiliary_loss_clip": 0.01238473, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.12651479, + "balance_loss_mlp": 1.02937937, + "epoch": 0.39909815120998043, + "flos": 70475305345920.0, + "grad_norm": 0.7382101704866447, + "language_loss": 0.53240436, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55533653, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.25390625, + "step": 6638, + "time_per_iteration": 3.4348180294036865 + }, + { + "auxiliary_loss_clip": 0.01470872, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.28621352, + "balance_loss_mlp": 1.01752186, + "epoch": 0.39915827446264845, + "flos": 18557837619840.0, + "grad_norm": 1.6269036989430905, + "language_loss": 0.75785476, + "learning_rate": 2.732822275578769e-06, + "loss": 0.78295922, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22058105, + "step": 6639, + "time_per_iteration": 2.853752613067627 + }, + { + "auxiliary_loss_clip": 0.01457374, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.27832484, + "balance_loss_mlp": 1.01677322, + "epoch": 0.3992183977153164, + "flos": 29909006507520.0, + "grad_norm": 1.6659626558589975, + "language_loss": 0.76696181, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.7919209, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21777344, + "step": 6640, + "time_per_iteration": 2.9235072135925293 + }, + { + "auxiliary_loss_clip": 0.01470857, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_clip": 1.28719008, + "balance_loss_mlp": 1.01687729, + "epoch": 0.3992785209679844, + "flos": 22575317063040.0, + "grad_norm": 2.182998791851487, + "language_loss": 0.82788908, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.85298479, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.21850586, + "step": 6641, + "time_per_iteration": 2.8876326084136963 + }, + { + "auxiliary_loss_clip": 0.01480655, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.29634559, + "balance_loss_mlp": 1.01584029, + "epoch": 0.39933864422065235, + "flos": 19692155473920.0, + "grad_norm": 1.8584846787098261, + "language_loss": 0.77461982, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.79981011, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.22546387, + "step": 6642, + "time_per_iteration": 2.834116220474243 + }, + { + "auxiliary_loss_clip": 0.01491198, + "auxiliary_loss_mlp": 0.01038962, + "balance_loss_clip": 1.30318165, + "balance_loss_mlp": 1.01708698, + "epoch": 0.3993987674733203, + "flos": 23048615364480.0, + "grad_norm": 2.2755161144694322, + "language_loss": 0.72729278, + "learning_rate": 2.731372550178393e-06, + "loss": 0.75259435, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.21862793, + "step": 6643, + "time_per_iteration": 2.8666434288024902 + }, + { + "auxiliary_loss_clip": 0.01489156, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.30207253, + "balance_loss_mlp": 1.01660669, + "epoch": 0.3994588907259883, + "flos": 19400244151680.0, + "grad_norm": 1.6609068848290167, + "language_loss": 0.66779613, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.69307423, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.22058105, + "step": 6644, + "time_per_iteration": 2.8829734325408936 + }, + { + "auxiliary_loss_clip": 0.0147528, + "auxiliary_loss_mlp": 0.01043043, + "balance_loss_clip": 1.2902534, + "balance_loss_mlp": 1.02054799, + "epoch": 0.39951901397865625, + "flos": 13742047342080.0, + "grad_norm": 3.161573040176594, + "language_loss": 0.79085034, + "learning_rate": 2.730647521020907e-06, + "loss": 0.8160336, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.22497559, + "step": 6645, + "time_per_iteration": 2.843592882156372 + }, + { + "auxiliary_loss_clip": 0.01486723, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.29988909, + "balance_loss_mlp": 1.01739109, + "epoch": 0.3995791372313242, + "flos": 23596893578880.0, + "grad_norm": 1.569574497980234, + "language_loss": 0.70484215, + "learning_rate": 2.73028496487595e-06, + "loss": 0.73010719, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.22363281, + "step": 6646, + "time_per_iteration": 2.985799551010132 + }, + { + "auxiliary_loss_clip": 0.01474731, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.2890234, + "balance_loss_mlp": 1.01502609, + "epoch": 0.3996392604839922, + "flos": 21364480972800.0, + "grad_norm": 2.6458993570767095, + "language_loss": 0.72376704, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74887574, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.2109375, + "step": 6647, + "time_per_iteration": 2.90041184425354 + }, + { + "auxiliary_loss_clip": 0.01457853, + "auxiliary_loss_mlp": 0.01042173, + "balance_loss_clip": 1.27904749, + "balance_loss_mlp": 1.0205363, + "epoch": 0.39969938373666014, + "flos": 26043658640640.0, + "grad_norm": 1.3933508052905788, + "language_loss": 0.74451929, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76951957, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.21630859, + "step": 6648, + "time_per_iteration": 2.8953137397766113 + }, + { + "auxiliary_loss_clip": 0.01463434, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.28053904, + "balance_loss_mlp": 1.01507223, + "epoch": 0.3997595069893281, + "flos": 20124887172480.0, + "grad_norm": 2.7597341609715156, + "language_loss": 0.67365932, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.69866872, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.2244873, + "step": 6649, + "time_per_iteration": 2.9237916469573975 + }, + { + "auxiliary_loss_clip": 0.01486922, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.30365992, + "balance_loss_mlp": 1.01634872, + "epoch": 0.39981963024199607, + "flos": 27795217063680.0, + "grad_norm": 1.7225191059592717, + "language_loss": 0.76483428, + "learning_rate": 2.728834463508826e-06, + "loss": 0.79008073, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.21374512, + "step": 6650, + "time_per_iteration": 2.905372381210327 + }, + { + "auxiliary_loss_clip": 0.01479961, + "auxiliary_loss_mlp": 0.01041207, + "balance_loss_clip": 1.29598284, + "balance_loss_mlp": 1.02021396, + "epoch": 0.39987975349466404, + "flos": 21954411665280.0, + "grad_norm": 1.632673247646959, + "language_loss": 0.72037703, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74558866, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.21008301, + "step": 6651, + "time_per_iteration": 2.8894877433776855 + }, + { + "auxiliary_loss_clip": 0.01484936, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.29770029, + "balance_loss_mlp": 1.01382232, + "epoch": 0.39993987674733206, + "flos": 20714184437760.0, + "grad_norm": 1.8464898260405884, + "language_loss": 0.7432183, + "learning_rate": 2.728109046945403e-06, + "loss": 0.76841778, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.21191406, + "step": 6652, + "time_per_iteration": 2.845848798751831 + }, + { + "auxiliary_loss_clip": 0.01242901, + "auxiliary_loss_mlp": 0.01018492, + "balance_loss_clip": 1.13411665, + "balance_loss_mlp": 1.00037241, + "epoch": 0.4, + "flos": 61553572761600.0, + "grad_norm": 0.8554467543259106, + "language_loss": 0.60712844, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62974238, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.18164062, + "step": 6653, + "time_per_iteration": 3.268691301345825 + }, + { + "auxiliary_loss_clip": 0.01465693, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.28840423, + "balance_loss_mlp": 1.01520002, + "epoch": 0.400060123252668, + "flos": 14510605080960.0, + "grad_norm": 2.1058417312825664, + "language_loss": 0.67326003, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.69829297, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.22387695, + "step": 6654, + "time_per_iteration": 4.27107572555542 + }, + { + "auxiliary_loss_clip": 0.01470751, + "auxiliary_loss_mlp": 0.01039282, + "balance_loss_clip": 1.2883507, + "balance_loss_mlp": 1.01925468, + "epoch": 0.40012024650533595, + "flos": 19101184151040.0, + "grad_norm": 2.1866879259848067, + "language_loss": 0.90245938, + "learning_rate": 2.7270207150599e-06, + "loss": 0.92755973, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.20031738, + "step": 6655, + "time_per_iteration": 2.8469367027282715 + }, + { + "auxiliary_loss_clip": 0.01459647, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.28270578, + "balance_loss_mlp": 1.01356649, + "epoch": 0.4001803697580039, + "flos": 29362900043520.0, + "grad_norm": 1.8283643725139043, + "language_loss": 0.73775256, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.76269084, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.20605469, + "step": 6656, + "time_per_iteration": 3.0052499771118164 + }, + { + "auxiliary_loss_clip": 0.01476339, + "auxiliary_loss_mlp": 0.01037252, + "balance_loss_clip": 1.29233885, + "balance_loss_mlp": 1.01592505, + "epoch": 0.4002404930106719, + "flos": 20929034586240.0, + "grad_norm": 1.5939524436443075, + "language_loss": 0.74558878, + "learning_rate": 2.726295022603144e-06, + "loss": 0.77072465, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21325684, + "step": 6657, + "time_per_iteration": 3.0338175296783447 + }, + { + "auxiliary_loss_clip": 0.01478657, + "auxiliary_loss_mlp": 0.01047558, + "balance_loss_clip": 1.2950654, + "balance_loss_mlp": 1.02569461, + "epoch": 0.40030061626333985, + "flos": 28418248967040.0, + "grad_norm": 1.9719515102529566, + "language_loss": 0.79962915, + "learning_rate": 2.725932135056117e-06, + "loss": 0.82489133, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21875, + "step": 6658, + "time_per_iteration": 2.9869658946990967 + }, + { + "auxiliary_loss_clip": 0.01490082, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.30514145, + "balance_loss_mlp": 1.01517892, + "epoch": 0.4003607395160078, + "flos": 25932681849600.0, + "grad_norm": 1.9022246397826297, + "language_loss": 0.78404963, + "learning_rate": 2.72556921998167e-06, + "loss": 0.80931234, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.21020508, + "step": 6659, + "time_per_iteration": 2.893192768096924 + }, + { + "auxiliary_loss_clip": 0.01451068, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.2757709, + "balance_loss_mlp": 1.0172683, + "epoch": 0.4004208627686758, + "flos": 20776179093120.0, + "grad_norm": 1.7262817845156733, + "language_loss": 0.73602653, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.76090467, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.19470215, + "step": 6660, + "time_per_iteration": 2.802809476852417 + }, + { + "auxiliary_loss_clip": 0.01472411, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.29066014, + "balance_loss_mlp": 1.01957345, + "epoch": 0.40048098602134374, + "flos": 24692002174080.0, + "grad_norm": 2.1694569915491693, + "language_loss": 0.71975219, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.74487495, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.20275879, + "step": 6661, + "time_per_iteration": 5.795670986175537 + }, + { + "auxiliary_loss_clip": 0.01480144, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_clip": 1.29481411, + "balance_loss_mlp": 1.02123356, + "epoch": 0.4005411092740117, + "flos": 23196177216000.0, + "grad_norm": 1.7070350022625265, + "language_loss": 0.76172233, + "learning_rate": 2.724480309731437e-06, + "loss": 0.78694844, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21228027, + "step": 6662, + "time_per_iteration": 2.8437774181365967 + }, + { + "auxiliary_loss_clip": 0.014788, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.29248118, + "balance_loss_mlp": 1.01937509, + "epoch": 0.4006012325266797, + "flos": 17529248160000.0, + "grad_norm": 10.065590615343613, + "language_loss": 0.67108524, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.6962865, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.21948242, + "step": 6663, + "time_per_iteration": 4.187743663787842 + }, + { + "auxiliary_loss_clip": 0.01481331, + "auxiliary_loss_mlp": 0.01041742, + "balance_loss_clip": 1.29712796, + "balance_loss_mlp": 1.02158356, + "epoch": 0.40066135577934764, + "flos": 19864674472320.0, + "grad_norm": 1.9451263806523116, + "language_loss": 0.86974078, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.89497149, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.20153809, + "step": 6664, + "time_per_iteration": 2.8048341274261475 + }, + { + "auxiliary_loss_clip": 0.01462833, + "auxiliary_loss_mlp": 0.01042005, + "balance_loss_clip": 1.27993488, + "balance_loss_mlp": 1.02086926, + "epoch": 0.40072147903201566, + "flos": 18159021538560.0, + "grad_norm": 2.2835635234663014, + "language_loss": 0.84999514, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87504351, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.21142578, + "step": 6665, + "time_per_iteration": 2.8077855110168457 + }, + { + "auxiliary_loss_clip": 0.01474227, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_clip": 1.29024017, + "balance_loss_mlp": 1.02152967, + "epoch": 0.4007816022846836, + "flos": 18670533713280.0, + "grad_norm": 1.5993297310547436, + "language_loss": 0.79036885, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.81553692, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21057129, + "step": 6666, + "time_per_iteration": 2.8053057193756104 + }, + { + "auxiliary_loss_clip": 0.01483709, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.30019951, + "balance_loss_mlp": 1.02288675, + "epoch": 0.4008417255373516, + "flos": 25714302606720.0, + "grad_norm": 2.129987297038019, + "language_loss": 0.74658793, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.77186096, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.20690918, + "step": 6667, + "time_per_iteration": 2.9194271564483643 + }, + { + "auxiliary_loss_clip": 0.01484188, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.29836512, + "balance_loss_mlp": 1.02007842, + "epoch": 0.40090184879001955, + "flos": 22869354890880.0, + "grad_norm": 1.4785743934221591, + "language_loss": 0.75959212, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78485847, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22363281, + "step": 6668, + "time_per_iteration": 2.92448091506958 + }, + { + "auxiliary_loss_clip": 0.01473914, + "auxiliary_loss_mlp": 0.01045146, + "balance_loss_clip": 1.29526401, + "balance_loss_mlp": 1.0250597, + "epoch": 0.4009619720426875, + "flos": 29071848372480.0, + "grad_norm": 1.9384936607106849, + "language_loss": 0.83097982, + "learning_rate": 2.721938558257248e-06, + "loss": 0.85617042, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.20092773, + "step": 6669, + "time_per_iteration": 2.9238102436065674 + }, + { + "auxiliary_loss_clip": 0.01232523, + "auxiliary_loss_mlp": 0.01057885, + "balance_loss_clip": 1.12750137, + "balance_loss_mlp": 1.03709543, + "epoch": 0.4010220952953555, + "flos": 66091703829120.0, + "grad_norm": 0.7126103941881856, + "language_loss": 0.53425461, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55715871, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.20800781, + "step": 6670, + "time_per_iteration": 3.5269203186035156 + }, + { + "auxiliary_loss_clip": 0.01460311, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.28039575, + "balance_loss_mlp": 1.01517344, + "epoch": 0.40108221854802345, + "flos": 29654766120960.0, + "grad_norm": 2.464883817648517, + "language_loss": 0.8889159, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.91387409, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.20361328, + "step": 6671, + "time_per_iteration": 2.9420790672302246 + }, + { + "auxiliary_loss_clip": 0.01479199, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.29518342, + "balance_loss_mlp": 1.01919854, + "epoch": 0.4011423418006914, + "flos": 19936894452480.0, + "grad_norm": 1.8761896874970936, + "language_loss": 0.79331154, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81851, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21435547, + "step": 6672, + "time_per_iteration": 2.798285722732544 + }, + { + "auxiliary_loss_clip": 0.01474744, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.29322028, + "balance_loss_mlp": 1.01827383, + "epoch": 0.4012024650533594, + "flos": 20093957712000.0, + "grad_norm": 2.6273031015903894, + "language_loss": 0.64092982, + "learning_rate": 2.72048552626888e-06, + "loss": 0.66606605, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.20617676, + "step": 6673, + "time_per_iteration": 2.8736982345581055 + }, + { + "auxiliary_loss_clip": 0.01477865, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.29435158, + "balance_loss_mlp": 1.02202439, + "epoch": 0.40126258830602735, + "flos": 21707365201920.0, + "grad_norm": 1.4972153984140504, + "language_loss": 0.80677742, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.8319878, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21154785, + "step": 6674, + "time_per_iteration": 3.0009572505950928 + }, + { + "auxiliary_loss_clip": 0.01500717, + "auxiliary_loss_mlp": 0.01044141, + "balance_loss_clip": 1.31118917, + "balance_loss_mlp": 1.0232439, + "epoch": 0.4013227115586953, + "flos": 12027797896320.0, + "grad_norm": 2.7085134546218037, + "language_loss": 0.82937866, + "learning_rate": 2.719758846294294e-06, + "loss": 0.85482728, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.20898438, + "step": 6675, + "time_per_iteration": 2.880319356918335 + }, + { + "auxiliary_loss_clip": 0.01489832, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_clip": 1.30657399, + "balance_loss_mlp": 1.02665806, + "epoch": 0.4013828348113633, + "flos": 25458523896960.0, + "grad_norm": 1.8761928932883938, + "language_loss": 0.94483048, + "learning_rate": 2.71939546536012e-06, + "loss": 0.97020072, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.20532227, + "step": 6676, + "time_per_iteration": 2.866199493408203 + }, + { + "auxiliary_loss_clip": 0.01503532, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.31468689, + "balance_loss_mlp": 1.02091765, + "epoch": 0.40144295806403124, + "flos": 18590803096320.0, + "grad_norm": 2.5962251281424855, + "language_loss": 0.8073945, + "learning_rate": 2.719032057146399e-06, + "loss": 0.83285713, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.21801758, + "step": 6677, + "time_per_iteration": 2.8110363483428955 + }, + { + "auxiliary_loss_clip": 0.01489705, + "auxiliary_loss_mlp": 0.01046134, + "balance_loss_clip": 1.30779505, + "balance_loss_mlp": 1.02608323, + "epoch": 0.4015030813166992, + "flos": 22940624730240.0, + "grad_norm": 2.4053656652233273, + "language_loss": 0.83951962, + "learning_rate": 2.71866862166691e-06, + "loss": 0.86487806, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.20043945, + "step": 6678, + "time_per_iteration": 2.8696792125701904 + }, + { + "auxiliary_loss_clip": 0.01482476, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_clip": 1.30265594, + "balance_loss_mlp": 1.02481937, + "epoch": 0.4015632045693672, + "flos": 20604836459520.0, + "grad_norm": 2.1416901898409537, + "language_loss": 0.64882517, + "learning_rate": 2.718305158935434e-06, + "loss": 0.67409194, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.19384766, + "step": 6679, + "time_per_iteration": 2.80134654045105 + }, + { + "auxiliary_loss_clip": 0.01487941, + "auxiliary_loss_mlp": 0.01042258, + "balance_loss_clip": 1.30878448, + "balance_loss_mlp": 1.02217078, + "epoch": 0.4016233278220352, + "flos": 23448924524160.0, + "grad_norm": 1.8991285917949285, + "language_loss": 0.79431272, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81961471, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.20080566, + "step": 6680, + "time_per_iteration": 2.865013599395752 + }, + { + "auxiliary_loss_clip": 0.01510541, + "auxiliary_loss_mlp": 0.01050589, + "balance_loss_clip": 1.32144356, + "balance_loss_mlp": 1.03040683, + "epoch": 0.40168345107470316, + "flos": 21440230047360.0, + "grad_norm": 1.607129267713683, + "language_loss": 0.76416707, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.78977841, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.20178223, + "step": 6681, + "time_per_iteration": 2.8391401767730713 + }, + { + "auxiliary_loss_clip": 0.01520511, + "auxiliary_loss_mlp": 0.01044141, + "balance_loss_clip": 1.33288074, + "balance_loss_mlp": 1.02426922, + "epoch": 0.4017435743273711, + "flos": 22867590343680.0, + "grad_norm": 1.8243811582227518, + "language_loss": 0.65037274, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.67601931, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.1986084, + "step": 6682, + "time_per_iteration": 2.8541107177734375 + }, + { + "auxiliary_loss_clip": 0.01491764, + "auxiliary_loss_mlp": 0.01041927, + "balance_loss_clip": 1.30697632, + "balance_loss_mlp": 1.02259123, + "epoch": 0.4018036975800391, + "flos": 28634275480320.0, + "grad_norm": 1.7882211876797014, + "language_loss": 0.74197483, + "learning_rate": 2.716851035765337e-06, + "loss": 0.76731169, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.19311523, + "step": 6683, + "time_per_iteration": 2.883040189743042 + }, + { + "auxiliary_loss_clip": 0.01493136, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.30928719, + "balance_loss_mlp": 1.02889478, + "epoch": 0.40186382083270705, + "flos": 26662437532800.0, + "grad_norm": 1.7217613311229025, + "language_loss": 0.74187088, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.76729786, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.20678711, + "step": 6684, + "time_per_iteration": 2.8782949447631836 + }, + { + "auxiliary_loss_clip": 0.01263456, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.15838552, + "balance_loss_mlp": 1.01312065, + "epoch": 0.401923944085375, + "flos": 59286656355840.0, + "grad_norm": 0.8122807106893122, + "language_loss": 0.60465515, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62760592, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.18457031, + "step": 6685, + "time_per_iteration": 3.4571034908294678 + }, + { + "auxiliary_loss_clip": 0.0152232, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.33183217, + "balance_loss_mlp": 1.02030659, + "epoch": 0.401984067338043, + "flos": 16991602473600.0, + "grad_norm": 1.7309799682477942, + "language_loss": 0.70801795, + "learning_rate": 2.715760157917357e-06, + "loss": 0.73364902, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.20495605, + "step": 6686, + "time_per_iteration": 2.996462106704712 + }, + { + "auxiliary_loss_clip": 0.01504838, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.32230091, + "balance_loss_mlp": 1.01998186, + "epoch": 0.40204419059071095, + "flos": 24983189579520.0, + "grad_norm": 1.9347437369316967, + "language_loss": 0.7523343, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77778333, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.20092773, + "step": 6687, + "time_per_iteration": 2.928790807723999 + }, + { + "auxiliary_loss_clip": 0.01514428, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.32829285, + "balance_loss_mlp": 1.01776826, + "epoch": 0.4021043138433789, + "flos": 23487998048640.0, + "grad_norm": 2.2130038946116937, + "language_loss": 0.72032773, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.74585533, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.20544434, + "step": 6688, + "time_per_iteration": 2.874418020248413 + }, + { + "auxiliary_loss_clip": 0.0152194, + "auxiliary_loss_mlp": 0.01040583, + "balance_loss_clip": 1.3314085, + "balance_loss_mlp": 1.01984024, + "epoch": 0.4021644370960469, + "flos": 26007209314560.0, + "grad_norm": 1.9118396420392236, + "language_loss": 0.65859687, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6842221, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.20739746, + "step": 6689, + "time_per_iteration": 4.382193565368652 + }, + { + "auxiliary_loss_clip": 0.01507731, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.31916797, + "balance_loss_mlp": 1.01488554, + "epoch": 0.40222456034871484, + "flos": 13595480876160.0, + "grad_norm": 2.3653991643020778, + "language_loss": 0.74260378, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.76803476, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.20471191, + "step": 6690, + "time_per_iteration": 2.806926965713501 + }, + { + "auxiliary_loss_clip": 0.01497973, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.3151238, + "balance_loss_mlp": 1.01498616, + "epoch": 0.4022846836013828, + "flos": 24288209164800.0, + "grad_norm": 2.0921031246078425, + "language_loss": 0.75374281, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.77907294, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.20056152, + "step": 6691, + "time_per_iteration": 2.8615708351135254 + }, + { + "auxiliary_loss_clip": 0.01528521, + "auxiliary_loss_mlp": 0.01041171, + "balance_loss_clip": 1.34095466, + "balance_loss_mlp": 1.02059531, + "epoch": 0.40234480685405083, + "flos": 20159798175360.0, + "grad_norm": 1.9068642676576706, + "language_loss": 0.73176038, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.75745726, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.20568848, + "step": 6692, + "time_per_iteration": 2.874814748764038 + }, + { + "auxiliary_loss_clip": 0.01496864, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.31299973, + "balance_loss_mlp": 1.01712012, + "epoch": 0.4024049301067188, + "flos": 22940624730240.0, + "grad_norm": 1.7046168766509167, + "language_loss": 0.84624511, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.87158537, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.20031738, + "step": 6693, + "time_per_iteration": 2.9002187252044678 + }, + { + "auxiliary_loss_clip": 0.0151063, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.32593131, + "balance_loss_mlp": 1.01687765, + "epoch": 0.40246505335938676, + "flos": 36042175676160.0, + "grad_norm": 1.9663378410614787, + "language_loss": 0.72012538, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.74560821, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.20788574, + "step": 6694, + "time_per_iteration": 2.986011505126953 + }, + { + "auxiliary_loss_clip": 0.01502912, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.31861818, + "balance_loss_mlp": 1.01662946, + "epoch": 0.4025251766120547, + "flos": 20603886318720.0, + "grad_norm": 2.3877834560454163, + "language_loss": 0.68744278, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.71285748, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.21948242, + "step": 6695, + "time_per_iteration": 2.924715757369995 + }, + { + "auxiliary_loss_clip": 0.01507525, + "auxiliary_loss_mlp": 0.01035872, + "balance_loss_clip": 1.3247149, + "balance_loss_mlp": 1.01660776, + "epoch": 0.4025852998647227, + "flos": 64545114205440.0, + "grad_norm": 1.94391292170846, + "language_loss": 0.80385906, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.82929301, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.19262695, + "step": 6696, + "time_per_iteration": 6.218676567077637 + }, + { + "auxiliary_loss_clip": 0.0151408, + "auxiliary_loss_mlp": 0.01042582, + "balance_loss_clip": 1.32894182, + "balance_loss_mlp": 1.02147031, + "epoch": 0.40264542311739066, + "flos": 20895978620160.0, + "grad_norm": 1.6927536094756572, + "language_loss": 0.71477532, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.7403419, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.21105957, + "step": 6697, + "time_per_iteration": 2.9198110103607178 + }, + { + "auxiliary_loss_clip": 0.01486751, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.30635405, + "balance_loss_mlp": 1.01899695, + "epoch": 0.4027055463700586, + "flos": 26261404456320.0, + "grad_norm": 2.264080722303738, + "language_loss": 0.62005782, + "learning_rate": 2.711394207496984e-06, + "loss": 0.64531457, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.19921875, + "step": 6698, + "time_per_iteration": 4.397491693496704 + }, + { + "auxiliary_loss_clip": 0.01496079, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.31332994, + "balance_loss_mlp": 1.01153207, + "epoch": 0.4027656696227266, + "flos": 20641150051200.0, + "grad_norm": 2.117889960337718, + "language_loss": 0.78013569, + "learning_rate": 2.711030202621491e-06, + "loss": 0.80541682, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.20495605, + "step": 6699, + "time_per_iteration": 2.857994318008423 + }, + { + "auxiliary_loss_clip": 0.01479958, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.30006766, + "balance_loss_mlp": 1.01304305, + "epoch": 0.40282579287539455, + "flos": 22356485372160.0, + "grad_norm": 1.733222213363098, + "language_loss": 0.81216168, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.83729136, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.19958496, + "step": 6700, + "time_per_iteration": 2.90022349357605 + }, + { + "auxiliary_loss_clip": 0.01505971, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.31798613, + "balance_loss_mlp": 1.01802373, + "epoch": 0.4028859161280625, + "flos": 29286065093760.0, + "grad_norm": 2.0265453675938527, + "language_loss": 0.75361019, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77907926, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.22924805, + "step": 6701, + "time_per_iteration": 2.9520859718322754 + }, + { + "auxiliary_loss_clip": 0.01488979, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.30735683, + "balance_loss_mlp": 1.01139641, + "epoch": 0.4029460393807305, + "flos": 28634275480320.0, + "grad_norm": 1.7118110110637654, + "language_loss": 0.66908085, + "learning_rate": 2.709938026276208e-06, + "loss": 0.69428521, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.20080566, + "step": 6702, + "time_per_iteration": 3.04244065284729 + }, + { + "auxiliary_loss_clip": 0.01504075, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.31788278, + "balance_loss_mlp": 1.01693344, + "epoch": 0.40300616263339845, + "flos": 22612264081920.0, + "grad_norm": 1.643798663481284, + "language_loss": 0.66851985, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.69393861, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.20874023, + "step": 6703, + "time_per_iteration": 2.9070193767547607 + }, + { + "auxiliary_loss_clip": 0.01499332, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.31293154, + "balance_loss_mlp": 1.01336074, + "epoch": 0.4030662858860664, + "flos": 25531286814720.0, + "grad_norm": 1.868803690830443, + "language_loss": 0.82643807, + "learning_rate": 2.709209774085071e-06, + "loss": 0.8517592, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.19421387, + "step": 6704, + "time_per_iteration": 2.9339048862457275 + }, + { + "auxiliary_loss_clip": 0.01505079, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.3184793, + "balance_loss_mlp": 1.01316261, + "epoch": 0.40312640913873443, + "flos": 23597210292480.0, + "grad_norm": 1.552998315858976, + "language_loss": 0.73951441, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.76490927, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.21228027, + "step": 6705, + "time_per_iteration": 2.897552728652954 + }, + { + "auxiliary_loss_clip": 0.01477393, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.29708087, + "balance_loss_mlp": 1.01236749, + "epoch": 0.4031865323914024, + "flos": 20020606611840.0, + "grad_norm": 2.0357857555587096, + "language_loss": 0.67760682, + "learning_rate": 2.708481414320713e-06, + "loss": 0.70270264, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.19836426, + "step": 6706, + "time_per_iteration": 2.82255482673645 + }, + { + "auxiliary_loss_clip": 0.01490063, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.30751455, + "balance_loss_mlp": 1.01494396, + "epoch": 0.40324665564407036, + "flos": 21881377278720.0, + "grad_norm": 1.3861484092173149, + "language_loss": 0.72182566, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.74708462, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.2088623, + "step": 6707, + "time_per_iteration": 2.8816840648651123 + }, + { + "auxiliary_loss_clip": 0.01459056, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.28193796, + "balance_loss_mlp": 1.01210189, + "epoch": 0.4033067788967383, + "flos": 23889347838720.0, + "grad_norm": 2.171447909053771, + "language_loss": 0.80779952, + "learning_rate": 2.707752947093611e-06, + "loss": 0.83271205, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.2010498, + "step": 6708, + "time_per_iteration": 2.9098479747772217 + }, + { + "auxiliary_loss_clip": 0.01509472, + "auxiliary_loss_mlp": 0.01040059, + "balance_loss_clip": 1.31946659, + "balance_loss_mlp": 1.01944816, + "epoch": 0.4033669021494063, + "flos": 17428632428160.0, + "grad_norm": 2.0399004864085324, + "language_loss": 0.8442266, + "learning_rate": 2.70738867321606e-06, + "loss": 0.86972189, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.20617676, + "step": 6709, + "time_per_iteration": 2.839867115020752 + }, + { + "auxiliary_loss_clip": 0.01505262, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.31799674, + "balance_loss_mlp": 1.01659632, + "epoch": 0.40342702540207426, + "flos": 29610987137280.0, + "grad_norm": 3.428393547213459, + "language_loss": 0.72043562, + "learning_rate": 2.70702437251426e-06, + "loss": 0.74586689, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.21264648, + "step": 6710, + "time_per_iteration": 2.909968137741089 + }, + { + "auxiliary_loss_clip": 0.0147812, + "auxiliary_loss_mlp": 0.01040301, + "balance_loss_clip": 1.29633093, + "balance_loss_mlp": 1.01934457, + "epoch": 0.4034871486547422, + "flos": 11290260107520.0, + "grad_norm": 2.386636094677762, + "language_loss": 0.85584092, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.8810252, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20959473, + "step": 6711, + "time_per_iteration": 2.8466875553131104 + }, + { + "auxiliary_loss_clip": 0.01492097, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.30768001, + "balance_loss_mlp": 1.01829147, + "epoch": 0.4035472719074102, + "flos": 15559446228480.0, + "grad_norm": 2.279021120891494, + "language_loss": 0.77628732, + "learning_rate": 2.706295690693168e-06, + "loss": 0.8015883, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.19726562, + "step": 6712, + "time_per_iteration": 2.837409019470215 + }, + { + "auxiliary_loss_clip": 0.01496185, + "auxiliary_loss_mlp": 0.01038514, + "balance_loss_clip": 1.31230211, + "balance_loss_mlp": 1.01756883, + "epoch": 0.40360739516007815, + "flos": 24683360417280.0, + "grad_norm": 2.309401599293232, + "language_loss": 0.8030926, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.82843959, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.20947266, + "step": 6713, + "time_per_iteration": 2.8454666137695312 + }, + { + "auxiliary_loss_clip": 0.01491532, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.30470204, + "balance_loss_mlp": 1.01874912, + "epoch": 0.4036675184127461, + "flos": 17311140385920.0, + "grad_norm": 2.021421285954685, + "language_loss": 0.88883716, + "learning_rate": 2.705566901740865e-06, + "loss": 0.91414428, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.2043457, + "step": 6714, + "time_per_iteration": 2.870880126953125 + }, + { + "auxiliary_loss_clip": 0.01494954, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.31054211, + "balance_loss_mlp": 1.02249312, + "epoch": 0.4037276416654141, + "flos": 19873270984320.0, + "grad_norm": 1.6773431017631308, + "language_loss": 0.69963062, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.7250157, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.21069336, + "step": 6715, + "time_per_iteration": 2.831733465194702 + }, + { + "auxiliary_loss_clip": 0.01512253, + "auxiliary_loss_mlp": 0.01044162, + "balance_loss_clip": 1.32259965, + "balance_loss_mlp": 1.02383637, + "epoch": 0.40378776491808205, + "flos": 18305407025280.0, + "grad_norm": 2.094132714670534, + "language_loss": 0.78566146, + "learning_rate": 2.704838005767892e-06, + "loss": 0.81122559, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.20336914, + "step": 6716, + "time_per_iteration": 2.827209949493408 + }, + { + "auxiliary_loss_clip": 0.01471514, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.29162538, + "balance_loss_mlp": 1.01489043, + "epoch": 0.40384788817075, + "flos": 15057752175360.0, + "grad_norm": 4.567862922241594, + "language_loss": 0.77239263, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.7974695, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21289062, + "step": 6717, + "time_per_iteration": 2.8132240772247314 + }, + { + "auxiliary_loss_clip": 0.01246347, + "auxiliary_loss_mlp": 0.01011477, + "balance_loss_clip": 1.13859808, + "balance_loss_mlp": 0.99669474, + "epoch": 0.40390801142341803, + "flos": 61958994583680.0, + "grad_norm": 0.9271795436925444, + "language_loss": 0.60867655, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.63125479, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.14746094, + "step": 6718, + "time_per_iteration": 3.2303478717803955 + }, + { + "auxiliary_loss_clip": 0.01498769, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.30922151, + "balance_loss_mlp": 1.02034688, + "epoch": 0.403968134676086, + "flos": 22746795431040.0, + "grad_norm": 9.076436700008227, + "language_loss": 0.75332224, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77873158, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.21826172, + "step": 6719, + "time_per_iteration": 2.8524045944213867 + }, + { + "auxiliary_loss_clip": 0.01483662, + "auxiliary_loss_mlp": 0.01046369, + "balance_loss_clip": 1.29939008, + "balance_loss_mlp": 1.02509046, + "epoch": 0.40402825792875396, + "flos": 19791911554560.0, + "grad_norm": 2.0482028573759616, + "language_loss": 0.82989693, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.85519731, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.21276855, + "step": 6720, + "time_per_iteration": 2.8408243656158447 + }, + { + "auxiliary_loss_clip": 0.0149436, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.30759621, + "balance_loss_mlp": 1.01782513, + "epoch": 0.40408838118142193, + "flos": 19618623394560.0, + "grad_norm": 1.7663266169740917, + "language_loss": 0.77478904, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.80011159, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.20080566, + "step": 6721, + "time_per_iteration": 2.846968412399292 + }, + { + "auxiliary_loss_clip": 0.01480271, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.2986455, + "balance_loss_mlp": 1.01797557, + "epoch": 0.4041485044340899, + "flos": 24436811646720.0, + "grad_norm": 1.6093296873367648, + "language_loss": 0.7367419, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.76191962, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.19519043, + "step": 6722, + "time_per_iteration": 2.8775830268859863 + }, + { + "auxiliary_loss_clip": 0.01478763, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.29600883, + "balance_loss_mlp": 1.01794934, + "epoch": 0.40420862768675786, + "flos": 16767974833920.0, + "grad_norm": 1.6462765185109878, + "language_loss": 0.66342562, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68859172, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.19897461, + "step": 6723, + "time_per_iteration": 2.8542542457580566 + }, + { + "auxiliary_loss_clip": 0.01501765, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.31349874, + "balance_loss_mlp": 1.02367759, + "epoch": 0.4042687509394258, + "flos": 22501649249280.0, + "grad_norm": 1.655902329675024, + "language_loss": 0.74067044, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76613683, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.21203613, + "step": 6724, + "time_per_iteration": 4.310163974761963 + }, + { + "auxiliary_loss_clip": 0.01464737, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.28632331, + "balance_loss_mlp": 1.01263213, + "epoch": 0.4043288741920938, + "flos": 30348298702080.0, + "grad_norm": 1.786290430657111, + "language_loss": 0.75983286, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.7848109, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20446777, + "step": 6725, + "time_per_iteration": 2.880164384841919 + }, + { + "auxiliary_loss_clip": 0.01481448, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.29882216, + "balance_loss_mlp": 1.01461685, + "epoch": 0.40438899744476176, + "flos": 46363307270400.0, + "grad_norm": 1.6466407225544473, + "language_loss": 0.772246, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79742181, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.21508789, + "step": 6726, + "time_per_iteration": 3.0462794303894043 + }, + { + "auxiliary_loss_clip": 0.01489664, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.30309725, + "balance_loss_mlp": 1.01607907, + "epoch": 0.4044491206974297, + "flos": 13341195244800.0, + "grad_norm": 2.22729347487987, + "language_loss": 0.82476091, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.850025, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.20654297, + "step": 6727, + "time_per_iteration": 2.8585100173950195 + }, + { + "auxiliary_loss_clip": 0.0149793, + "auxiliary_loss_mlp": 0.01035344, + "balance_loss_clip": 1.31081116, + "balance_loss_mlp": 1.0139935, + "epoch": 0.4045092439500977, + "flos": 12101284730880.0, + "grad_norm": 2.312974936843572, + "language_loss": 0.86404842, + "learning_rate": 2.700462388688447e-06, + "loss": 0.88938123, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.21350098, + "step": 6728, + "time_per_iteration": 3.0446431636810303 + }, + { + "auxiliary_loss_clip": 0.01485786, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.30358601, + "balance_loss_mlp": 1.01609945, + "epoch": 0.40456936720276565, + "flos": 21189835468800.0, + "grad_norm": 1.7206042194596154, + "language_loss": 0.82164228, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84686285, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.20178223, + "step": 6729, + "time_per_iteration": 2.896275281906128 + }, + { + "auxiliary_loss_clip": 0.01497027, + "auxiliary_loss_mlp": 0.0104118, + "balance_loss_clip": 1.31238794, + "balance_loss_mlp": 1.02093816, + "epoch": 0.4046294904554336, + "flos": 23925932899200.0, + "grad_norm": 6.016488673940383, + "language_loss": 0.74610668, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.77148873, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.20251465, + "step": 6730, + "time_per_iteration": 2.8608593940734863 + }, + { + "auxiliary_loss_clip": 0.01482433, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.29969084, + "balance_loss_mlp": 1.01652992, + "epoch": 0.4046896137081016, + "flos": 38085419197440.0, + "grad_norm": 1.8244550718450188, + "language_loss": 0.68340588, + "learning_rate": 2.699367885848985e-06, + "loss": 0.70860112, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.20556641, + "step": 6731, + "time_per_iteration": 5.786974191665649 + }, + { + "auxiliary_loss_clip": 0.01481109, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.29589748, + "balance_loss_mlp": 1.01884174, + "epoch": 0.4047497369607696, + "flos": 23626239471360.0, + "grad_norm": 1.4580730815884706, + "language_loss": 0.74910045, + "learning_rate": 2.699002998510517e-06, + "loss": 0.77430785, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.20788574, + "step": 6732, + "time_per_iteration": 2.8549928665161133 + }, + { + "auxiliary_loss_clip": 0.01480576, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.29789424, + "balance_loss_mlp": 1.01582932, + "epoch": 0.40480986021343757, + "flos": 12831357127680.0, + "grad_norm": 1.752196811025774, + "language_loss": 0.77904254, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.80420715, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.20031738, + "step": 6733, + "time_per_iteration": 4.261539459228516 + }, + { + "auxiliary_loss_clip": 0.01510631, + "auxiliary_loss_mlp": 0.01046318, + "balance_loss_clip": 1.32017875, + "balance_loss_mlp": 1.02415729, + "epoch": 0.40486998346610553, + "flos": 23779230698880.0, + "grad_norm": 1.741353442442142, + "language_loss": 0.77588373, + "learning_rate": 2.698273144328627e-06, + "loss": 0.80145323, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.22167969, + "step": 6734, + "time_per_iteration": 2.8466339111328125 + }, + { + "auxiliary_loss_clip": 0.01502551, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_clip": 1.31121838, + "balance_loss_mlp": 1.02315462, + "epoch": 0.4049301067187735, + "flos": 22867092650880.0, + "grad_norm": 2.2067013384939878, + "language_loss": 0.65904164, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.68451142, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.21276855, + "step": 6735, + "time_per_iteration": 2.8863890171051025 + }, + { + "auxiliary_loss_clip": 0.01481395, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.29627132, + "balance_loss_mlp": 1.02168787, + "epoch": 0.40499022997144146, + "flos": 22794284488320.0, + "grad_norm": 1.6740819987225546, + "language_loss": 0.8362478, + "learning_rate": 2.697543184232387e-06, + "loss": 0.86149132, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21264648, + "step": 6736, + "time_per_iteration": 2.8403313159942627 + }, + { + "auxiliary_loss_clip": 0.01494128, + "auxiliary_loss_mlp": 0.01050994, + "balance_loss_clip": 1.30760956, + "balance_loss_mlp": 1.02901161, + "epoch": 0.4050503532241094, + "flos": 23049701239680.0, + "grad_norm": 1.5676897038642668, + "language_loss": 0.75900376, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.784455, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.21972656, + "step": 6737, + "time_per_iteration": 2.8292407989501953 + }, + { + "auxiliary_loss_clip": 0.01481615, + "auxiliary_loss_mlp": 0.01045525, + "balance_loss_clip": 1.29747462, + "balance_loss_mlp": 1.025069, + "epoch": 0.4051104764767774, + "flos": 16654735802880.0, + "grad_norm": 1.903095956796571, + "language_loss": 0.72516513, + "learning_rate": 2.696813118332519e-06, + "loss": 0.75043654, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.20446777, + "step": 6738, + "time_per_iteration": 2.902050018310547 + }, + { + "auxiliary_loss_clip": 0.01478278, + "auxiliary_loss_mlp": 0.01053455, + "balance_loss_clip": 1.29489172, + "balance_loss_mlp": 1.03225982, + "epoch": 0.40517059972944536, + "flos": 16366670288640.0, + "grad_norm": 2.0180020728183354, + "language_loss": 0.76003098, + "learning_rate": 2.696448045740828e-06, + "loss": 0.7853483, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.21203613, + "step": 6739, + "time_per_iteration": 2.815553903579712 + }, + { + "auxiliary_loss_clip": 0.01481042, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_clip": 1.29598069, + "balance_loss_mlp": 1.0223608, + "epoch": 0.4052307229821133, + "flos": 28815029032320.0, + "grad_norm": 2.24615667945766, + "language_loss": 0.75428122, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.77954066, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.22521973, + "step": 6740, + "time_per_iteration": 2.866610050201416 + }, + { + "auxiliary_loss_clip": 0.0147099, + "auxiliary_loss_mlp": 0.01043652, + "balance_loss_clip": 1.29019511, + "balance_loss_mlp": 1.02249265, + "epoch": 0.4052908462347813, + "flos": 21407897998080.0, + "grad_norm": 1.5410561086780905, + "language_loss": 0.77613658, + "learning_rate": 2.695717821343153e-06, + "loss": 0.801283, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21166992, + "step": 6741, + "time_per_iteration": 2.854022264480591 + }, + { + "auxiliary_loss_clip": 0.01479877, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_clip": 1.29510045, + "balance_loss_mlp": 1.02776742, + "epoch": 0.40535096948744925, + "flos": 22429248289920.0, + "grad_norm": 1.9634866035373009, + "language_loss": 0.72352469, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.74883556, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.23449707, + "step": 6742, + "time_per_iteration": 2.812648057937622 + }, + { + "auxiliary_loss_clip": 0.01491324, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.30450833, + "balance_loss_mlp": 1.01774454, + "epoch": 0.4054110927401172, + "flos": 17017374026880.0, + "grad_norm": 2.230126803084672, + "language_loss": 0.73575515, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.76106119, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.21533203, + "step": 6743, + "time_per_iteration": 2.8082680702209473 + }, + { + "auxiliary_loss_clip": 0.01500983, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.31134868, + "balance_loss_mlp": 1.01924729, + "epoch": 0.4054712159927852, + "flos": 21624422204160.0, + "grad_norm": 2.3099385217168282, + "language_loss": 0.72120035, + "learning_rate": 2.694622286918588e-06, + "loss": 0.74662155, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.21899414, + "step": 6744, + "time_per_iteration": 2.820746660232544 + }, + { + "auxiliary_loss_clip": 0.01483545, + "auxiliary_loss_mlp": 0.01042007, + "balance_loss_clip": 1.30034256, + "balance_loss_mlp": 1.02088284, + "epoch": 0.4055313392454532, + "flos": 25823695829760.0, + "grad_norm": 1.567338843173594, + "language_loss": 0.80565536, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.21130371, + "step": 6745, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.01480782, + "auxiliary_loss_mlp": 0.01041642, + "balance_loss_clip": 1.29850984, + "balance_loss_mlp": 1.01851559, + "epoch": 0.40559146249812117, + "flos": 14145206924160.0, + "grad_norm": 2.3927765922330893, + "language_loss": 0.67554724, + "learning_rate": 2.693891798911731e-06, + "loss": 0.70077145, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.23144531, + "step": 6746, + "time_per_iteration": 2.7916104793548584 + }, + { + "auxiliary_loss_clip": 0.01489812, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.30465984, + "balance_loss_mlp": 1.01533592, + "epoch": 0.40565158575078913, + "flos": 41370654493440.0, + "grad_norm": 3.039729581353956, + "language_loss": 0.57424986, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.599509, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.2076416, + "step": 6747, + "time_per_iteration": 2.9790310859680176 + }, + { + "auxiliary_loss_clip": 0.01489452, + "auxiliary_loss_mlp": 0.01042688, + "balance_loss_clip": 1.30388331, + "balance_loss_mlp": 1.02082467, + "epoch": 0.4057117090034571, + "flos": 28555223535360.0, + "grad_norm": 1.7838519785105682, + "language_loss": 0.8507942, + "learning_rate": 2.693161205655089e-06, + "loss": 0.87611556, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21850586, + "step": 6748, + "time_per_iteration": 2.9651427268981934 + }, + { + "auxiliary_loss_clip": 0.01493812, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.30680954, + "balance_loss_mlp": 1.01704657, + "epoch": 0.40577183225612506, + "flos": 18013043255040.0, + "grad_norm": 1.7721534484483183, + "language_loss": 0.82416886, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.84949481, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2175293, + "step": 6749, + "time_per_iteration": 2.818758010864258 + }, + { + "auxiliary_loss_clip": 0.01481935, + "auxiliary_loss_mlp": 0.0104115, + "balance_loss_clip": 1.29864001, + "balance_loss_mlp": 1.01953709, + "epoch": 0.40583195550879303, + "flos": 19546177190400.0, + "grad_norm": 1.7075615182241006, + "language_loss": 0.76137376, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.78660458, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21606445, + "step": 6750, + "time_per_iteration": 2.901683807373047 + }, + { + "auxiliary_loss_clip": 0.01507307, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.31444967, + "balance_loss_mlp": 1.01656342, + "epoch": 0.405892078761461, + "flos": 22319583598080.0, + "grad_norm": 2.8634228434763105, + "language_loss": 0.74315983, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76861674, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 1.92773438, + "router_z_loss_mlp": 0.21801758, + "step": 6751, + "time_per_iteration": 2.8803458213806152 + }, + { + "auxiliary_loss_clip": 0.01500197, + "auxiliary_loss_mlp": 0.01045534, + "balance_loss_clip": 1.31293082, + "balance_loss_mlp": 1.02320647, + "epoch": 0.40595220201412896, + "flos": 25495516160640.0, + "grad_norm": 3.509388972995175, + "language_loss": 0.68113577, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.70659304, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.2232666, + "step": 6752, + "time_per_iteration": 2.8837597370147705 + }, + { + "auxiliary_loss_clip": 0.01504579, + "auxiliary_loss_mlp": 0.01050287, + "balance_loss_clip": 1.31399071, + "balance_loss_mlp": 1.02599192, + "epoch": 0.4060123252667969, + "flos": 49873210836480.0, + "grad_norm": 1.9053288334291985, + "language_loss": 0.71629131, + "learning_rate": 2.691334262772948e-06, + "loss": 0.74184, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.24267578, + "step": 6753, + "time_per_iteration": 3.0923516750335693 + }, + { + "auxiliary_loss_clip": 0.01498965, + "auxiliary_loss_mlp": 0.01044041, + "balance_loss_clip": 1.31068087, + "balance_loss_mlp": 1.02214265, + "epoch": 0.4060724485194649, + "flos": 21143613265920.0, + "grad_norm": 1.9156957595945474, + "language_loss": 0.7268995, + "learning_rate": 2.690968795494699e-06, + "loss": 0.75232959, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 1.88085938, + "router_z_loss_mlp": 0.21899414, + "step": 6754, + "time_per_iteration": 2.8273680210113525 + }, + { + "auxiliary_loss_clip": 0.01500183, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.31096637, + "balance_loss_mlp": 1.02143335, + "epoch": 0.40613257177213286, + "flos": 21766961882880.0, + "grad_norm": 1.7513738502730913, + "language_loss": 0.83836019, + "learning_rate": 2.690603302014844e-06, + "loss": 0.8637979, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.22155762, + "step": 6755, + "time_per_iteration": 2.846447229385376 + }, + { + "auxiliary_loss_clip": 0.01505582, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.31474209, + "balance_loss_mlp": 1.02120161, + "epoch": 0.4061926950248008, + "flos": 25565609635200.0, + "grad_norm": 1.8814678908606506, + "language_loss": 0.71788645, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.7433787, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22424316, + "step": 6756, + "time_per_iteration": 2.8939061164855957 + }, + { + "auxiliary_loss_clip": 0.01498267, + "auxiliary_loss_mlp": 0.01042578, + "balance_loss_clip": 1.30926275, + "balance_loss_mlp": 1.02056038, + "epoch": 0.4062528182774688, + "flos": 23706241557120.0, + "grad_norm": 1.7879777894184403, + "language_loss": 0.80273223, + "learning_rate": 2.689872236505755e-06, + "loss": 0.82814062, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.22021484, + "step": 6757, + "time_per_iteration": 2.879807472229004 + }, + { + "auxiliary_loss_clip": 0.01493668, + "auxiliary_loss_mlp": 0.01039446, + "balance_loss_clip": 1.30697358, + "balance_loss_mlp": 1.017869, + "epoch": 0.4063129415301368, + "flos": 21736077667200.0, + "grad_norm": 2.04128193174796, + "language_loss": 0.79379487, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.81912595, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.21569824, + "step": 6758, + "time_per_iteration": 4.262715101242065 + }, + { + "auxiliary_loss_clip": 0.01485111, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.30163288, + "balance_loss_mlp": 1.01311255, + "epoch": 0.40637306478280477, + "flos": 12795450739200.0, + "grad_norm": 3.701213696809869, + "language_loss": 0.9024868, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.92768192, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.2130127, + "step": 6759, + "time_per_iteration": 2.795884132385254 + }, + { + "auxiliary_loss_clip": 0.01506824, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.31751966, + "balance_loss_mlp": 1.0147512, + "epoch": 0.40643318803547274, + "flos": 24035099898240.0, + "grad_norm": 1.9535453080169018, + "language_loss": 0.65187275, + "learning_rate": 2.688775442076598e-06, + "loss": 0.67730099, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.21252441, + "step": 6760, + "time_per_iteration": 2.893871784210205 + }, + { + "auxiliary_loss_clip": 0.0151538, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.32619905, + "balance_loss_mlp": 1.0213623, + "epoch": 0.4064933112881407, + "flos": 25603190081280.0, + "grad_norm": 1.5124254279120817, + "language_loss": 0.75462019, + "learning_rate": 2.688409791678193e-06, + "loss": 0.78020185, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.2142334, + "step": 6761, + "time_per_iteration": 2.8730552196502686 + }, + { + "auxiliary_loss_clip": 0.01479957, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.29891288, + "balance_loss_mlp": 1.01342833, + "epoch": 0.40655343454080867, + "flos": 22064438315520.0, + "grad_norm": 1.6038825883396162, + "language_loss": 0.70640051, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.73154795, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.21362305, + "step": 6762, + "time_per_iteration": 2.8499209880828857 + }, + { + "auxiliary_loss_clip": 0.01497903, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.31134212, + "balance_loss_mlp": 1.01709914, + "epoch": 0.40661355779347663, + "flos": 26480100412800.0, + "grad_norm": 3.071849889144561, + "language_loss": 0.73591775, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.76128608, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.21813965, + "step": 6763, + "time_per_iteration": 2.8786327838897705 + }, + { + "auxiliary_loss_clip": 0.01505166, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.31643355, + "balance_loss_mlp": 1.01382995, + "epoch": 0.4066736810461446, + "flos": 13268930019840.0, + "grad_norm": 1.826199436080857, + "language_loss": 0.69959033, + "learning_rate": 2.687312683911033e-06, + "loss": 0.7249971, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.21679688, + "step": 6764, + "time_per_iteration": 2.8155317306518555 + }, + { + "auxiliary_loss_clip": 0.01513126, + "auxiliary_loss_mlp": 0.0104521, + "balance_loss_clip": 1.32214952, + "balance_loss_mlp": 1.02263141, + "epoch": 0.40673380429881256, + "flos": 28815345745920.0, + "grad_norm": 2.0253790188856553, + "language_loss": 0.91884613, + "learning_rate": 2.686946929177557e-06, + "loss": 0.94442952, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.22583008, + "step": 6765, + "time_per_iteration": 2.9646458625793457 + }, + { + "auxiliary_loss_clip": 0.01522106, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.32866335, + "balance_loss_mlp": 1.01882815, + "epoch": 0.4067939275514805, + "flos": 12503041724160.0, + "grad_norm": 2.3745151206344697, + "language_loss": 0.7989018, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.82453668, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.22546387, + "step": 6766, + "time_per_iteration": 4.2178428173065186 + }, + { + "auxiliary_loss_clip": 0.01502237, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.31193852, + "balance_loss_mlp": 1.01866555, + "epoch": 0.4068540508041485, + "flos": 18779745957120.0, + "grad_norm": 2.0248636514704272, + "language_loss": 0.76903498, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.79445946, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.2154541, + "step": 6767, + "time_per_iteration": 4.224372863769531 + }, + { + "auxiliary_loss_clip": 0.01498637, + "auxiliary_loss_mlp": 0.01040348, + "balance_loss_clip": 1.31138873, + "balance_loss_mlp": 1.01875937, + "epoch": 0.40691417405681646, + "flos": 28524746522880.0, + "grad_norm": 1.689511459844688, + "language_loss": 0.78754687, + "learning_rate": 2.685849508738034e-06, + "loss": 0.81293672, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.21582031, + "step": 6768, + "time_per_iteration": 4.3601367473602295 + }, + { + "auxiliary_loss_clip": 0.01506432, + "auxiliary_loss_mlp": 0.01040369, + "balance_loss_clip": 1.31850135, + "balance_loss_mlp": 1.01911378, + "epoch": 0.4069742973094844, + "flos": 20823894374400.0, + "grad_norm": 2.116282490586452, + "language_loss": 0.88735867, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.91282666, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.21240234, + "step": 6769, + "time_per_iteration": 2.839622735977173 + }, + { + "auxiliary_loss_clip": 0.01495614, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.31124675, + "balance_loss_mlp": 1.01754785, + "epoch": 0.4070344205621524, + "flos": 21480163223040.0, + "grad_norm": 2.362055096940512, + "language_loss": 0.81727624, + "learning_rate": 2.685117765051156e-06, + "loss": 0.84262156, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.21386719, + "step": 6770, + "time_per_iteration": 2.8860654830932617 + }, + { + "auxiliary_loss_clip": 0.01510779, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.31923819, + "balance_loss_mlp": 1.01602101, + "epoch": 0.4070945438148204, + "flos": 26840385907200.0, + "grad_norm": 1.8600695440618467, + "language_loss": 0.80788243, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.83337957, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 1.9140625, + "router_z_loss_mlp": 0.22900391, + "step": 6771, + "time_per_iteration": 2.940472364425659 + }, + { + "auxiliary_loss_clip": 0.01501195, + "auxiliary_loss_mlp": 0.01041202, + "balance_loss_clip": 1.31487727, + "balance_loss_mlp": 1.02019751, + "epoch": 0.4071546670674884, + "flos": 26363965714560.0, + "grad_norm": 1.34303116939065, + "language_loss": 0.76587629, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7913003, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.20996094, + "step": 6772, + "time_per_iteration": 2.8977255821228027 + }, + { + "auxiliary_loss_clip": 0.01506316, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.31656754, + "balance_loss_mlp": 1.01979637, + "epoch": 0.40721479032015634, + "flos": 17904554928000.0, + "grad_norm": 1.6857588973799869, + "language_loss": 0.82059205, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.84607965, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.22644043, + "step": 6773, + "time_per_iteration": 2.785315990447998 + }, + { + "auxiliary_loss_clip": 0.0124419, + "auxiliary_loss_mlp": 0.01019855, + "balance_loss_clip": 1.13817155, + "balance_loss_mlp": 0.99830163, + "epoch": 0.4072749135728243, + "flos": 49880875190400.0, + "grad_norm": 0.8170587861039549, + "language_loss": 0.6436764, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66631687, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.21582031, + "step": 6774, + "time_per_iteration": 3.239579439163208 + }, + { + "auxiliary_loss_clip": 0.01522091, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.33061099, + "balance_loss_mlp": 1.0154711, + "epoch": 0.40733503682549227, + "flos": 27575163763200.0, + "grad_norm": 1.764698141055891, + "language_loss": 0.72997159, + "learning_rate": 2.683287951431446e-06, + "loss": 0.75556356, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.21630859, + "step": 6775, + "time_per_iteration": 2.8959832191467285 + }, + { + "auxiliary_loss_clip": 0.01512677, + "auxiliary_loss_mlp": 0.01043815, + "balance_loss_clip": 1.32377017, + "balance_loss_mlp": 1.02279854, + "epoch": 0.40739516007816023, + "flos": 22137020254080.0, + "grad_norm": 1.4389810988648113, + "language_loss": 0.78140962, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80697453, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.21008301, + "step": 6776, + "time_per_iteration": 2.8857216835021973 + }, + { + "auxiliary_loss_clip": 0.01534337, + "auxiliary_loss_mlp": 0.01045309, + "balance_loss_clip": 1.34031522, + "balance_loss_mlp": 1.02366078, + "epoch": 0.4074552833308282, + "flos": 23852853267840.0, + "grad_norm": 2.448134985495422, + "language_loss": 0.80370343, + "learning_rate": 2.682555844513981e-06, + "loss": 0.82949996, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.21643066, + "step": 6777, + "time_per_iteration": 2.85481858253479 + }, + { + "auxiliary_loss_clip": 0.01238603, + "auxiliary_loss_mlp": 0.01016566, + "balance_loss_clip": 1.13483799, + "balance_loss_mlp": 0.99806517, + "epoch": 0.40751540658349616, + "flos": 58030792410240.0, + "grad_norm": 0.692172142210136, + "language_loss": 0.53244609, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.5549978, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18457031, + "step": 6778, + "time_per_iteration": 3.309422254562378 + }, + { + "auxiliary_loss_clip": 0.01526536, + "auxiliary_loss_mlp": 0.01048891, + "balance_loss_clip": 1.33761549, + "balance_loss_mlp": 1.02624071, + "epoch": 0.40757552983616413, + "flos": 21224429758080.0, + "grad_norm": 1.8914929413390473, + "language_loss": 0.8313604, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.85711467, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.2265625, + "step": 6779, + "time_per_iteration": 2.8551979064941406 + }, + { + "auxiliary_loss_clip": 0.0151916, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_clip": 1.33016646, + "balance_loss_mlp": 1.02175987, + "epoch": 0.4076356530888321, + "flos": 26844774652800.0, + "grad_norm": 1.4916315013375414, + "language_loss": 0.76629949, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.79193044, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.22167969, + "step": 6780, + "time_per_iteration": 2.864501476287842 + }, + { + "auxiliary_loss_clip": 0.01495092, + "auxiliary_loss_mlp": 0.01039344, + "balance_loss_clip": 1.31175828, + "balance_loss_mlp": 1.01872098, + "epoch": 0.40769577634150006, + "flos": 12210542219520.0, + "grad_norm": 2.1816618515084634, + "language_loss": 0.67393291, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.69927728, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.2064209, + "step": 6781, + "time_per_iteration": 2.8452322483062744 + }, + { + "auxiliary_loss_clip": 0.01487614, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.30254507, + "balance_loss_mlp": 1.0204246, + "epoch": 0.407755899594168, + "flos": 33667992552960.0, + "grad_norm": 1.636403228965355, + "language_loss": 0.71666121, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.74195731, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.21569824, + "step": 6782, + "time_per_iteration": 2.9384114742279053 + }, + { + "auxiliary_loss_clip": 0.01506425, + "auxiliary_loss_mlp": 0.01038976, + "balance_loss_clip": 1.31794143, + "balance_loss_mlp": 1.01761425, + "epoch": 0.407816022846836, + "flos": 20167037343360.0, + "grad_norm": 1.7112461992755006, + "language_loss": 0.82632935, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.85178334, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.21374512, + "step": 6783, + "time_per_iteration": 2.8324053287506104 + }, + { + "auxiliary_loss_clip": 0.01509502, + "auxiliary_loss_mlp": 0.01045544, + "balance_loss_clip": 1.32200682, + "balance_loss_mlp": 1.02370453, + "epoch": 0.40787614609950396, + "flos": 21188749593600.0, + "grad_norm": 1.4459681415758028, + "language_loss": 0.81486452, + "learning_rate": 2.679992655730283e-06, + "loss": 0.840415, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.21801758, + "step": 6784, + "time_per_iteration": 2.854062795639038 + }, + { + "auxiliary_loss_clip": 0.01515838, + "auxiliary_loss_mlp": 0.0104271, + "balance_loss_clip": 1.32147264, + "balance_loss_mlp": 1.01959503, + "epoch": 0.407936269352172, + "flos": 20530037525760.0, + "grad_norm": 1.5914690018963948, + "language_loss": 0.66649199, + "learning_rate": 2.679626382651386e-06, + "loss": 0.69207752, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.2310791, + "step": 6785, + "time_per_iteration": 2.858283281326294 + }, + { + "auxiliary_loss_clip": 0.01497412, + "auxiliary_loss_mlp": 0.01040118, + "balance_loss_clip": 1.30997503, + "balance_loss_mlp": 1.01863658, + "epoch": 0.40799639260483994, + "flos": 20127963818880.0, + "grad_norm": 2.019470580090676, + "language_loss": 0.80050027, + "learning_rate": 2.679260083800989e-06, + "loss": 0.82587552, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.21472168, + "step": 6786, + "time_per_iteration": 2.829864501953125 + }, + { + "auxiliary_loss_clip": 0.01494072, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.30791211, + "balance_loss_mlp": 1.02044845, + "epoch": 0.4080565158575079, + "flos": 21007453104000.0, + "grad_norm": 1.8742375181532813, + "language_loss": 0.82121503, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84657454, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.2142334, + "step": 6787, + "time_per_iteration": 2.943730592727661 + }, + { + "auxiliary_loss_clip": 0.01504192, + "auxiliary_loss_mlp": 0.01038753, + "balance_loss_clip": 1.31738925, + "balance_loss_mlp": 1.0168426, + "epoch": 0.40811663911017587, + "flos": 19327074030720.0, + "grad_norm": 1.8359308509936134, + "language_loss": 0.68442965, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70985913, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.21911621, + "step": 6788, + "time_per_iteration": 2.9029886722564697 + }, + { + "auxiliary_loss_clip": 0.01493592, + "auxiliary_loss_mlp": 0.01043203, + "balance_loss_clip": 1.30512905, + "balance_loss_mlp": 1.02124476, + "epoch": 0.40817676236284384, + "flos": 40640265383040.0, + "grad_norm": 2.1780425832183745, + "language_loss": 0.66931713, + "learning_rate": 2.678161032759701e-06, + "loss": 0.69468504, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.21960449, + "step": 6789, + "time_per_iteration": 3.0607638359069824 + }, + { + "auxiliary_loss_clip": 0.01483505, + "auxiliary_loss_mlp": 0.01039844, + "balance_loss_clip": 1.29796588, + "balance_loss_mlp": 1.01452422, + "epoch": 0.4082368856155118, + "flos": 20531847317760.0, + "grad_norm": 1.7983495064251784, + "language_loss": 0.61893183, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.64416528, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.25317383, + "step": 6790, + "time_per_iteration": 2.9165382385253906 + }, + { + "auxiliary_loss_clip": 0.01487055, + "auxiliary_loss_mlp": 0.01042716, + "balance_loss_clip": 1.30252981, + "balance_loss_mlp": 1.02031696, + "epoch": 0.40829700886817977, + "flos": 11432709296640.0, + "grad_norm": 3.060044978260564, + "language_loss": 0.71191627, + "learning_rate": 2.677428203462683e-06, + "loss": 0.73721397, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22412109, + "step": 6791, + "time_per_iteration": 2.8610119819641113 + }, + { + "auxiliary_loss_clip": 0.01252357, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.14046538, + "balance_loss_mlp": 1.02043521, + "epoch": 0.40835713212084773, + "flos": 67361367438720.0, + "grad_norm": 0.7536062885560001, + "language_loss": 0.59745848, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.62043631, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.24902344, + "step": 6792, + "time_per_iteration": 3.352713108062744 + }, + { + "auxiliary_loss_clip": 0.01499891, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.31022573, + "balance_loss_mlp": 1.01797128, + "epoch": 0.4084172553735157, + "flos": 21771938810880.0, + "grad_norm": 1.8653781861215921, + "language_loss": 0.80707496, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.83248985, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.23632812, + "step": 6793, + "time_per_iteration": 2.8783676624298096 + }, + { + "auxiliary_loss_clip": 0.01493381, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.30660594, + "balance_loss_mlp": 1.01679111, + "epoch": 0.40847737862618366, + "flos": 27428642542080.0, + "grad_norm": 1.9974359165490705, + "language_loss": 0.85615277, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.88147044, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.21594238, + "step": 6794, + "time_per_iteration": 4.288780212402344 + }, + { + "auxiliary_loss_clip": 0.01493482, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.30561614, + "balance_loss_mlp": 1.01740408, + "epoch": 0.4085375018788516, + "flos": 18596232472320.0, + "grad_norm": 1.5422245037601559, + "language_loss": 0.80157834, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.826922, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.23461914, + "step": 6795, + "time_per_iteration": 2.8388419151306152 + }, + { + "auxiliary_loss_clip": 0.0149902, + "auxiliary_loss_mlp": 0.01041003, + "balance_loss_clip": 1.30688858, + "balance_loss_mlp": 1.01764941, + "epoch": 0.4085976251315196, + "flos": 15419621237760.0, + "grad_norm": 2.4412738476014684, + "language_loss": 0.70090044, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7263006, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.23339844, + "step": 6796, + "time_per_iteration": 2.824941635131836 + }, + { + "auxiliary_loss_clip": 0.01486853, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.30021429, + "balance_loss_mlp": 1.0183301, + "epoch": 0.40865774838418756, + "flos": 21262281672960.0, + "grad_norm": 1.8967021489499423, + "language_loss": 0.78455925, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.80983377, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22253418, + "step": 6797, + "time_per_iteration": 2.8418192863464355 + }, + { + "auxiliary_loss_clip": 0.0149284, + "auxiliary_loss_mlp": 0.01042823, + "balance_loss_clip": 1.30499482, + "balance_loss_mlp": 1.02099633, + "epoch": 0.4087178716368556, + "flos": 13780261215360.0, + "grad_norm": 2.1150561590414276, + "language_loss": 0.86518002, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.89053667, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.21838379, + "step": 6798, + "time_per_iteration": 2.8100321292877197 + }, + { + "auxiliary_loss_clip": 0.01484658, + "auxiliary_loss_mlp": 0.01038949, + "balance_loss_clip": 1.30071568, + "balance_loss_mlp": 1.0171814, + "epoch": 0.40877799488952354, + "flos": 23631895071360.0, + "grad_norm": 1.47193709529708, + "language_loss": 0.84837723, + "learning_rate": 2.674495859860601e-06, + "loss": 0.87361336, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21765137, + "step": 6799, + "time_per_iteration": 2.8991024494171143 + }, + { + "auxiliary_loss_clip": 0.01489024, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.30325031, + "balance_loss_mlp": 1.01529884, + "epoch": 0.4088381181421915, + "flos": 20927451018240.0, + "grad_norm": 5.279358679648146, + "language_loss": 0.84361291, + "learning_rate": 2.6741292016681e-06, + "loss": 0.86888695, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.23083496, + "step": 6800, + "time_per_iteration": 2.802234411239624 + }, + { + "auxiliary_loss_clip": 0.01490728, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.30240321, + "balance_loss_mlp": 1.01681042, + "epoch": 0.4088982413948595, + "flos": 13305379345920.0, + "grad_norm": 2.3157314609689266, + "language_loss": 0.74797857, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.7732892, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.23522949, + "step": 6801, + "time_per_iteration": 4.206390857696533 + }, + { + "auxiliary_loss_clip": 0.01494594, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.30775046, + "balance_loss_mlp": 1.01902127, + "epoch": 0.40895836464752744, + "flos": 15275950439040.0, + "grad_norm": 3.0700035496478493, + "language_loss": 0.805556, + "learning_rate": 2.673395808607861e-06, + "loss": 0.83091617, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22387695, + "step": 6802, + "time_per_iteration": 4.210785150527954 + }, + { + "auxiliary_loss_clip": 0.01496015, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.30617678, + "balance_loss_mlp": 1.01548207, + "epoch": 0.4090184879001954, + "flos": 14509473960960.0, + "grad_norm": 6.844553471056186, + "language_loss": 0.77048451, + "learning_rate": 2.673029073767934e-06, + "loss": 0.79583168, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.2322998, + "step": 6803, + "time_per_iteration": 4.302134275436401 + }, + { + "auxiliary_loss_clip": 0.01470979, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.28506875, + "balance_loss_mlp": 1.01558864, + "epoch": 0.40907861115286337, + "flos": 13889609193600.0, + "grad_norm": 1.8426000799444895, + "language_loss": 0.80055064, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.82564002, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22387695, + "step": 6804, + "time_per_iteration": 2.8246982097625732 + }, + { + "auxiliary_loss_clip": 0.01504447, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.31174302, + "balance_loss_mlp": 1.02076197, + "epoch": 0.40913873440553133, + "flos": 28049185981440.0, + "grad_norm": 1.7528053405160642, + "language_loss": 0.75975156, + "learning_rate": 2.672295527537998e-06, + "loss": 0.78522569, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.22192383, + "step": 6805, + "time_per_iteration": 2.885532855987549 + }, + { + "auxiliary_loss_clip": 0.01507801, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.31771517, + "balance_loss_mlp": 1.01943529, + "epoch": 0.4091988576581993, + "flos": 21628629970560.0, + "grad_norm": 1.584174310941899, + "language_loss": 0.80180711, + "learning_rate": 2.671928716175804e-06, + "loss": 0.82730657, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.22717285, + "step": 6806, + "time_per_iteration": 2.8513293266296387 + }, + { + "auxiliary_loss_clip": 0.01502775, + "auxiliary_loss_mlp": 0.01035675, + "balance_loss_clip": 1.31216526, + "balance_loss_mlp": 1.01341891, + "epoch": 0.40925898091086726, + "flos": 25233855626880.0, + "grad_norm": 2.1597590798744815, + "language_loss": 0.73273909, + "learning_rate": 2.671561879334007e-06, + "loss": 0.75812358, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.22253418, + "step": 6807, + "time_per_iteration": 2.8295576572418213 + }, + { + "auxiliary_loss_clip": 0.01250856, + "auxiliary_loss_mlp": 0.01018235, + "balance_loss_clip": 1.13962078, + "balance_loss_mlp": 0.99467963, + "epoch": 0.40931910416353523, + "flos": 68960251347840.0, + "grad_norm": 0.8323223525576464, + "language_loss": 0.58875138, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.61144233, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.23535156, + "step": 6808, + "time_per_iteration": 3.506718873977661 + }, + { + "auxiliary_loss_clip": 0.01486683, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.30107307, + "balance_loss_mlp": 1.01739979, + "epoch": 0.4093792274162032, + "flos": 20198147783040.0, + "grad_norm": 1.9207606173927545, + "language_loss": 0.55419219, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57945973, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22668457, + "step": 6809, + "time_per_iteration": 2.8651437759399414 + }, + { + "auxiliary_loss_clip": 0.01487406, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.3009733, + "balance_loss_mlp": 1.01240528, + "epoch": 0.40943935066887116, + "flos": 25239737450880.0, + "grad_norm": 1.8486966128463767, + "language_loss": 0.83430088, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85952407, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.22497559, + "step": 6810, + "time_per_iteration": 2.88028621673584 + }, + { + "auxiliary_loss_clip": 0.01504021, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.31372058, + "balance_loss_mlp": 1.01459539, + "epoch": 0.4094994739215392, + "flos": 23264958591360.0, + "grad_norm": 2.614185002132003, + "language_loss": 0.78697407, + "learning_rate": 2.670094277448999e-06, + "loss": 0.81240243, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 1.90332031, + "router_z_loss_mlp": 0.24230957, + "step": 6811, + "time_per_iteration": 2.8286502361297607 + }, + { + "auxiliary_loss_clip": 0.01492006, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.30228519, + "balance_loss_mlp": 1.01474297, + "epoch": 0.40955959717420715, + "flos": 17390554289280.0, + "grad_norm": 2.261377418804439, + "language_loss": 0.71083051, + "learning_rate": 2.669727313417857e-06, + "loss": 0.73614943, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.25158691, + "step": 6812, + "time_per_iteration": 2.8374407291412354 + }, + { + "auxiliary_loss_clip": 0.01486445, + "auxiliary_loss_mlp": 0.01038296, + "balance_loss_clip": 1.29963136, + "balance_loss_mlp": 1.01346445, + "epoch": 0.4096197204268751, + "flos": 25093261474560.0, + "grad_norm": 1.6067315606072514, + "language_loss": 0.67658532, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.70183265, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.24853516, + "step": 6813, + "time_per_iteration": 2.9135401248931885 + }, + { + "auxiliary_loss_clip": 0.01479257, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.29398894, + "balance_loss_mlp": 1.01213515, + "epoch": 0.4096798436795431, + "flos": 30597743139840.0, + "grad_norm": 2.156529779707446, + "language_loss": 0.74802125, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.77316964, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.23449707, + "step": 6814, + "time_per_iteration": 2.8980822563171387 + }, + { + "auxiliary_loss_clip": 0.01505553, + "auxiliary_loss_mlp": 0.01040271, + "balance_loss_clip": 1.31304431, + "balance_loss_mlp": 1.01732278, + "epoch": 0.40973996693221104, + "flos": 24144176407680.0, + "grad_norm": 2.3429798223434033, + "language_loss": 0.67325747, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.69871569, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.22937012, + "step": 6815, + "time_per_iteration": 2.834437370300293 + }, + { + "auxiliary_loss_clip": 0.01472733, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.29061389, + "balance_loss_mlp": 1.02056789, + "epoch": 0.409800090184879, + "flos": 23999826936960.0, + "grad_norm": 2.0371055127326, + "language_loss": 0.77519882, + "learning_rate": 2.668259203471188e-06, + "loss": 0.80035388, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.22216797, + "step": 6816, + "time_per_iteration": 2.8439950942993164 + }, + { + "auxiliary_loss_clip": 0.01477481, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.29079771, + "balance_loss_mlp": 1.01552653, + "epoch": 0.40986021343754697, + "flos": 16152272588160.0, + "grad_norm": 2.1315652271517016, + "language_loss": 0.82718444, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8523522, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.23791504, + "step": 6817, + "time_per_iteration": 2.8048946857452393 + }, + { + "auxiliary_loss_clip": 0.01511903, + "auxiliary_loss_mlp": 0.01040801, + "balance_loss_clip": 1.31659222, + "balance_loss_mlp": 1.01680434, + "epoch": 0.40992033669021494, + "flos": 24801666865920.0, + "grad_norm": 1.6946449395947591, + "language_loss": 0.8141312, + "learning_rate": 2.667524996399444e-06, + "loss": 0.8396582, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.23986816, + "step": 6818, + "time_per_iteration": 2.8447911739349365 + }, + { + "auxiliary_loss_clip": 0.01468693, + "auxiliary_loss_mlp": 0.01041651, + "balance_loss_clip": 1.2835083, + "balance_loss_mlp": 1.01786876, + "epoch": 0.4099804599428829, + "flos": 29653589756160.0, + "grad_norm": 1.705287654418305, + "language_loss": 0.66896433, + "learning_rate": 2.66715785488769e-06, + "loss": 0.69406772, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.23779297, + "step": 6819, + "time_per_iteration": 2.8805065155029297 + }, + { + "auxiliary_loss_clip": 0.01508421, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.31330132, + "balance_loss_mlp": 1.01731753, + "epoch": 0.41004058319555087, + "flos": 24837256540800.0, + "grad_norm": 1.7244506237584396, + "language_loss": 0.86118644, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.88667685, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 1.94726562, + "router_z_loss_mlp": 0.2331543, + "step": 6820, + "time_per_iteration": 2.897613525390625 + }, + { + "auxiliary_loss_clip": 0.01477886, + "auxiliary_loss_mlp": 0.01044924, + "balance_loss_clip": 1.29430628, + "balance_loss_mlp": 1.02178538, + "epoch": 0.41010070644821883, + "flos": 25748037244800.0, + "grad_norm": 1.7921826800197256, + "language_loss": 0.72247565, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.74770379, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23144531, + "step": 6821, + "time_per_iteration": 2.9182918071746826 + }, + { + "auxiliary_loss_clip": 0.01491708, + "auxiliary_loss_mlp": 0.01045046, + "balance_loss_clip": 1.30392683, + "balance_loss_mlp": 1.02191973, + "epoch": 0.4101608297008868, + "flos": 22356213903360.0, + "grad_norm": 2.3698047690408894, + "language_loss": 0.74807107, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.77343863, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.23132324, + "step": 6822, + "time_per_iteration": 2.881303310394287 + }, + { + "auxiliary_loss_clip": 0.0149689, + "auxiliary_loss_mlp": 0.01040206, + "balance_loss_clip": 1.30860281, + "balance_loss_mlp": 1.01679361, + "epoch": 0.41022095295355476, + "flos": 21955226071680.0, + "grad_norm": 2.4246625147081526, + "language_loss": 0.76946557, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.79483652, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.23413086, + "step": 6823, + "time_per_iteration": 2.8137426376342773 + }, + { + "auxiliary_loss_clip": 0.01520089, + "auxiliary_loss_mlp": 0.01048429, + "balance_loss_clip": 1.32630479, + "balance_loss_mlp": 1.02393138, + "epoch": 0.4102810762062228, + "flos": 27461065080960.0, + "grad_norm": 1.633870564991092, + "language_loss": 0.74113715, + "learning_rate": 2.665321768127001e-06, + "loss": 0.76682234, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.24511719, + "step": 6824, + "time_per_iteration": 2.8842718601226807 + }, + { + "auxiliary_loss_clip": 0.01510394, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.31646633, + "balance_loss_mlp": 1.01703978, + "epoch": 0.41034119945889075, + "flos": 24510117502080.0, + "grad_norm": 1.8911254934102826, + "language_loss": 0.72604531, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.75155485, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 1.93847656, + "router_z_loss_mlp": 0.23510742, + "step": 6825, + "time_per_iteration": 2.95027494430542 + }, + { + "auxiliary_loss_clip": 0.0149451, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.30716169, + "balance_loss_mlp": 1.02476633, + "epoch": 0.4104013227115587, + "flos": 24363279567360.0, + "grad_norm": 2.0859158041072257, + "language_loss": 0.85443997, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87985682, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.22412109, + "step": 6826, + "time_per_iteration": 2.861497402191162 + }, + { + "auxiliary_loss_clip": 0.01476708, + "auxiliary_loss_mlp": 0.01048106, + "balance_loss_clip": 1.29345107, + "balance_loss_mlp": 1.02464545, + "epoch": 0.4104614459642267, + "flos": 23739026054400.0, + "grad_norm": 1.8082822228329583, + "language_loss": 0.67315394, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.69840205, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23461914, + "step": 6827, + "time_per_iteration": 2.861844778060913 + }, + { + "auxiliary_loss_clip": 0.0147627, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_clip": 1.29196548, + "balance_loss_mlp": 1.01977801, + "epoch": 0.41052156921689464, + "flos": 22138287108480.0, + "grad_norm": 1.5900784303265159, + "language_loss": 0.73109925, + "learning_rate": 2.663852444511689e-06, + "loss": 0.75629914, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23901367, + "step": 6828, + "time_per_iteration": 4.270995616912842 + }, + { + "auxiliary_loss_clip": 0.0152184, + "auxiliary_loss_mlp": 0.01048246, + "balance_loss_clip": 1.32719588, + "balance_loss_mlp": 1.02540517, + "epoch": 0.4105816924695626, + "flos": 20094048201600.0, + "grad_norm": 2.4314770035356115, + "language_loss": 0.8462798, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.87198067, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 1.9453125, + "router_z_loss_mlp": 0.22839355, + "step": 6829, + "time_per_iteration": 2.891573429107666 + }, + { + "auxiliary_loss_clip": 0.01486569, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.30024505, + "balance_loss_mlp": 1.0210464, + "epoch": 0.4106418157222306, + "flos": 18085715683200.0, + "grad_norm": 2.5933670443359844, + "language_loss": 0.90433812, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92965245, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.23840332, + "step": 6830, + "time_per_iteration": 2.858628034591675 + }, + { + "auxiliary_loss_clip": 0.01493384, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.30623245, + "balance_loss_mlp": 1.02373886, + "epoch": 0.41070193897489854, + "flos": 21656528029440.0, + "grad_norm": 2.420352675989182, + "language_loss": 0.66549397, + "learning_rate": 2.662750187431268e-06, + "loss": 0.69088787, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.22265625, + "step": 6831, + "time_per_iteration": 2.845128297805786 + }, + { + "auxiliary_loss_clip": 0.01486334, + "auxiliary_loss_mlp": 0.01043197, + "balance_loss_clip": 1.30071735, + "balance_loss_mlp": 1.01987958, + "epoch": 0.4107620622275665, + "flos": 26658636969600.0, + "grad_norm": 1.8522413161824247, + "language_loss": 0.70325738, + "learning_rate": 2.662382718122776e-06, + "loss": 0.7285527, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.2331543, + "step": 6832, + "time_per_iteration": 2.9097986221313477 + }, + { + "auxiliary_loss_clip": 0.01485424, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.29830289, + "balance_loss_mlp": 1.02466285, + "epoch": 0.41082218548023447, + "flos": 18743613344640.0, + "grad_norm": 2.3914154827605802, + "language_loss": 0.74307233, + "learning_rate": 2.662015223696666e-06, + "loss": 0.76839662, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.22338867, + "step": 6833, + "time_per_iteration": 2.894451379776001 + }, + { + "auxiliary_loss_clip": 0.01507902, + "auxiliary_loss_mlp": 0.01049967, + "balance_loss_clip": 1.3156383, + "balance_loss_mlp": 1.02461171, + "epoch": 0.41088230873290243, + "flos": 22904175404160.0, + "grad_norm": 1.5968088300560257, + "language_loss": 0.73362124, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75919992, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.25366211, + "step": 6834, + "time_per_iteration": 2.868238925933838 + }, + { + "auxiliary_loss_clip": 0.01502374, + "auxiliary_loss_mlp": 0.01048249, + "balance_loss_clip": 1.31007266, + "balance_loss_mlp": 1.02492011, + "epoch": 0.4109424319855704, + "flos": 24286897065600.0, + "grad_norm": 2.793194308429929, + "language_loss": 0.72230422, + "learning_rate": 2.661280159547329e-06, + "loss": 0.74781042, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.23352051, + "step": 6835, + "time_per_iteration": 4.229032039642334 + }, + { + "auxiliary_loss_clip": 0.01495309, + "auxiliary_loss_mlp": 0.01047519, + "balance_loss_clip": 1.30688, + "balance_loss_mlp": 1.02200806, + "epoch": 0.41100255523823837, + "flos": 12976837718400.0, + "grad_norm": 1.988134279925879, + "language_loss": 0.87960798, + "learning_rate": 2.660912589851978e-06, + "loss": 0.90503627, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.25512695, + "step": 6836, + "time_per_iteration": 2.8039655685424805 + }, + { + "auxiliary_loss_clip": 0.01468965, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.28607905, + "balance_loss_mlp": 1.01846933, + "epoch": 0.4110626784909064, + "flos": 23155203409920.0, + "grad_norm": 1.900427424252544, + "language_loss": 0.69257081, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.7177006, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.25549316, + "step": 6837, + "time_per_iteration": 4.246564865112305 + }, + { + "auxiliary_loss_clip": 0.01488475, + "auxiliary_loss_mlp": 0.01045209, + "balance_loss_clip": 1.29932785, + "balance_loss_mlp": 1.02158177, + "epoch": 0.41112280174357435, + "flos": 22757654183040.0, + "grad_norm": 2.207777278216685, + "language_loss": 0.75998211, + "learning_rate": 2.660177375289599e-06, + "loss": 0.78531897, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.23620605, + "step": 6838, + "time_per_iteration": 4.2281334400177 + }, + { + "auxiliary_loss_clip": 0.01491048, + "auxiliary_loss_mlp": 0.01040173, + "balance_loss_clip": 1.30416203, + "balance_loss_mlp": 1.01580644, + "epoch": 0.4111829249962423, + "flos": 21111416951040.0, + "grad_norm": 2.04915714002317, + "language_loss": 0.8284483, + "learning_rate": 2.659809730450451e-06, + "loss": 0.85376054, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.24365234, + "step": 6839, + "time_per_iteration": 2.841240167617798 + }, + { + "auxiliary_loss_clip": 0.01488941, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.30232918, + "balance_loss_mlp": 1.01589346, + "epoch": 0.4112430482489103, + "flos": 21515436184320.0, + "grad_norm": 1.9429800355147544, + "language_loss": 0.81462586, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.83990717, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.23303223, + "step": 6840, + "time_per_iteration": 2.8043906688690186 + }, + { + "auxiliary_loss_clip": 0.01472532, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.28852987, + "balance_loss_mlp": 1.01434779, + "epoch": 0.41130317150157825, + "flos": 19578644974080.0, + "grad_norm": 3.022319855016868, + "language_loss": 0.68263793, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.7077446, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.23791504, + "step": 6841, + "time_per_iteration": 2.845144033432007 + }, + { + "auxiliary_loss_clip": 0.01261095, + "auxiliary_loss_mlp": 0.01087266, + "balance_loss_clip": 1.14619362, + "balance_loss_mlp": 1.05713022, + "epoch": 0.4113632947542462, + "flos": 62416339453440.0, + "grad_norm": 0.7778926550012077, + "language_loss": 0.59730232, + "learning_rate": 2.65870664586847e-06, + "loss": 0.62078595, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.30078125, + "step": 6842, + "time_per_iteration": 3.4109702110290527 + }, + { + "auxiliary_loss_clip": 0.01469282, + "auxiliary_loss_mlp": 0.01044337, + "balance_loss_clip": 1.28772473, + "balance_loss_mlp": 1.02056646, + "epoch": 0.4114234180069142, + "flos": 13926782436480.0, + "grad_norm": 2.1895254012432757, + "language_loss": 0.70712823, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.7322644, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.2376709, + "step": 6843, + "time_per_iteration": 2.851491928100586 + }, + { + "auxiliary_loss_clip": 0.01255814, + "auxiliary_loss_mlp": 0.01060661, + "balance_loss_clip": 1.14188027, + "balance_loss_mlp": 1.03357625, + "epoch": 0.41148354125958214, + "flos": 64960100663040.0, + "grad_norm": 0.7407055049932092, + "language_loss": 0.5368247, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55998945, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.27148438, + "step": 6844, + "time_per_iteration": 3.3211963176727295 + }, + { + "auxiliary_loss_clip": 0.01484423, + "auxiliary_loss_mlp": 0.01039385, + "balance_loss_clip": 1.29965639, + "balance_loss_mlp": 1.01615095, + "epoch": 0.4115436645122501, + "flos": 18736645645440.0, + "grad_norm": 1.599714065018392, + "language_loss": 0.66596085, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.69119895, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.23205566, + "step": 6845, + "time_per_iteration": 2.844075918197632 + }, + { + "auxiliary_loss_clip": 0.01495056, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.30832243, + "balance_loss_mlp": 1.01639962, + "epoch": 0.41160378776491807, + "flos": 16261484832000.0, + "grad_norm": 1.8545358261424179, + "language_loss": 0.70765793, + "learning_rate": 2.657235516795808e-06, + "loss": 0.73300844, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 1.86621094, + "router_z_loss_mlp": 0.23583984, + "step": 6846, + "time_per_iteration": 2.847303628921509 + }, + { + "auxiliary_loss_clip": 0.01474085, + "auxiliary_loss_mlp": 0.01040416, + "balance_loss_clip": 1.29006851, + "balance_loss_mlp": 1.01668191, + "epoch": 0.41166391101758604, + "flos": 27982983559680.0, + "grad_norm": 2.0740972363976016, + "language_loss": 0.65966374, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.68480873, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.23742676, + "step": 6847, + "time_per_iteration": 2.9432668685913086 + }, + { + "auxiliary_loss_clip": 0.01468088, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.28329289, + "balance_loss_mlp": 1.01803613, + "epoch": 0.411724034270254, + "flos": 34144593724800.0, + "grad_norm": 1.4479413651102355, + "language_loss": 0.71447742, + "learning_rate": 2.656499802669069e-06, + "loss": 0.73958701, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.24853516, + "step": 6848, + "time_per_iteration": 2.954068183898926 + }, + { + "auxiliary_loss_clip": 0.01255129, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.14034271, + "balance_loss_mlp": 1.00186062, + "epoch": 0.41178415752292197, + "flos": 67956908486400.0, + "grad_norm": 0.8955491248080992, + "language_loss": 0.56309438, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58592939, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.265625, + "step": 6849, + "time_per_iteration": 3.441833019256592 + }, + { + "auxiliary_loss_clip": 0.01469596, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.28840089, + "balance_loss_mlp": 1.01656985, + "epoch": 0.41184428077558993, + "flos": 34327338048000.0, + "grad_norm": 2.4875529014082907, + "language_loss": 0.76922721, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.79433143, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.24267578, + "step": 6850, + "time_per_iteration": 2.91916823387146 + }, + { + "auxiliary_loss_clip": 0.01476821, + "auxiliary_loss_mlp": 0.01043392, + "balance_loss_clip": 1.29365945, + "balance_loss_mlp": 1.01842976, + "epoch": 0.41190440402825795, + "flos": 35457583870080.0, + "grad_norm": 1.6599565790335393, + "language_loss": 0.68368983, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70889199, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.24975586, + "step": 6851, + "time_per_iteration": 2.9478774070739746 + }, + { + "auxiliary_loss_clip": 0.01489035, + "auxiliary_loss_mlp": 0.01045399, + "balance_loss_clip": 1.30061626, + "balance_loss_mlp": 1.01916122, + "epoch": 0.4119645272809259, + "flos": 20859574538880.0, + "grad_norm": 4.093361043007292, + "language_loss": 0.80608428, + "learning_rate": 2.655028075792743e-06, + "loss": 0.83142859, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 1.88671875, + "router_z_loss_mlp": 0.26257324, + "step": 6852, + "time_per_iteration": 2.798630714416504 + }, + { + "auxiliary_loss_clip": 0.01501884, + "auxiliary_loss_mlp": 0.01039983, + "balance_loss_clip": 1.31094241, + "balance_loss_mlp": 1.01536632, + "epoch": 0.4120246505335939, + "flos": 27573172992000.0, + "grad_norm": 2.7528250641062053, + "language_loss": 0.78394663, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80936527, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 1.91015625, + "router_z_loss_mlp": 0.24621582, + "step": 6853, + "time_per_iteration": 2.8998210430145264 + }, + { + "auxiliary_loss_clip": 0.0151055, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.31571579, + "balance_loss_mlp": 1.01556325, + "epoch": 0.41208477378626185, + "flos": 37829459508480.0, + "grad_norm": 1.7192068076063796, + "language_loss": 0.66882527, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.69433427, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 1.94628906, + "router_z_loss_mlp": 0.2479248, + "step": 6854, + "time_per_iteration": 2.9895408153533936 + }, + { + "auxiliary_loss_clip": 0.01489289, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.30173576, + "balance_loss_mlp": 1.01483643, + "epoch": 0.4121448970389298, + "flos": 23451232008960.0, + "grad_norm": 1.8294741119445719, + "language_loss": 0.8467797, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.87206185, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.24108887, + "step": 6855, + "time_per_iteration": 2.825977325439453 + }, + { + "auxiliary_loss_clip": 0.0146181, + "auxiliary_loss_mlp": 0.01042846, + "balance_loss_clip": 1.28019762, + "balance_loss_mlp": 1.01973104, + "epoch": 0.4122050202915978, + "flos": 21335406549120.0, + "grad_norm": 1.9175771881984625, + "language_loss": 0.79817897, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.8232255, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23120117, + "step": 6856, + "time_per_iteration": 2.9382219314575195 + }, + { + "auxiliary_loss_clip": 0.01481002, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.29526663, + "balance_loss_mlp": 1.01665521, + "epoch": 0.41226514354426574, + "flos": 17313900318720.0, + "grad_norm": 2.3882862688394773, + "language_loss": 0.80672741, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.83193213, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.22814941, + "step": 6857, + "time_per_iteration": 2.8730032444000244 + }, + { + "auxiliary_loss_clip": 0.01485696, + "auxiliary_loss_mlp": 0.01040995, + "balance_loss_clip": 1.29840446, + "balance_loss_mlp": 1.01771307, + "epoch": 0.4123252667969337, + "flos": 17647645098240.0, + "grad_norm": 4.477133480406094, + "language_loss": 0.71517229, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.74043924, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.23291016, + "step": 6858, + "time_per_iteration": 2.845611333847046 + }, + { + "auxiliary_loss_clip": 0.01481609, + "auxiliary_loss_mlp": 0.01045615, + "balance_loss_clip": 1.2970283, + "balance_loss_mlp": 1.02092671, + "epoch": 0.4123853900496017, + "flos": 46440730402560.0, + "grad_norm": 1.5771376995442332, + "language_loss": 0.59834838, + "learning_rate": 2.652451598005391e-06, + "loss": 0.62362063, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.24682617, + "step": 6859, + "time_per_iteration": 3.0960280895233154 + }, + { + "auxiliary_loss_clip": 0.01488418, + "auxiliary_loss_mlp": 0.01042702, + "balance_loss_clip": 1.2987653, + "balance_loss_mlp": 1.01918197, + "epoch": 0.41244551330226964, + "flos": 17683913445120.0, + "grad_norm": 1.9663359051327043, + "language_loss": 0.74357039, + "learning_rate": 2.652083430674264e-06, + "loss": 0.76888162, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.23522949, + "step": 6860, + "time_per_iteration": 2.842412233352661 + }, + { + "auxiliary_loss_clip": 0.01472462, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.28814697, + "balance_loss_mlp": 1.01788545, + "epoch": 0.4125056365549376, + "flos": 18701960866560.0, + "grad_norm": 1.6949722307987445, + "language_loss": 0.75003755, + "learning_rate": 2.651715238616068e-06, + "loss": 0.77516615, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22509766, + "step": 6861, + "time_per_iteration": 2.8212027549743652 + }, + { + "auxiliary_loss_clip": 0.01470888, + "auxiliary_loss_mlp": 0.01041382, + "balance_loss_clip": 1.28818917, + "balance_loss_mlp": 1.01910138, + "epoch": 0.41256575980760557, + "flos": 17904419193600.0, + "grad_norm": 2.7392445868276383, + "language_loss": 0.8079555, + "learning_rate": 2.651347021844765e-06, + "loss": 0.83307821, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.22290039, + "step": 6862, + "time_per_iteration": 2.795100450515747 + }, + { + "auxiliary_loss_clip": 0.01479343, + "auxiliary_loss_mlp": 0.01040407, + "balance_loss_clip": 1.29515588, + "balance_loss_mlp": 1.01840115, + "epoch": 0.41262588306027354, + "flos": 21991403928960.0, + "grad_norm": 1.7378878457055444, + "language_loss": 0.77036387, + "learning_rate": 2.650978780374318e-06, + "loss": 0.79556143, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22009277, + "step": 6863, + "time_per_iteration": 4.253396034240723 + }, + { + "auxiliary_loss_clip": 0.0125764, + "auxiliary_loss_mlp": 0.01052216, + "balance_loss_clip": 1.1467793, + "balance_loss_mlp": 1.02904177, + "epoch": 0.41268600631294156, + "flos": 53375350279680.0, + "grad_norm": 0.7120610851283444, + "language_loss": 0.52716279, + "learning_rate": 2.650610514218691e-06, + "loss": 0.55026138, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.23144531, + "step": 6864, + "time_per_iteration": 3.327979326248169 + }, + { + "auxiliary_loss_clip": 0.01497462, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.30744147, + "balance_loss_mlp": 1.01784968, + "epoch": 0.4127461295656095, + "flos": 24395023434240.0, + "grad_norm": 1.7255539229402597, + "language_loss": 0.73271179, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.75809443, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.22961426, + "step": 6865, + "time_per_iteration": 2.866166353225708 + }, + { + "auxiliary_loss_clip": 0.01252854, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.14252031, + "balance_loss_mlp": 1.00590169, + "epoch": 0.4128062528182775, + "flos": 71736100974720.0, + "grad_norm": 0.9256410644446629, + "language_loss": 0.66651034, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68938398, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.28515625, + "step": 6866, + "time_per_iteration": 3.1921327114105225 + }, + { + "auxiliary_loss_clip": 0.01466348, + "auxiliary_loss_mlp": 0.01041662, + "balance_loss_clip": 1.28318715, + "balance_loss_mlp": 1.01933408, + "epoch": 0.41286637607094545, + "flos": 17855708526720.0, + "grad_norm": 2.442182167673794, + "language_loss": 0.82400918, + "learning_rate": 2.649505567780375e-06, + "loss": 0.84908921, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.22338867, + "step": 6867, + "time_per_iteration": 2.9046554565429688 + }, + { + "auxiliary_loss_clip": 0.01492797, + "auxiliary_loss_mlp": 0.01050075, + "balance_loss_clip": 1.30393386, + "balance_loss_mlp": 1.02612615, + "epoch": 0.4129264993236134, + "flos": 25558641936000.0, + "grad_norm": 1.9904360713065918, + "language_loss": 0.78160602, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.80703473, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.23950195, + "step": 6868, + "time_per_iteration": 2.9100170135498047 + }, + { + "auxiliary_loss_clip": 0.01246431, + "auxiliary_loss_mlp": 0.01043415, + "balance_loss_clip": 1.13838422, + "balance_loss_mlp": 1.0170939, + "epoch": 0.4129866225762814, + "flos": 65439715253760.0, + "grad_norm": 0.877080018897909, + "language_loss": 0.57936358, + "learning_rate": 2.64876881365164e-06, + "loss": 0.60226202, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.26367188, + "step": 6869, + "time_per_iteration": 3.06510591506958 + }, + { + "auxiliary_loss_clip": 0.01473092, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.29077935, + "balance_loss_mlp": 1.02311838, + "epoch": 0.41304674582894935, + "flos": 28888832580480.0, + "grad_norm": 1.7110666604344338, + "language_loss": 0.75332403, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77852559, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.23937988, + "step": 6870, + "time_per_iteration": 4.305445194244385 + }, + { + "auxiliary_loss_clip": 0.01478413, + "auxiliary_loss_mlp": 0.01052422, + "balance_loss_clip": 1.29201114, + "balance_loss_mlp": 1.02972436, + "epoch": 0.4131068690816173, + "flos": 22901913164160.0, + "grad_norm": 2.344985545419534, + "language_loss": 0.84253383, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.8678422, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.22680664, + "step": 6871, + "time_per_iteration": 2.8669683933258057 + }, + { + "auxiliary_loss_clip": 0.01492147, + "auxiliary_loss_mlp": 0.01050594, + "balance_loss_clip": 1.30457711, + "balance_loss_mlp": 1.02690744, + "epoch": 0.4131669923342853, + "flos": 26075673976320.0, + "grad_norm": 1.9893437650736632, + "language_loss": 0.69348121, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71890855, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.23669434, + "step": 6872, + "time_per_iteration": 4.267404556274414 + }, + { + "auxiliary_loss_clip": 0.01490902, + "auxiliary_loss_mlp": 0.01047025, + "balance_loss_clip": 1.30498171, + "balance_loss_mlp": 1.02367187, + "epoch": 0.41322711558695324, + "flos": 19253858664960.0, + "grad_norm": 1.963146595009174, + "language_loss": 0.76522815, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.79060739, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23376465, + "step": 6873, + "time_per_iteration": 4.210752010345459 + }, + { + "auxiliary_loss_clip": 0.01492228, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_clip": 1.30374336, + "balance_loss_mlp": 1.02380359, + "epoch": 0.4132872388396212, + "flos": 22684710286080.0, + "grad_norm": 2.278591653059523, + "language_loss": 0.8436712, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.86905551, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.22387695, + "step": 6874, + "time_per_iteration": 2.830596685409546 + }, + { + "auxiliary_loss_clip": 0.01491447, + "auxiliary_loss_mlp": 0.01049935, + "balance_loss_clip": 1.30343723, + "balance_loss_mlp": 1.02653468, + "epoch": 0.4133473620922892, + "flos": 20158033628160.0, + "grad_norm": 1.958956667522632, + "language_loss": 0.72196603, + "learning_rate": 2.646557961279436e-06, + "loss": 0.74737984, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.23413086, + "step": 6875, + "time_per_iteration": 2.830190420150757 + }, + { + "auxiliary_loss_clip": 0.01460331, + "auxiliary_loss_mlp": 0.01051017, + "balance_loss_clip": 1.28374803, + "balance_loss_mlp": 1.03002429, + "epoch": 0.41340748534495714, + "flos": 24253252917120.0, + "grad_norm": 1.531374672648363, + "language_loss": 0.83347952, + "learning_rate": 2.646189399991154e-06, + "loss": 0.85859305, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.20996094, + "step": 6876, + "time_per_iteration": 2.872391939163208 + }, + { + "auxiliary_loss_clip": 0.01493477, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.30371583, + "balance_loss_mlp": 1.02099931, + "epoch": 0.41346760859762516, + "flos": 14400261717120.0, + "grad_norm": 2.3666888319883275, + "language_loss": 0.67204446, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6974138, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 1.89648438, + "router_z_loss_mlp": 0.22460938, + "step": 6877, + "time_per_iteration": 2.8142096996307373 + }, + { + "auxiliary_loss_clip": 0.0148267, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.29741311, + "balance_loss_mlp": 1.01955414, + "epoch": 0.4135277318502931, + "flos": 22502192186880.0, + "grad_norm": 2.151678750815433, + "language_loss": 0.77510905, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.8003546, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.22338867, + "step": 6878, + "time_per_iteration": 2.972215414047241 + }, + { + "auxiliary_loss_clip": 0.0147639, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.29042625, + "balance_loss_mlp": 1.02071917, + "epoch": 0.4135878551029611, + "flos": 22428841086720.0, + "grad_norm": 2.1443786362273833, + "language_loss": 0.8104406, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.83563578, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.22399902, + "step": 6879, + "time_per_iteration": 2.9080541133880615 + }, + { + "auxiliary_loss_clip": 0.01482621, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.29728162, + "balance_loss_mlp": 1.0139426, + "epoch": 0.41364797835562905, + "flos": 27064737463680.0, + "grad_norm": 1.6899336145076975, + "language_loss": 0.8556205, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.88080275, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21679688, + "step": 6880, + "time_per_iteration": 2.9099271297454834 + }, + { + "auxiliary_loss_clip": 0.01497324, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.31036496, + "balance_loss_mlp": 1.01216626, + "epoch": 0.413708101608297, + "flos": 22978250421120.0, + "grad_norm": 1.6306094437850742, + "language_loss": 0.71441436, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.73971725, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.20776367, + "step": 6881, + "time_per_iteration": 2.8861281871795654 + }, + { + "auxiliary_loss_clip": 0.01472447, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.29374957, + "balance_loss_mlp": 1.01932311, + "epoch": 0.413768224860965, + "flos": 13341783427200.0, + "grad_norm": 1.9254513433758464, + "language_loss": 0.82200646, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.84714204, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21789551, + "step": 6882, + "time_per_iteration": 2.8194236755371094 + }, + { + "auxiliary_loss_clip": 0.01506984, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.31642306, + "balance_loss_mlp": 1.0172137, + "epoch": 0.41382834811363295, + "flos": 20823803884800.0, + "grad_norm": 2.39409409242022, + "language_loss": 0.70973873, + "learning_rate": 2.643608785656077e-06, + "loss": 0.73521245, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.23168945, + "step": 6883, + "time_per_iteration": 2.849210500717163 + }, + { + "auxiliary_loss_clip": 0.01488981, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.30304372, + "balance_loss_mlp": 1.01460969, + "epoch": 0.4138884713663009, + "flos": 20676694481280.0, + "grad_norm": 2.2724040049877807, + "language_loss": 0.7598722, + "learning_rate": 2.643240028730663e-06, + "loss": 0.78512436, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.21618652, + "step": 6884, + "time_per_iteration": 2.847496271133423 + }, + { + "auxiliary_loss_clip": 0.0149666, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.30699754, + "balance_loss_mlp": 1.01798332, + "epoch": 0.4139485946189689, + "flos": 29067776340480.0, + "grad_norm": 1.4649851508786178, + "language_loss": 0.76812267, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7935009, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.23181152, + "step": 6885, + "time_per_iteration": 2.9419925212860107 + }, + { + "auxiliary_loss_clip": 0.01480984, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.29431677, + "balance_loss_mlp": 1.01791775, + "epoch": 0.41400871787163684, + "flos": 24436404443520.0, + "grad_norm": 2.2244528028441044, + "language_loss": 0.70695424, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.73215491, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.21154785, + "step": 6886, + "time_per_iteration": 2.8810081481933594 + }, + { + "auxiliary_loss_clip": 0.01500476, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.31193352, + "balance_loss_mlp": 1.01361871, + "epoch": 0.4140688411243048, + "flos": 19473550007040.0, + "grad_norm": 1.5063437157537314, + "language_loss": 0.76156199, + "learning_rate": 2.642133611660002e-06, + "loss": 0.78692383, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.22094727, + "step": 6887, + "time_per_iteration": 2.8531312942504883 + }, + { + "auxiliary_loss_clip": 0.0147587, + "auxiliary_loss_mlp": 0.0103867, + "balance_loss_clip": 1.29134655, + "balance_loss_mlp": 1.01680744, + "epoch": 0.4141289643769728, + "flos": 19321961368320.0, + "grad_norm": 2.0862045365132196, + "language_loss": 0.71131694, + "learning_rate": 2.641764757251592e-06, + "loss": 0.73646235, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.21850586, + "step": 6888, + "time_per_iteration": 2.7990520000457764 + }, + { + "auxiliary_loss_clip": 0.01479406, + "auxiliary_loss_mlp": 0.01044267, + "balance_loss_clip": 1.29449272, + "balance_loss_mlp": 1.02128303, + "epoch": 0.41418908762964074, + "flos": 16735507050240.0, + "grad_norm": 1.8321557567320765, + "language_loss": 0.76738489, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.79262161, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22998047, + "step": 6889, + "time_per_iteration": 2.7860748767852783 + }, + { + "auxiliary_loss_clip": 0.01482644, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.29830194, + "balance_loss_mlp": 1.01754928, + "epoch": 0.41424921088230876, + "flos": 25306573299840.0, + "grad_norm": 1.6306885033708345, + "language_loss": 0.80409825, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82933569, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.23547363, + "step": 6890, + "time_per_iteration": 2.8611912727355957 + }, + { + "auxiliary_loss_clip": 0.01465247, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.28518569, + "balance_loss_mlp": 1.01912737, + "epoch": 0.4143093341349767, + "flos": 20970551329920.0, + "grad_norm": 1.521467134705875, + "language_loss": 0.74476725, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76983738, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22631836, + "step": 6891, + "time_per_iteration": 2.8638806343078613 + }, + { + "auxiliary_loss_clip": 0.01494331, + "auxiliary_loss_mlp": 0.0104316, + "balance_loss_clip": 1.30581307, + "balance_loss_mlp": 1.01942563, + "epoch": 0.4143694573876447, + "flos": 22027762765440.0, + "grad_norm": 2.244979056049151, + "language_loss": 0.84873593, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.87411082, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.23754883, + "step": 6892, + "time_per_iteration": 2.874753713607788 + }, + { + "auxiliary_loss_clip": 0.01465113, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.28558075, + "balance_loss_mlp": 1.01570463, + "epoch": 0.41442958064031266, + "flos": 35710376423040.0, + "grad_norm": 2.068802017375974, + "language_loss": 0.70582175, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.73085594, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22607422, + "step": 6893, + "time_per_iteration": 2.9320008754730225 + }, + { + "auxiliary_loss_clip": 0.01469798, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.28716445, + "balance_loss_mlp": 1.01627803, + "epoch": 0.4144897038929806, + "flos": 28305598118400.0, + "grad_norm": 2.1881171949293177, + "language_loss": 0.73484409, + "learning_rate": 2.639551120239279e-06, + "loss": 0.7599349, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.23010254, + "step": 6894, + "time_per_iteration": 2.874009370803833 + }, + { + "auxiliary_loss_clip": 0.01483006, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.29888237, + "balance_loss_mlp": 1.01533818, + "epoch": 0.4145498271456486, + "flos": 11653486513920.0, + "grad_norm": 2.467994921179865, + "language_loss": 0.63668305, + "learning_rate": 2.63918209577416e-06, + "loss": 0.66188794, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22143555, + "step": 6895, + "time_per_iteration": 2.771549701690674 + }, + { + "auxiliary_loss_clip": 0.01459426, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.27930975, + "balance_loss_mlp": 1.01560819, + "epoch": 0.41460995039831655, + "flos": 27247165073280.0, + "grad_norm": 1.4727994012119314, + "language_loss": 0.71484256, + "learning_rate": 2.638813047071192e-06, + "loss": 0.73980701, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.2142334, + "step": 6896, + "time_per_iteration": 2.8818492889404297 + }, + { + "auxiliary_loss_clip": 0.0148333, + "auxiliary_loss_mlp": 0.01044976, + "balance_loss_clip": 1.29839325, + "balance_loss_mlp": 1.02280307, + "epoch": 0.4146700736509845, + "flos": 25933631990400.0, + "grad_norm": 1.6286665016432573, + "language_loss": 0.73116541, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75644845, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 1.84960938, + "router_z_loss_mlp": 0.22167969, + "step": 6897, + "time_per_iteration": 2.890639543533325 + }, + { + "auxiliary_loss_clip": 0.01482771, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.29896235, + "balance_loss_mlp": 1.01883316, + "epoch": 0.4147301969036525, + "flos": 26844412694400.0, + "grad_norm": 1.6545732595967189, + "language_loss": 0.84912288, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.874358, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.21911621, + "step": 6898, + "time_per_iteration": 4.335132360458374 + }, + { + "auxiliary_loss_clip": 0.01477783, + "auxiliary_loss_mlp": 0.01042309, + "balance_loss_clip": 1.29238033, + "balance_loss_mlp": 1.02005291, + "epoch": 0.41479032015632045, + "flos": 20306726599680.0, + "grad_norm": 2.344241084866195, + "language_loss": 0.75035512, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.77555597, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.22241211, + "step": 6899, + "time_per_iteration": 2.9439637660980225 + }, + { + "auxiliary_loss_clip": 0.01482776, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.29513884, + "balance_loss_mlp": 1.01559889, + "epoch": 0.4148504434089884, + "flos": 25275915308160.0, + "grad_norm": 1.6920190875141417, + "language_loss": 0.76585865, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.79106474, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.22216797, + "step": 6900, + "time_per_iteration": 2.947110652923584 + }, + { + "auxiliary_loss_clip": 0.01458399, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.27707088, + "balance_loss_mlp": 1.01271605, + "epoch": 0.4149105666616564, + "flos": 12829728314880.0, + "grad_norm": 2.70312924849219, + "language_loss": 0.82069004, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.84562099, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.2199707, + "step": 6901, + "time_per_iteration": 2.816500663757324 + }, + { + "auxiliary_loss_clip": 0.01452396, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.27164912, + "balance_loss_mlp": 1.01165962, + "epoch": 0.41497068991432434, + "flos": 16772635048320.0, + "grad_norm": 1.6240311830311718, + "language_loss": 0.70454144, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.72940129, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21960449, + "step": 6902, + "time_per_iteration": 2.8383026123046875 + }, + { + "auxiliary_loss_clip": 0.01458264, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.28006101, + "balance_loss_mlp": 1.01651514, + "epoch": 0.4150308131669923, + "flos": 18009061712640.0, + "grad_norm": 1.6322678504652326, + "language_loss": 0.84044623, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.86540002, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.20605469, + "step": 6903, + "time_per_iteration": 2.825498104095459 + }, + { + "auxiliary_loss_clip": 0.01477709, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.29021764, + "balance_loss_mlp": 1.01184368, + "epoch": 0.41509093641966033, + "flos": 30056975562240.0, + "grad_norm": 2.099364021373867, + "language_loss": 0.68739021, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.71251845, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 1.87402344, + "router_z_loss_mlp": 0.23254395, + "step": 6904, + "time_per_iteration": 2.927365303039551 + }, + { + "auxiliary_loss_clip": 0.01477477, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.29093623, + "balance_loss_mlp": 1.01308894, + "epoch": 0.4151510596723283, + "flos": 24290697628800.0, + "grad_norm": 1.5856569311533761, + "language_loss": 0.78396785, + "learning_rate": 2.635490520350643e-06, + "loss": 0.80908179, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.20825195, + "step": 6905, + "time_per_iteration": 4.293271541595459 + }, + { + "auxiliary_loss_clip": 0.0148135, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.29516649, + "balance_loss_mlp": 1.01338279, + "epoch": 0.41521118292499626, + "flos": 23486414480640.0, + "grad_norm": 2.2893323212753716, + "language_loss": 0.69413459, + "learning_rate": 2.635121230039025e-06, + "loss": 0.71929634, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21447754, + "step": 6906, + "time_per_iteration": 4.284117937088013 + }, + { + "auxiliary_loss_clip": 0.0147293, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.29129636, + "balance_loss_mlp": 1.0170933, + "epoch": 0.4152713061776642, + "flos": 22135165217280.0, + "grad_norm": 2.3240208409198133, + "language_loss": 0.68714958, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.71225935, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20959473, + "step": 6907, + "time_per_iteration": 2.804215669631958 + }, + { + "auxiliary_loss_clip": 0.01484526, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.29960179, + "balance_loss_mlp": 1.0150826, + "epoch": 0.4153314294303322, + "flos": 21261241042560.0, + "grad_norm": 1.6852922140142488, + "language_loss": 0.77666736, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.80187309, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.2097168, + "step": 6908, + "time_per_iteration": 4.2858452796936035 + }, + { + "auxiliary_loss_clip": 0.01252653, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.14677954, + "balance_loss_mlp": 1.00143743, + "epoch": 0.41539155268300015, + "flos": 57949794938880.0, + "grad_norm": 0.7771906642726056, + "language_loss": 0.64911181, + "learning_rate": 2.634013214657026e-06, + "loss": 0.67192733, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.27539062, + "step": 6909, + "time_per_iteration": 3.394258737564087 + }, + { + "auxiliary_loss_clip": 0.01461123, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.28090119, + "balance_loss_mlp": 1.01681566, + "epoch": 0.4154516759356681, + "flos": 21912532963200.0, + "grad_norm": 1.4477332482717022, + "language_loss": 0.87779993, + "learning_rate": 2.633643828093996e-06, + "loss": 0.90278828, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.20898438, + "step": 6910, + "time_per_iteration": 2.868013858795166 + }, + { + "auxiliary_loss_clip": 0.01255988, + "auxiliary_loss_mlp": 0.01048283, + "balance_loss_clip": 1.14746332, + "balance_loss_mlp": 1.02158022, + "epoch": 0.4155117991883361, + "flos": 67862473309440.0, + "grad_norm": 0.8458552621962913, + "language_loss": 0.62203991, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64508259, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.26757812, + "step": 6911, + "time_per_iteration": 3.2872982025146484 + }, + { + "auxiliary_loss_clip": 0.0151258, + "auxiliary_loss_mlp": 0.01043159, + "balance_loss_clip": 1.3214916, + "balance_loss_mlp": 1.02245259, + "epoch": 0.41557192244100405, + "flos": 14290732759680.0, + "grad_norm": 2.671473350303384, + "language_loss": 0.88278198, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90833938, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.20727539, + "step": 6912, + "time_per_iteration": 2.8015835285186768 + }, + { + "auxiliary_loss_clip": 0.01486849, + "auxiliary_loss_mlp": 0.0104264, + "balance_loss_clip": 1.3021158, + "balance_loss_mlp": 1.0230062, + "epoch": 0.415632045693672, + "flos": 24472582300800.0, + "grad_norm": 1.9934335804515464, + "language_loss": 0.64108634, + "learning_rate": 2.632535524293914e-06, + "loss": 0.66638124, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.19628906, + "step": 6913, + "time_per_iteration": 2.8591151237487793 + }, + { + "auxiliary_loss_clip": 0.01472271, + "auxiliary_loss_mlp": 0.01046728, + "balance_loss_clip": 1.29105484, + "balance_loss_mlp": 1.02592635, + "epoch": 0.41569216894634, + "flos": 20123529828480.0, + "grad_norm": 1.702501783458402, + "language_loss": 0.76291573, + "learning_rate": 2.632166041703586e-06, + "loss": 0.78810573, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.20812988, + "step": 6914, + "time_per_iteration": 2.86930251121521 + }, + { + "auxiliary_loss_clip": 0.01481192, + "auxiliary_loss_mlp": 0.01054675, + "balance_loss_clip": 1.29648399, + "balance_loss_mlp": 1.03314602, + "epoch": 0.41575229219900794, + "flos": 23807626450560.0, + "grad_norm": 51.10460125866058, + "language_loss": 0.88122994, + "learning_rate": 2.631796535141458e-06, + "loss": 0.90658867, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.2154541, + "step": 6915, + "time_per_iteration": 2.8269529342651367 + }, + { + "auxiliary_loss_clip": 0.01477652, + "auxiliary_loss_mlp": 0.0105588, + "balance_loss_clip": 1.29412389, + "balance_loss_mlp": 1.0346489, + "epoch": 0.4158124154516759, + "flos": 23117532474240.0, + "grad_norm": 1.8723635601343491, + "language_loss": 0.72166598, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.74700129, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.21228027, + "step": 6916, + "time_per_iteration": 2.872739553451538 + }, + { + "auxiliary_loss_clip": 0.01487386, + "auxiliary_loss_mlp": 0.01067389, + "balance_loss_clip": 1.30050254, + "balance_loss_mlp": 1.0457046, + "epoch": 0.41587253870434393, + "flos": 24253524385920.0, + "grad_norm": 1.9128601953372484, + "language_loss": 0.73042518, + "learning_rate": 2.631057450157852e-06, + "loss": 0.75597292, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.21691895, + "step": 6917, + "time_per_iteration": 2.8505101203918457 + }, + { + "auxiliary_loss_clip": 0.01475226, + "auxiliary_loss_mlp": 0.01064792, + "balance_loss_clip": 1.29219627, + "balance_loss_mlp": 1.04331017, + "epoch": 0.4159326619570119, + "flos": 23892967422720.0, + "grad_norm": 1.4375623847442527, + "language_loss": 0.8136549, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83905512, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.21472168, + "step": 6918, + "time_per_iteration": 2.9246771335601807 + }, + { + "auxiliary_loss_clip": 0.01487141, + "auxiliary_loss_mlp": 0.01062799, + "balance_loss_clip": 1.29998672, + "balance_loss_mlp": 1.04072142, + "epoch": 0.41599278520967986, + "flos": 40641713216640.0, + "grad_norm": 1.3686734367103763, + "language_loss": 0.70739472, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.73289412, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.2208252, + "step": 6919, + "time_per_iteration": 3.016178607940674 + }, + { + "auxiliary_loss_clip": 0.01474998, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.2909143, + "balance_loss_mlp": 1.03491759, + "epoch": 0.4160529084623478, + "flos": 18231965435520.0, + "grad_norm": 2.386813292630589, + "language_loss": 0.82383597, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.84915739, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.22229004, + "step": 6920, + "time_per_iteration": 2.8413913249969482 + }, + { + "auxiliary_loss_clip": 0.01484151, + "auxiliary_loss_mlp": 0.01056841, + "balance_loss_clip": 1.29764628, + "balance_loss_mlp": 1.03550267, + "epoch": 0.4161130317150158, + "flos": 13669736872320.0, + "grad_norm": 2.044282077591011, + "language_loss": 0.66650283, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.69191277, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 1.86425781, + "router_z_loss_mlp": 0.21350098, + "step": 6921, + "time_per_iteration": 2.8346188068389893 + }, + { + "auxiliary_loss_clip": 0.01474566, + "auxiliary_loss_mlp": 0.01061958, + "balance_loss_clip": 1.2917937, + "balance_loss_mlp": 1.04145384, + "epoch": 0.41617315496768376, + "flos": 16186685898240.0, + "grad_norm": 2.1138902007449554, + "language_loss": 0.81754172, + "learning_rate": 2.629209319173274e-06, + "loss": 0.84290695, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.20507812, + "step": 6922, + "time_per_iteration": 2.802933931350708 + }, + { + "auxiliary_loss_clip": 0.01483414, + "auxiliary_loss_mlp": 0.01056405, + "balance_loss_clip": 1.29789782, + "balance_loss_mlp": 1.03493512, + "epoch": 0.4162332782203517, + "flos": 26224412192640.0, + "grad_norm": 1.7558697755764596, + "language_loss": 0.67999846, + "learning_rate": 2.628839621341247e-06, + "loss": 0.70539665, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21484375, + "step": 6923, + "time_per_iteration": 2.9095559120178223 + }, + { + "auxiliary_loss_clip": 0.01477531, + "auxiliary_loss_mlp": 0.0105999, + "balance_loss_clip": 1.29531956, + "balance_loss_mlp": 1.03823423, + "epoch": 0.4162934014730197, + "flos": 28195707202560.0, + "grad_norm": 2.285550320669461, + "language_loss": 0.76605278, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.79142797, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.2175293, + "step": 6924, + "time_per_iteration": 2.9351460933685303 + }, + { + "auxiliary_loss_clip": 0.01478628, + "auxiliary_loss_mlp": 0.01051322, + "balance_loss_clip": 1.29323554, + "balance_loss_mlp": 1.02992415, + "epoch": 0.41635352472568765, + "flos": 19874899797120.0, + "grad_norm": 1.6550886337975523, + "language_loss": 0.74146307, + "learning_rate": 2.62810015415423e-06, + "loss": 0.76676255, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.21386719, + "step": 6925, + "time_per_iteration": 2.865053176879883 + }, + { + "auxiliary_loss_clip": 0.01462434, + "auxiliary_loss_mlp": 0.01044292, + "balance_loss_clip": 1.27908087, + "balance_loss_mlp": 1.02294183, + "epoch": 0.4164136479783556, + "flos": 14942522373120.0, + "grad_norm": 1.909725537250393, + "language_loss": 0.85100877, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.87607598, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.21337891, + "step": 6926, + "time_per_iteration": 2.852017879486084 + }, + { + "auxiliary_loss_clip": 0.01453261, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.27353692, + "balance_loss_mlp": 1.01923871, + "epoch": 0.4164737712310236, + "flos": 21766373700480.0, + "grad_norm": 1.5604733655339171, + "language_loss": 0.86753726, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.89247197, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.2097168, + "step": 6927, + "time_per_iteration": 2.854295253753662 + }, + { + "auxiliary_loss_clip": 0.0146678, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.2844975, + "balance_loss_mlp": 1.01826477, + "epoch": 0.41653389448369155, + "flos": 20749728867840.0, + "grad_norm": 2.233191990431906, + "language_loss": 0.73933947, + "learning_rate": 2.626990774776604e-06, + "loss": 0.76441246, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22277832, + "step": 6928, + "time_per_iteration": 2.8461570739746094 + }, + { + "auxiliary_loss_clip": 0.0146041, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.27751517, + "balance_loss_mlp": 1.01884186, + "epoch": 0.4165940177363595, + "flos": 24983687272320.0, + "grad_norm": 1.9142636850439767, + "language_loss": 0.79323101, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.81823277, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.20922852, + "step": 6929, + "time_per_iteration": 2.8861517906188965 + }, + { + "auxiliary_loss_clip": 0.0145235, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.27221715, + "balance_loss_mlp": 1.01437068, + "epoch": 0.41665414098902753, + "flos": 20531394869760.0, + "grad_norm": 1.865926894726875, + "language_loss": 0.71662635, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.74150181, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.20812988, + "step": 6930, + "time_per_iteration": 2.8529043197631836 + }, + { + "auxiliary_loss_clip": 0.01474129, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.28860402, + "balance_loss_mlp": 1.01898062, + "epoch": 0.4167142642416955, + "flos": 19692472187520.0, + "grad_norm": 1.8546738627967763, + "language_loss": 0.82375193, + "learning_rate": 2.625881181419007e-06, + "loss": 0.84889603, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.21313477, + "step": 6931, + "time_per_iteration": 2.8160171508789062 + }, + { + "auxiliary_loss_clip": 0.01444953, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.26467752, + "balance_loss_mlp": 1.01499426, + "epoch": 0.41677438749436346, + "flos": 23772896426880.0, + "grad_norm": 2.161705083668155, + "language_loss": 0.79546475, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.82026929, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.20495605, + "step": 6932, + "time_per_iteration": 2.835158586502075 + }, + { + "auxiliary_loss_clip": 0.014636, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.27967048, + "balance_loss_mlp": 1.0153985, + "epoch": 0.41683451074703143, + "flos": 30422464208640.0, + "grad_norm": 20.986902689540873, + "language_loss": 0.83089423, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.85590291, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21887207, + "step": 6933, + "time_per_iteration": 4.368880987167358 + }, + { + "auxiliary_loss_clip": 0.01484847, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.29705071, + "balance_loss_mlp": 1.01315594, + "epoch": 0.4168946339996994, + "flos": 21516657793920.0, + "grad_norm": 1.9328715218891803, + "language_loss": 0.78258759, + "learning_rate": 2.624771374460121e-06, + "loss": 0.80777776, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.21008301, + "step": 6934, + "time_per_iteration": 2.8357813358306885 + }, + { + "auxiliary_loss_clip": 0.01470189, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.28683448, + "balance_loss_mlp": 1.01508796, + "epoch": 0.41695475725236736, + "flos": 17647418874240.0, + "grad_norm": 2.2453886195172106, + "language_loss": 0.67586637, + "learning_rate": 2.624401391405668e-06, + "loss": 0.70093048, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21130371, + "step": 6935, + "time_per_iteration": 2.8146512508392334 + }, + { + "auxiliary_loss_clip": 0.01454667, + "auxiliary_loss_mlp": 0.01042065, + "balance_loss_clip": 1.27430534, + "balance_loss_mlp": 1.02077413, + "epoch": 0.4170148805050353, + "flos": 15677254984320.0, + "grad_norm": 2.0788519493053337, + "language_loss": 0.74133217, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.76629949, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21289062, + "step": 6936, + "time_per_iteration": 2.8332698345184326 + }, + { + "auxiliary_loss_clip": 0.01449274, + "auxiliary_loss_mlp": 0.0103712, + "balance_loss_clip": 1.27065301, + "balance_loss_mlp": 1.01611495, + "epoch": 0.4170750037577033, + "flos": 15167643091200.0, + "grad_norm": 2.1428575130020566, + "language_loss": 0.74944675, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.77431071, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21020508, + "step": 6937, + "time_per_iteration": 2.845737934112549 + }, + { + "auxiliary_loss_clip": 0.01467997, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.28589821, + "balance_loss_mlp": 1.0216701, + "epoch": 0.41713512701037125, + "flos": 28780299008640.0, + "grad_norm": 1.4530718571695742, + "language_loss": 0.84988558, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.87498617, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.20373535, + "step": 6938, + "time_per_iteration": 2.948174238204956 + }, + { + "auxiliary_loss_clip": 0.01478745, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.29159498, + "balance_loss_mlp": 1.01473045, + "epoch": 0.4171952502630392, + "flos": 28268741589120.0, + "grad_norm": 1.8487419512897938, + "language_loss": 0.7521016, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.77725267, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.21630859, + "step": 6939, + "time_per_iteration": 2.9203481674194336 + }, + { + "auxiliary_loss_clip": 0.01467879, + "auxiliary_loss_mlp": 0.01039927, + "balance_loss_clip": 1.28714681, + "balance_loss_mlp": 1.01871967, + "epoch": 0.4172553735157072, + "flos": 24582427971840.0, + "grad_norm": 2.7700366780817016, + "language_loss": 0.76045078, + "learning_rate": 2.622551121253579e-06, + "loss": 0.78552884, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21203613, + "step": 6940, + "time_per_iteration": 4.357048273086548 + }, + { + "auxiliary_loss_clip": 0.01481827, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.29697824, + "balance_loss_mlp": 1.01782513, + "epoch": 0.41731549676837515, + "flos": 27055371790080.0, + "grad_norm": 1.8408681868401786, + "language_loss": 0.72253007, + "learning_rate": 2.622180996345424e-06, + "loss": 0.74773878, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.21203613, + "step": 6941, + "time_per_iteration": 2.895787477493286 + }, + { + "auxiliary_loss_clip": 0.01476224, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.2907238, + "balance_loss_mlp": 1.01886046, + "epoch": 0.4173756200210431, + "flos": 28403544407040.0, + "grad_norm": 2.013371769581459, + "language_loss": 0.75155663, + "learning_rate": 2.621810847844104e-06, + "loss": 0.77671766, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.21032715, + "step": 6942, + "time_per_iteration": 4.377228021621704 + }, + { + "auxiliary_loss_clip": 0.0148712, + "auxiliary_loss_mlp": 0.01041561, + "balance_loss_clip": 1.29919624, + "balance_loss_mlp": 1.01863718, + "epoch": 0.41743574327371114, + "flos": 22529954511360.0, + "grad_norm": 1.956096788048457, + "language_loss": 0.73660433, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.76189113, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22937012, + "step": 6943, + "time_per_iteration": 4.1981213092803955 + }, + { + "auxiliary_loss_clip": 0.0148145, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.29667068, + "balance_loss_mlp": 1.0184387, + "epoch": 0.4174958665263791, + "flos": 30124535328000.0, + "grad_norm": 1.7126117407783177, + "language_loss": 0.64420986, + "learning_rate": 2.621070480118111e-06, + "loss": 0.66942728, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.21838379, + "step": 6944, + "time_per_iteration": 2.9117696285247803 + }, + { + "auxiliary_loss_clip": 0.01470877, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.28672457, + "balance_loss_mlp": 1.01507044, + "epoch": 0.41755598977904707, + "flos": 25273969781760.0, + "grad_norm": 1.4298131554761693, + "language_loss": 0.70252073, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72758704, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.20678711, + "step": 6945, + "time_per_iteration": 2.8991613388061523 + }, + { + "auxiliary_loss_clip": 0.01469244, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.28652608, + "balance_loss_mlp": 1.01653218, + "epoch": 0.41761611303171503, + "flos": 19838088512640.0, + "grad_norm": 2.349867726684987, + "language_loss": 0.8195315, + "learning_rate": 2.620330018187899e-06, + "loss": 0.844616, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22692871, + "step": 6946, + "time_per_iteration": 2.826840400695801 + }, + { + "auxiliary_loss_clip": 0.01466835, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.28528857, + "balance_loss_mlp": 1.01265335, + "epoch": 0.417676236284383, + "flos": 15531638659200.0, + "grad_norm": 2.2101053592946225, + "language_loss": 0.78553551, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.81053078, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20031738, + "step": 6947, + "time_per_iteration": 2.814042091369629 + }, + { + "auxiliary_loss_clip": 0.01483151, + "auxiliary_loss_mlp": 0.01041072, + "balance_loss_clip": 1.30036557, + "balance_loss_mlp": 1.01996052, + "epoch": 0.41773635953705096, + "flos": 32536977569280.0, + "grad_norm": 1.9046167915130512, + "language_loss": 0.72204638, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.74728864, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21130371, + "step": 6948, + "time_per_iteration": 2.9695961475372314 + }, + { + "auxiliary_loss_clip": 0.01463306, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.28247499, + "balance_loss_mlp": 1.0124588, + "epoch": 0.4177964827897189, + "flos": 23451458232960.0, + "grad_norm": 1.4846338326293698, + "language_loss": 0.77570128, + "learning_rate": 2.619219148905362e-06, + "loss": 0.80067354, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.21459961, + "step": 6949, + "time_per_iteration": 2.9076483249664307 + }, + { + "auxiliary_loss_clip": 0.01489484, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.3012712, + "balance_loss_mlp": 1.01815557, + "epoch": 0.4178566060423869, + "flos": 22759554464640.0, + "grad_norm": 1.5961192244066145, + "language_loss": 0.82606208, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.85136175, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.22314453, + "step": 6950, + "time_per_iteration": 2.868067979812622 + }, + { + "auxiliary_loss_clip": 0.01461476, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.2837944, + "balance_loss_mlp": 1.01698232, + "epoch": 0.41791672929505486, + "flos": 26044337312640.0, + "grad_norm": 1.2995413972626246, + "language_loss": 0.76720595, + "learning_rate": 2.618478451956007e-06, + "loss": 0.79219472, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.20410156, + "step": 6951, + "time_per_iteration": 2.9133458137512207 + }, + { + "auxiliary_loss_clip": 0.01496025, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.30617118, + "balance_loss_mlp": 1.01491904, + "epoch": 0.4179768525477228, + "flos": 19576970916480.0, + "grad_norm": 2.072155769672736, + "language_loss": 0.73949558, + "learning_rate": 2.61810806829516e-06, + "loss": 0.76481748, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.21240234, + "step": 6952, + "time_per_iteration": 2.913940668106079 + }, + { + "auxiliary_loss_clip": 0.01488186, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.30328619, + "balance_loss_mlp": 1.01903367, + "epoch": 0.4180369758003908, + "flos": 17792401772160.0, + "grad_norm": 3.633599836318094, + "language_loss": 0.72552836, + "learning_rate": 2.617737661195593e-06, + "loss": 0.75081336, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21276855, + "step": 6953, + "time_per_iteration": 2.840087652206421 + }, + { + "auxiliary_loss_clip": 0.01463346, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.28504133, + "balance_loss_mlp": 1.02133775, + "epoch": 0.41809709905305875, + "flos": 20970732309120.0, + "grad_norm": 1.6945976652244814, + "language_loss": 0.76557839, + "learning_rate": 2.617367230671353e-06, + "loss": 0.79064387, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21875, + "step": 6954, + "time_per_iteration": 2.8516314029693604 + }, + { + "auxiliary_loss_clip": 0.01477502, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.29374623, + "balance_loss_mlp": 1.01498151, + "epoch": 0.4181572223057267, + "flos": 22027672275840.0, + "grad_norm": 9.045867496618138, + "language_loss": 0.8536846, + "learning_rate": 2.616996776736485e-06, + "loss": 0.87884045, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.23120117, + "step": 6955, + "time_per_iteration": 2.841114044189453 + }, + { + "auxiliary_loss_clip": 0.01472283, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.29160571, + "balance_loss_mlp": 1.02068806, + "epoch": 0.4182173455583947, + "flos": 26255522632320.0, + "grad_norm": 1.4754475660360569, + "language_loss": 0.83965337, + "learning_rate": 2.616626299405037e-06, + "loss": 0.86479449, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.21142578, + "step": 6956, + "time_per_iteration": 2.875835418701172 + }, + { + "auxiliary_loss_clip": 0.01483582, + "auxiliary_loss_mlp": 0.01047647, + "balance_loss_clip": 1.2974354, + "balance_loss_mlp": 1.02544975, + "epoch": 0.4182774688110627, + "flos": 14799304022400.0, + "grad_norm": 1.9607425705864332, + "language_loss": 0.72399294, + "learning_rate": 2.616255798691059e-06, + "loss": 0.74930525, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.22192383, + "step": 6957, + "time_per_iteration": 2.8217484951019287 + }, + { + "auxiliary_loss_clip": 0.01483802, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.29910171, + "balance_loss_mlp": 1.0275147, + "epoch": 0.41833759206373067, + "flos": 20421594443520.0, + "grad_norm": 2.3926090424247364, + "language_loss": 0.76508433, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.7903986, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.20117188, + "step": 6958, + "time_per_iteration": 2.840210199356079 + }, + { + "auxiliary_loss_clip": 0.01475072, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.29263234, + "balance_loss_mlp": 1.01634574, + "epoch": 0.41839771531639863, + "flos": 23665991667840.0, + "grad_norm": 2.17587133155173, + "language_loss": 0.77637887, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.80151141, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.21862793, + "step": 6959, + "time_per_iteration": 2.891871929168701 + }, + { + "auxiliary_loss_clip": 0.0146624, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_clip": 1.28369713, + "balance_loss_mlp": 1.02308774, + "epoch": 0.4184578385690666, + "flos": 19763244334080.0, + "grad_norm": 4.293041062202975, + "language_loss": 0.77613914, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.80125856, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.22595215, + "step": 6960, + "time_per_iteration": 2.8494880199432373 + }, + { + "auxiliary_loss_clip": 0.01452702, + "auxiliary_loss_mlp": 0.01036228, + "balance_loss_clip": 1.27725089, + "balance_loss_mlp": 1.01596236, + "epoch": 0.41851796182173456, + "flos": 20202988976640.0, + "grad_norm": 1.9026062030936706, + "language_loss": 0.76240593, + "learning_rate": 2.614773562290835e-06, + "loss": 0.78729528, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.20263672, + "step": 6961, + "time_per_iteration": 2.892587661743164 + }, + { + "auxiliary_loss_clip": 0.01258455, + "auxiliary_loss_mlp": 0.01042979, + "balance_loss_clip": 1.14956069, + "balance_loss_mlp": 1.02276075, + "epoch": 0.41857808507440253, + "flos": 59049111300480.0, + "grad_norm": 0.8063364173127168, + "language_loss": 0.54660928, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56962359, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.20214844, + "step": 6962, + "time_per_iteration": 3.2956666946411133 + }, + { + "auxiliary_loss_clip": 0.01481467, + "auxiliary_loss_mlp": 0.01044099, + "balance_loss_clip": 1.29758716, + "balance_loss_mlp": 1.02214122, + "epoch": 0.4186382083270705, + "flos": 18488015614080.0, + "grad_norm": 2.2766321197361776, + "language_loss": 0.86465979, + "learning_rate": 2.614032304160864e-06, + "loss": 0.88991541, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.21972656, + "step": 6963, + "time_per_iteration": 2.810253143310547 + }, + { + "auxiliary_loss_clip": 0.01472497, + "auxiliary_loss_mlp": 0.01040866, + "balance_loss_clip": 1.29071426, + "balance_loss_mlp": 1.01959896, + "epoch": 0.41869833157973846, + "flos": 21588425326080.0, + "grad_norm": 1.4629381714428389, + "language_loss": 0.70801353, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.73314714, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21264648, + "step": 6964, + "time_per_iteration": 2.879701852798462 + }, + { + "auxiliary_loss_clip": 0.01465373, + "auxiliary_loss_mlp": 0.01045158, + "balance_loss_clip": 1.28472662, + "balance_loss_mlp": 1.02435565, + "epoch": 0.4187584548324064, + "flos": 35530980215040.0, + "grad_norm": 1.4481141809320326, + "language_loss": 0.7142176, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.7393229, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.20800781, + "step": 6965, + "time_per_iteration": 2.9654433727264404 + }, + { + "auxiliary_loss_clip": 0.01462367, + "auxiliary_loss_mlp": 0.01040844, + "balance_loss_clip": 1.28465247, + "balance_loss_mlp": 1.02035201, + "epoch": 0.4188185780850744, + "flos": 18663746993280.0, + "grad_norm": 1.5963621322425618, + "language_loss": 0.72639203, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.75142413, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20483398, + "step": 6966, + "time_per_iteration": 2.860440969467163 + }, + { + "auxiliary_loss_clip": 0.01489192, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.30237877, + "balance_loss_mlp": 1.02062654, + "epoch": 0.41887870133774235, + "flos": 40348218326400.0, + "grad_norm": 2.1020135896272722, + "language_loss": 0.71813226, + "learning_rate": 2.612549508603375e-06, + "loss": 0.74343115, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.20056152, + "step": 6967, + "time_per_iteration": 2.9962122440338135 + }, + { + "auxiliary_loss_clip": 0.01260775, + "auxiliary_loss_mlp": 0.0106169, + "balance_loss_clip": 1.15326071, + "balance_loss_mlp": 1.0415678, + "epoch": 0.4189388245904103, + "flos": 61397323908480.0, + "grad_norm": 0.6927624031303851, + "language_loss": 0.46415129, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48737594, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.20117188, + "step": 6968, + "time_per_iteration": 4.790131568908691 + }, + { + "auxiliary_loss_clip": 0.01491045, + "auxiliary_loss_mlp": 0.01042645, + "balance_loss_clip": 1.30477428, + "balance_loss_mlp": 1.02118754, + "epoch": 0.4189989478430783, + "flos": 28226093725440.0, + "grad_norm": 4.011384829411986, + "language_loss": 0.7577309, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.78306782, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.21472168, + "step": 6969, + "time_per_iteration": 2.972778081893921 + }, + { + "auxiliary_loss_clip": 0.0147837, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.29505086, + "balance_loss_mlp": 1.01767135, + "epoch": 0.4190590710957463, + "flos": 24575098314240.0, + "grad_norm": 1.7767477510818215, + "language_loss": 0.81700528, + "learning_rate": 2.611437167992705e-06, + "loss": 0.84216535, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.19970703, + "step": 6970, + "time_per_iteration": 2.8886237144470215 + }, + { + "auxiliary_loss_clip": 0.01479819, + "auxiliary_loss_mlp": 0.01038399, + "balance_loss_clip": 1.29820085, + "balance_loss_mlp": 1.0171324, + "epoch": 0.41911919434841427, + "flos": 21736077667200.0, + "grad_norm": 2.0101854926999514, + "language_loss": 0.83935928, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.86454153, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21276855, + "step": 6971, + "time_per_iteration": 2.8384807109832764 + }, + { + "auxiliary_loss_clip": 0.01464827, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.28759885, + "balance_loss_mlp": 1.01688385, + "epoch": 0.41917931760108224, + "flos": 17610109896960.0, + "grad_norm": 2.0602187282357254, + "language_loss": 0.76149315, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.78652036, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.21020508, + "step": 6972, + "time_per_iteration": 2.8807787895202637 + }, + { + "auxiliary_loss_clip": 0.01474144, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.29151726, + "balance_loss_mlp": 1.01301336, + "epoch": 0.4192394408537502, + "flos": 37831043076480.0, + "grad_norm": 2.068725423560535, + "language_loss": 0.74054348, + "learning_rate": 2.610324618710212e-06, + "loss": 0.76562655, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.21142578, + "step": 6973, + "time_per_iteration": 2.9747424125671387 + }, + { + "auxiliary_loss_clip": 0.01495943, + "auxiliary_loss_mlp": 0.01051953, + "balance_loss_clip": 1.30737257, + "balance_loss_mlp": 1.03033996, + "epoch": 0.41929956410641817, + "flos": 23116899047040.0, + "grad_norm": 2.207100165315424, + "language_loss": 0.75391716, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77939612, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 1.88769531, + "router_z_loss_mlp": 0.21606445, + "step": 6974, + "time_per_iteration": 2.900017023086548 + }, + { + "auxiliary_loss_clip": 0.01475565, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.29237962, + "balance_loss_mlp": 1.01286292, + "epoch": 0.41935968735908613, + "flos": 22533709829760.0, + "grad_norm": 1.8605671930701877, + "language_loss": 0.73611575, + "learning_rate": 2.609582803447259e-06, + "loss": 0.76122117, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.22131348, + "step": 6975, + "time_per_iteration": 2.8255467414855957 + }, + { + "auxiliary_loss_clip": 0.01477884, + "auxiliary_loss_mlp": 0.01041602, + "balance_loss_clip": 1.29735482, + "balance_loss_mlp": 1.02058494, + "epoch": 0.4194198106117541, + "flos": 26881812161280.0, + "grad_norm": 1.5106497502758174, + "language_loss": 0.8132267, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83842152, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21020508, + "step": 6976, + "time_per_iteration": 4.2981579303741455 + }, + { + "auxiliary_loss_clip": 0.01474775, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.29079533, + "balance_loss_mlp": 1.01307237, + "epoch": 0.41947993386442206, + "flos": 19912163529600.0, + "grad_norm": 2.8812566517915332, + "language_loss": 0.68811631, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.71319997, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.20532227, + "step": 6977, + "time_per_iteration": 4.289769172668457 + }, + { + "auxiliary_loss_clip": 0.01494509, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.3093555, + "balance_loss_mlp": 1.01696265, + "epoch": 0.41954005711709, + "flos": 17392726039680.0, + "grad_norm": 2.9154316667675504, + "language_loss": 0.81678128, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.84210104, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.20507812, + "step": 6978, + "time_per_iteration": 4.249008893966675 + }, + { + "auxiliary_loss_clip": 0.01490809, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.30266118, + "balance_loss_mlp": 1.01808226, + "epoch": 0.419600180369758, + "flos": 25012535472000.0, + "grad_norm": 2.108662572138678, + "language_loss": 0.83634329, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.86164498, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.21276855, + "step": 6979, + "time_per_iteration": 2.860628843307495 + }, + { + "auxiliary_loss_clip": 0.01470422, + "auxiliary_loss_mlp": 0.01037111, + "balance_loss_clip": 1.28805983, + "balance_loss_mlp": 1.01661909, + "epoch": 0.41966030362242596, + "flos": 17392590305280.0, + "grad_norm": 2.0394303736265296, + "language_loss": 0.84031326, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.86538851, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.20495605, + "step": 6980, + "time_per_iteration": 2.8149101734161377 + }, + { + "auxiliary_loss_clip": 0.01493048, + "auxiliary_loss_mlp": 0.01042595, + "balance_loss_clip": 1.30573094, + "balance_loss_mlp": 1.02124429, + "epoch": 0.4197204268750939, + "flos": 22164330130560.0, + "grad_norm": 3.7157782841751343, + "language_loss": 0.78961658, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.814973, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.21350098, + "step": 6981, + "time_per_iteration": 2.8314263820648193 + }, + { + "auxiliary_loss_clip": 0.01464816, + "auxiliary_loss_mlp": 0.01033454, + "balance_loss_clip": 1.28449512, + "balance_loss_mlp": 1.01331997, + "epoch": 0.4197805501277619, + "flos": 22092517353600.0, + "grad_norm": 1.659330205870222, + "language_loss": 0.84737492, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.87235761, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.20129395, + "step": 6982, + "time_per_iteration": 2.8203184604644775 + }, + { + "auxiliary_loss_clip": 0.01492462, + "auxiliary_loss_mlp": 0.01041528, + "balance_loss_clip": 1.30451131, + "balance_loss_mlp": 1.0191288, + "epoch": 0.4198406733804299, + "flos": 26443334373120.0, + "grad_norm": 2.3397612971467714, + "language_loss": 0.57896996, + "learning_rate": 2.606614618903214e-06, + "loss": 0.6043098, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 1.87792969, + "router_z_loss_mlp": 0.22399902, + "step": 6983, + "time_per_iteration": 2.866570472717285 + }, + { + "auxiliary_loss_clip": 0.01469857, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.28894353, + "balance_loss_mlp": 1.01688838, + "epoch": 0.4199007966330979, + "flos": 12538540909440.0, + "grad_norm": 1.9110522756041868, + "language_loss": 0.82976985, + "learning_rate": 2.606243492174471e-06, + "loss": 0.85484564, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.20837402, + "step": 6984, + "time_per_iteration": 2.877274513244629 + }, + { + "auxiliary_loss_clip": 0.01472555, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.29003048, + "balance_loss_mlp": 1.01448059, + "epoch": 0.41996091988576584, + "flos": 21773115175680.0, + "grad_norm": 3.029549289545675, + "language_loss": 0.80100471, + "learning_rate": 2.605872342456914e-06, + "loss": 0.82608926, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.2142334, + "step": 6985, + "time_per_iteration": 2.882906436920166 + }, + { + "auxiliary_loss_clip": 0.01492519, + "auxiliary_loss_mlp": 0.01039031, + "balance_loss_clip": 1.30203116, + "balance_loss_mlp": 1.01702535, + "epoch": 0.4200210431384338, + "flos": 26553180044160.0, + "grad_norm": 1.6346867830224485, + "language_loss": 0.78715229, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.81246775, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 1.90527344, + "router_z_loss_mlp": 0.2199707, + "step": 6986, + "time_per_iteration": 2.8790605068206787 + }, + { + "auxiliary_loss_clip": 0.01456653, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.27918267, + "balance_loss_mlp": 1.01976514, + "epoch": 0.42008116639110177, + "flos": 26806741758720.0, + "grad_norm": 1.5749747318342266, + "language_loss": 0.72929549, + "learning_rate": 2.605129974111655e-06, + "loss": 0.75426221, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20251465, + "step": 6987, + "time_per_iteration": 2.9014296531677246 + }, + { + "auxiliary_loss_clip": 0.01489899, + "auxiliary_loss_mlp": 0.01042678, + "balance_loss_clip": 1.30509841, + "balance_loss_mlp": 1.02000487, + "epoch": 0.42014128964376973, + "flos": 32099902369920.0, + "grad_norm": 1.4226539290910547, + "language_loss": 0.75629282, + "learning_rate": 2.604758755512104e-06, + "loss": 0.78161865, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22680664, + "step": 6988, + "time_per_iteration": 2.96189546585083 + }, + { + "auxiliary_loss_clip": 0.01489384, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.30319643, + "balance_loss_mlp": 1.01793051, + "epoch": 0.4202014128964377, + "flos": 26477883417600.0, + "grad_norm": 1.610340073058688, + "language_loss": 0.74859422, + "learning_rate": 2.60438751398004e-06, + "loss": 0.77388275, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 1.86230469, + "router_z_loss_mlp": 0.2154541, + "step": 6989, + "time_per_iteration": 2.8682408332824707 + }, + { + "auxiliary_loss_clip": 0.01486926, + "auxiliary_loss_mlp": 0.01036394, + "balance_loss_clip": 1.30002093, + "balance_loss_mlp": 1.01387572, + "epoch": 0.42026153614910566, + "flos": 13407533400960.0, + "grad_norm": 2.1342757526053613, + "language_loss": 0.72604561, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.75127888, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.2253418, + "step": 6990, + "time_per_iteration": 2.8554816246032715 + }, + { + "auxiliary_loss_clip": 0.0123708, + "auxiliary_loss_mlp": 0.01067683, + "balance_loss_clip": 1.13369608, + "balance_loss_mlp": 1.04584372, + "epoch": 0.42032165940177363, + "flos": 60278796489600.0, + "grad_norm": 0.8340442210893206, + "language_loss": 0.60463494, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62768257, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.21875, + "step": 6991, + "time_per_iteration": 3.216486930847168 + }, + { + "auxiliary_loss_clip": 0.01487786, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.30170858, + "balance_loss_mlp": 1.01958942, + "epoch": 0.4203817826544416, + "flos": 24545933400960.0, + "grad_norm": 1.7116787687561292, + "language_loss": 0.83772886, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.8630147, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.2121582, + "step": 6992, + "time_per_iteration": 2.9229466915130615 + }, + { + "auxiliary_loss_clip": 0.01249257, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.13891459, + "balance_loss_mlp": 1.00255859, + "epoch": 0.42044190590710956, + "flos": 58847002202880.0, + "grad_norm": 0.8220601391908872, + "language_loss": 0.65514743, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.6779269, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.26171875, + "step": 6993, + "time_per_iteration": 3.2792551517486572 + }, + { + "auxiliary_loss_clip": 0.01505107, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.31368935, + "balance_loss_mlp": 1.02253175, + "epoch": 0.4205020291597775, + "flos": 16444862582400.0, + "grad_norm": 2.292474609036427, + "language_loss": 0.84994435, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.87544274, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.2220459, + "step": 6994, + "time_per_iteration": 2.8137123584747314 + }, + { + "auxiliary_loss_clip": 0.01465952, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_clip": 1.28613949, + "balance_loss_mlp": 1.02260303, + "epoch": 0.4205621524124455, + "flos": 18414800248320.0, + "grad_norm": 1.9413686532627623, + "language_loss": 0.78479773, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80989957, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.21606445, + "step": 6995, + "time_per_iteration": 2.958728313446045 + }, + { + "auxiliary_loss_clip": 0.01467161, + "auxiliary_loss_mlp": 0.01038398, + "balance_loss_clip": 1.28860712, + "balance_loss_mlp": 1.01668978, + "epoch": 0.4206222756651135, + "flos": 25531015345920.0, + "grad_norm": 1.684865431290551, + "language_loss": 0.80347037, + "learning_rate": 2.60178818232786e-06, + "loss": 0.8285259, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21728516, + "step": 6996, + "time_per_iteration": 2.919257640838623 + }, + { + "auxiliary_loss_clip": 0.01482464, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.29870546, + "balance_loss_mlp": 1.01570988, + "epoch": 0.4206823989177815, + "flos": 15312671233920.0, + "grad_norm": 2.1422672535636496, + "language_loss": 0.76564932, + "learning_rate": 2.601416757842559e-06, + "loss": 0.79084051, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.20947266, + "step": 6997, + "time_per_iteration": 2.831120729446411 + }, + { + "auxiliary_loss_clip": 0.01473797, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.28897905, + "balance_loss_mlp": 1.01786232, + "epoch": 0.42074252217044944, + "flos": 15561029796480.0, + "grad_norm": 1.9669222648243108, + "language_loss": 0.76430142, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78943974, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.22167969, + "step": 6998, + "time_per_iteration": 2.884909152984619 + }, + { + "auxiliary_loss_clip": 0.01496699, + "auxiliary_loss_mlp": 0.01043216, + "balance_loss_clip": 1.30747998, + "balance_loss_mlp": 1.01985097, + "epoch": 0.4208026454231174, + "flos": 26158390750080.0, + "grad_norm": 1.8825937142846703, + "language_loss": 0.76726645, + "learning_rate": 2.60067384046869e-06, + "loss": 0.7926656, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 1.89257812, + "router_z_loss_mlp": 0.23364258, + "step": 6999, + "time_per_iteration": 2.9460692405700684 + }, + { + "auxiliary_loss_clip": 0.01485037, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.30255699, + "balance_loss_mlp": 1.01715159, + "epoch": 0.42086276867578537, + "flos": 23560579987200.0, + "grad_norm": 2.1086304538172906, + "language_loss": 0.64846218, + "learning_rate": 2.600302347608295e-06, + "loss": 0.67370677, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.22277832, + "step": 7000, + "time_per_iteration": 2.876311779022217 + }, + { + "auxiliary_loss_clip": 0.01505175, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.31818974, + "balance_loss_mlp": 1.01946127, + "epoch": 0.42092289192845334, + "flos": 18122391233280.0, + "grad_norm": 2.7971499855369766, + "language_loss": 0.76809978, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.79356682, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.22058105, + "step": 7001, + "time_per_iteration": 2.8683078289031982 + }, + { + "auxiliary_loss_clip": 0.01493342, + "auxiliary_loss_mlp": 0.0104237, + "balance_loss_clip": 1.3107537, + "balance_loss_mlp": 1.02037644, + "epoch": 0.4209830151811213, + "flos": 20014996256640.0, + "grad_norm": 1.4713260996602677, + "language_loss": 0.87539077, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.9007479, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.2199707, + "step": 7002, + "time_per_iteration": 2.821183919906616 + }, + { + "auxiliary_loss_clip": 0.01487936, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.3047483, + "balance_loss_mlp": 1.01812136, + "epoch": 0.42104313843378927, + "flos": 21988417772160.0, + "grad_norm": 2.74588773827634, + "language_loss": 0.69099939, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.71627831, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.21813965, + "step": 7003, + "time_per_iteration": 4.3195414543151855 + }, + { + "auxiliary_loss_clip": 0.0150112, + "auxiliary_loss_mlp": 0.01045068, + "balance_loss_clip": 1.31310153, + "balance_loss_mlp": 1.02269268, + "epoch": 0.42110326168645723, + "flos": 25454044661760.0, + "grad_norm": 2.8304112830688326, + "language_loss": 0.78699446, + "learning_rate": 2.598816148672344e-06, + "loss": 0.81245637, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.22387695, + "step": 7004, + "time_per_iteration": 2.850839376449585 + }, + { + "auxiliary_loss_clip": 0.01473248, + "auxiliary_loss_mlp": 0.01043284, + "balance_loss_clip": 1.29325962, + "balance_loss_mlp": 1.02086091, + "epoch": 0.4211633849391252, + "flos": 17831746765440.0, + "grad_norm": 1.5485617199741195, + "language_loss": 0.68767118, + "learning_rate": 2.59844454213521e-06, + "loss": 0.7128365, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22424316, + "step": 7005, + "time_per_iteration": 2.827394723892212 + }, + { + "auxiliary_loss_clip": 0.01488992, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.30399156, + "balance_loss_mlp": 1.01826632, + "epoch": 0.42122350819179316, + "flos": 16289201911680.0, + "grad_norm": 3.3379986156692643, + "language_loss": 0.73598015, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.7612657, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.21289062, + "step": 7006, + "time_per_iteration": 2.8123152256011963 + }, + { + "auxiliary_loss_clip": 0.01491863, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_clip": 1.3057667, + "balance_loss_mlp": 1.02357054, + "epoch": 0.4212836314444611, + "flos": 19655253699840.0, + "grad_norm": 1.7690035117024923, + "language_loss": 0.72057617, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.74595124, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.22058105, + "step": 7007, + "time_per_iteration": 2.8527493476867676 + }, + { + "auxiliary_loss_clip": 0.01493062, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.30781615, + "balance_loss_mlp": 1.01815176, + "epoch": 0.4213437546971291, + "flos": 18378124698240.0, + "grad_norm": 1.90739813346141, + "language_loss": 0.8351171, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.86044049, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.21105957, + "step": 7008, + "time_per_iteration": 2.82950758934021 + }, + { + "auxiliary_loss_clip": 0.01499595, + "auxiliary_loss_mlp": 0.01043111, + "balance_loss_clip": 1.31433821, + "balance_loss_mlp": 1.02193987, + "epoch": 0.42140387794979706, + "flos": 27714762529920.0, + "grad_norm": 6.601080614480888, + "language_loss": 0.72483325, + "learning_rate": 2.596957889196831e-06, + "loss": 0.75026035, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.21179199, + "step": 7009, + "time_per_iteration": 2.8988163471221924 + }, + { + "auxiliary_loss_clip": 0.01494499, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.30674088, + "balance_loss_mlp": 1.01994991, + "epoch": 0.4214640012024651, + "flos": 28158669694080.0, + "grad_norm": 2.679354477922483, + "language_loss": 0.6721642, + "learning_rate": 2.596586169335243e-06, + "loss": 0.69752508, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.21643066, + "step": 7010, + "time_per_iteration": 4.312077045440674 + }, + { + "auxiliary_loss_clip": 0.01483473, + "auxiliary_loss_mlp": 0.01041964, + "balance_loss_clip": 1.30011582, + "balance_loss_mlp": 1.01983857, + "epoch": 0.42152412445513304, + "flos": 23006148480000.0, + "grad_norm": 1.5982273158777511, + "language_loss": 0.73336387, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.75861824, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.22119141, + "step": 7011, + "time_per_iteration": 2.858715534210205 + }, + { + "auxiliary_loss_clip": 0.01258668, + "auxiliary_loss_mlp": 0.01058152, + "balance_loss_clip": 1.14398456, + "balance_loss_mlp": 1.03431058, + "epoch": 0.421584247707801, + "flos": 63777931793280.0, + "grad_norm": 0.8037487176202057, + "language_loss": 0.54360449, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.5667727, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.23828125, + "step": 7012, + "time_per_iteration": 4.65428614616394 + }, + { + "auxiliary_loss_clip": 0.01496618, + "auxiliary_loss_mlp": 0.01044578, + "balance_loss_clip": 1.31094682, + "balance_loss_mlp": 1.02234519, + "epoch": 0.421644370960469, + "flos": 24324839470080.0, + "grad_norm": 1.3460731999058508, + "language_loss": 0.79333448, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.81874645, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.22253418, + "step": 7013, + "time_per_iteration": 2.8751492500305176 + }, + { + "auxiliary_loss_clip": 0.0149669, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.31057549, + "balance_loss_mlp": 1.01698899, + "epoch": 0.42170449421313694, + "flos": 23451141519360.0, + "grad_norm": 1.7597739343981018, + "language_loss": 0.81911528, + "learning_rate": 2.595099063803787e-06, + "loss": 0.84447491, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.22302246, + "step": 7014, + "time_per_iteration": 4.251984596252441 + }, + { + "auxiliary_loss_clip": 0.01486161, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.30335951, + "balance_loss_mlp": 1.01937079, + "epoch": 0.4217646174658049, + "flos": 23705834353920.0, + "grad_norm": 1.5276312301688555, + "language_loss": 0.78386289, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80912977, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.21154785, + "step": 7015, + "time_per_iteration": 2.941279649734497 + }, + { + "auxiliary_loss_clip": 0.0150906, + "auxiliary_loss_mlp": 0.01045807, + "balance_loss_clip": 1.32228625, + "balance_loss_mlp": 1.02363384, + "epoch": 0.42182474071847287, + "flos": 24982013214720.0, + "grad_norm": 1.3006194860310722, + "language_loss": 0.82479608, + "learning_rate": 2.594355375584368e-06, + "loss": 0.85034478, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22192383, + "step": 7016, + "time_per_iteration": 2.8927063941955566 + }, + { + "auxiliary_loss_clip": 0.01481851, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.29755306, + "balance_loss_mlp": 1.02102208, + "epoch": 0.42188486397114083, + "flos": 22866866426880.0, + "grad_norm": 2.0216304957056566, + "language_loss": 0.68283767, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70809001, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.22351074, + "step": 7017, + "time_per_iteration": 2.8800899982452393 + }, + { + "auxiliary_loss_clip": 0.01259009, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.14233446, + "balance_loss_mlp": 1.01036251, + "epoch": 0.4219449872238088, + "flos": 67008864049920.0, + "grad_norm": 0.6999819402752542, + "language_loss": 0.59535193, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61836421, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.31835938, + "step": 7018, + "time_per_iteration": 3.4934003353118896 + }, + { + "auxiliary_loss_clip": 0.01500266, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.31235552, + "balance_loss_mlp": 1.01971877, + "epoch": 0.42200511047647676, + "flos": 13123177960320.0, + "grad_norm": 1.7399984355213636, + "language_loss": 0.7625314, + "learning_rate": 2.593239674255382e-06, + "loss": 0.7879566, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.22546387, + "step": 7019, + "time_per_iteration": 2.8802733421325684 + }, + { + "auxiliary_loss_clip": 0.01490527, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.30478728, + "balance_loss_mlp": 1.02301598, + "epoch": 0.42206523372914473, + "flos": 13999273885440.0, + "grad_norm": 1.9190720975663869, + "language_loss": 0.70470041, + "learning_rate": 2.592867728802166e-06, + "loss": 0.73006463, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22888184, + "step": 7020, + "time_per_iteration": 2.9077346324920654 + }, + { + "auxiliary_loss_clip": 0.01466212, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_clip": 1.28972268, + "balance_loss_mlp": 1.02275324, + "epoch": 0.4221253569818127, + "flos": 21951742222080.0, + "grad_norm": 1.6088626705174531, + "language_loss": 0.81768298, + "learning_rate": 2.592495760867347e-06, + "loss": 0.842785, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.21240234, + "step": 7021, + "time_per_iteration": 2.8436410427093506 + }, + { + "auxiliary_loss_clip": 0.01494564, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.3097105, + "balance_loss_mlp": 1.01739645, + "epoch": 0.42218548023448066, + "flos": 32204092440960.0, + "grad_norm": 2.590373004056052, + "language_loss": 0.70636308, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.73169911, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.21643066, + "step": 7022, + "time_per_iteration": 2.9064908027648926 + }, + { + "auxiliary_loss_clip": 0.01469825, + "auxiliary_loss_mlp": 0.01042911, + "balance_loss_clip": 1.29404545, + "balance_loss_mlp": 1.02263391, + "epoch": 0.4222456034871487, + "flos": 30131503027200.0, + "grad_norm": 1.8180980027926745, + "language_loss": 0.68076485, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.70589221, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20263672, + "step": 7023, + "time_per_iteration": 2.888573408126831 + }, + { + "auxiliary_loss_clip": 0.01470825, + "auxiliary_loss_mlp": 0.01047326, + "balance_loss_clip": 1.29392958, + "balance_loss_mlp": 1.02469993, + "epoch": 0.42230572673981664, + "flos": 22138196618880.0, + "grad_norm": 86.62136877134265, + "language_loss": 0.70155597, + "learning_rate": 2.591379722314322e-06, + "loss": 0.72673744, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.22619629, + "step": 7024, + "time_per_iteration": 2.8557558059692383 + }, + { + "auxiliary_loss_clip": 0.01491888, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.30906272, + "balance_loss_mlp": 1.02444136, + "epoch": 0.4223658499924846, + "flos": 22065388456320.0, + "grad_norm": 1.5703480439390414, + "language_loss": 0.77760351, + "learning_rate": 2.591007664594147e-06, + "loss": 0.80299211, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.2253418, + "step": 7025, + "time_per_iteration": 2.845501184463501 + }, + { + "auxiliary_loss_clip": 0.01477506, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.2980237, + "balance_loss_mlp": 1.02072692, + "epoch": 0.4224259732451526, + "flos": 20419965630720.0, + "grad_norm": 2.34086604854161, + "language_loss": 0.80604005, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.83123469, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.2121582, + "step": 7026, + "time_per_iteration": 2.8462634086608887 + }, + { + "auxiliary_loss_clip": 0.01249087, + "auxiliary_loss_mlp": 0.01023345, + "balance_loss_clip": 1.1385057, + "balance_loss_mlp": 0.99740553, + "epoch": 0.42248609649782054, + "flos": 62877150190080.0, + "grad_norm": 0.7446523305026598, + "language_loss": 0.62018424, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.64290857, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.25976562, + "step": 7027, + "time_per_iteration": 3.4912173748016357 + }, + { + "auxiliary_loss_clip": 0.01470628, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.29159629, + "balance_loss_mlp": 1.02131736, + "epoch": 0.4225462197504885, + "flos": 26261087742720.0, + "grad_norm": 3.8807768588098766, + "language_loss": 0.72196579, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.74709934, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.21398926, + "step": 7028, + "time_per_iteration": 2.9317526817321777 + }, + { + "auxiliary_loss_clip": 0.01485412, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.301687, + "balance_loss_mlp": 1.01823652, + "epoch": 0.42260634300315647, + "flos": 20531666338560.0, + "grad_norm": 5.4552553355504525, + "language_loss": 0.83495855, + "learning_rate": 2.589519209743846e-06, + "loss": 0.86020827, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.21325684, + "step": 7029, + "time_per_iteration": 2.8726391792297363 + }, + { + "auxiliary_loss_clip": 0.0149303, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.30818379, + "balance_loss_mlp": 1.02336264, + "epoch": 0.42266646625582444, + "flos": 24327508913280.0, + "grad_norm": 2.0756842603538708, + "language_loss": 0.7603488, + "learning_rate": 2.589147040109424e-06, + "loss": 0.78573549, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22277832, + "step": 7030, + "time_per_iteration": 2.8746137619018555 + }, + { + "auxiliary_loss_clip": 0.01468579, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.28731465, + "balance_loss_mlp": 1.02067685, + "epoch": 0.4227265895084924, + "flos": 24214088903040.0, + "grad_norm": 2.08951110867092, + "language_loss": 0.87615341, + "learning_rate": 2.588774848134486e-06, + "loss": 0.9012711, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.22509766, + "step": 7031, + "time_per_iteration": 2.881432056427002 + }, + { + "auxiliary_loss_clip": 0.01481282, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.29958189, + "balance_loss_mlp": 1.01813626, + "epoch": 0.42278671276116037, + "flos": 16918522842240.0, + "grad_norm": 2.656614747530965, + "language_loss": 0.74310148, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.76831478, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21923828, + "step": 7032, + "time_per_iteration": 2.881918430328369 + }, + { + "auxiliary_loss_clip": 0.01484819, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.3003087, + "balance_loss_mlp": 1.01872063, + "epoch": 0.42284683601382833, + "flos": 25422074570880.0, + "grad_norm": 1.888868450620308, + "language_loss": 0.71466637, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.73991072, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.20898438, + "step": 7033, + "time_per_iteration": 2.883237361907959 + }, + { + "auxiliary_loss_clip": 0.01491012, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.30583572, + "balance_loss_mlp": 1.01963902, + "epoch": 0.4229069592664963, + "flos": 23050922849280.0, + "grad_norm": 2.042455612043299, + "language_loss": 0.90749627, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.93281734, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.21447754, + "step": 7034, + "time_per_iteration": 2.8584516048431396 + }, + { + "auxiliary_loss_clip": 0.01462662, + "auxiliary_loss_mlp": 0.01046785, + "balance_loss_clip": 1.28473949, + "balance_loss_mlp": 1.02593565, + "epoch": 0.42296708251916426, + "flos": 26078524398720.0, + "grad_norm": 1.6007280110117255, + "language_loss": 0.77524281, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.80033731, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.20837402, + "step": 7035, + "time_per_iteration": 2.8498620986938477 + }, + { + "auxiliary_loss_clip": 0.01482387, + "auxiliary_loss_mlp": 0.01048949, + "balance_loss_clip": 1.29946375, + "balance_loss_mlp": 1.02670455, + "epoch": 0.4230272057718323, + "flos": 19466401328640.0, + "grad_norm": 1.9194140636193495, + "language_loss": 0.83340186, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.85871518, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22241211, + "step": 7036, + "time_per_iteration": 2.782806873321533 + }, + { + "auxiliary_loss_clip": 0.01466723, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.28933072, + "balance_loss_mlp": 1.01362586, + "epoch": 0.42308732902450025, + "flos": 22393658615040.0, + "grad_norm": 1.7187036248296264, + "language_loss": 0.70867074, + "learning_rate": 2.58654122792447e-06, + "loss": 0.7336868, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.21252441, + "step": 7037, + "time_per_iteration": 2.8160901069641113 + }, + { + "auxiliary_loss_clip": 0.01469409, + "auxiliary_loss_mlp": 0.0103918, + "balance_loss_clip": 1.28837764, + "balance_loss_mlp": 1.01760268, + "epoch": 0.4231474522771682, + "flos": 21005055129600.0, + "grad_norm": 1.7101236323065594, + "language_loss": 0.78157759, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80666339, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21594238, + "step": 7038, + "time_per_iteration": 2.8704192638397217 + }, + { + "auxiliary_loss_clip": 0.01483933, + "auxiliary_loss_mlp": 0.01046777, + "balance_loss_clip": 1.29793704, + "balance_loss_mlp": 1.02399611, + "epoch": 0.4232075755298362, + "flos": 14984084361600.0, + "grad_norm": 1.9811879588500545, + "language_loss": 0.67932767, + "learning_rate": 2.585796509770259e-06, + "loss": 0.70463479, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.22766113, + "step": 7039, + "time_per_iteration": 4.259572505950928 + }, + { + "auxiliary_loss_clip": 0.01488543, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.30125248, + "balance_loss_mlp": 1.02465332, + "epoch": 0.42326769878250414, + "flos": 24542721020160.0, + "grad_norm": 1.57980839223373, + "language_loss": 0.76103163, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78637195, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 1.87207031, + "router_z_loss_mlp": 0.20849609, + "step": 7040, + "time_per_iteration": 2.868297815322876 + }, + { + "auxiliary_loss_clip": 0.01471628, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.28978705, + "balance_loss_mlp": 1.01723814, + "epoch": 0.4233278220351721, + "flos": 26881631182080.0, + "grad_norm": 1.7125138574712695, + "language_loss": 0.66394711, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.68905723, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22143555, + "step": 7041, + "time_per_iteration": 2.8884286880493164 + }, + { + "auxiliary_loss_clip": 0.01474239, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.29169416, + "balance_loss_mlp": 1.01573944, + "epoch": 0.4233879452878401, + "flos": 42830482573440.0, + "grad_norm": 1.727720612260796, + "language_loss": 0.74959695, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.77471888, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.22229004, + "step": 7042, + "time_per_iteration": 3.06581711769104 + }, + { + "auxiliary_loss_clip": 0.0145834, + "auxiliary_loss_mlp": 0.01040946, + "balance_loss_clip": 1.28024983, + "balance_loss_mlp": 1.01953626, + "epoch": 0.42344806854050804, + "flos": 25240054164480.0, + "grad_norm": 1.4245072379942034, + "language_loss": 0.82359463, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84858751, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21398926, + "step": 7043, + "time_per_iteration": 2.897966146469116 + }, + { + "auxiliary_loss_clip": 0.01468253, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.28841209, + "balance_loss_mlp": 1.01989102, + "epoch": 0.423508191793176, + "flos": 22787769237120.0, + "grad_norm": 2.351533716102935, + "language_loss": 0.65819108, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.6832993, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22680664, + "step": 7044, + "time_per_iteration": 2.874068260192871 + }, + { + "auxiliary_loss_clip": 0.01494059, + "auxiliary_loss_mlp": 0.01044749, + "balance_loss_clip": 1.30813003, + "balance_loss_mlp": 1.02097845, + "epoch": 0.42356831504584397, + "flos": 34649726382720.0, + "grad_norm": 1.7147354669374402, + "language_loss": 0.75606638, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.7814545, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.23803711, + "step": 7045, + "time_per_iteration": 2.9598186016082764 + }, + { + "auxiliary_loss_clip": 0.01460399, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.2821368, + "balance_loss_mlp": 1.0165962, + "epoch": 0.42362843829851193, + "flos": 17604092338560.0, + "grad_norm": 2.3333162447133016, + "language_loss": 0.81399447, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83898008, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21582031, + "step": 7046, + "time_per_iteration": 4.205596923828125 + }, + { + "auxiliary_loss_clip": 0.01471902, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.2881422, + "balance_loss_mlp": 1.01661611, + "epoch": 0.4236885615511799, + "flos": 22576086224640.0, + "grad_norm": 1.707772585384806, + "language_loss": 0.7761538, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.80126703, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.22814941, + "step": 7047, + "time_per_iteration": 4.248044013977051 + }, + { + "auxiliary_loss_clip": 0.01459142, + "auxiliary_loss_mlp": 0.010374, + "balance_loss_clip": 1.28160167, + "balance_loss_mlp": 1.01565588, + "epoch": 0.42374868480384786, + "flos": 26480507616000.0, + "grad_norm": 1.8804945001585132, + "language_loss": 0.68869686, + "learning_rate": 2.582444180141098e-06, + "loss": 0.71366221, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21728516, + "step": 7048, + "time_per_iteration": 2.9048032760620117 + }, + { + "auxiliary_loss_clip": 0.01481243, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.29727268, + "balance_loss_mlp": 1.01190722, + "epoch": 0.4238088080565159, + "flos": 20378403642240.0, + "grad_norm": 1.9349392738265394, + "language_loss": 0.78834534, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.8135125, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23571777, + "step": 7049, + "time_per_iteration": 4.268215894699097 + }, + { + "auxiliary_loss_clip": 0.01472598, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.28891802, + "balance_loss_mlp": 1.01912475, + "epoch": 0.42386893130918385, + "flos": 21180288816000.0, + "grad_norm": 2.2114978032616883, + "language_loss": 0.83840024, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.86354679, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.22924805, + "step": 7050, + "time_per_iteration": 2.8113133907318115 + }, + { + "auxiliary_loss_clip": 0.01468392, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.28779149, + "balance_loss_mlp": 1.0120163, + "epoch": 0.4239290545618518, + "flos": 17684501627520.0, + "grad_norm": 2.0907358284418827, + "language_loss": 0.74133122, + "learning_rate": 2.581326338868687e-06, + "loss": 0.76635975, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22436523, + "step": 7051, + "time_per_iteration": 2.807770013809204 + }, + { + "auxiliary_loss_clip": 0.01471982, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.29141307, + "balance_loss_mlp": 1.01270771, + "epoch": 0.4239891778145198, + "flos": 24324703735680.0, + "grad_norm": 2.1156588994241927, + "language_loss": 0.8685503, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.89362562, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22851562, + "step": 7052, + "time_per_iteration": 2.8544981479644775 + }, + { + "auxiliary_loss_clip": 0.01471036, + "auxiliary_loss_mlp": 0.01041357, + "balance_loss_clip": 1.28795481, + "balance_loss_mlp": 1.01829004, + "epoch": 0.42404930106718774, + "flos": 20568160909440.0, + "grad_norm": 1.4240513186670427, + "language_loss": 0.73340666, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.75853062, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23071289, + "step": 7053, + "time_per_iteration": 2.8438780307769775 + }, + { + "auxiliary_loss_clip": 0.01467737, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.28689921, + "balance_loss_mlp": 1.01449656, + "epoch": 0.4241094243198557, + "flos": 22317819050880.0, + "grad_norm": 2.106624462462933, + "language_loss": 0.83029902, + "learning_rate": 2.580208299200704e-06, + "loss": 0.85535294, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.23156738, + "step": 7054, + "time_per_iteration": 2.8378076553344727 + }, + { + "auxiliary_loss_clip": 0.01251872, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.13854742, + "balance_loss_mlp": 1.00180852, + "epoch": 0.4241695475725237, + "flos": 70643480843520.0, + "grad_norm": 0.7922161520724251, + "language_loss": 0.60566294, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62847435, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.27539062, + "step": 7055, + "time_per_iteration": 3.304468870162964 + }, + { + "auxiliary_loss_clip": 0.0147128, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.28785634, + "balance_loss_mlp": 1.01510096, + "epoch": 0.42422967082519164, + "flos": 14035496987520.0, + "grad_norm": 2.3412738264596022, + "language_loss": 0.77340734, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79851037, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.23937988, + "step": 7056, + "time_per_iteration": 2.8280317783355713 + }, + { + "auxiliary_loss_clip": 0.01478435, + "auxiliary_loss_mlp": 0.01041031, + "balance_loss_clip": 1.29188156, + "balance_loss_mlp": 1.01726127, + "epoch": 0.4242897940778596, + "flos": 22355535231360.0, + "grad_norm": 1.8557269389019724, + "language_loss": 0.84869325, + "learning_rate": 2.579090061518714e-06, + "loss": 0.8738879, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.23779297, + "step": 7057, + "time_per_iteration": 2.8505892753601074 + }, + { + "auxiliary_loss_clip": 0.01475061, + "auxiliary_loss_mlp": 0.01036488, + "balance_loss_clip": 1.29005933, + "balance_loss_mlp": 1.01355267, + "epoch": 0.42434991733052757, + "flos": 22604617710720.0, + "grad_norm": 2.656094765801848, + "language_loss": 0.83719397, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.86230946, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22937012, + "step": 7058, + "time_per_iteration": 2.8491175174713135 + }, + { + "auxiliary_loss_clip": 0.01445298, + "auxiliary_loss_mlp": 0.01036546, + "balance_loss_clip": 1.26852858, + "balance_loss_mlp": 1.01494563, + "epoch": 0.42441004058319554, + "flos": 20021104304640.0, + "grad_norm": 1.7190487857609122, + "language_loss": 0.81154054, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.83635902, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21618652, + "step": 7059, + "time_per_iteration": 2.8908252716064453 + }, + { + "auxiliary_loss_clip": 0.014734, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.28856611, + "balance_loss_mlp": 1.01898623, + "epoch": 0.4244701638358635, + "flos": 11152244908800.0, + "grad_norm": 14.270175961006737, + "language_loss": 0.70953333, + "learning_rate": 2.57797162620435e-06, + "loss": 0.73468643, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.22912598, + "step": 7060, + "time_per_iteration": 2.822981834411621 + }, + { + "auxiliary_loss_clip": 0.01476756, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.29251516, + "balance_loss_mlp": 1.01539683, + "epoch": 0.42453028708853147, + "flos": 23998288613760.0, + "grad_norm": 1.6085195089812157, + "language_loss": 0.76445198, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78959709, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.22338867, + "step": 7061, + "time_per_iteration": 2.8544015884399414 + }, + { + "auxiliary_loss_clip": 0.01477706, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.29438961, + "balance_loss_mlp": 1.01686025, + "epoch": 0.42459041034119943, + "flos": 18415659899520.0, + "grad_norm": 2.0292926244870015, + "language_loss": 0.74150854, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.76669276, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23864746, + "step": 7062, + "time_per_iteration": 2.822336196899414 + }, + { + "auxiliary_loss_clip": 0.0146176, + "auxiliary_loss_mlp": 0.01044475, + "balance_loss_clip": 1.27942777, + "balance_loss_mlp": 1.02249336, + "epoch": 0.42465053359386745, + "flos": 20967384193920.0, + "grad_norm": 1.6569015020194375, + "language_loss": 0.67177546, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.69683778, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.21972656, + "step": 7063, + "time_per_iteration": 2.825423002243042 + }, + { + "auxiliary_loss_clip": 0.01457156, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.28017068, + "balance_loss_mlp": 1.0166502, + "epoch": 0.4247106568465354, + "flos": 33118311749760.0, + "grad_norm": 1.6302609094980036, + "language_loss": 0.79077524, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.81573236, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21911621, + "step": 7064, + "time_per_iteration": 2.9116926193237305 + }, + { + "auxiliary_loss_clip": 0.01481129, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.29585266, + "balance_loss_mlp": 1.01339173, + "epoch": 0.4247707800992034, + "flos": 20056332021120.0, + "grad_norm": 1.946044571825641, + "language_loss": 0.76434374, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.78951645, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.22741699, + "step": 7065, + "time_per_iteration": 2.8624989986419678 + }, + { + "auxiliary_loss_clip": 0.01478094, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.29725456, + "balance_loss_mlp": 1.01722646, + "epoch": 0.42483090335187135, + "flos": 22395966099840.0, + "grad_norm": 1.6431681808649956, + "language_loss": 0.72996271, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.75514472, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.22888184, + "step": 7066, + "time_per_iteration": 2.889441967010498 + }, + { + "auxiliary_loss_clip": 0.01474853, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.28954279, + "balance_loss_mlp": 1.01561093, + "epoch": 0.4248910266045393, + "flos": 21365657337600.0, + "grad_norm": 2.2401426937799434, + "language_loss": 0.80679452, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.83194405, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.24475098, + "step": 7067, + "time_per_iteration": 2.8291900157928467 + }, + { + "auxiliary_loss_clip": 0.01243727, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.137308, + "balance_loss_mlp": 1.0034945, + "epoch": 0.4249511498572073, + "flos": 64037330087040.0, + "grad_norm": 0.9128119802923499, + "language_loss": 0.63540232, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65821785, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.34375, + "step": 7068, + "time_per_iteration": 3.2698163986206055 + }, + { + "auxiliary_loss_clip": 0.01474036, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.29031348, + "balance_loss_mlp": 1.0149821, + "epoch": 0.42501127310987524, + "flos": 19615818216960.0, + "grad_norm": 1.6712598376359682, + "language_loss": 0.72958124, + "learning_rate": 2.574615138284361e-06, + "loss": 0.75471056, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.23913574, + "step": 7069, + "time_per_iteration": 2.837414503097534 + }, + { + "auxiliary_loss_clip": 0.01474219, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.28884935, + "balance_loss_mlp": 1.01425552, + "epoch": 0.4250713963625432, + "flos": 19471378256640.0, + "grad_norm": 2.3883478489198278, + "language_loss": 0.79959911, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.82471383, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.2298584, + "step": 7070, + "time_per_iteration": 2.87648868560791 + }, + { + "auxiliary_loss_clip": 0.01470175, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.2881341, + "balance_loss_mlp": 1.01531327, + "epoch": 0.4251315196152112, + "flos": 25348587736320.0, + "grad_norm": 1.757575169507403, + "language_loss": 0.708009, + "learning_rate": 2.573869012032795e-06, + "loss": 0.73309708, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23327637, + "step": 7071, + "time_per_iteration": 2.989698886871338 + }, + { + "auxiliary_loss_clip": 0.01467, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.28439081, + "balance_loss_mlp": 1.01505816, + "epoch": 0.42519164286787914, + "flos": 26370299986560.0, + "grad_norm": 3.079234544391467, + "language_loss": 0.72624779, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.75128567, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.21740723, + "step": 7072, + "time_per_iteration": 2.8998169898986816 + }, + { + "auxiliary_loss_clip": 0.0147643, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.28997517, + "balance_loss_mlp": 1.01851654, + "epoch": 0.4252517661205471, + "flos": 26042165562240.0, + "grad_norm": 1.801806560297093, + "language_loss": 0.82587892, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.85105741, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.22888184, + "step": 7073, + "time_per_iteration": 4.302968263626099 + }, + { + "auxiliary_loss_clip": 0.01462803, + "auxiliary_loss_mlp": 0.01041189, + "balance_loss_clip": 1.28333497, + "balance_loss_mlp": 1.01921892, + "epoch": 0.42531188937321507, + "flos": 12721239987840.0, + "grad_norm": 2.526784660928835, + "language_loss": 0.92226684, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.94730675, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.21972656, + "step": 7074, + "time_per_iteration": 2.8009183406829834 + }, + { + "auxiliary_loss_clip": 0.01474724, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.28715611, + "balance_loss_mlp": 1.01989985, + "epoch": 0.42537201262588303, + "flos": 22102018761600.0, + "grad_norm": 1.664867803326938, + "language_loss": 0.65047055, + "learning_rate": 2.572376498508805e-06, + "loss": 0.67565733, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.24047852, + "step": 7075, + "time_per_iteration": 2.84133243560791 + }, + { + "auxiliary_loss_clip": 0.01461465, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.28327394, + "balance_loss_mlp": 1.02024293, + "epoch": 0.42543213587855105, + "flos": 23013523382400.0, + "grad_norm": 1.836824499285344, + "language_loss": 0.7511422, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.77617383, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.21459961, + "step": 7076, + "time_per_iteration": 2.844604969024658 + }, + { + "auxiliary_loss_clip": 0.0148134, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.29581809, + "balance_loss_mlp": 1.02133036, + "epoch": 0.425492259131219, + "flos": 25093306719360.0, + "grad_norm": 2.1293913860918527, + "language_loss": 0.79453564, + "learning_rate": 2.571630111462766e-06, + "loss": 0.81978714, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.22473145, + "step": 7077, + "time_per_iteration": 2.9756767749786377 + }, + { + "auxiliary_loss_clip": 0.01457901, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.2822144, + "balance_loss_mlp": 1.01665723, + "epoch": 0.425552382383887, + "flos": 22826209334400.0, + "grad_norm": 1.6854221612056146, + "language_loss": 0.73800761, + "learning_rate": 2.571256885418265e-06, + "loss": 0.76296496, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21191406, + "step": 7078, + "time_per_iteration": 2.9292047023773193 + }, + { + "auxiliary_loss_clip": 0.01463414, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.28474426, + "balance_loss_mlp": 1.02244973, + "epoch": 0.42561250563655495, + "flos": 13561610503680.0, + "grad_norm": 1.6984132203027067, + "language_loss": 0.81002343, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.83511019, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.22802734, + "step": 7079, + "time_per_iteration": 2.7873973846435547 + }, + { + "auxiliary_loss_clip": 0.0146893, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.28887808, + "balance_loss_mlp": 1.0218997, + "epoch": 0.4256726288892229, + "flos": 46995976316160.0, + "grad_norm": 1.509967960694907, + "language_loss": 0.72754961, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.75267172, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21386719, + "step": 7080, + "time_per_iteration": 3.0949535369873047 + }, + { + "auxiliary_loss_clip": 0.01461318, + "auxiliary_loss_mlp": 0.01042871, + "balance_loss_clip": 1.28008914, + "balance_loss_mlp": 1.02092409, + "epoch": 0.4257327521418909, + "flos": 23596893578880.0, + "grad_norm": 2.7475255756402905, + "language_loss": 0.81404763, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.83908951, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21948242, + "step": 7081, + "time_per_iteration": 4.316623210906982 + }, + { + "auxiliary_loss_clip": 0.01458734, + "auxiliary_loss_mlp": 0.01047012, + "balance_loss_clip": 1.28271258, + "balance_loss_mlp": 1.0245527, + "epoch": 0.42579287539455885, + "flos": 18999437299200.0, + "grad_norm": 1.6086405719613108, + "language_loss": 0.81806117, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.84311861, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.22460938, + "step": 7082, + "time_per_iteration": 4.196643352508545 + }, + { + "auxiliary_loss_clip": 0.01473346, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.29193783, + "balance_loss_mlp": 1.02444851, + "epoch": 0.4258529986472268, + "flos": 25202790432000.0, + "grad_norm": 1.7702021216620596, + "language_loss": 0.70296693, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72816199, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21704102, + "step": 7083, + "time_per_iteration": 2.8531947135925293 + }, + { + "auxiliary_loss_clip": 0.01243944, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.13867402, + "balance_loss_mlp": 1.01961637, + "epoch": 0.4259131218998948, + "flos": 70002414247680.0, + "grad_norm": 0.8987582894866127, + "language_loss": 0.67095327, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69385207, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.26367188, + "step": 7084, + "time_per_iteration": 4.859258651733398 + }, + { + "auxiliary_loss_clip": 0.01467491, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_clip": 1.28674662, + "balance_loss_mlp": 1.02636373, + "epoch": 0.42597324515256274, + "flos": 18014174375040.0, + "grad_norm": 2.073264252772804, + "language_loss": 0.79048955, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.81566429, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.23620605, + "step": 7085, + "time_per_iteration": 2.827221393585205 + }, + { + "auxiliary_loss_clip": 0.01495705, + "auxiliary_loss_mlp": 0.01056485, + "balance_loss_clip": 1.30613577, + "balance_loss_mlp": 1.03285789, + "epoch": 0.4260333684052307, + "flos": 15167190643200.0, + "grad_norm": 2.438441834141738, + "language_loss": 0.77774489, + "learning_rate": 2.568270298414995e-06, + "loss": 0.80326676, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 1.89550781, + "router_z_loss_mlp": 0.23632812, + "step": 7086, + "time_per_iteration": 2.7935128211975098 + }, + { + "auxiliary_loss_clip": 0.01473715, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.29409981, + "balance_loss_mlp": 1.02034283, + "epoch": 0.42609349165789867, + "flos": 14947544545920.0, + "grad_norm": 1.8130101783300696, + "language_loss": 0.81090742, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.83606708, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.21911621, + "step": 7087, + "time_per_iteration": 2.8359174728393555 + }, + { + "auxiliary_loss_clip": 0.01477646, + "auxiliary_loss_mlp": 0.01044611, + "balance_loss_clip": 1.29562593, + "balance_loss_mlp": 1.02150822, + "epoch": 0.42615361491056664, + "flos": 23742328924800.0, + "grad_norm": 1.6207979807323898, + "language_loss": 0.66610831, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.69133091, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.2310791, + "step": 7088, + "time_per_iteration": 2.8723244667053223 + }, + { + "auxiliary_loss_clip": 0.01494108, + "auxiliary_loss_mlp": 0.01050189, + "balance_loss_clip": 1.30900097, + "balance_loss_mlp": 1.02768207, + "epoch": 0.42621373816323466, + "flos": 24947237946240.0, + "grad_norm": 2.1818213667225255, + "language_loss": 0.69553363, + "learning_rate": 2.56714997234313e-06, + "loss": 0.72097659, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.22509766, + "step": 7089, + "time_per_iteration": 2.9019615650177 + }, + { + "auxiliary_loss_clip": 0.0148952, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_clip": 1.30398643, + "balance_loss_mlp": 1.02133834, + "epoch": 0.4262738614159026, + "flos": 13560841342080.0, + "grad_norm": 3.9077710905916443, + "language_loss": 0.7450248, + "learning_rate": 2.566776487287525e-06, + "loss": 0.77034485, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.21142578, + "step": 7090, + "time_per_iteration": 2.7999441623687744 + }, + { + "auxiliary_loss_clip": 0.01491988, + "auxiliary_loss_mlp": 0.01048336, + "balance_loss_clip": 1.30647242, + "balance_loss_mlp": 1.02643681, + "epoch": 0.4263339846685706, + "flos": 29759861088000.0, + "grad_norm": 2.24574039437063, + "language_loss": 0.75631404, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.7817173, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.21911621, + "step": 7091, + "time_per_iteration": 2.8859994411468506 + }, + { + "auxiliary_loss_clip": 0.01459933, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.28428388, + "balance_loss_mlp": 1.01845634, + "epoch": 0.42639410792123855, + "flos": 16842276074880.0, + "grad_norm": 1.8615750119415506, + "language_loss": 0.83412373, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.85911733, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.2097168, + "step": 7092, + "time_per_iteration": 2.8245270252227783 + }, + { + "auxiliary_loss_clip": 0.0150104, + "auxiliary_loss_mlp": 0.01044675, + "balance_loss_clip": 1.31262004, + "balance_loss_mlp": 1.02226329, + "epoch": 0.4264542311739065, + "flos": 28773783757440.0, + "grad_norm": 1.5284073933417415, + "language_loss": 0.74306208, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76851928, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.22412109, + "step": 7093, + "time_per_iteration": 2.867769241333008 + }, + { + "auxiliary_loss_clip": 0.01482669, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.30091023, + "balance_loss_mlp": 1.01549339, + "epoch": 0.4265143544265745, + "flos": 24723338837760.0, + "grad_norm": 2.202005778297535, + "language_loss": 0.71069527, + "learning_rate": 2.565282332284532e-06, + "loss": 0.73589933, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22241211, + "step": 7094, + "time_per_iteration": 2.866036891937256 + }, + { + "auxiliary_loss_clip": 0.01493678, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.31031775, + "balance_loss_mlp": 1.02060771, + "epoch": 0.42657447767924245, + "flos": 21874500069120.0, + "grad_norm": 1.972071129020264, + "language_loss": 0.82513475, + "learning_rate": 2.564908739909464e-06, + "loss": 0.85051787, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.24047852, + "step": 7095, + "time_per_iteration": 2.799010992050171 + }, + { + "auxiliary_loss_clip": 0.01498013, + "auxiliary_loss_mlp": 0.01047034, + "balance_loss_clip": 1.31170344, + "balance_loss_mlp": 1.02357328, + "epoch": 0.4266346009319104, + "flos": 21480117978240.0, + "grad_norm": 1.736118968724066, + "language_loss": 0.81419003, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.8396405, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.23461914, + "step": 7096, + "time_per_iteration": 2.941101551055908 + }, + { + "auxiliary_loss_clip": 0.01501692, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.31312776, + "balance_loss_mlp": 1.0183568, + "epoch": 0.4266947241845784, + "flos": 25530110449920.0, + "grad_norm": 1.9558958838763105, + "language_loss": 0.66535032, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.69077408, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.2232666, + "step": 7097, + "time_per_iteration": 2.8706889152526855 + }, + { + "auxiliary_loss_clip": 0.01480272, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.29937196, + "balance_loss_mlp": 1.01089644, + "epoch": 0.42675484743724634, + "flos": 26552275148160.0, + "grad_norm": 1.7336567320563723, + "language_loss": 0.74958587, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.7747187, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22094727, + "step": 7098, + "time_per_iteration": 2.888233184814453 + }, + { + "auxiliary_loss_clip": 0.01467765, + "auxiliary_loss_mlp": 0.01040603, + "balance_loss_clip": 1.28886056, + "balance_loss_mlp": 1.01822782, + "epoch": 0.4268149706899143, + "flos": 23123188074240.0, + "grad_norm": 1.626764291897448, + "language_loss": 0.75957477, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.78465849, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22387695, + "step": 7099, + "time_per_iteration": 2.8786673545837402 + }, + { + "auxiliary_loss_clip": 0.01494132, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.30865347, + "balance_loss_mlp": 1.01583898, + "epoch": 0.4268750939425823, + "flos": 22715639746560.0, + "grad_norm": 2.0155925480813606, + "language_loss": 0.83379269, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85912037, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.2277832, + "step": 7100, + "time_per_iteration": 2.825925827026367 + }, + { + "auxiliary_loss_clip": 0.01484212, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.30007625, + "balance_loss_mlp": 1.01651835, + "epoch": 0.42693521719525024, + "flos": 25385941958400.0, + "grad_norm": 1.38420099557558, + "language_loss": 0.82477719, + "learning_rate": 2.562666736305627e-06, + "loss": 0.85001278, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.22814941, + "step": 7101, + "time_per_iteration": 2.8799118995666504 + }, + { + "auxiliary_loss_clip": 0.0149098, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.30534983, + "balance_loss_mlp": 1.01379836, + "epoch": 0.42699534044791826, + "flos": 18159881189760.0, + "grad_norm": 2.4175712516481362, + "language_loss": 0.73272902, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.75801331, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.2364502, + "step": 7102, + "time_per_iteration": 2.828179121017456 + }, + { + "auxiliary_loss_clip": 0.01469827, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.29176378, + "balance_loss_mlp": 1.01471829, + "epoch": 0.4270554637005862, + "flos": 13706186198400.0, + "grad_norm": 2.173044595388497, + "language_loss": 0.83969367, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.86475849, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.21936035, + "step": 7103, + "time_per_iteration": 2.798736810684204 + }, + { + "auxiliary_loss_clip": 0.01495216, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.30886197, + "balance_loss_mlp": 1.01348233, + "epoch": 0.4271155869532542, + "flos": 17502526465920.0, + "grad_norm": 2.0836487852502095, + "language_loss": 0.75479543, + "learning_rate": 2.561545446271294e-06, + "loss": 0.78010803, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 1.86328125, + "router_z_loss_mlp": 0.22583008, + "step": 7104, + "time_per_iteration": 2.8120079040527344 + }, + { + "auxiliary_loss_clip": 0.01474399, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.29226017, + "balance_loss_mlp": 1.01528978, + "epoch": 0.42717571020592215, + "flos": 32464983813120.0, + "grad_norm": 2.5076280588491175, + "language_loss": 0.76446128, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.78957987, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.22155762, + "step": 7105, + "time_per_iteration": 2.8916070461273193 + }, + { + "auxiliary_loss_clip": 0.01502075, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.31800914, + "balance_loss_mlp": 1.01291788, + "epoch": 0.4272358334585901, + "flos": 16261665811200.0, + "grad_norm": 2.07420057156611, + "language_loss": 0.79114771, + "learning_rate": 2.560797813088819e-06, + "loss": 0.81651151, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.21374512, + "step": 7106, + "time_per_iteration": 2.792191505432129 + }, + { + "auxiliary_loss_clip": 0.0147742, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.29602563, + "balance_loss_mlp": 1.0169158, + "epoch": 0.4272959567112581, + "flos": 24208976240640.0, + "grad_norm": 2.7794575296211894, + "language_loss": 0.81298923, + "learning_rate": 2.560423964592229e-06, + "loss": 0.83815765, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.22521973, + "step": 7107, + "time_per_iteration": 2.8419156074523926 + }, + { + "auxiliary_loss_clip": 0.01475607, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.29452145, + "balance_loss_mlp": 1.01402736, + "epoch": 0.42735607996392605, + "flos": 27974749006080.0, + "grad_norm": 1.5731997885386806, + "language_loss": 0.68677485, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.71189523, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.22399902, + "step": 7108, + "time_per_iteration": 4.306857109069824 + }, + { + "auxiliary_loss_clip": 0.01482864, + "auxiliary_loss_mlp": 0.01041066, + "balance_loss_clip": 1.30096769, + "balance_loss_mlp": 1.01913142, + "epoch": 0.427416203216594, + "flos": 20303333239680.0, + "grad_norm": 1.8770057054858285, + "language_loss": 0.72345471, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.748694, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21923828, + "step": 7109, + "time_per_iteration": 2.880417823791504 + }, + { + "auxiliary_loss_clip": 0.01478671, + "auxiliary_loss_mlp": 0.01038646, + "balance_loss_clip": 1.29602146, + "balance_loss_mlp": 1.01516223, + "epoch": 0.427476326469262, + "flos": 26955479975040.0, + "grad_norm": 1.7429405070441886, + "language_loss": 0.65401232, + "learning_rate": 2.559302291651174e-06, + "loss": 0.67918551, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.23474121, + "step": 7110, + "time_per_iteration": 2.881152629852295 + }, + { + "auxiliary_loss_clip": 0.01478838, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.29659534, + "balance_loss_mlp": 1.01474428, + "epoch": 0.42753644972192995, + "flos": 25713895403520.0, + "grad_norm": 1.7312647067920897, + "language_loss": 0.77229345, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.79745913, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.22961426, + "step": 7111, + "time_per_iteration": 2.877923011779785 + }, + { + "auxiliary_loss_clip": 0.014828, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.2995224, + "balance_loss_mlp": 1.0164094, + "epoch": 0.4275965729745979, + "flos": 18775854904320.0, + "grad_norm": 1.7590635471110536, + "language_loss": 0.74340713, + "learning_rate": 2.558554403622845e-06, + "loss": 0.76861864, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.21948242, + "step": 7112, + "time_per_iteration": 2.8184444904327393 + }, + { + "auxiliary_loss_clip": 0.01465311, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.28628302, + "balance_loss_mlp": 1.01390254, + "epoch": 0.4276566962272659, + "flos": 23773756078080.0, + "grad_norm": 1.6137174139734443, + "language_loss": 0.72004783, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.74505776, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21777344, + "step": 7113, + "time_per_iteration": 2.9155595302581787 + }, + { + "auxiliary_loss_clip": 0.01488841, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.30405569, + "balance_loss_mlp": 1.01776743, + "epoch": 0.42771681947993384, + "flos": 22502825614080.0, + "grad_norm": 1.5418724869165017, + "language_loss": 0.61934996, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64463782, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 1.84863281, + "router_z_loss_mlp": 0.22167969, + "step": 7114, + "time_per_iteration": 2.980602741241455 + }, + { + "auxiliary_loss_clip": 0.01492758, + "auxiliary_loss_mlp": 0.01041735, + "balance_loss_clip": 1.30317116, + "balance_loss_mlp": 1.01776218, + "epoch": 0.42777694273260186, + "flos": 25055228580480.0, + "grad_norm": 1.9043519815471994, + "language_loss": 0.65778196, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.68312687, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 1.89453125, + "router_z_loss_mlp": 0.23986816, + "step": 7115, + "time_per_iteration": 4.265331745147705 + }, + { + "auxiliary_loss_clip": 0.01475668, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.29388404, + "balance_loss_mlp": 1.01162112, + "epoch": 0.4278370659852698, + "flos": 18670850426880.0, + "grad_norm": 1.6471052860346147, + "language_loss": 0.73823857, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76332921, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.21777344, + "step": 7116, + "time_per_iteration": 2.8323047161102295 + }, + { + "auxiliary_loss_clip": 0.01455817, + "auxiliary_loss_mlp": 0.0103525, + "balance_loss_clip": 1.27829587, + "balance_loss_mlp": 1.01324427, + "epoch": 0.4278971892379378, + "flos": 27319747011840.0, + "grad_norm": 1.739070112535789, + "language_loss": 0.70102179, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.72593248, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22009277, + "step": 7117, + "time_per_iteration": 4.315077066421509 + }, + { + "auxiliary_loss_clip": 0.0147499, + "auxiliary_loss_mlp": 0.01041741, + "balance_loss_clip": 1.29439211, + "balance_loss_mlp": 1.01880503, + "epoch": 0.42795731249060576, + "flos": 12894437658240.0, + "grad_norm": 2.7473168055614994, + "language_loss": 0.71541446, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.74058175, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22937012, + "step": 7118, + "time_per_iteration": 4.215929985046387 + }, + { + "auxiliary_loss_clip": 0.01467739, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.28759253, + "balance_loss_mlp": 1.01887369, + "epoch": 0.4280174357432737, + "flos": 33414521328000.0, + "grad_norm": 1.8753448909109025, + "language_loss": 0.75403285, + "learning_rate": 2.55593612908444e-06, + "loss": 0.77912533, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22631836, + "step": 7119, + "time_per_iteration": 2.9512226581573486 + }, + { + "auxiliary_loss_clip": 0.01462949, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.28370535, + "balance_loss_mlp": 1.01028776, + "epoch": 0.4280775589959417, + "flos": 18268052803200.0, + "grad_norm": 2.0033717169538625, + "language_loss": 0.74837142, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77333337, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.22973633, + "step": 7120, + "time_per_iteration": 2.8159942626953125 + }, + { + "auxiliary_loss_clip": 0.01475702, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.29457951, + "balance_loss_mlp": 1.01688004, + "epoch": 0.42813768224860965, + "flos": 21481430077440.0, + "grad_norm": 1.9672518694679362, + "language_loss": 0.77832901, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.80348361, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.2286377, + "step": 7121, + "time_per_iteration": 2.8971588611602783 + }, + { + "auxiliary_loss_clip": 0.01450676, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.27390432, + "balance_loss_mlp": 1.01625741, + "epoch": 0.4281978055012776, + "flos": 15677707432320.0, + "grad_norm": 1.731754953481099, + "language_loss": 0.86312211, + "learning_rate": 2.554813694924126e-06, + "loss": 0.88802516, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.23364258, + "step": 7122, + "time_per_iteration": 2.8223047256469727 + }, + { + "auxiliary_loss_clip": 0.01462025, + "auxiliary_loss_mlp": 0.01037242, + "balance_loss_clip": 1.28247547, + "balance_loss_mlp": 1.01491368, + "epoch": 0.4282579287539456, + "flos": 17720860464000.0, + "grad_norm": 5.382276948508483, + "language_loss": 0.8200078, + "learning_rate": 2.554439508107921e-06, + "loss": 0.84500051, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22338867, + "step": 7123, + "time_per_iteration": 2.8408985137939453 + }, + { + "auxiliary_loss_clip": 0.01459769, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.28149426, + "balance_loss_mlp": 1.01316607, + "epoch": 0.42831805200661355, + "flos": 19290488970240.0, + "grad_norm": 1.5818329185226463, + "language_loss": 0.81430256, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83925062, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21862793, + "step": 7124, + "time_per_iteration": 2.876183271408081 + }, + { + "auxiliary_loss_clip": 0.01461212, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.28026378, + "balance_loss_mlp": 1.01808143, + "epoch": 0.4283781752592815, + "flos": 19802272613760.0, + "grad_norm": 1.7350061051988042, + "language_loss": 0.80686855, + "learning_rate": 2.553691071416498e-06, + "loss": 0.83190167, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.2401123, + "step": 7125, + "time_per_iteration": 2.871673583984375 + }, + { + "auxiliary_loss_clip": 0.01448779, + "auxiliary_loss_mlp": 0.01037301, + "balance_loss_clip": 1.27112043, + "balance_loss_mlp": 1.01523519, + "epoch": 0.4284382985119495, + "flos": 16516630114560.0, + "grad_norm": 2.2128455848264434, + "language_loss": 0.75949377, + "learning_rate": 2.553316821569659e-06, + "loss": 0.78435457, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.22058105, + "step": 7126, + "time_per_iteration": 2.8393664360046387 + }, + { + "auxiliary_loss_clip": 0.01468541, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.2870419, + "balance_loss_mlp": 1.01557136, + "epoch": 0.42849842176461744, + "flos": 23341069624320.0, + "grad_norm": 1.630653779326275, + "language_loss": 0.82348311, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.84855235, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22802734, + "step": 7127, + "time_per_iteration": 2.885897397994995 + }, + { + "auxiliary_loss_clip": 0.01467856, + "auxiliary_loss_mlp": 0.01040244, + "balance_loss_clip": 1.28433836, + "balance_loss_mlp": 1.01697433, + "epoch": 0.4285585450172854, + "flos": 17283242327040.0, + "grad_norm": 2.5322786049146337, + "language_loss": 0.76694316, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.79202414, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.23278809, + "step": 7128, + "time_per_iteration": 2.9343371391296387 + }, + { + "auxiliary_loss_clip": 0.01468104, + "auxiliary_loss_mlp": 0.01039169, + "balance_loss_clip": 1.28341174, + "balance_loss_mlp": 1.01616192, + "epoch": 0.42861866826995343, + "flos": 24290199936000.0, + "grad_norm": 2.5158771956410613, + "language_loss": 0.74472439, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76979715, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.22998047, + "step": 7129, + "time_per_iteration": 2.8772947788238525 + }, + { + "auxiliary_loss_clip": 0.01462086, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.28205895, + "balance_loss_mlp": 1.01398551, + "epoch": 0.4286787915226214, + "flos": 24363867749760.0, + "grad_norm": 2.8738693730627185, + "language_loss": 0.78946614, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.81445193, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22509766, + "step": 7130, + "time_per_iteration": 2.908754587173462 + }, + { + "auxiliary_loss_clip": 0.01476166, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_clip": 1.29233265, + "balance_loss_mlp": 1.01918304, + "epoch": 0.42873891477528936, + "flos": 15458287559040.0, + "grad_norm": 2.2713513156266187, + "language_loss": 0.74427652, + "learning_rate": 2.551445257891886e-06, + "loss": 0.76946676, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.23693848, + "step": 7131, + "time_per_iteration": 2.8172099590301514 + }, + { + "auxiliary_loss_clip": 0.01477301, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.29410875, + "balance_loss_mlp": 1.02147198, + "epoch": 0.4287990380279573, + "flos": 17648549994240.0, + "grad_norm": 2.217428585682831, + "language_loss": 0.78136009, + "learning_rate": 2.551070882366973e-06, + "loss": 0.80657613, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22827148, + "step": 7132, + "time_per_iteration": 2.828075647354126 + }, + { + "auxiliary_loss_clip": 0.01472214, + "auxiliary_loss_mlp": 0.01044373, + "balance_loss_clip": 1.29017997, + "balance_loss_mlp": 1.02150846, + "epoch": 0.4288591612806253, + "flos": 27173768728320.0, + "grad_norm": 1.6607455826577207, + "language_loss": 0.79082, + "learning_rate": 2.550696485945397e-06, + "loss": 0.81598592, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.2286377, + "step": 7133, + "time_per_iteration": 2.8752410411834717 + }, + { + "auxiliary_loss_clip": 0.01476844, + "auxiliary_loss_mlp": 0.010479, + "balance_loss_clip": 1.29389429, + "balance_loss_mlp": 1.02517915, + "epoch": 0.42891928453329325, + "flos": 17171496374400.0, + "grad_norm": 1.9387117446939677, + "language_loss": 0.755247, + "learning_rate": 2.550322068641355e-06, + "loss": 0.78049445, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.22705078, + "step": 7134, + "time_per_iteration": 2.8343286514282227 + }, + { + "auxiliary_loss_clip": 0.01450531, + "auxiliary_loss_mlp": 0.01039046, + "balance_loss_clip": 1.27201891, + "balance_loss_mlp": 1.01683748, + "epoch": 0.4289794077859612, + "flos": 18196194781440.0, + "grad_norm": 1.8323264884324404, + "language_loss": 0.8498317, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.87472749, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22192383, + "step": 7135, + "time_per_iteration": 2.8087246417999268 + }, + { + "auxiliary_loss_clip": 0.01450484, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.2734195, + "balance_loss_mlp": 1.01861346, + "epoch": 0.4290395310386292, + "flos": 28268605854720.0, + "grad_norm": 1.9514951797259767, + "language_loss": 0.76305223, + "learning_rate": 2.549573171442666e-06, + "loss": 0.78796095, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.2175293, + "step": 7136, + "time_per_iteration": 2.8528990745544434 + }, + { + "auxiliary_loss_clip": 0.01474231, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.28996634, + "balance_loss_mlp": 1.01505589, + "epoch": 0.42909965429129715, + "flos": 16224537813120.0, + "grad_norm": 2.9091493296078665, + "language_loss": 0.80258286, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.82769972, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.22387695, + "step": 7137, + "time_per_iteration": 2.7956466674804688 + }, + { + "auxiliary_loss_clip": 0.01470202, + "auxiliary_loss_mlp": 0.01041902, + "balance_loss_clip": 1.28831506, + "balance_loss_mlp": 1.01971722, + "epoch": 0.4291597775439651, + "flos": 23123369053440.0, + "grad_norm": 1.8113042222793359, + "language_loss": 0.77374321, + "learning_rate": 2.548824190884499e-06, + "loss": 0.79886425, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22192383, + "step": 7138, + "time_per_iteration": 2.881227731704712 + }, + { + "auxiliary_loss_clip": 0.01263668, + "auxiliary_loss_mlp": 0.01065081, + "balance_loss_clip": 1.15347588, + "balance_loss_mlp": 1.0353266, + "epoch": 0.4292199007966331, + "flos": 67576081852800.0, + "grad_norm": 0.7923938618595718, + "language_loss": 0.5628981, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58618557, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.296875, + "step": 7139, + "time_per_iteration": 3.2196671962738037 + }, + { + "auxiliary_loss_clip": 0.01446694, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.27130878, + "balance_loss_mlp": 1.0189327, + "epoch": 0.42928002404930105, + "flos": 23009406105600.0, + "grad_norm": 1.79417403514838, + "language_loss": 0.81421208, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.8390857, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.21740723, + "step": 7140, + "time_per_iteration": 2.902254819869995 + }, + { + "auxiliary_loss_clip": 0.01479072, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.295017, + "balance_loss_mlp": 1.01797783, + "epoch": 0.429340147301969, + "flos": 11551920641280.0, + "grad_norm": 1.7983262637180595, + "language_loss": 0.82850361, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.85371482, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.24072266, + "step": 7141, + "time_per_iteration": 2.810974359512329 + }, + { + "auxiliary_loss_clip": 0.01470063, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.28582716, + "balance_loss_mlp": 1.02259135, + "epoch": 0.42940027055463703, + "flos": 25275915308160.0, + "grad_norm": 1.6274572684533348, + "language_loss": 0.87276649, + "learning_rate": 2.547325980144166e-06, + "loss": 0.89792341, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.23034668, + "step": 7142, + "time_per_iteration": 2.8382935523986816 + }, + { + "auxiliary_loss_clip": 0.0146293, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.28620136, + "balance_loss_mlp": 1.02303886, + "epoch": 0.429460393807305, + "flos": 23815408556160.0, + "grad_norm": 1.9195663566826497, + "language_loss": 0.78756875, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.81265497, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.2265625, + "step": 7143, + "time_per_iteration": 4.24007248878479 + }, + { + "auxiliary_loss_clip": 0.01482616, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.30109906, + "balance_loss_mlp": 1.01956964, + "epoch": 0.42952051705997296, + "flos": 13926375233280.0, + "grad_norm": 2.108181469895643, + "language_loss": 0.78330338, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.80852687, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20166016, + "step": 7144, + "time_per_iteration": 2.800973892211914 + }, + { + "auxiliary_loss_clip": 0.01467771, + "auxiliary_loss_mlp": 0.010416, + "balance_loss_clip": 1.28640389, + "balance_loss_mlp": 1.01867628, + "epoch": 0.4295806403126409, + "flos": 26771559287040.0, + "grad_norm": 3.0085630947510675, + "language_loss": 0.74766803, + "learning_rate": 2.54620210411532e-06, + "loss": 0.7727617, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22924805, + "step": 7145, + "time_per_iteration": 2.873518943786621 + }, + { + "auxiliary_loss_clip": 0.0146648, + "auxiliary_loss_mlp": 0.01041336, + "balance_loss_clip": 1.28429127, + "balance_loss_mlp": 1.01931846, + "epoch": 0.4296407635653089, + "flos": 20960597473920.0, + "grad_norm": 2.0135328580120184, + "language_loss": 0.80101967, + "learning_rate": 2.545827437329352e-06, + "loss": 0.82609785, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22009277, + "step": 7146, + "time_per_iteration": 2.8143060207366943 + }, + { + "auxiliary_loss_clip": 0.01460524, + "auxiliary_loss_mlp": 0.0103969, + "balance_loss_clip": 1.28193033, + "balance_loss_mlp": 1.01845837, + "epoch": 0.42970088681797686, + "flos": 15860994693120.0, + "grad_norm": 3.320080098259512, + "language_loss": 0.84389567, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.8688978, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21228027, + "step": 7147, + "time_per_iteration": 2.8341891765594482 + }, + { + "auxiliary_loss_clip": 0.0146631, + "auxiliary_loss_mlp": 0.01043344, + "balance_loss_clip": 1.28674984, + "balance_loss_mlp": 1.01982355, + "epoch": 0.4297610100706448, + "flos": 22392798963840.0, + "grad_norm": 2.0797655236318056, + "language_loss": 0.8778978, + "learning_rate": 2.545078041678131e-06, + "loss": 0.90299428, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.23522949, + "step": 7148, + "time_per_iteration": 2.866269588470459 + }, + { + "auxiliary_loss_clip": 0.01456839, + "auxiliary_loss_mlp": 0.01040163, + "balance_loss_clip": 1.27749038, + "balance_loss_mlp": 1.0170722, + "epoch": 0.4298211333233128, + "flos": 27936580377600.0, + "grad_norm": 2.126902888038868, + "language_loss": 0.78705764, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.81202769, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.23083496, + "step": 7149, + "time_per_iteration": 2.9194178581237793 + }, + { + "auxiliary_loss_clip": 0.01455679, + "auxiliary_loss_mlp": 0.0104129, + "balance_loss_clip": 1.2777282, + "balance_loss_mlp": 1.01879513, + "epoch": 0.42988125657598075, + "flos": 24436268709120.0, + "grad_norm": 1.7111751858188546, + "language_loss": 0.8078301, + "learning_rate": 2.544328563349256e-06, + "loss": 0.83279979, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.22485352, + "step": 7150, + "time_per_iteration": 4.293681383132935 + }, + { + "auxiliary_loss_clip": 0.01481611, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.29455614, + "balance_loss_mlp": 1.01863885, + "epoch": 0.4299413798286487, + "flos": 15858280005120.0, + "grad_norm": 1.8430121824220336, + "language_loss": 0.75959623, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.78484124, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.24267578, + "step": 7151, + "time_per_iteration": 4.211425542831421 + }, + { + "auxiliary_loss_clip": 0.01481874, + "auxiliary_loss_mlp": 0.01041705, + "balance_loss_clip": 1.29631197, + "balance_loss_mlp": 1.01938868, + "epoch": 0.4300015030813167, + "flos": 22319628842880.0, + "grad_norm": 1.7565109148156308, + "language_loss": 0.7116462, + "learning_rate": 2.543579002456406e-06, + "loss": 0.73688197, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 1.85449219, + "router_z_loss_mlp": 0.2232666, + "step": 7152, + "time_per_iteration": 2.8337037563323975 + }, + { + "auxiliary_loss_clip": 0.01460169, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.27846849, + "balance_loss_mlp": 1.01363266, + "epoch": 0.43006162633398465, + "flos": 34911205937280.0, + "grad_norm": 1.5812781101012474, + "language_loss": 0.72112632, + "learning_rate": 2.54320419108402e-06, + "loss": 0.7460869, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22265625, + "step": 7153, + "time_per_iteration": 4.3419976234436035 + }, + { + "auxiliary_loss_clip": 0.01455982, + "auxiliary_loss_mlp": 0.0103756, + "balance_loss_clip": 1.27578235, + "balance_loss_mlp": 1.01518428, + "epoch": 0.4301217495866526, + "flos": 15970206936960.0, + "grad_norm": 1.867013879317763, + "language_loss": 0.79134244, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81627786, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.22375488, + "step": 7154, + "time_per_iteration": 2.8343729972839355 + }, + { + "auxiliary_loss_clip": 0.01457227, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.27785957, + "balance_loss_mlp": 1.01412618, + "epoch": 0.43018187283932063, + "flos": 18779564977920.0, + "grad_norm": 1.5379068748133056, + "language_loss": 0.79804289, + "learning_rate": 2.542454506558389e-06, + "loss": 0.82296419, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.20776367, + "step": 7155, + "time_per_iteration": 2.791686773300171 + }, + { + "auxiliary_loss_clip": 0.0144536, + "auxiliary_loss_mlp": 0.01038001, + "balance_loss_clip": 1.26919436, + "balance_loss_mlp": 1.01568556, + "epoch": 0.4302419960919886, + "flos": 20159933909760.0, + "grad_norm": 1.7634509702636543, + "language_loss": 0.89600283, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.92083645, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22314453, + "step": 7156, + "time_per_iteration": 2.9325857162475586 + }, + { + "auxiliary_loss_clip": 0.01472789, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.28958607, + "balance_loss_mlp": 1.01574373, + "epoch": 0.43030211934465656, + "flos": 26444013045120.0, + "grad_norm": 3.549259225605503, + "language_loss": 0.84542787, + "learning_rate": 2.541704739753042e-06, + "loss": 0.87055081, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23779297, + "step": 7157, + "time_per_iteration": 2.8458869457244873 + }, + { + "auxiliary_loss_clip": 0.01490471, + "auxiliary_loss_mlp": 0.01041475, + "balance_loss_clip": 1.3033973, + "balance_loss_mlp": 1.01964784, + "epoch": 0.43036224259732453, + "flos": 24399909872640.0, + "grad_norm": 1.6488935123692399, + "language_loss": 0.72828031, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.75359976, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.21826172, + "step": 7158, + "time_per_iteration": 2.8423197269439697 + }, + { + "auxiliary_loss_clip": 0.01464972, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_clip": 1.28315854, + "balance_loss_mlp": 1.02240765, + "epoch": 0.4304223658499925, + "flos": 17210569898880.0, + "grad_norm": 1.915088365114028, + "language_loss": 0.83956301, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.8646639, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22729492, + "step": 7159, + "time_per_iteration": 2.8590245246887207 + }, + { + "auxiliary_loss_clip": 0.0146295, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.28078723, + "balance_loss_mlp": 1.01359403, + "epoch": 0.43048248910266046, + "flos": 14911230954240.0, + "grad_norm": 2.6413441178456045, + "language_loss": 0.84140277, + "learning_rate": 2.54057993551933e-06, + "loss": 0.86638862, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.22033691, + "step": 7160, + "time_per_iteration": 2.873429536819458 + }, + { + "auxiliary_loss_clip": 0.01488972, + "auxiliary_loss_mlp": 0.01040947, + "balance_loss_clip": 1.30187678, + "balance_loss_mlp": 1.01777315, + "epoch": 0.4305426123553284, + "flos": 21589782670080.0, + "grad_norm": 1.955304100498663, + "language_loss": 0.78322709, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.80852628, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.23193359, + "step": 7161, + "time_per_iteration": 2.826483964920044 + }, + { + "auxiliary_loss_clip": 0.01464497, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.28136802, + "balance_loss_mlp": 1.01399946, + "epoch": 0.4306027356079964, + "flos": 22611449675520.0, + "grad_norm": 2.06949680006272, + "language_loss": 0.73901868, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.76402128, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.2175293, + "step": 7162, + "time_per_iteration": 2.827371835708618 + }, + { + "auxiliary_loss_clip": 0.01255705, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.14774299, + "balance_loss_mlp": 1.00763345, + "epoch": 0.43066285886066435, + "flos": 70699684164480.0, + "grad_norm": 0.8087336402041331, + "language_loss": 0.59115815, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61406422, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.2734375, + "step": 7163, + "time_per_iteration": 3.2481682300567627 + }, + { + "auxiliary_loss_clip": 0.01448287, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.26974463, + "balance_loss_mlp": 1.01253986, + "epoch": 0.4307229821133323, + "flos": 26730902194560.0, + "grad_norm": 30.537524032787132, + "language_loss": 0.80107653, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.82589936, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.21459961, + "step": 7164, + "time_per_iteration": 2.8913636207580566 + }, + { + "auxiliary_loss_clip": 0.01476977, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.29100204, + "balance_loss_mlp": 1.01482654, + "epoch": 0.4307831053660003, + "flos": 26188279580160.0, + "grad_norm": 1.8661804097467254, + "language_loss": 0.69345951, + "learning_rate": 2.538704852009177e-06, + "loss": 0.71859425, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21679688, + "step": 7165, + "time_per_iteration": 2.899526834487915 + }, + { + "auxiliary_loss_clip": 0.01457735, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.27562296, + "balance_loss_mlp": 1.02013206, + "epoch": 0.43084322861866825, + "flos": 18918032624640.0, + "grad_norm": 2.766069566420504, + "language_loss": 0.75847745, + "learning_rate": 2.538329773967034e-06, + "loss": 0.78348589, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.2298584, + "step": 7166, + "time_per_iteration": 2.824721336364746 + }, + { + "auxiliary_loss_clip": 0.01452843, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.27408016, + "balance_loss_mlp": 1.01490402, + "epoch": 0.4309033518713362, + "flos": 26444239269120.0, + "grad_norm": 1.6760417028195063, + "language_loss": 0.7287361, + "learning_rate": 2.537954675511372e-06, + "loss": 0.75363886, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.2253418, + "step": 7167, + "time_per_iteration": 3.0053017139434814 + }, + { + "auxiliary_loss_clip": 0.01447732, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.27048302, + "balance_loss_mlp": 1.01404214, + "epoch": 0.43096347512400424, + "flos": 21222710455680.0, + "grad_norm": 1.6685022092669415, + "language_loss": 0.79352212, + "learning_rate": 2.537579556656414e-06, + "loss": 0.81836677, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22692871, + "step": 7168, + "time_per_iteration": 2.860227346420288 + }, + { + "auxiliary_loss_clip": 0.01450032, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.27068424, + "balance_loss_mlp": 1.01765013, + "epoch": 0.4310235983766722, + "flos": 16547559575040.0, + "grad_norm": 2.110619810157183, + "language_loss": 0.8340044, + "learning_rate": 2.537204417416387e-06, + "loss": 0.85890841, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.22729492, + "step": 7169, + "time_per_iteration": 2.8525123596191406 + }, + { + "auxiliary_loss_clip": 0.01252354, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.14466441, + "balance_loss_mlp": 1.00304019, + "epoch": 0.43108372162934017, + "flos": 64805480622720.0, + "grad_norm": 0.6714268412775781, + "language_loss": 0.60833567, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6311509, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.26171875, + "step": 7170, + "time_per_iteration": 3.502706527709961 + }, + { + "auxiliary_loss_clip": 0.01444287, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.26516986, + "balance_loss_mlp": 1.01257837, + "epoch": 0.43114384488200813, + "flos": 13452081546240.0, + "grad_norm": 3.677506397564554, + "language_loss": 0.77064282, + "learning_rate": 2.536454077838021e-06, + "loss": 0.79543322, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22155762, + "step": 7171, + "time_per_iteration": 2.8230485916137695 + }, + { + "auxiliary_loss_clip": 0.01444536, + "auxiliary_loss_mlp": 0.0103707, + "balance_loss_clip": 1.26610088, + "balance_loss_mlp": 1.01456285, + "epoch": 0.4312039681346761, + "flos": 26297899027200.0, + "grad_norm": 1.6313889753851782, + "language_loss": 0.77950728, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.80432332, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22509766, + "step": 7172, + "time_per_iteration": 2.898340940475464 + }, + { + "auxiliary_loss_clip": 0.01456524, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.27372837, + "balance_loss_mlp": 1.01560652, + "epoch": 0.43126409138734406, + "flos": 20386547706240.0, + "grad_norm": 2.8694199877664524, + "language_loss": 0.77817655, + "learning_rate": 2.535703656890086e-06, + "loss": 0.80314541, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.2479248, + "step": 7173, + "time_per_iteration": 2.8805859088897705 + }, + { + "auxiliary_loss_clip": 0.01449976, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.27055526, + "balance_loss_mlp": 1.01225305, + "epoch": 0.431324214640012, + "flos": 22132133815680.0, + "grad_norm": 1.5039131823283207, + "language_loss": 0.77074385, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79560179, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.23571777, + "step": 7174, + "time_per_iteration": 2.9182989597320557 + }, + { + "auxiliary_loss_clip": 0.01459536, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.27693129, + "balance_loss_mlp": 1.01272357, + "epoch": 0.43138433789268, + "flos": 15239229644160.0, + "grad_norm": 1.4520126121497234, + "language_loss": 0.82924777, + "learning_rate": 2.534953154686407e-06, + "loss": 0.85420781, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.23742676, + "step": 7175, + "time_per_iteration": 2.865137815475464 + }, + { + "auxiliary_loss_clip": 0.01472945, + "auxiliary_loss_mlp": 0.01039908, + "balance_loss_clip": 1.28433192, + "balance_loss_mlp": 1.01584029, + "epoch": 0.43144446114534796, + "flos": 18159383496960.0, + "grad_norm": 2.8337191227874823, + "language_loss": 0.75042897, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77555752, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.24084473, + "step": 7176, + "time_per_iteration": 2.7703304290771484 + }, + { + "auxiliary_loss_clip": 0.01463936, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.27945364, + "balance_loss_mlp": 1.01423073, + "epoch": 0.4315045843980159, + "flos": 22940172282240.0, + "grad_norm": 1.7622965810288402, + "language_loss": 0.74251199, + "learning_rate": 2.534202571340819e-06, + "loss": 0.76752603, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.2322998, + "step": 7177, + "time_per_iteration": 2.8688673973083496 + }, + { + "auxiliary_loss_clip": 0.01493475, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.30081487, + "balance_loss_mlp": 1.01739049, + "epoch": 0.4315647076506839, + "flos": 22136884519680.0, + "grad_norm": 2.059031562814478, + "language_loss": 0.82185721, + "learning_rate": 2.533827249275387e-06, + "loss": 0.84721261, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.24682617, + "step": 7178, + "time_per_iteration": 4.238221645355225 + }, + { + "auxiliary_loss_clip": 0.01444324, + "auxiliary_loss_mlp": 0.01043032, + "balance_loss_clip": 1.26918948, + "balance_loss_mlp": 1.02008367, + "epoch": 0.43162483090335185, + "flos": 26882671812480.0, + "grad_norm": 2.1142113328536887, + "language_loss": 0.84566486, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.87053841, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.22937012, + "step": 7179, + "time_per_iteration": 2.863978862762451 + }, + { + "auxiliary_loss_clip": 0.01456261, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.27540207, + "balance_loss_mlp": 1.01626468, + "epoch": 0.4316849541560198, + "flos": 13919859982080.0, + "grad_norm": 1.79060715275152, + "language_loss": 0.76440936, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.78936779, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.23303223, + "step": 7180, + "time_per_iteration": 2.8634438514709473 + }, + { + "auxiliary_loss_clip": 0.01463008, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.27842546, + "balance_loss_mlp": 1.01764643, + "epoch": 0.4317450774086878, + "flos": 16443731462400.0, + "grad_norm": 1.9332059848182244, + "language_loss": 0.82280654, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84784997, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.23681641, + "step": 7181, + "time_per_iteration": 2.765810251235962 + }, + { + "auxiliary_loss_clip": 0.01465641, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.28181672, + "balance_loss_mlp": 1.02167523, + "epoch": 0.4318052006613558, + "flos": 20563817408640.0, + "grad_norm": 1.7265897895892273, + "language_loss": 0.89590466, + "learning_rate": 2.532325758728165e-06, + "loss": 0.92101353, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.23583984, + "step": 7182, + "time_per_iteration": 2.823988914489746 + }, + { + "auxiliary_loss_clip": 0.01450608, + "auxiliary_loss_mlp": 0.01037528, + "balance_loss_clip": 1.2712152, + "balance_loss_mlp": 1.01543868, + "epoch": 0.43186532391402377, + "flos": 22830236121600.0, + "grad_norm": 1.7457539844124343, + "language_loss": 0.76805341, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.79293478, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.2208252, + "step": 7183, + "time_per_iteration": 2.8317618370056152 + }, + { + "auxiliary_loss_clip": 0.01462922, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.2789557, + "balance_loss_mlp": 1.0155884, + "epoch": 0.43192544716669173, + "flos": 25567509916800.0, + "grad_norm": 1.903530138002767, + "language_loss": 0.78173125, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.80674088, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.2244873, + "step": 7184, + "time_per_iteration": 2.870814085006714 + }, + { + "auxiliary_loss_clip": 0.01437387, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.26252174, + "balance_loss_mlp": 1.01436472, + "epoch": 0.4319855704193597, + "flos": 30966534656640.0, + "grad_norm": 1.532799240372351, + "language_loss": 0.73850513, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.76324683, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.22412109, + "step": 7185, + "time_per_iteration": 4.3542585372924805 + }, + { + "auxiliary_loss_clip": 0.01459857, + "auxiliary_loss_mlp": 0.01043507, + "balance_loss_clip": 1.27512765, + "balance_loss_mlp": 1.01865208, + "epoch": 0.43204569367202766, + "flos": 24248185499520.0, + "grad_norm": 2.348599429974186, + "language_loss": 0.76920176, + "learning_rate": 2.530823945207421e-06, + "loss": 0.79423541, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.2487793, + "step": 7186, + "time_per_iteration": 4.242719650268555 + }, + { + "auxiliary_loss_clip": 0.01457729, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.27617514, + "balance_loss_mlp": 1.0164907, + "epoch": 0.43210581692469563, + "flos": 18416429061120.0, + "grad_norm": 2.6814462154406886, + "language_loss": 0.76762307, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.79258537, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22009277, + "step": 7187, + "time_per_iteration": 2.8473987579345703 + }, + { + "auxiliary_loss_clip": 0.01258641, + "auxiliary_loss_mlp": 0.01055631, + "balance_loss_clip": 1.15123272, + "balance_loss_mlp": 1.03045392, + "epoch": 0.4321659401773636, + "flos": 49862388049920.0, + "grad_norm": 0.85608708718504, + "language_loss": 0.68325245, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70639515, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.25195312, + "step": 7188, + "time_per_iteration": 3.3641107082366943 + }, + { + "auxiliary_loss_clip": 0.01440386, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.26400781, + "balance_loss_mlp": 1.01564193, + "epoch": 0.43222606343003156, + "flos": 17137354533120.0, + "grad_norm": 1.9049255396570928, + "language_loss": 0.78654552, + "learning_rate": 2.529697373663614e-06, + "loss": 0.81132174, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21594238, + "step": 7189, + "time_per_iteration": 4.241934299468994 + }, + { + "auxiliary_loss_clip": 0.01476422, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.28833687, + "balance_loss_mlp": 1.01970482, + "epoch": 0.4322861866826995, + "flos": 22760866563840.0, + "grad_norm": 1.7220154486559482, + "language_loss": 0.72133297, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.74652535, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.23120117, + "step": 7190, + "time_per_iteration": 2.858563184738159 + }, + { + "auxiliary_loss_clip": 0.01438597, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.26054597, + "balance_loss_mlp": 1.01552129, + "epoch": 0.4323463099353675, + "flos": 27903524411520.0, + "grad_norm": 1.4549260364841998, + "language_loss": 0.80601561, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.83078593, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.22900391, + "step": 7191, + "time_per_iteration": 2.900874137878418 + }, + { + "auxiliary_loss_clip": 0.01435928, + "auxiliary_loss_mlp": 0.01040313, + "balance_loss_clip": 1.25794876, + "balance_loss_mlp": 1.01740098, + "epoch": 0.43240643318803546, + "flos": 21624286469760.0, + "grad_norm": 1.7165561907429043, + "language_loss": 0.75745469, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.78221709, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.22912598, + "step": 7192, + "time_per_iteration": 2.8999364376068115 + }, + { + "auxiliary_loss_clip": 0.0144474, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.26535034, + "balance_loss_mlp": 1.01158261, + "epoch": 0.4324665564407034, + "flos": 17565199793280.0, + "grad_norm": 1.8506994345949868, + "language_loss": 0.7957589, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.82055604, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.23400879, + "step": 7193, + "time_per_iteration": 2.8175084590911865 + }, + { + "auxiliary_loss_clip": 0.01443494, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.26212478, + "balance_loss_mlp": 1.01211429, + "epoch": 0.4325266796933714, + "flos": 18410728216320.0, + "grad_norm": 1.7975023968493737, + "language_loss": 0.76384318, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.78863257, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.23339844, + "step": 7194, + "time_per_iteration": 2.894209623336792 + }, + { + "auxiliary_loss_clip": 0.01456527, + "auxiliary_loss_mlp": 0.01041161, + "balance_loss_clip": 1.27612448, + "balance_loss_mlp": 1.01789153, + "epoch": 0.4325868029460394, + "flos": 22574955104640.0, + "grad_norm": 2.3518663570300467, + "language_loss": 0.60848236, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.63345921, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.23278809, + "step": 7195, + "time_per_iteration": 2.8486146926879883 + }, + { + "auxiliary_loss_clip": 0.01469873, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.28532648, + "balance_loss_mlp": 1.01626134, + "epoch": 0.43264692619870737, + "flos": 14612216198400.0, + "grad_norm": 2.09796925399397, + "language_loss": 0.6568501, + "learning_rate": 2.527068004376515e-06, + "loss": 0.68196595, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.25463867, + "step": 7196, + "time_per_iteration": 2.8022654056549072 + }, + { + "auxiliary_loss_clip": 0.01464764, + "auxiliary_loss_mlp": 0.01043092, + "balance_loss_clip": 1.28025246, + "balance_loss_mlp": 1.01971519, + "epoch": 0.43270704945137534, + "flos": 21510233032320.0, + "grad_norm": 2.159735741265863, + "language_loss": 0.72921002, + "learning_rate": 2.526692300132797e-06, + "loss": 0.75428855, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.23400879, + "step": 7197, + "time_per_iteration": 2.8162379264831543 + }, + { + "auxiliary_loss_clip": 0.01434823, + "auxiliary_loss_mlp": 0.01039102, + "balance_loss_clip": 1.25820196, + "balance_loss_mlp": 1.01475942, + "epoch": 0.4327671727040433, + "flos": 25166884043520.0, + "grad_norm": 1.4278633306108317, + "language_loss": 0.73723245, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.76197165, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.2434082, + "step": 7198, + "time_per_iteration": 2.8758440017700195 + }, + { + "auxiliary_loss_clip": 0.01443324, + "auxiliary_loss_mlp": 0.01035828, + "balance_loss_clip": 1.26490593, + "balance_loss_mlp": 1.0117712, + "epoch": 0.43282729595671127, + "flos": 25458071448960.0, + "grad_norm": 1.757944557374797, + "language_loss": 0.81556863, + "learning_rate": 2.525940831742934e-06, + "loss": 0.84036016, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.24072266, + "step": 7199, + "time_per_iteration": 2.927938461303711 + }, + { + "auxiliary_loss_clip": 0.01452992, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.27408946, + "balance_loss_mlp": 1.01841247, + "epoch": 0.43288741920937923, + "flos": 24134720244480.0, + "grad_norm": 2.991178389092289, + "language_loss": 0.69793463, + "learning_rate": 2.525565067625286e-06, + "loss": 0.72286439, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.21557617, + "step": 7200, + "time_per_iteration": 2.863464832305908 + }, + { + "auxiliary_loss_clip": 0.01454647, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_clip": 1.27336419, + "balance_loss_mlp": 1.01938581, + "epoch": 0.4329475424620472, + "flos": 19213427796480.0, + "grad_norm": 2.0269881694742207, + "language_loss": 0.87923634, + "learning_rate": 2.525189283578157e-06, + "loss": 0.90420085, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.22424316, + "step": 7201, + "time_per_iteration": 2.8250744342803955 + }, + { + "auxiliary_loss_clip": 0.0148457, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.29628325, + "balance_loss_mlp": 1.01350641, + "epoch": 0.43300766571471516, + "flos": 22648713408000.0, + "grad_norm": 2.0582828518352656, + "language_loss": 0.65718716, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.68240219, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 1.88183594, + "router_z_loss_mlp": 0.23413086, + "step": 7202, + "time_per_iteration": 2.822941541671753 + }, + { + "auxiliary_loss_clip": 0.01438551, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.25939155, + "balance_loss_mlp": 1.01416802, + "epoch": 0.4330677889673831, + "flos": 22129600106880.0, + "grad_norm": 1.9070012782941839, + "language_loss": 0.8280524, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.85279417, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.21447754, + "step": 7203, + "time_per_iteration": 2.858931064605713 + }, + { + "auxiliary_loss_clip": 0.01467585, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.2797749, + "balance_loss_mlp": 1.01466942, + "epoch": 0.4331279122200511, + "flos": 23231857380480.0, + "grad_norm": 2.017487532064701, + "language_loss": 0.82657558, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.85162079, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 1.87890625, + "router_z_loss_mlp": 0.22290039, + "step": 7204, + "time_per_iteration": 2.825761079788208 + }, + { + "auxiliary_loss_clip": 0.01451241, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.27050924, + "balance_loss_mlp": 1.01256537, + "epoch": 0.43318803547271906, + "flos": 18268867209600.0, + "grad_norm": 2.987585517509372, + "language_loss": 0.74934691, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.77421135, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.22631836, + "step": 7205, + "time_per_iteration": 2.825655937194824 + }, + { + "auxiliary_loss_clip": 0.01437118, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.26254189, + "balance_loss_mlp": 1.01168919, + "epoch": 0.433248158725387, + "flos": 27430090375680.0, + "grad_norm": 2.4538834552919213, + "language_loss": 0.75683594, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.78155053, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.22631836, + "step": 7206, + "time_per_iteration": 2.922534465789795 + }, + { + "auxiliary_loss_clip": 0.01442411, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.26399314, + "balance_loss_mlp": 1.01485312, + "epoch": 0.433308281978055, + "flos": 23227423390080.0, + "grad_norm": 1.746616035759977, + "language_loss": 0.79804528, + "learning_rate": 2.522934161574342e-06, + "loss": 0.82284486, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22717285, + "step": 7207, + "time_per_iteration": 2.8545446395874023 + }, + { + "auxiliary_loss_clip": 0.01470715, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.28476906, + "balance_loss_mlp": 1.01853538, + "epoch": 0.433368405230723, + "flos": 15861085182720.0, + "grad_norm": 1.7049487443937106, + "language_loss": 0.81350589, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83862746, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 1.85742188, + "router_z_loss_mlp": 0.22900391, + "step": 7208, + "time_per_iteration": 2.8545920848846436 + }, + { + "auxiliary_loss_clip": 0.0145201, + "auxiliary_loss_mlp": 0.01036187, + "balance_loss_clip": 1.27243829, + "balance_loss_mlp": 1.01458645, + "epoch": 0.433428528483391, + "flos": 19035434177280.0, + "grad_norm": 2.2432331947440143, + "language_loss": 0.7178297, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.74271166, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.21606445, + "step": 7209, + "time_per_iteration": 2.801804780960083 + }, + { + "auxiliary_loss_clip": 0.01445148, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.26599038, + "balance_loss_mlp": 1.01607502, + "epoch": 0.43348865173605894, + "flos": 24729175416960.0, + "grad_norm": 1.4243047574813827, + "language_loss": 0.82213163, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.8469767, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.23291016, + "step": 7210, + "time_per_iteration": 2.9203689098358154 + }, + { + "auxiliary_loss_clip": 0.01458705, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.27802467, + "balance_loss_mlp": 1.01947725, + "epoch": 0.4335487749887269, + "flos": 22100751907200.0, + "grad_norm": 2.1004804772070695, + "language_loss": 0.82723498, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.85222578, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.2088623, + "step": 7211, + "time_per_iteration": 2.960676431655884 + }, + { + "auxiliary_loss_clip": 0.01460535, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.27759957, + "balance_loss_mlp": 1.01505446, + "epoch": 0.43360889824139487, + "flos": 22393206167040.0, + "grad_norm": 1.8669610794742044, + "language_loss": 0.76423126, + "learning_rate": 2.521054347790029e-06, + "loss": 0.78919899, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.21179199, + "step": 7212, + "time_per_iteration": 2.8220133781433105 + }, + { + "auxiliary_loss_clip": 0.01454946, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.27569985, + "balance_loss_mlp": 1.01685786, + "epoch": 0.43366902149406283, + "flos": 17536758796800.0, + "grad_norm": 2.3307574793894266, + "language_loss": 0.77331734, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.79824167, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.20629883, + "step": 7213, + "time_per_iteration": 4.290258884429932 + }, + { + "auxiliary_loss_clip": 0.01452492, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.27082276, + "balance_loss_mlp": 1.017331, + "epoch": 0.4337291447467308, + "flos": 19031135921280.0, + "grad_norm": 2.653131165822232, + "language_loss": 0.65330589, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67821252, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.20849609, + "step": 7214, + "time_per_iteration": 2.8240954875946045 + }, + { + "auxiliary_loss_clip": 0.01439044, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.26443791, + "balance_loss_mlp": 1.01910579, + "epoch": 0.43378926799939876, + "flos": 27245174302080.0, + "grad_norm": 1.5769218407146535, + "language_loss": 0.71788669, + "learning_rate": 2.519926222304191e-06, + "loss": 0.74268061, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.21240234, + "step": 7215, + "time_per_iteration": 2.8849449157714844 + }, + { + "auxiliary_loss_clip": 0.014502, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.27251077, + "balance_loss_mlp": 1.01956582, + "epoch": 0.43384939125206673, + "flos": 15969618754560.0, + "grad_norm": 1.9391746025960457, + "language_loss": 0.76006842, + "learning_rate": 2.519550141025255e-06, + "loss": 0.78498441, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21838379, + "step": 7216, + "time_per_iteration": 2.803990125656128 + }, + { + "auxiliary_loss_clip": 0.01480728, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.29299068, + "balance_loss_mlp": 1.01619291, + "epoch": 0.4339095145047347, + "flos": 21801782396160.0, + "grad_norm": 2.530977357112367, + "language_loss": 0.77407193, + "learning_rate": 2.519174040044927e-06, + "loss": 0.79927784, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.23681641, + "step": 7217, + "time_per_iteration": 2.7782344818115234 + }, + { + "auxiliary_loss_clip": 0.01458834, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.27726948, + "balance_loss_mlp": 1.01714468, + "epoch": 0.43396963775740266, + "flos": 14217924597120.0, + "grad_norm": 2.380846073570895, + "language_loss": 0.74773538, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.7727108, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21569824, + "step": 7218, + "time_per_iteration": 2.8044447898864746 + }, + { + "auxiliary_loss_clip": 0.01457949, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.27574635, + "balance_loss_mlp": 1.01317215, + "epoch": 0.4340297610100706, + "flos": 19728514310400.0, + "grad_norm": 2.012487734199626, + "language_loss": 0.70626867, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.73119301, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.2130127, + "step": 7219, + "time_per_iteration": 2.814913511276245 + }, + { + "auxiliary_loss_clip": 0.01447138, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.2692132, + "balance_loss_mlp": 1.01584125, + "epoch": 0.4340898842627386, + "flos": 18962942728320.0, + "grad_norm": 1.8173886012126734, + "language_loss": 0.78339785, + "learning_rate": 2.518045619038202e-06, + "loss": 0.80823278, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.2052002, + "step": 7220, + "time_per_iteration": 4.40529727935791 + }, + { + "auxiliary_loss_clip": 0.01446832, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.26829064, + "balance_loss_mlp": 1.0159179, + "epoch": 0.4341500075154066, + "flos": 22028531927040.0, + "grad_norm": 2.1977913592598384, + "language_loss": 0.70491791, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.72975421, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20874023, + "step": 7221, + "time_per_iteration": 4.273157835006714 + }, + { + "auxiliary_loss_clip": 0.0145886, + "auxiliary_loss_mlp": 0.01039039, + "balance_loss_clip": 1.27627766, + "balance_loss_mlp": 1.01804662, + "epoch": 0.4342101307680746, + "flos": 23591871406080.0, + "grad_norm": 4.127077782461437, + "language_loss": 0.65580767, + "learning_rate": 2.51729324012157e-06, + "loss": 0.68078661, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.20983887, + "step": 7222, + "time_per_iteration": 2.876605749130249 + }, + { + "auxiliary_loss_clip": 0.01460395, + "auxiliary_loss_mlp": 0.01039732, + "balance_loss_clip": 1.28013754, + "balance_loss_mlp": 1.0177145, + "epoch": 0.43427025402074254, + "flos": 17977046376960.0, + "grad_norm": 2.3577912232778067, + "language_loss": 0.73944706, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.76444829, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22021484, + "step": 7223, + "time_per_iteration": 4.165122985839844 + }, + { + "auxiliary_loss_clip": 0.01466478, + "auxiliary_loss_mlp": 0.01036615, + "balance_loss_clip": 1.28325582, + "balance_loss_mlp": 1.01505017, + "epoch": 0.4343303772734105, + "flos": 26297537068800.0, + "grad_norm": 2.7004079351282733, + "language_loss": 0.94515079, + "learning_rate": 2.516540782741694e-06, + "loss": 0.9701817, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.21557617, + "step": 7224, + "time_per_iteration": 2.850823402404785 + }, + { + "auxiliary_loss_clip": 0.01447113, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.26829445, + "balance_loss_mlp": 1.01733732, + "epoch": 0.43439050052607847, + "flos": 26845317590400.0, + "grad_norm": 1.4472559503670315, + "language_loss": 0.62238097, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.6472277, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.20239258, + "step": 7225, + "time_per_iteration": 2.9088900089263916 + }, + { + "auxiliary_loss_clip": 0.0145778, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.27742028, + "balance_loss_mlp": 1.01980639, + "epoch": 0.43445062377874644, + "flos": 21407626529280.0, + "grad_norm": 3.0722759939224438, + "language_loss": 0.78985721, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.81484139, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.20837402, + "step": 7226, + "time_per_iteration": 2.8238272666931152 + }, + { + "auxiliary_loss_clip": 0.0143923, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.26445699, + "balance_loss_mlp": 1.01338375, + "epoch": 0.4345107470314144, + "flos": 19911575347200.0, + "grad_norm": 1.641214487040912, + "language_loss": 0.85938835, + "learning_rate": 2.515411949802964e-06, + "loss": 0.88412178, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.20715332, + "step": 7227, + "time_per_iteration": 2.8198015689849854 + }, + { + "auxiliary_loss_clip": 0.01449876, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.27221274, + "balance_loss_mlp": 1.01448727, + "epoch": 0.43457087028408237, + "flos": 26443877310720.0, + "grad_norm": 1.8572637256085816, + "language_loss": 0.77458096, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79943907, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21459961, + "step": 7228, + "time_per_iteration": 2.858193874359131 + }, + { + "auxiliary_loss_clip": 0.01443238, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.26723194, + "balance_loss_mlp": 1.01458263, + "epoch": 0.43463099353675033, + "flos": 31881432637440.0, + "grad_norm": 1.598486377591358, + "language_loss": 0.81332064, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.8381145, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21557617, + "step": 7229, + "time_per_iteration": 2.9410006999969482 + }, + { + "auxiliary_loss_clip": 0.01456452, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.27736306, + "balance_loss_mlp": 1.0178709, + "epoch": 0.4346911167894183, + "flos": 24581885034240.0, + "grad_norm": 1.9523271890975424, + "language_loss": 0.83005172, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.85500449, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.20947266, + "step": 7230, + "time_per_iteration": 2.950418710708618 + }, + { + "auxiliary_loss_clip": 0.01475328, + "auxiliary_loss_mlp": 0.01042569, + "balance_loss_clip": 1.29018688, + "balance_loss_mlp": 1.02020526, + "epoch": 0.43475124004208626, + "flos": 17099095415040.0, + "grad_norm": 2.3080174564885683, + "language_loss": 0.78290355, + "learning_rate": 2.513906565661973e-06, + "loss": 0.80808258, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.22351074, + "step": 7231, + "time_per_iteration": 2.8339085578918457 + }, + { + "auxiliary_loss_clip": 0.01440279, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.26504278, + "balance_loss_mlp": 1.01269388, + "epoch": 0.4348113632947542, + "flos": 26115878620800.0, + "grad_norm": 1.4851648702897655, + "language_loss": 0.69339693, + "learning_rate": 2.513530170872575e-06, + "loss": 0.71812469, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.19799805, + "step": 7232, + "time_per_iteration": 2.903183937072754 + }, + { + "auxiliary_loss_clip": 0.0146563, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.28291607, + "balance_loss_mlp": 1.01502657, + "epoch": 0.4348714865474222, + "flos": 34212832162560.0, + "grad_norm": 1.9060791084376187, + "language_loss": 0.7248354, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74987167, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.22973633, + "step": 7233, + "time_per_iteration": 3.0739660263061523 + }, + { + "auxiliary_loss_clip": 0.0147048, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.28678191, + "balance_loss_mlp": 1.01497674, + "epoch": 0.43493160980009016, + "flos": 31549226181120.0, + "grad_norm": 1.652442601171842, + "language_loss": 0.75136667, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.77643329, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.21203613, + "step": 7234, + "time_per_iteration": 2.9157984256744385 + }, + { + "auxiliary_loss_clip": 0.01486707, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.29803896, + "balance_loss_mlp": 1.01928544, + "epoch": 0.4349917330527582, + "flos": 24072046917120.0, + "grad_norm": 1.9536236744841642, + "language_loss": 0.59731674, + "learning_rate": 2.512400869722782e-06, + "loss": 0.62259698, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 1.88867188, + "router_z_loss_mlp": 0.22009277, + "step": 7235, + "time_per_iteration": 2.8367245197296143 + }, + { + "auxiliary_loss_clip": 0.01464457, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.28320837, + "balance_loss_mlp": 1.01302791, + "epoch": 0.43505185630542614, + "flos": 30531721697280.0, + "grad_norm": 1.8539179955552163, + "language_loss": 0.77747208, + "learning_rate": 2.512024397126566e-06, + "loss": 0.80246222, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.21533203, + "step": 7236, + "time_per_iteration": 2.8806731700897217 + }, + { + "auxiliary_loss_clip": 0.01446353, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.26887369, + "balance_loss_mlp": 1.01401889, + "epoch": 0.4351119795580941, + "flos": 15741783348480.0, + "grad_norm": 2.0889779575662355, + "language_loss": 0.81650746, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.84131688, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.20568848, + "step": 7237, + "time_per_iteration": 2.795853614807129 + }, + { + "auxiliary_loss_clip": 0.0145068, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.2724973, + "balance_loss_mlp": 1.01761532, + "epoch": 0.4351721028107621, + "flos": 18740400963840.0, + "grad_norm": 1.4709505934868143, + "language_loss": 0.63940883, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.6643014, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.20947266, + "step": 7238, + "time_per_iteration": 2.8485748767852783 + }, + { + "auxiliary_loss_clip": 0.01452674, + "auxiliary_loss_mlp": 0.01040147, + "balance_loss_clip": 1.27638865, + "balance_loss_mlp": 1.01970279, + "epoch": 0.43523222606343004, + "flos": 25237384721280.0, + "grad_norm": 1.709029408492606, + "language_loss": 0.85933375, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88426197, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.2043457, + "step": 7239, + "time_per_iteration": 2.8579494953155518 + }, + { + "auxiliary_loss_clip": 0.01463901, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.28305161, + "balance_loss_mlp": 1.01266658, + "epoch": 0.435292349316098, + "flos": 22718987861760.0, + "grad_norm": 1.5776932938268091, + "language_loss": 0.73314905, + "learning_rate": 2.510518312724309e-06, + "loss": 0.75811827, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.20349121, + "step": 7240, + "time_per_iteration": 2.8270397186279297 + }, + { + "auxiliary_loss_clip": 0.01460926, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.27850711, + "balance_loss_mlp": 1.01496339, + "epoch": 0.43535247256876597, + "flos": 25786975034880.0, + "grad_norm": 2.454316610087799, + "language_loss": 0.83157694, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.85654783, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.21191406, + "step": 7241, + "time_per_iteration": 2.861924648284912 + }, + { + "auxiliary_loss_clip": 0.01468563, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.28219187, + "balance_loss_mlp": 1.01267374, + "epoch": 0.43541259582143393, + "flos": 17536577817600.0, + "grad_norm": 2.764376800027208, + "language_loss": 0.81178153, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.8368057, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.21191406, + "step": 7242, + "time_per_iteration": 2.7843289375305176 + }, + { + "auxiliary_loss_clip": 0.01468604, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.28300798, + "balance_loss_mlp": 1.01366019, + "epoch": 0.4354727190741019, + "flos": 15203594724480.0, + "grad_norm": 2.126107484002281, + "language_loss": 0.70104694, + "learning_rate": 2.509388546104138e-06, + "loss": 0.72607207, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.20251465, + "step": 7243, + "time_per_iteration": 2.9414398670196533 + }, + { + "auxiliary_loss_clip": 0.01453386, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.27645934, + "balance_loss_mlp": 1.01488531, + "epoch": 0.43553284232676986, + "flos": 16656636084480.0, + "grad_norm": 1.8080228502890152, + "language_loss": 0.81729364, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.84217304, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.19665527, + "step": 7244, + "time_per_iteration": 2.857069492340088 + }, + { + "auxiliary_loss_clip": 0.01452023, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.2736423, + "balance_loss_mlp": 1.01117039, + "epoch": 0.43559296557943783, + "flos": 23410439182080.0, + "grad_norm": 1.7147545265074993, + "language_loss": 0.73888588, + "learning_rate": 2.508635271753234e-06, + "loss": 0.76371622, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.19836426, + "step": 7245, + "time_per_iteration": 2.860724449157715 + }, + { + "auxiliary_loss_clip": 0.01464699, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.28405309, + "balance_loss_mlp": 1.01964712, + "epoch": 0.4356530888321058, + "flos": 22429248289920.0, + "grad_norm": 1.6192488494429365, + "language_loss": 0.78495371, + "learning_rate": 2.508258605639389e-06, + "loss": 0.80999994, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.20288086, + "step": 7246, + "time_per_iteration": 2.907010793685913 + }, + { + "auxiliary_loss_clip": 0.01457153, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.27800512, + "balance_loss_mlp": 1.0160625, + "epoch": 0.43571321208477376, + "flos": 21626051016960.0, + "grad_norm": 1.865678192965393, + "language_loss": 0.86513329, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.89007759, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.2121582, + "step": 7247, + "time_per_iteration": 2.8645999431610107 + }, + { + "auxiliary_loss_clip": 0.01468441, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.28749323, + "balance_loss_mlp": 1.01668584, + "epoch": 0.4357733353374418, + "flos": 23997790920960.0, + "grad_norm": 1.7112499758663562, + "language_loss": 0.7293877, + "learning_rate": 2.507505215606333e-06, + "loss": 0.75445044, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.21154785, + "step": 7248, + "time_per_iteration": 4.282192945480347 + }, + { + "auxiliary_loss_clip": 0.01465149, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.2860955, + "balance_loss_mlp": 1.01385832, + "epoch": 0.43583345859010975, + "flos": 25275598594560.0, + "grad_norm": 1.516199566134212, + "language_loss": 0.87611353, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.90110481, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.20117188, + "step": 7249, + "time_per_iteration": 2.8870763778686523 + }, + { + "auxiliary_loss_clip": 0.0147458, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.29069901, + "balance_loss_mlp": 1.02052176, + "epoch": 0.4358935818427777, + "flos": 23706874984320.0, + "grad_norm": 1.864732020260216, + "language_loss": 0.82488108, + "learning_rate": 2.506751748594683e-06, + "loss": 0.8500362, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.20410156, + "step": 7250, + "time_per_iteration": 2.8323020935058594 + }, + { + "auxiliary_loss_clip": 0.01476222, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.29469824, + "balance_loss_mlp": 1.01326442, + "epoch": 0.4359537050954457, + "flos": 29544603736320.0, + "grad_norm": 2.0649875466068472, + "language_loss": 0.86009431, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.88520539, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21618652, + "step": 7251, + "time_per_iteration": 2.909106731414795 + }, + { + "auxiliary_loss_clip": 0.01461672, + "auxiliary_loss_mlp": 0.01040471, + "balance_loss_clip": 1.28179586, + "balance_loss_mlp": 1.01785696, + "epoch": 0.43601382834811364, + "flos": 22721612060160.0, + "grad_norm": 1.844581942199752, + "language_loss": 0.69988358, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.72490501, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.22595215, + "step": 7252, + "time_per_iteration": 2.886782646179199 + }, + { + "auxiliary_loss_clip": 0.01447263, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.27173221, + "balance_loss_mlp": 1.01566219, + "epoch": 0.4360739516007816, + "flos": 19108016115840.0, + "grad_norm": 1.6588738386835842, + "language_loss": 0.84477925, + "learning_rate": 2.505621403992348e-06, + "loss": 0.86960769, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.19909668, + "step": 7253, + "time_per_iteration": 2.805887460708618 + }, + { + "auxiliary_loss_clip": 0.01463839, + "auxiliary_loss_mlp": 0.01035717, + "balance_loss_clip": 1.28419495, + "balance_loss_mlp": 1.01440239, + "epoch": 0.43613407485344957, + "flos": 23414918417280.0, + "grad_norm": 1.5708163300614775, + "language_loss": 0.71460873, + "learning_rate": 2.505244584092757e-06, + "loss": 0.73960423, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.21325684, + "step": 7254, + "time_per_iteration": 2.9536285400390625 + }, + { + "auxiliary_loss_clip": 0.01448596, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.27267587, + "balance_loss_mlp": 1.01859498, + "epoch": 0.43619419810611754, + "flos": 22647989491200.0, + "grad_norm": 1.7666580781687167, + "language_loss": 0.8227337, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.8476131, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20739746, + "step": 7255, + "time_per_iteration": 2.8537089824676514 + }, + { + "auxiliary_loss_clip": 0.01472285, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.28967476, + "balance_loss_mlp": 1.01685357, + "epoch": 0.4362543213587855, + "flos": 20057644120320.0, + "grad_norm": 2.558512632804761, + "language_loss": 0.77823347, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80332577, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.20080566, + "step": 7256, + "time_per_iteration": 4.330026865005493 + }, + { + "auxiliary_loss_clip": 0.01445765, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.27001202, + "balance_loss_mlp": 1.01175761, + "epoch": 0.43631444461145347, + "flos": 21371312937600.0, + "grad_norm": 1.511188513504407, + "language_loss": 0.76617748, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.7909674, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21459961, + "step": 7257, + "time_per_iteration": 2.840121269226074 + }, + { + "auxiliary_loss_clip": 0.01460746, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.27919793, + "balance_loss_mlp": 1.0179466, + "epoch": 0.43637456786412143, + "flos": 22428569617920.0, + "grad_norm": 1.7307968248687207, + "language_loss": 0.73704803, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.76205444, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.21960449, + "step": 7258, + "time_per_iteration": 4.202048063278198 + }, + { + "auxiliary_loss_clip": 0.01467685, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.28495336, + "balance_loss_mlp": 1.02065933, + "epoch": 0.4364346911167894, + "flos": 28560652911360.0, + "grad_norm": 1.8865507813133124, + "language_loss": 0.77485031, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.79994011, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.20654297, + "step": 7259, + "time_per_iteration": 2.8654587268829346 + }, + { + "auxiliary_loss_clip": 0.01246906, + "auxiliary_loss_mlp": 0.01019478, + "balance_loss_clip": 1.14452434, + "balance_loss_mlp": 0.99668515, + "epoch": 0.43649481436945736, + "flos": 62688524042880.0, + "grad_norm": 0.7438014998269435, + "language_loss": 0.57024562, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59290946, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.22753906, + "step": 7260, + "time_per_iteration": 3.3056180477142334 + }, + { + "auxiliary_loss_clip": 0.01444548, + "auxiliary_loss_mlp": 0.01040747, + "balance_loss_clip": 1.26485288, + "balance_loss_mlp": 1.01892018, + "epoch": 0.4365549376221254, + "flos": 30604484615040.0, + "grad_norm": 2.2879742526547937, + "language_loss": 0.72427654, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.74912953, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21826172, + "step": 7261, + "time_per_iteration": 2.9165055751800537 + }, + { + "auxiliary_loss_clip": 0.01448917, + "auxiliary_loss_mlp": 0.01039875, + "balance_loss_clip": 1.2699945, + "balance_loss_mlp": 1.01884604, + "epoch": 0.43661506087479335, + "flos": 17174889734400.0, + "grad_norm": 1.8958679492158117, + "language_loss": 0.70138067, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.72626853, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21020508, + "step": 7262, + "time_per_iteration": 2.850949764251709 + }, + { + "auxiliary_loss_clip": 0.01421375, + "auxiliary_loss_mlp": 0.01039101, + "balance_loss_clip": 1.25014138, + "balance_loss_mlp": 1.01946735, + "epoch": 0.4366751841274613, + "flos": 22055841803520.0, + "grad_norm": 1.7601594815285206, + "language_loss": 0.80813116, + "learning_rate": 2.501852344559726e-06, + "loss": 0.8327359, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.19628906, + "step": 7263, + "time_per_iteration": 2.893087148666382 + }, + { + "auxiliary_loss_clip": 0.0143975, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.26303291, + "balance_loss_mlp": 1.02215552, + "epoch": 0.4367353073801293, + "flos": 16005751367040.0, + "grad_norm": 2.003533480308807, + "language_loss": 0.76256841, + "learning_rate": 2.50147533371401e-06, + "loss": 0.78739333, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.20568848, + "step": 7264, + "time_per_iteration": 2.8973236083984375 + }, + { + "auxiliary_loss_clip": 0.01443187, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.26626992, + "balance_loss_mlp": 1.01767683, + "epoch": 0.43679543063279724, + "flos": 38231849928960.0, + "grad_norm": 1.7820417260261963, + "language_loss": 0.62453628, + "learning_rate": 2.501098303852298e-06, + "loss": 0.64935732, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.21240234, + "step": 7265, + "time_per_iteration": 3.0061168670654297 + }, + { + "auxiliary_loss_clip": 0.01441734, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.26464212, + "balance_loss_mlp": 1.01214933, + "epoch": 0.4368555538854652, + "flos": 15201106260480.0, + "grad_norm": 2.3847629309021814, + "language_loss": 0.73773187, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.76248288, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.2121582, + "step": 7266, + "time_per_iteration": 2.8127048015594482 + }, + { + "auxiliary_loss_clip": 0.01446746, + "auxiliary_loss_mlp": 0.01040862, + "balance_loss_clip": 1.26863027, + "balance_loss_mlp": 1.01972651, + "epoch": 0.4369156771381332, + "flos": 23077689788160.0, + "grad_norm": 2.04715496559634, + "language_loss": 0.83503985, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.85991591, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21154785, + "step": 7267, + "time_per_iteration": 2.8533895015716553 + }, + { + "auxiliary_loss_clip": 0.01433706, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.25986218, + "balance_loss_mlp": 1.01528549, + "epoch": 0.43697580039080114, + "flos": 23451639212160.0, + "grad_norm": 2.723722911866533, + "language_loss": 0.75097692, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.77567416, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.20727539, + "step": 7268, + "time_per_iteration": 2.84023118019104 + }, + { + "auxiliary_loss_clip": 0.01445875, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.26695943, + "balance_loss_mlp": 1.0155468, + "epoch": 0.4370359236434691, + "flos": 18523741023360.0, + "grad_norm": 2.1991217589814838, + "language_loss": 0.80503309, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82986087, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21350098, + "step": 7269, + "time_per_iteration": 2.81695556640625 + }, + { + "auxiliary_loss_clip": 0.01441429, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.26530099, + "balance_loss_mlp": 1.0155437, + "epoch": 0.43709604689613707, + "flos": 23233078990080.0, + "grad_norm": 2.4589123448996832, + "language_loss": 0.75513119, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77991998, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21899414, + "step": 7270, + "time_per_iteration": 2.8459506034851074 + }, + { + "auxiliary_loss_clip": 0.01453849, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.27427697, + "balance_loss_mlp": 1.01384878, + "epoch": 0.43715617014880503, + "flos": 23813779743360.0, + "grad_norm": 2.0771524980857374, + "language_loss": 0.79803479, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.82292378, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.21203613, + "step": 7271, + "time_per_iteration": 2.8872764110565186 + }, + { + "auxiliary_loss_clip": 0.01240625, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.14078259, + "balance_loss_mlp": 1.01106119, + "epoch": 0.437216293401473, + "flos": 61973201450880.0, + "grad_norm": 0.7016885431934072, + "language_loss": 0.54911906, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.57183236, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.19628906, + "step": 7272, + "time_per_iteration": 3.451115608215332 + }, + { + "auxiliary_loss_clip": 0.01456745, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.27627611, + "balance_loss_mlp": 1.01781678, + "epoch": 0.43727641665414096, + "flos": 21992580293760.0, + "grad_norm": 1.6920170506978562, + "language_loss": 0.71058053, + "learning_rate": 2.498081382098581e-06, + "loss": 0.73554367, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21740723, + "step": 7273, + "time_per_iteration": 2.8276493549346924 + }, + { + "auxiliary_loss_clip": 0.01450508, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.27047706, + "balance_loss_mlp": 1.01527345, + "epoch": 0.437336539906809, + "flos": 39545473501440.0, + "grad_norm": 2.123069428058822, + "language_loss": 0.76993978, + "learning_rate": 2.497704181736367e-06, + "loss": 0.79481065, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.21313477, + "step": 7274, + "time_per_iteration": 3.0094635486602783 + }, + { + "auxiliary_loss_clip": 0.0142809, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.25227571, + "balance_loss_mlp": 1.01028109, + "epoch": 0.43739666315947695, + "flos": 17466077139840.0, + "grad_norm": 1.9046044488873413, + "language_loss": 0.80896145, + "learning_rate": 2.49732696250116e-06, + "loss": 0.83355051, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.2052002, + "step": 7275, + "time_per_iteration": 2.8493292331695557 + }, + { + "auxiliary_loss_clip": 0.01444054, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.26820946, + "balance_loss_mlp": 1.01414585, + "epoch": 0.4374567864121449, + "flos": 16366398819840.0, + "grad_norm": 1.9982148612882, + "language_loss": 0.81189704, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83670092, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.22180176, + "step": 7276, + "time_per_iteration": 2.816157579421997 + }, + { + "auxiliary_loss_clip": 0.01464957, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.28028202, + "balance_loss_mlp": 1.01221871, + "epoch": 0.4375169096648129, + "flos": 30598693280640.0, + "grad_norm": 1.8326460505798894, + "language_loss": 0.7356683, + "learning_rate": 2.496572467468988e-06, + "loss": 0.76065725, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.21716309, + "step": 7277, + "time_per_iteration": 2.893683433532715 + }, + { + "auxiliary_loss_clip": 0.01440424, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.26277375, + "balance_loss_mlp": 1.01509297, + "epoch": 0.43757703291748085, + "flos": 30567944799360.0, + "grad_norm": 1.9383331165489974, + "language_loss": 0.73458445, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75935638, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.2166748, + "step": 7278, + "time_per_iteration": 2.916102170944214 + }, + { + "auxiliary_loss_clip": 0.01439827, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.26402068, + "balance_loss_mlp": 1.01511931, + "epoch": 0.4376371561701488, + "flos": 21407264570880.0, + "grad_norm": 2.9668028761150618, + "language_loss": 0.66841936, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.69317734, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20861816, + "step": 7279, + "time_per_iteration": 2.8302624225616455 + }, + { + "auxiliary_loss_clip": 0.01468451, + "auxiliary_loss_mlp": 0.01044569, + "balance_loss_clip": 1.2837975, + "balance_loss_mlp": 1.02221751, + "epoch": 0.4376972794228168, + "flos": 23414873172480.0, + "grad_norm": 2.6564791931036282, + "language_loss": 0.83113384, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.856264, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 1.84667969, + "router_z_loss_mlp": 0.22363281, + "step": 7280, + "time_per_iteration": 2.839761257171631 + }, + { + "auxiliary_loss_clip": 0.01430095, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.25744009, + "balance_loss_mlp": 1.01079965, + "epoch": 0.43775740267548474, + "flos": 22903134773760.0, + "grad_norm": 1.6559866092433078, + "language_loss": 0.77784175, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.80246711, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21618652, + "step": 7281, + "time_per_iteration": 2.8494272232055664 + }, + { + "auxiliary_loss_clip": 0.01449329, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.27014136, + "balance_loss_mlp": 1.01426601, + "epoch": 0.4378175259281527, + "flos": 23304710787840.0, + "grad_norm": 1.9429374273030813, + "language_loss": 0.77531928, + "learning_rate": 2.494685900612569e-06, + "loss": 0.80016249, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.20727539, + "step": 7282, + "time_per_iteration": 2.8460309505462646 + }, + { + "auxiliary_loss_clip": 0.01453597, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.27240682, + "balance_loss_mlp": 1.01492763, + "epoch": 0.43787764918082067, + "flos": 23887040353920.0, + "grad_norm": 2.501787870014843, + "language_loss": 0.85920596, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.88410753, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21630859, + "step": 7283, + "time_per_iteration": 4.259234189987183 + }, + { + "auxiliary_loss_clip": 0.01464104, + "auxiliary_loss_mlp": 0.0103824, + "balance_loss_clip": 1.27974939, + "balance_loss_mlp": 1.01588809, + "epoch": 0.43793777243348864, + "flos": 23998786306560.0, + "grad_norm": 1.9467636863732452, + "language_loss": 0.81549209, + "learning_rate": 2.49393114246007e-06, + "loss": 0.84051555, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.22338867, + "step": 7284, + "time_per_iteration": 2.8802502155303955 + }, + { + "auxiliary_loss_clip": 0.01444316, + "auxiliary_loss_mlp": 0.01040931, + "balance_loss_clip": 1.26632595, + "balance_loss_mlp": 1.01786399, + "epoch": 0.4379978956861566, + "flos": 18633134246400.0, + "grad_norm": 2.43226106828972, + "language_loss": 0.81770694, + "learning_rate": 2.493553735281787e-06, + "loss": 0.84255946, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.23059082, + "step": 7285, + "time_per_iteration": 2.8176732063293457 + }, + { + "auxiliary_loss_clip": 0.0145666, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.27622366, + "balance_loss_mlp": 1.01198554, + "epoch": 0.43805801893882457, + "flos": 21991494418560.0, + "grad_norm": 2.101682044578097, + "language_loss": 0.75498533, + "learning_rate": 2.493176309387897e-06, + "loss": 0.779881, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.20935059, + "step": 7286, + "time_per_iteration": 2.9041709899902344 + }, + { + "auxiliary_loss_clip": 0.0145771, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.27436054, + "balance_loss_mlp": 1.01463771, + "epoch": 0.43811814219149253, + "flos": 26403853645440.0, + "grad_norm": 2.079649814246176, + "language_loss": 0.74573505, + "learning_rate": 2.492798864792712e-06, + "loss": 0.77067494, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21643066, + "step": 7287, + "time_per_iteration": 2.957794427871704 + }, + { + "auxiliary_loss_clip": 0.01458136, + "auxiliary_loss_mlp": 0.01039796, + "balance_loss_clip": 1.27682793, + "balance_loss_mlp": 1.01771832, + "epoch": 0.43817826544416055, + "flos": 17502164507520.0, + "grad_norm": 1.9479759123595193, + "language_loss": 0.83205479, + "learning_rate": 2.492421401510545e-06, + "loss": 0.85703409, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.22058105, + "step": 7288, + "time_per_iteration": 2.815713882446289 + }, + { + "auxiliary_loss_clip": 0.01463383, + "auxiliary_loss_mlp": 0.01034575, + "balance_loss_clip": 1.27918434, + "balance_loss_mlp": 1.0137248, + "epoch": 0.4382383886968285, + "flos": 21591275748480.0, + "grad_norm": 1.5743034588301685, + "language_loss": 0.84749836, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.87247801, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.20825195, + "step": 7289, + "time_per_iteration": 2.9344100952148438 + }, + { + "auxiliary_loss_clip": 0.01484325, + "auxiliary_loss_mlp": 0.01037717, + "balance_loss_clip": 1.29832649, + "balance_loss_mlp": 1.01660538, + "epoch": 0.4382985119494965, + "flos": 27934137158400.0, + "grad_norm": 1.5192340254738526, + "language_loss": 0.78852707, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.81374753, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21130371, + "step": 7290, + "time_per_iteration": 2.8868682384490967 + }, + { + "auxiliary_loss_clip": 0.01455022, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.27491403, + "balance_loss_mlp": 1.01772201, + "epoch": 0.43835863520216445, + "flos": 24947735639040.0, + "grad_norm": 1.9745493325659746, + "language_loss": 0.78523862, + "learning_rate": 2.491288899685288e-06, + "loss": 0.81017411, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.20800781, + "step": 7291, + "time_per_iteration": 5.8035078048706055 + }, + { + "auxiliary_loss_clip": 0.01460133, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.27908885, + "balance_loss_mlp": 1.01515639, + "epoch": 0.4384187584548324, + "flos": 33523235879040.0, + "grad_norm": 1.653094262246065, + "language_loss": 0.65559983, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.68055868, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.20593262, + "step": 7292, + "time_per_iteration": 2.9235785007476807 + }, + { + "auxiliary_loss_clip": 0.01473805, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.29024601, + "balance_loss_mlp": 1.01149964, + "epoch": 0.4384788817075004, + "flos": 23961522574080.0, + "grad_norm": 1.5140609048946356, + "language_loss": 0.7550754, + "learning_rate": 2.49053380529597e-06, + "loss": 0.78014517, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.21679688, + "step": 7293, + "time_per_iteration": 4.248810768127441 + }, + { + "auxiliary_loss_clip": 0.01465963, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.28584588, + "balance_loss_mlp": 1.01667857, + "epoch": 0.43853900496016834, + "flos": 19107744647040.0, + "grad_norm": 1.885246907312398, + "language_loss": 0.80201018, + "learning_rate": 2.490156230192516e-06, + "loss": 0.82705092, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.2142334, + "step": 7294, + "time_per_iteration": 2.803452968597412 + }, + { + "auxiliary_loss_clip": 0.01467752, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.28625619, + "balance_loss_mlp": 1.01904869, + "epoch": 0.4385991282128363, + "flos": 13233340344960.0, + "grad_norm": 1.6500728933971545, + "language_loss": 0.73784626, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.76292133, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.20703125, + "step": 7295, + "time_per_iteration": 2.803194761276245 + }, + { + "auxiliary_loss_clip": 0.01479362, + "auxiliary_loss_mlp": 0.01049906, + "balance_loss_clip": 1.29499483, + "balance_loss_mlp": 1.02757835, + "epoch": 0.4386592514655043, + "flos": 14328132226560.0, + "grad_norm": 1.825078750184844, + "language_loss": 0.76134253, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.78663528, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.2232666, + "step": 7296, + "time_per_iteration": 2.878452777862549 + }, + { + "auxiliary_loss_clip": 0.01460175, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.28013599, + "balance_loss_mlp": 1.01532936, + "epoch": 0.43871937471817224, + "flos": 22794827425920.0, + "grad_norm": 1.5750689633217083, + "language_loss": 0.69699705, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.72196352, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.21154785, + "step": 7297, + "time_per_iteration": 2.8298516273498535 + }, + { + "auxiliary_loss_clip": 0.01458994, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.28016448, + "balance_loss_mlp": 1.01814318, + "epoch": 0.4387794979708402, + "flos": 28083418312320.0, + "grad_norm": 1.5003086472264369, + "language_loss": 0.70694721, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.73192739, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.2088623, + "step": 7298, + "time_per_iteration": 2.866270065307617 + }, + { + "auxiliary_loss_clip": 0.01455592, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.27733231, + "balance_loss_mlp": 1.01381063, + "epoch": 0.43883962122350817, + "flos": 26260590049920.0, + "grad_norm": 1.5087363188883787, + "language_loss": 0.7286793, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.75359607, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.22290039, + "step": 7299, + "time_per_iteration": 2.8651773929595947 + }, + { + "auxiliary_loss_clip": 0.01468831, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.28519011, + "balance_loss_mlp": 1.01642346, + "epoch": 0.43889974447617613, + "flos": 25894467976320.0, + "grad_norm": 1.692831211773142, + "language_loss": 0.77580523, + "learning_rate": 2.487890389750719e-06, + "loss": 0.8008858, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.22790527, + "step": 7300, + "time_per_iteration": 2.889347553253174 + }, + { + "auxiliary_loss_clip": 0.01461362, + "auxiliary_loss_mlp": 0.01039071, + "balance_loss_clip": 1.28041196, + "balance_loss_mlp": 1.0179944, + "epoch": 0.43895986772884416, + "flos": 25057626554880.0, + "grad_norm": 3.6764799913728217, + "language_loss": 0.71890789, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.74391222, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21081543, + "step": 7301, + "time_per_iteration": 2.967463731765747 + }, + { + "auxiliary_loss_clip": 0.01478256, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.29572654, + "balance_loss_mlp": 1.02089489, + "epoch": 0.4390199909815121, + "flos": 26005670991360.0, + "grad_norm": 2.208914690990745, + "language_loss": 0.71561551, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.74082673, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.21960449, + "step": 7302, + "time_per_iteration": 2.8947505950927734 + }, + { + "auxiliary_loss_clip": 0.01458356, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.27897322, + "balance_loss_mlp": 1.02190161, + "epoch": 0.4390801142341801, + "flos": 29033951212800.0, + "grad_norm": 1.6064428657511671, + "language_loss": 0.82506627, + "learning_rate": 2.486757219574983e-06, + "loss": 0.85008395, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.21520996, + "step": 7303, + "time_per_iteration": 2.948883295059204 + }, + { + "auxiliary_loss_clip": 0.01482098, + "auxiliary_loss_mlp": 0.0104069, + "balance_loss_clip": 1.29528975, + "balance_loss_mlp": 1.01985252, + "epoch": 0.43914023748684805, + "flos": 33451785060480.0, + "grad_norm": 2.134186872731007, + "language_loss": 0.6980226, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.72325051, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 1.86816406, + "router_z_loss_mlp": 0.20861816, + "step": 7304, + "time_per_iteration": 2.9262404441833496 + }, + { + "auxiliary_loss_clip": 0.01449861, + "auxiliary_loss_mlp": 0.01038219, + "balance_loss_clip": 1.27302098, + "balance_loss_mlp": 1.01720262, + "epoch": 0.439200360739516, + "flos": 34545038618880.0, + "grad_norm": 1.3595587217385068, + "language_loss": 0.78708184, + "learning_rate": 2.486001680477873e-06, + "loss": 0.81196272, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.21020508, + "step": 7305, + "time_per_iteration": 2.9577412605285645 + }, + { + "auxiliary_loss_clip": 0.01465125, + "auxiliary_loss_mlp": 0.01040627, + "balance_loss_clip": 1.28498924, + "balance_loss_mlp": 1.01884699, + "epoch": 0.439260483992184, + "flos": 21917781360000.0, + "grad_norm": 1.7414874662980093, + "language_loss": 0.69169647, + "learning_rate": 2.485623883278308e-06, + "loss": 0.71675396, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21777344, + "step": 7306, + "time_per_iteration": 2.837794065475464 + }, + { + "auxiliary_loss_clip": 0.01460824, + "auxiliary_loss_mlp": 0.01042766, + "balance_loss_clip": 1.2793982, + "balance_loss_mlp": 1.02020013, + "epoch": 0.43932060724485195, + "flos": 21006321984000.0, + "grad_norm": 1.8126321483119852, + "language_loss": 0.63474822, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.65978414, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.22558594, + "step": 7307, + "time_per_iteration": 2.81889009475708 + }, + { + "auxiliary_loss_clip": 0.01470991, + "auxiliary_loss_mlp": 0.01041673, + "balance_loss_clip": 1.28688431, + "balance_loss_mlp": 1.01996481, + "epoch": 0.4393807304975199, + "flos": 17755545242880.0, + "grad_norm": 1.9648452593338537, + "language_loss": 0.72768021, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.75280684, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.21704102, + "step": 7308, + "time_per_iteration": 2.7916765213012695 + }, + { + "auxiliary_loss_clip": 0.01461368, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.27679992, + "balance_loss_mlp": 1.01933253, + "epoch": 0.4394408537501879, + "flos": 22538732002560.0, + "grad_norm": 2.1683236417889615, + "language_loss": 0.77347863, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.79850489, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.21936035, + "step": 7309, + "time_per_iteration": 2.8534719944000244 + }, + { + "auxiliary_loss_clip": 0.01442829, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.26755834, + "balance_loss_mlp": 1.01427317, + "epoch": 0.43950097700285584, + "flos": 23451096274560.0, + "grad_norm": 1.806805410759518, + "language_loss": 0.71513438, + "learning_rate": 2.484112510474251e-06, + "loss": 0.73992485, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21960449, + "step": 7310, + "time_per_iteration": 2.8820126056671143 + }, + { + "auxiliary_loss_clip": 0.01479943, + "auxiliary_loss_mlp": 0.01039007, + "balance_loss_clip": 1.2947576, + "balance_loss_mlp": 1.01760888, + "epoch": 0.4395611002555238, + "flos": 23189888188800.0, + "grad_norm": 1.9953616556712859, + "language_loss": 0.76691341, + "learning_rate": 2.483734621343429e-06, + "loss": 0.79210293, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 1.8515625, + "router_z_loss_mlp": 0.21386719, + "step": 7311, + "time_per_iteration": 2.828118085861206 + }, + { + "auxiliary_loss_clip": 0.01487476, + "auxiliary_loss_mlp": 0.01045195, + "balance_loss_clip": 1.30167198, + "balance_loss_mlp": 1.02435684, + "epoch": 0.43962122350819177, + "flos": 22137517946880.0, + "grad_norm": 2.106678331738435, + "language_loss": 0.82711422, + "learning_rate": 2.483356713869341e-06, + "loss": 0.85244095, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.20837402, + "step": 7312, + "time_per_iteration": 2.913433790206909 + }, + { + "auxiliary_loss_clip": 0.01460607, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.28130078, + "balance_loss_mlp": 1.01749039, + "epoch": 0.43968134676085974, + "flos": 17429446834560.0, + "grad_norm": 3.1319490910996497, + "language_loss": 0.86618239, + "learning_rate": 2.482978788066318e-06, + "loss": 0.89116955, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.20629883, + "step": 7313, + "time_per_iteration": 2.8134238719940186 + }, + { + "auxiliary_loss_clip": 0.01471337, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.28675854, + "balance_loss_mlp": 1.02206564, + "epoch": 0.43974147001352776, + "flos": 18961856853120.0, + "grad_norm": 1.851388154912592, + "language_loss": 0.6853385, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.71048516, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 1.84277344, + "router_z_loss_mlp": 0.21252441, + "step": 7314, + "time_per_iteration": 2.8271877765655518 + }, + { + "auxiliary_loss_clip": 0.01470993, + "auxiliary_loss_mlp": 0.01043784, + "balance_loss_clip": 1.28542328, + "balance_loss_mlp": 1.02142024, + "epoch": 0.4398015932661957, + "flos": 18962942728320.0, + "grad_norm": 1.8980336477172999, + "language_loss": 0.77855361, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.8037014, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22363281, + "step": 7315, + "time_per_iteration": 2.813729763031006 + }, + { + "auxiliary_loss_clip": 0.01453105, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.27316523, + "balance_loss_mlp": 1.02178645, + "epoch": 0.4398617165188637, + "flos": 24208750016640.0, + "grad_norm": 2.6134878651206637, + "language_loss": 0.75027937, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.77522951, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.2010498, + "step": 7316, + "time_per_iteration": 2.878239393234253 + }, + { + "auxiliary_loss_clip": 0.01461036, + "auxiliary_loss_mlp": 0.01043728, + "balance_loss_clip": 1.28076005, + "balance_loss_mlp": 1.02191234, + "epoch": 0.43992183977153165, + "flos": 22246820680320.0, + "grad_norm": 2.8219878672510434, + "language_loss": 0.66170168, + "learning_rate": 2.481466901851506e-06, + "loss": 0.68674934, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21813965, + "step": 7317, + "time_per_iteration": 2.8454365730285645 + }, + { + "auxiliary_loss_clip": 0.01475159, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_clip": 1.29151237, + "balance_loss_mlp": 1.02093256, + "epoch": 0.4399819630241996, + "flos": 18706666325760.0, + "grad_norm": 2.047013528197896, + "language_loss": 0.80749702, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.83267415, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.21643066, + "step": 7318, + "time_per_iteration": 4.281556129455566 + }, + { + "auxiliary_loss_clip": 0.0148374, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.29771817, + "balance_loss_mlp": 1.0234288, + "epoch": 0.4400420862768676, + "flos": 23890162245120.0, + "grad_norm": 1.510794318563486, + "language_loss": 0.80283248, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.82811916, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.21520996, + "step": 7319, + "time_per_iteration": 2.855647563934326 + }, + { + "auxiliary_loss_clip": 0.01464952, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.28410316, + "balance_loss_mlp": 1.02646661, + "epoch": 0.44010220952953555, + "flos": 28049185981440.0, + "grad_norm": 1.835760150995832, + "language_loss": 0.81003159, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.83516872, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.22277832, + "step": 7320, + "time_per_iteration": 2.897409200668335 + }, + { + "auxiliary_loss_clip": 0.01462606, + "auxiliary_loss_mlp": 0.01044273, + "balance_loss_clip": 1.28107083, + "balance_loss_mlp": 1.02301836, + "epoch": 0.4401623327822035, + "flos": 23779502167680.0, + "grad_norm": 1.605227256998024, + "language_loss": 0.70353973, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.72860849, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21276855, + "step": 7321, + "time_per_iteration": 2.8361306190490723 + }, + { + "auxiliary_loss_clip": 0.01269605, + "auxiliary_loss_mlp": 0.01037455, + "balance_loss_clip": 1.16526961, + "balance_loss_mlp": 1.00922585, + "epoch": 0.4402224560348715, + "flos": 70809864531840.0, + "grad_norm": 0.8849412763665592, + "language_loss": 0.56970537, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.59277594, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.28320312, + "step": 7322, + "time_per_iteration": 3.4432075023651123 + }, + { + "auxiliary_loss_clip": 0.01466172, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.2867651, + "balance_loss_mlp": 1.02706611, + "epoch": 0.44028257928753944, + "flos": 22901686940160.0, + "grad_norm": 2.552910071692215, + "language_loss": 0.77165258, + "learning_rate": 2.479198525097822e-06, + "loss": 0.79678214, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.19714355, + "step": 7323, + "time_per_iteration": 2.875600814819336 + }, + { + "auxiliary_loss_clip": 0.01470956, + "auxiliary_loss_mlp": 0.01047383, + "balance_loss_clip": 1.28863275, + "balance_loss_mlp": 1.02630687, + "epoch": 0.4403427025402074, + "flos": 17905007376000.0, + "grad_norm": 1.6179824207700244, + "language_loss": 0.81295425, + "learning_rate": 2.478820398622511e-06, + "loss": 0.83813763, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.21057129, + "step": 7324, + "time_per_iteration": 2.8752851486206055 + }, + { + "auxiliary_loss_clip": 0.01280319, + "auxiliary_loss_mlp": 0.01048531, + "balance_loss_clip": 1.17387342, + "balance_loss_mlp": 1.01820374, + "epoch": 0.4404028257928754, + "flos": 69595408857600.0, + "grad_norm": 0.6679495935086173, + "language_loss": 0.54596478, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56925333, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.30273438, + "step": 7325, + "time_per_iteration": 3.31561541557312 + }, + { + "auxiliary_loss_clip": 0.0146834, + "auxiliary_loss_mlp": 0.01040773, + "balance_loss_clip": 1.28991163, + "balance_loss_mlp": 1.02142537, + "epoch": 0.44046294904554334, + "flos": 20933604311040.0, + "grad_norm": 1.6526656648177898, + "language_loss": 0.70498526, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.73007637, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.19348145, + "step": 7326, + "time_per_iteration": 4.273205518722534 + }, + { + "auxiliary_loss_clip": 0.01457671, + "auxiliary_loss_mlp": 0.0104606, + "balance_loss_clip": 1.27975714, + "balance_loss_mlp": 1.02456617, + "epoch": 0.44052307229821136, + "flos": 23634066821760.0, + "grad_norm": 1.6635810824539783, + "language_loss": 0.77039981, + "learning_rate": 2.477685910312432e-06, + "loss": 0.79543722, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.21496582, + "step": 7327, + "time_per_iteration": 2.86287260055542 + }, + { + "auxiliary_loss_clip": 0.014645, + "auxiliary_loss_mlp": 0.01043833, + "balance_loss_clip": 1.28644288, + "balance_loss_mlp": 1.02324533, + "epoch": 0.4405831955508793, + "flos": 17605223458560.0, + "grad_norm": 1.9642939962902053, + "language_loss": 0.84449703, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86958033, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.20593262, + "step": 7328, + "time_per_iteration": 4.242962598800659 + }, + { + "auxiliary_loss_clip": 0.01464424, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.28637505, + "balance_loss_mlp": 1.01734841, + "epoch": 0.4406433188035473, + "flos": 21471476221440.0, + "grad_norm": 3.5023239265499373, + "language_loss": 0.784356, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.80938542, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.21166992, + "step": 7329, + "time_per_iteration": 3.0050745010375977 + }, + { + "auxiliary_loss_clip": 0.01474705, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.29075146, + "balance_loss_mlp": 1.01816869, + "epoch": 0.44070344205621526, + "flos": 22683579166080.0, + "grad_norm": 2.507990785580898, + "language_loss": 0.74544358, + "learning_rate": 2.476551258977278e-06, + "loss": 0.7705822, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 1.84082031, + "router_z_loss_mlp": 0.20996094, + "step": 7330, + "time_per_iteration": 2.815157651901245 + }, + { + "auxiliary_loss_clip": 0.01463145, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.28338706, + "balance_loss_mlp": 1.01686835, + "epoch": 0.4407635653088832, + "flos": 23451774946560.0, + "grad_norm": 1.9215610659451428, + "language_loss": 0.75111032, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77611434, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.20385742, + "step": 7331, + "time_per_iteration": 2.881594657897949 + }, + { + "auxiliary_loss_clip": 0.01455959, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.27834189, + "balance_loss_mlp": 1.02235508, + "epoch": 0.4408236885615512, + "flos": 24031073111040.0, + "grad_norm": 1.6134338705405693, + "language_loss": 0.76716685, + "learning_rate": 2.475794734375581e-06, + "loss": 0.79215693, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.20690918, + "step": 7332, + "time_per_iteration": 2.8717565536499023 + }, + { + "auxiliary_loss_clip": 0.0145772, + "auxiliary_loss_mlp": 0.01041772, + "balance_loss_clip": 1.27849817, + "balance_loss_mlp": 1.02051747, + "epoch": 0.44088381181421915, + "flos": 12684338213760.0, + "grad_norm": 1.6550566805881763, + "language_loss": 0.74055439, + "learning_rate": 2.475416445004285e-06, + "loss": 0.7655493, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21252441, + "step": 7333, + "time_per_iteration": 2.7910070419311523 + }, + { + "auxiliary_loss_clip": 0.01447035, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.27346575, + "balance_loss_mlp": 1.01893115, + "epoch": 0.4409439350668871, + "flos": 24580120487040.0, + "grad_norm": 1.6907011660081197, + "language_loss": 0.80304903, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.82791746, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20874023, + "step": 7334, + "time_per_iteration": 2.852411985397339 + }, + { + "auxiliary_loss_clip": 0.01489043, + "auxiliary_loss_mlp": 0.01044839, + "balance_loss_clip": 1.29795182, + "balance_loss_mlp": 1.02148545, + "epoch": 0.4410040583195551, + "flos": 22677290138880.0, + "grad_norm": 1.996714422901873, + "language_loss": 0.75871956, + "learning_rate": 2.47465981219252e-06, + "loss": 0.78405839, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 1.91210938, + "router_z_loss_mlp": 0.23352051, + "step": 7335, + "time_per_iteration": 2.8505704402923584 + }, + { + "auxiliary_loss_clip": 0.01454022, + "auxiliary_loss_mlp": 0.01040767, + "balance_loss_clip": 1.27384269, + "balance_loss_mlp": 1.01941633, + "epoch": 0.44106418157222305, + "flos": 10859564424960.0, + "grad_norm": 1.856549313641576, + "language_loss": 0.73291951, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.7578674, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21337891, + "step": 7336, + "time_per_iteration": 2.83628511428833 + }, + { + "auxiliary_loss_clip": 0.01458408, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.27514219, + "balance_loss_mlp": 1.02480078, + "epoch": 0.441124304824891, + "flos": 21736937318400.0, + "grad_norm": 2.6059143752219054, + "language_loss": 0.64886463, + "learning_rate": 2.473903107384165e-06, + "loss": 0.67390746, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.21081543, + "step": 7337, + "time_per_iteration": 2.810410499572754 + }, + { + "auxiliary_loss_clip": 0.01257303, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.15471673, + "balance_loss_mlp": 1.02363062, + "epoch": 0.441184428077559, + "flos": 63253407098880.0, + "grad_norm": 0.7472164689826325, + "language_loss": 0.5272482, + "learning_rate": 2.473524728017134e-06, + "loss": 0.55023777, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18066406, + "step": 7338, + "time_per_iteration": 3.3686940670013428 + }, + { + "auxiliary_loss_clip": 0.01475106, + "auxiliary_loss_mlp": 0.01049788, + "balance_loss_clip": 1.28745985, + "balance_loss_mlp": 1.0269711, + "epoch": 0.44124455133022694, + "flos": 21187663718400.0, + "grad_norm": 2.0496133249125594, + "language_loss": 0.71574116, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7409901, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 1.87597656, + "router_z_loss_mlp": 0.22814941, + "step": 7339, + "time_per_iteration": 2.831730842590332 + }, + { + "auxiliary_loss_clip": 0.01438831, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.26572466, + "balance_loss_mlp": 1.02263284, + "epoch": 0.4413046745828949, + "flos": 17466982035840.0, + "grad_norm": 1.5021063426106052, + "language_loss": 0.70219815, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72701931, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20654297, + "step": 7340, + "time_per_iteration": 2.883413076400757 + }, + { + "auxiliary_loss_clip": 0.0126265, + "auxiliary_loss_mlp": 0.01041146, + "balance_loss_clip": 1.15931106, + "balance_loss_mlp": 1.02254987, + "epoch": 0.4413647978355629, + "flos": 61611133426560.0, + "grad_norm": 0.8983563546049733, + "language_loss": 0.64051318, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66355109, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18554688, + "step": 7341, + "time_per_iteration": 3.124307155609131 + }, + { + "auxiliary_loss_clip": 0.01452173, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.27094555, + "balance_loss_mlp": 1.01565969, + "epoch": 0.4414249210882309, + "flos": 27538488213120.0, + "grad_norm": 2.125804942496564, + "language_loss": 0.74457479, + "learning_rate": 2.47201103113145e-06, + "loss": 0.76946747, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21435547, + "step": 7342, + "time_per_iteration": 2.953056812286377 + }, + { + "auxiliary_loss_clip": 0.01443644, + "auxiliary_loss_mlp": 0.01041507, + "balance_loss_clip": 1.26534212, + "balance_loss_mlp": 1.02007353, + "epoch": 0.44148504434089886, + "flos": 23524537864320.0, + "grad_norm": 1.845314145359759, + "language_loss": 0.81025469, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.83510619, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.2142334, + "step": 7343, + "time_per_iteration": 2.91367506980896 + }, + { + "auxiliary_loss_clip": 0.01449109, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.26960182, + "balance_loss_mlp": 1.01653421, + "epoch": 0.4415451675935668, + "flos": 21590732810880.0, + "grad_norm": 8.218235463187373, + "language_loss": 0.77606988, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.80093694, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21069336, + "step": 7344, + "time_per_iteration": 2.851857900619507 + }, + { + "auxiliary_loss_clip": 0.01266231, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.16291738, + "balance_loss_mlp": 1.00720561, + "epoch": 0.4416052908462348, + "flos": 59033456599680.0, + "grad_norm": 0.7911190885519438, + "language_loss": 0.63819629, + "learning_rate": 2.470875570480556e-06, + "loss": 0.66110611, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.17578125, + "step": 7345, + "time_per_iteration": 3.040727138519287 + }, + { + "auxiliary_loss_clip": 0.01452566, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.27200794, + "balance_loss_mlp": 1.01465261, + "epoch": 0.44166541409890275, + "flos": 26368354460160.0, + "grad_norm": 1.7188739811738007, + "language_loss": 0.86124289, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88614142, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22619629, + "step": 7346, + "time_per_iteration": 2.9145097732543945 + }, + { + "auxiliary_loss_clip": 0.0146037, + "auxiliary_loss_mlp": 0.01038932, + "balance_loss_clip": 1.27898324, + "balance_loss_mlp": 1.01654434, + "epoch": 0.4417255373515707, + "flos": 20202084080640.0, + "grad_norm": 1.6548115070186007, + "language_loss": 0.80643398, + "learning_rate": 2.470118507411128e-06, + "loss": 0.83142698, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.22387695, + "step": 7347, + "time_per_iteration": 2.8458476066589355 + }, + { + "auxiliary_loss_clip": 0.014561, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.27440739, + "balance_loss_mlp": 1.01452017, + "epoch": 0.4417856606042387, + "flos": 17895098764800.0, + "grad_norm": 1.8212717925722746, + "language_loss": 0.83972061, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.86465812, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.2310791, + "step": 7348, + "time_per_iteration": 2.8834269046783447 + }, + { + "auxiliary_loss_clip": 0.01472188, + "auxiliary_loss_mlp": 0.01041202, + "balance_loss_clip": 1.2882005, + "balance_loss_mlp": 1.01854026, + "epoch": 0.44184578385690665, + "flos": 27975427678080.0, + "grad_norm": 1.7492855049391907, + "language_loss": 0.71505284, + "learning_rate": 2.469361373033938e-06, + "loss": 0.74018669, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.2265625, + "step": 7349, + "time_per_iteration": 2.9200048446655273 + }, + { + "auxiliary_loss_clip": 0.01461134, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.27941549, + "balance_loss_mlp": 1.01520967, + "epoch": 0.4419059071095746, + "flos": 23378378601600.0, + "grad_norm": 1.8278860292305237, + "language_loss": 0.75336695, + "learning_rate": 2.468982779140819e-06, + "loss": 0.77835214, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.22167969, + "step": 7350, + "time_per_iteration": 2.899661064147949 + }, + { + "auxiliary_loss_clip": 0.01460248, + "auxiliary_loss_mlp": 0.01043885, + "balance_loss_clip": 1.27958548, + "balance_loss_mlp": 1.02093697, + "epoch": 0.4419660303622426, + "flos": 15020352708480.0, + "grad_norm": 2.245050682533336, + "language_loss": 0.82610518, + "learning_rate": 2.468604167463827e-06, + "loss": 0.85114658, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.22961426, + "step": 7351, + "time_per_iteration": 2.8251559734344482 + }, + { + "auxiliary_loss_clip": 0.01439036, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.26537824, + "balance_loss_mlp": 1.01350021, + "epoch": 0.44202615361491054, + "flos": 25382005660800.0, + "grad_norm": 1.6173226283515463, + "language_loss": 0.73927057, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.7640084, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21252441, + "step": 7352, + "time_per_iteration": 2.898984909057617 + }, + { + "auxiliary_loss_clip": 0.01470095, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.28961325, + "balance_loss_mlp": 1.0167774, + "epoch": 0.4420862768675785, + "flos": 24691866439680.0, + "grad_norm": 1.8726812446777494, + "language_loss": 0.87981451, + "learning_rate": 2.467846890815649e-06, + "loss": 0.90490031, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.21704102, + "step": 7353, + "time_per_iteration": 4.32417893409729 + }, + { + "auxiliary_loss_clip": 0.01466424, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.28474295, + "balance_loss_mlp": 1.01695478, + "epoch": 0.44214640012024653, + "flos": 19535725641600.0, + "grad_norm": 2.7001572308273265, + "language_loss": 0.77278095, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.79782587, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.21105957, + "step": 7354, + "time_per_iteration": 2.8378207683563232 + }, + { + "auxiliary_loss_clip": 0.01449626, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.2744031, + "balance_loss_mlp": 1.01913118, + "epoch": 0.4422065233729145, + "flos": 47574052871040.0, + "grad_norm": 1.9778980420011554, + "language_loss": 0.65880609, + "learning_rate": 2.467089543204268e-06, + "loss": 0.68371427, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22070312, + "step": 7355, + "time_per_iteration": 3.060670852661133 + }, + { + "auxiliary_loss_clip": 0.01474169, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.2880398, + "balance_loss_mlp": 1.01497722, + "epoch": 0.44226664662558246, + "flos": 19290353235840.0, + "grad_norm": 1.7829739777635252, + "language_loss": 0.78827536, + "learning_rate": 2.466710842823274e-06, + "loss": 0.81338573, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 0.21911621, + "step": 7356, + "time_per_iteration": 2.8754546642303467 + }, + { + "auxiliary_loss_clip": 0.01468701, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.28466749, + "balance_loss_mlp": 1.01550305, + "epoch": 0.4423267698782504, + "flos": 17830796624640.0, + "grad_norm": 2.0546082264404393, + "language_loss": 0.78200567, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.80707616, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.22851562, + "step": 7357, + "time_per_iteration": 2.8315320014953613 + }, + { + "auxiliary_loss_clip": 0.01465888, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.28540373, + "balance_loss_mlp": 1.02158177, + "epoch": 0.4423868931309184, + "flos": 29216107353600.0, + "grad_norm": 1.5107866084595705, + "language_loss": 0.74008155, + "learning_rate": 2.465953388982481e-06, + "loss": 0.7651751, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21911621, + "step": 7358, + "time_per_iteration": 2.9500234127044678 + }, + { + "auxiliary_loss_clip": 0.01471075, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.28953481, + "balance_loss_mlp": 1.02432728, + "epoch": 0.44244701638358636, + "flos": 29724407147520.0, + "grad_norm": 1.6005916053442713, + "language_loss": 0.76076102, + "learning_rate": 2.465574635551405e-06, + "loss": 0.78594965, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23449707, + "step": 7359, + "time_per_iteration": 2.8910272121429443 + }, + { + "auxiliary_loss_clip": 0.01462637, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.28419518, + "balance_loss_mlp": 1.01978266, + "epoch": 0.4425071396362543, + "flos": 22940398506240.0, + "grad_norm": 1.6986971435865195, + "language_loss": 0.70742065, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.73246741, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22253418, + "step": 7360, + "time_per_iteration": 2.872999429702759 + }, + { + "auxiliary_loss_clip": 0.01472784, + "auxiliary_loss_mlp": 0.01042628, + "balance_loss_clip": 1.29071569, + "balance_loss_mlp": 1.02014494, + "epoch": 0.4425672628889223, + "flos": 19801910655360.0, + "grad_norm": 2.228508446786744, + "language_loss": 0.70888543, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.73403955, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.22485352, + "step": 7361, + "time_per_iteration": 5.574120283126831 + }, + { + "auxiliary_loss_clip": 0.01463017, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.28110409, + "balance_loss_mlp": 1.01586056, + "epoch": 0.44262738614159025, + "flos": 13670053585920.0, + "grad_norm": 2.024176807928691, + "language_loss": 0.83204174, + "learning_rate": 2.464438269387809e-06, + "loss": 0.85705799, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22753906, + "step": 7362, + "time_per_iteration": 2.895024061203003 + }, + { + "auxiliary_loss_clip": 0.01478986, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.29130363, + "balance_loss_mlp": 1.02013338, + "epoch": 0.4426875093942582, + "flos": 14218377045120.0, + "grad_norm": 1.7485533923596728, + "language_loss": 0.75234842, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77756375, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.22412109, + "step": 7363, + "time_per_iteration": 4.201362609863281 + }, + { + "auxiliary_loss_clip": 0.01269106, + "auxiliary_loss_mlp": 0.01023492, + "balance_loss_clip": 1.16477394, + "balance_loss_mlp": 1.00127113, + "epoch": 0.4427476326469262, + "flos": 70152555052800.0, + "grad_norm": 0.6853232657375016, + "language_loss": 0.55740917, + "learning_rate": 2.463680603863743e-06, + "loss": 0.58033514, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.22265625, + "step": 7364, + "time_per_iteration": 3.434938430786133 + }, + { + "auxiliary_loss_clip": 0.01449067, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.27170217, + "balance_loss_mlp": 1.01632428, + "epoch": 0.44280775589959415, + "flos": 25455447250560.0, + "grad_norm": 1.641036338652434, + "language_loss": 0.75919253, + "learning_rate": 2.463301744720305e-06, + "loss": 0.78406709, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22070312, + "step": 7365, + "time_per_iteration": 2.9378321170806885 + }, + { + "auxiliary_loss_clip": 0.01451925, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.27342212, + "balance_loss_mlp": 1.01545608, + "epoch": 0.4428678791522621, + "flos": 22867590343680.0, + "grad_norm": 1.7031800191827613, + "language_loss": 0.74966252, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.77456522, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22888184, + "step": 7366, + "time_per_iteration": 2.8703179359436035 + }, + { + "auxiliary_loss_clip": 0.01464791, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.28482521, + "balance_loss_mlp": 1.01695085, + "epoch": 0.44292800240493013, + "flos": 25823560095360.0, + "grad_norm": 4.8376980688938245, + "language_loss": 0.7413125, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7663554, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22546387, + "step": 7367, + "time_per_iteration": 2.900465726852417 + }, + { + "auxiliary_loss_clip": 0.01453887, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.27389717, + "balance_loss_mlp": 1.02423906, + "epoch": 0.4429881256575981, + "flos": 32429846586240.0, + "grad_norm": 1.4116551147886955, + "language_loss": 0.74712372, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.7721231, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.21813965, + "step": 7368, + "time_per_iteration": 2.933173418045044 + }, + { + "auxiliary_loss_clip": 0.01460443, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.28170919, + "balance_loss_mlp": 1.02138209, + "epoch": 0.44304824891026606, + "flos": 22173831538560.0, + "grad_norm": 1.581221177181286, + "language_loss": 0.80540216, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.830441, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.22058105, + "step": 7369, + "time_per_iteration": 2.8809752464294434 + }, + { + "auxiliary_loss_clip": 0.01449686, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.27189767, + "balance_loss_mlp": 1.01682854, + "epoch": 0.443108372162934, + "flos": 25349673611520.0, + "grad_norm": 1.8809713624867983, + "language_loss": 0.73437309, + "learning_rate": 2.461407185763737e-06, + "loss": 0.75925088, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.21240234, + "step": 7370, + "time_per_iteration": 2.9148974418640137 + }, + { + "auxiliary_loss_clip": 0.01445296, + "auxiliary_loss_mlp": 0.01041548, + "balance_loss_clip": 1.26748645, + "balance_loss_mlp": 1.01916099, + "epoch": 0.443168495415602, + "flos": 23341295848320.0, + "grad_norm": 1.7838724679952747, + "language_loss": 0.71681035, + "learning_rate": 2.461028221425126e-06, + "loss": 0.74167883, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.22387695, + "step": 7371, + "time_per_iteration": 2.878207206726074 + }, + { + "auxiliary_loss_clip": 0.01438947, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.26162624, + "balance_loss_mlp": 1.01886725, + "epoch": 0.44322861866826996, + "flos": 21881467768320.0, + "grad_norm": 2.8137954312282005, + "language_loss": 0.68804896, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.71284217, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.21508789, + "step": 7372, + "time_per_iteration": 2.9813220500946045 + }, + { + "auxiliary_loss_clip": 0.01456589, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.27399468, + "balance_loss_mlp": 1.02147269, + "epoch": 0.4432887419209379, + "flos": 20094093446400.0, + "grad_norm": 1.8783385678742621, + "language_loss": 0.84658051, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.87159395, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.23291016, + "step": 7373, + "time_per_iteration": 2.8384690284729004 + }, + { + "auxiliary_loss_clip": 0.01261389, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.1561656, + "balance_loss_mlp": 1.01111948, + "epoch": 0.4433488651736059, + "flos": 70068209466240.0, + "grad_norm": 0.763783441164014, + "language_loss": 0.55189943, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57484388, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.21972656, + "step": 7374, + "time_per_iteration": 3.3976738452911377 + }, + { + "auxiliary_loss_clip": 0.0143587, + "auxiliary_loss_mlp": 0.01048689, + "balance_loss_clip": 1.26059723, + "balance_loss_mlp": 1.02481174, + "epoch": 0.44340898842627385, + "flos": 16289247156480.0, + "grad_norm": 3.881543859124447, + "language_loss": 0.83737659, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.86222225, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.2388916, + "step": 7375, + "time_per_iteration": 2.8469104766845703 + }, + { + "auxiliary_loss_clip": 0.01450379, + "auxiliary_loss_mlp": 0.01049148, + "balance_loss_clip": 1.26920366, + "balance_loss_mlp": 1.02676034, + "epoch": 0.4434691116789418, + "flos": 16619146128000.0, + "grad_norm": 1.734864088494111, + "language_loss": 0.84093511, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86593032, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.22399902, + "step": 7376, + "time_per_iteration": 2.8033030033111572 + }, + { + "auxiliary_loss_clip": 0.01441014, + "auxiliary_loss_mlp": 0.0104565, + "balance_loss_clip": 1.26331496, + "balance_loss_mlp": 1.02277398, + "epoch": 0.4435292349316098, + "flos": 19072788399360.0, + "grad_norm": 2.166263573238394, + "language_loss": 0.78296113, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.80782783, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.22875977, + "step": 7377, + "time_per_iteration": 2.876875400543213 + }, + { + "auxiliary_loss_clip": 0.01423809, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.25099683, + "balance_loss_mlp": 1.02143717, + "epoch": 0.44358935818427775, + "flos": 21261150552960.0, + "grad_norm": 2.0478718842527024, + "language_loss": 0.76581621, + "learning_rate": 2.458374982357057e-06, + "loss": 0.79049909, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.23022461, + "step": 7378, + "time_per_iteration": 2.8658711910247803 + }, + { + "auxiliary_loss_clip": 0.01441165, + "auxiliary_loss_mlp": 0.01044692, + "balance_loss_clip": 1.26387477, + "balance_loss_mlp": 1.02166021, + "epoch": 0.4436494814369457, + "flos": 12502996479360.0, + "grad_norm": 2.204863750738569, + "language_loss": 0.70482981, + "learning_rate": 2.457995878562982e-06, + "loss": 0.72968835, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.23046875, + "step": 7379, + "time_per_iteration": 2.8294684886932373 + }, + { + "auxiliary_loss_clip": 0.01444101, + "auxiliary_loss_mlp": 0.01043735, + "balance_loss_clip": 1.26488769, + "balance_loss_mlp": 1.02077484, + "epoch": 0.44370960468961373, + "flos": 23670425658240.0, + "grad_norm": 1.5596536074553278, + "language_loss": 0.73983073, + "learning_rate": 2.457616757401656e-06, + "loss": 0.76470912, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.22973633, + "step": 7380, + "time_per_iteration": 2.9124960899353027 + }, + { + "auxiliary_loss_clip": 0.01441963, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.26393056, + "balance_loss_mlp": 1.01469398, + "epoch": 0.4437697279422817, + "flos": 32429801341440.0, + "grad_norm": 1.6455631833965363, + "language_loss": 0.6607011, + "learning_rate": 2.457237618887458e-06, + "loss": 0.68549371, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.22619629, + "step": 7381, + "time_per_iteration": 2.9507193565368652 + }, + { + "auxiliary_loss_clip": 0.01455676, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.27638817, + "balance_loss_mlp": 1.01985335, + "epoch": 0.44382985119494966, + "flos": 18121712561280.0, + "grad_norm": 1.9553415652525692, + "language_loss": 0.80827928, + "learning_rate": 2.456858463034763e-06, + "loss": 0.83325464, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.2199707, + "step": 7382, + "time_per_iteration": 2.9868087768554688 + }, + { + "auxiliary_loss_clip": 0.01442893, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.26531577, + "balance_loss_mlp": 1.0217104, + "epoch": 0.44388997444761763, + "flos": 30786459776640.0, + "grad_norm": 1.9916160195569759, + "language_loss": 0.66341698, + "learning_rate": 2.456479289857949e-06, + "loss": 0.68828416, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.22119141, + "step": 7383, + "time_per_iteration": 3.0478219985961914 + }, + { + "auxiliary_loss_clip": 0.01462705, + "auxiliary_loss_mlp": 0.0103843, + "balance_loss_clip": 1.28014529, + "balance_loss_mlp": 1.01556551, + "epoch": 0.4439500977002856, + "flos": 20348741036160.0, + "grad_norm": 3.1575316080845184, + "language_loss": 0.77333879, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.79835016, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.2286377, + "step": 7384, + "time_per_iteration": 2.8994600772857666 + }, + { + "auxiliary_loss_clip": 0.01450654, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.26915812, + "balance_loss_mlp": 1.0170269, + "epoch": 0.44401022095295356, + "flos": 20379670496640.0, + "grad_norm": 1.8417188201139187, + "language_loss": 0.8189171, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.84382266, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.22875977, + "step": 7385, + "time_per_iteration": 2.8501672744750977 + }, + { + "auxiliary_loss_clip": 0.01436129, + "auxiliary_loss_mlp": 0.01042262, + "balance_loss_clip": 1.25789857, + "balance_loss_mlp": 1.01745462, + "epoch": 0.4440703442056215, + "flos": 20240433688320.0, + "grad_norm": 1.857966187326479, + "language_loss": 0.82384014, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84862411, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.24816895, + "step": 7386, + "time_per_iteration": 2.9048900604248047 + }, + { + "auxiliary_loss_clip": 0.01472334, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.28656948, + "balance_loss_mlp": 1.01786983, + "epoch": 0.4441304674582895, + "flos": 39509386133760.0, + "grad_norm": 1.8522555422396316, + "language_loss": 0.70896506, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.73410398, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.23706055, + "step": 7387, + "time_per_iteration": 3.0481314659118652 + }, + { + "auxiliary_loss_clip": 0.01446137, + "auxiliary_loss_mlp": 0.01042578, + "balance_loss_clip": 1.26700258, + "balance_loss_mlp": 1.0192486, + "epoch": 0.44419059071095746, + "flos": 14837789364480.0, + "grad_norm": 2.0039629691655008, + "language_loss": 0.7249999, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.74988711, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.23327637, + "step": 7388, + "time_per_iteration": 4.2812511920928955 + }, + { + "auxiliary_loss_clip": 0.0146583, + "auxiliary_loss_mlp": 0.0104269, + "balance_loss_clip": 1.28333759, + "balance_loss_mlp": 1.01944411, + "epoch": 0.4442507139636254, + "flos": 22648034736000.0, + "grad_norm": 1.7161102493226876, + "language_loss": 0.6996575, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.72474277, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.23242188, + "step": 7389, + "time_per_iteration": 2.885082721710205 + }, + { + "auxiliary_loss_clip": 0.01450466, + "auxiliary_loss_mlp": 0.01039211, + "balance_loss_clip": 1.27092457, + "balance_loss_mlp": 1.01718104, + "epoch": 0.4443108372162934, + "flos": 38305110539520.0, + "grad_norm": 1.8053246020246982, + "language_loss": 0.75472224, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77961898, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22033691, + "step": 7390, + "time_per_iteration": 3.0353541374206543 + }, + { + "auxiliary_loss_clip": 0.01436769, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.25940013, + "balance_loss_mlp": 1.01634991, + "epoch": 0.44437096046896135, + "flos": 17757988462080.0, + "grad_norm": 3.4356276996605093, + "language_loss": 0.83178324, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.8565414, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22705078, + "step": 7391, + "time_per_iteration": 2.834217071533203 + }, + { + "auxiliary_loss_clip": 0.01441962, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.26572704, + "balance_loss_mlp": 1.01900172, + "epoch": 0.4444310837216293, + "flos": 13739468388480.0, + "grad_norm": 1.985887724286699, + "language_loss": 0.74030519, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76514143, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.22668457, + "step": 7392, + "time_per_iteration": 2.888111114501953 + }, + { + "auxiliary_loss_clip": 0.01440431, + "auxiliary_loss_mlp": 0.01040356, + "balance_loss_clip": 1.26318145, + "balance_loss_mlp": 1.01658583, + "epoch": 0.44449120697429734, + "flos": 25021267718400.0, + "grad_norm": 1.5511765945606866, + "language_loss": 0.80474615, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.82955408, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.23754883, + "step": 7393, + "time_per_iteration": 2.9204657077789307 + }, + { + "auxiliary_loss_clip": 0.01453645, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.27097535, + "balance_loss_mlp": 1.01288509, + "epoch": 0.4445513302269653, + "flos": 32684584665600.0, + "grad_norm": 2.093923085182415, + "language_loss": 0.81647635, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.84137356, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.23205566, + "step": 7394, + "time_per_iteration": 2.9567296504974365 + }, + { + "auxiliary_loss_clip": 0.01441573, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.2650969, + "balance_loss_mlp": 1.01711524, + "epoch": 0.44461145347963327, + "flos": 11663214145920.0, + "grad_norm": 3.5223726039522676, + "language_loss": 0.80970204, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.83450389, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21508789, + "step": 7395, + "time_per_iteration": 4.3942649364471436 + }, + { + "auxiliary_loss_clip": 0.01439653, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.26204324, + "balance_loss_mlp": 1.01433372, + "epoch": 0.44467157673230123, + "flos": 20896566802560.0, + "grad_norm": 4.997689268312886, + "language_loss": 0.69699371, + "learning_rate": 2.451548468607584e-06, + "loss": 0.72175837, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.22485352, + "step": 7396, + "time_per_iteration": 4.307459592819214 + }, + { + "auxiliary_loss_clip": 0.01461293, + "auxiliary_loss_mlp": 0.01042508, + "balance_loss_clip": 1.28020847, + "balance_loss_mlp": 1.02033532, + "epoch": 0.4447316999849692, + "flos": 18553901322240.0, + "grad_norm": 2.3613004869760643, + "language_loss": 0.81086469, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83590269, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.22180176, + "step": 7397, + "time_per_iteration": 2.8469433784484863 + }, + { + "auxiliary_loss_clip": 0.01444539, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.26732135, + "balance_loss_mlp": 1.01654172, + "epoch": 0.44479182323763716, + "flos": 23779592657280.0, + "grad_norm": 4.886182626371294, + "language_loss": 0.67990667, + "learning_rate": 2.450789623090293e-06, + "loss": 0.70473886, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.22131348, + "step": 7398, + "time_per_iteration": 4.290961980819702 + }, + { + "auxiliary_loss_clip": 0.01440936, + "auxiliary_loss_mlp": 0.01045754, + "balance_loss_clip": 1.26557159, + "balance_loss_mlp": 1.02260327, + "epoch": 0.44485194649030513, + "flos": 16552219789440.0, + "grad_norm": 1.6545339587584185, + "language_loss": 0.70389485, + "learning_rate": 2.450410174683472e-06, + "loss": 0.72876173, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.23144531, + "step": 7399, + "time_per_iteration": 2.886300563812256 + }, + { + "auxiliary_loss_clip": 0.01430934, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.25591338, + "balance_loss_mlp": 1.01687074, + "epoch": 0.4449120697429731, + "flos": 22611042472320.0, + "grad_norm": 2.3428234498568767, + "language_loss": 0.73851383, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.76320857, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21679688, + "step": 7400, + "time_per_iteration": 2.861452341079712 + }, + { + "auxiliary_loss_clip": 0.01442094, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.26577663, + "balance_loss_mlp": 1.01545823, + "epoch": 0.44497219299564106, + "flos": 20012824506240.0, + "grad_norm": 1.765920301810085, + "language_loss": 0.8591184, + "learning_rate": 2.449651226645422e-06, + "loss": 0.88391638, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.22241211, + "step": 7401, + "time_per_iteration": 2.9218664169311523 + }, + { + "auxiliary_loss_clip": 0.01424731, + "auxiliary_loss_mlp": 0.01039485, + "balance_loss_clip": 1.25201511, + "balance_loss_mlp": 1.01910043, + "epoch": 0.445032316248309, + "flos": 25605497566080.0, + "grad_norm": 2.8968338758233627, + "language_loss": 0.83859777, + "learning_rate": 2.449271727042973e-06, + "loss": 0.86323988, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20373535, + "step": 7402, + "time_per_iteration": 2.987872362136841 + }, + { + "auxiliary_loss_clip": 0.01447583, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.26891422, + "balance_loss_mlp": 1.01412094, + "epoch": 0.445092439500977, + "flos": 21260019432960.0, + "grad_norm": 1.668879653057871, + "language_loss": 0.7770583, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.80189848, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.22338867, + "step": 7403, + "time_per_iteration": 2.842527151107788 + }, + { + "auxiliary_loss_clip": 0.0127959, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.17077112, + "balance_loss_mlp": 1.01020169, + "epoch": 0.44515256275364495, + "flos": 57791419580160.0, + "grad_norm": 0.755599175813721, + "language_loss": 0.60129452, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62449002, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.296875, + "step": 7404, + "time_per_iteration": 3.335784435272217 + }, + { + "auxiliary_loss_clip": 0.01454546, + "auxiliary_loss_mlp": 0.01039398, + "balance_loss_clip": 1.27246714, + "balance_loss_mlp": 1.01611686, + "epoch": 0.4452126860063129, + "flos": 15604356332160.0, + "grad_norm": 2.785212712894443, + "language_loss": 0.82704085, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.85198033, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.23278809, + "step": 7405, + "time_per_iteration": 2.908973455429077 + }, + { + "auxiliary_loss_clip": 0.01440032, + "auxiliary_loss_mlp": 0.01039545, + "balance_loss_clip": 1.26297033, + "balance_loss_mlp": 1.01712167, + "epoch": 0.4452728092589809, + "flos": 21627634584960.0, + "grad_norm": 2.3445138758323876, + "language_loss": 0.7616722, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.78646797, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22424316, + "step": 7406, + "time_per_iteration": 2.912193775177002 + }, + { + "auxiliary_loss_clip": 0.01423333, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.25186694, + "balance_loss_mlp": 1.01595974, + "epoch": 0.4453329325116489, + "flos": 29509556999040.0, + "grad_norm": 1.994226512384518, + "language_loss": 0.66030717, + "learning_rate": 2.447373973772129e-06, + "loss": 0.68492055, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.22058105, + "step": 7407, + "time_per_iteration": 2.950221300125122 + }, + { + "auxiliary_loss_clip": 0.01456876, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.2769022, + "balance_loss_mlp": 1.01810706, + "epoch": 0.44539305576431687, + "flos": 21371086713600.0, + "grad_norm": 1.5637316022641572, + "language_loss": 0.6884799, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.71345139, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22155762, + "step": 7408, + "time_per_iteration": 2.88632869720459 + }, + { + "auxiliary_loss_clip": 0.01450657, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.2705667, + "balance_loss_mlp": 1.01923895, + "epoch": 0.44545317901698483, + "flos": 41442603004800.0, + "grad_norm": 1.5293316373602237, + "language_loss": 0.72653002, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.75145578, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.22692871, + "step": 7409, + "time_per_iteration": 3.0382041931152344 + }, + { + "auxiliary_loss_clip": 0.01436041, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.25567138, + "balance_loss_mlp": 1.01878273, + "epoch": 0.4455133022696528, + "flos": 22065297966720.0, + "grad_norm": 1.646120503224584, + "language_loss": 0.65726554, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.68204844, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.23486328, + "step": 7410, + "time_per_iteration": 2.8450193405151367 + }, + { + "auxiliary_loss_clip": 0.01469416, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.28445172, + "balance_loss_mlp": 1.01801658, + "epoch": 0.44557342552232077, + "flos": 23487364621440.0, + "grad_norm": 3.051768397041201, + "language_loss": 0.75742185, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.7825247, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.22875977, + "step": 7411, + "time_per_iteration": 2.899341106414795 + }, + { + "auxiliary_loss_clip": 0.01436688, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.26358128, + "balance_loss_mlp": 1.01832843, + "epoch": 0.44563354877498873, + "flos": 19143832014720.0, + "grad_norm": 2.3973266305124725, + "language_loss": 0.79954088, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.82431918, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.22802734, + "step": 7412, + "time_per_iteration": 2.869169235229492 + }, + { + "auxiliary_loss_clip": 0.0145249, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.26932025, + "balance_loss_mlp": 1.01654601, + "epoch": 0.4456936720276567, + "flos": 13628129639040.0, + "grad_norm": 1.8953164093632644, + "language_loss": 0.81423485, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.83914316, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.21801758, + "step": 7413, + "time_per_iteration": 2.9330315589904785 + }, + { + "auxiliary_loss_clip": 0.01432085, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.25696087, + "balance_loss_mlp": 1.01520038, + "epoch": 0.44575379528032466, + "flos": 14720116343040.0, + "grad_norm": 2.780380955918839, + "language_loss": 0.77257192, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.79726827, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.22338867, + "step": 7414, + "time_per_iteration": 2.8842039108276367 + }, + { + "auxiliary_loss_clip": 0.01434298, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.25765157, + "balance_loss_mlp": 1.01713753, + "epoch": 0.4458139185329926, + "flos": 24181394895360.0, + "grad_norm": 1.5280197409697842, + "language_loss": 0.83941936, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.86417156, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.23779297, + "step": 7415, + "time_per_iteration": 2.9058585166931152 + }, + { + "auxiliary_loss_clip": 0.01432121, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.25377429, + "balance_loss_mlp": 1.01499438, + "epoch": 0.4458740417856606, + "flos": 21772210279680.0, + "grad_norm": 1.5036259728586137, + "language_loss": 0.84695339, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.87164915, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.22460938, + "step": 7416, + "time_per_iteration": 2.8872830867767334 + }, + { + "auxiliary_loss_clip": 0.01440609, + "auxiliary_loss_mlp": 0.01040204, + "balance_loss_clip": 1.26038313, + "balance_loss_mlp": 1.01724482, + "epoch": 0.44593416503832856, + "flos": 21078541964160.0, + "grad_norm": 2.8492087917878095, + "language_loss": 0.81606579, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.8408739, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.22961426, + "step": 7417, + "time_per_iteration": 2.8885910511016846 + }, + { + "auxiliary_loss_clip": 0.01444495, + "auxiliary_loss_mlp": 0.01050134, + "balance_loss_clip": 1.26522017, + "balance_loss_mlp": 1.02768719, + "epoch": 0.4459942882909965, + "flos": 22610454289920.0, + "grad_norm": 2.031991653407147, + "language_loss": 0.81842172, + "learning_rate": 2.443197426237077e-06, + "loss": 0.84336805, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.2244873, + "step": 7418, + "time_per_iteration": 2.8903253078460693 + }, + { + "auxiliary_loss_clip": 0.01440605, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.26014507, + "balance_loss_mlp": 1.01896071, + "epoch": 0.4460544115436645, + "flos": 26516730718080.0, + "grad_norm": 1.6497730536291957, + "language_loss": 0.77996194, + "learning_rate": 2.442817638972991e-06, + "loss": 0.80477917, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22167969, + "step": 7419, + "time_per_iteration": 2.8701987266540527 + }, + { + "auxiliary_loss_clip": 0.01441623, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.26449609, + "balance_loss_mlp": 1.01445055, + "epoch": 0.4461145347963325, + "flos": 17613231788160.0, + "grad_norm": 1.637108654947106, + "language_loss": 0.73344535, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.7582252, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21911621, + "step": 7420, + "time_per_iteration": 2.839426040649414 + }, + { + "auxiliary_loss_clip": 0.01423441, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.24983132, + "balance_loss_mlp": 1.01509881, + "epoch": 0.44617465804900047, + "flos": 27278727960960.0, + "grad_norm": 2.0909378820074713, + "language_loss": 0.75501621, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77963287, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.23132324, + "step": 7421, + "time_per_iteration": 2.9565625190734863 + }, + { + "auxiliary_loss_clip": 0.01423213, + "auxiliary_loss_mlp": 0.01041077, + "balance_loss_clip": 1.24992466, + "balance_loss_mlp": 1.01824808, + "epoch": 0.44623478130166844, + "flos": 17795659397760.0, + "grad_norm": 1.9253807718136415, + "language_loss": 0.76716918, + "learning_rate": 2.44167817648821e-06, + "loss": 0.79181206, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.22827148, + "step": 7422, + "time_per_iteration": 2.921449661254883 + }, + { + "auxiliary_loss_clip": 0.01443432, + "auxiliary_loss_mlp": 0.01043289, + "balance_loss_clip": 1.26497591, + "balance_loss_mlp": 1.02054417, + "epoch": 0.4462949045543364, + "flos": 23013387648000.0, + "grad_norm": 5.274105488387539, + "language_loss": 0.66193402, + "learning_rate": 2.441298322143784e-06, + "loss": 0.68680131, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22741699, + "step": 7423, + "time_per_iteration": 4.3420021533966064 + }, + { + "auxiliary_loss_clip": 0.014122, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.24142373, + "balance_loss_mlp": 1.01639104, + "epoch": 0.44635502780700437, + "flos": 17828624874240.0, + "grad_norm": 1.4949073873160796, + "language_loss": 0.8053019, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.82981503, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.22717285, + "step": 7424, + "time_per_iteration": 2.8859431743621826 + }, + { + "auxiliary_loss_clip": 0.01415983, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.24519396, + "balance_loss_mlp": 1.01582158, + "epoch": 0.44641515105967233, + "flos": 26699339306880.0, + "grad_norm": 1.3958955526127363, + "language_loss": 0.80781186, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.83233976, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21008301, + "step": 7425, + "time_per_iteration": 2.982287883758545 + }, + { + "auxiliary_loss_clip": 0.0142577, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.25116634, + "balance_loss_mlp": 1.01807594, + "epoch": 0.4464752743123403, + "flos": 18921787943040.0, + "grad_norm": 1.4697205323069449, + "language_loss": 0.7802006, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.804847, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.20788574, + "step": 7426, + "time_per_iteration": 2.8496246337890625 + }, + { + "auxiliary_loss_clip": 0.01437178, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.25839484, + "balance_loss_mlp": 1.01523781, + "epoch": 0.44653539756500826, + "flos": 29582365161600.0, + "grad_norm": 1.734032213497307, + "language_loss": 0.65685761, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.68160844, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.22668457, + "step": 7427, + "time_per_iteration": 2.8883659839630127 + }, + { + "auxiliary_loss_clip": 0.01424946, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.25230825, + "balance_loss_mlp": 1.013942, + "epoch": 0.44659552081767623, + "flos": 21478534410240.0, + "grad_norm": 1.8068959268247908, + "language_loss": 0.76587546, + "learning_rate": 2.439398799698608e-06, + "loss": 0.79047978, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21533203, + "step": 7428, + "time_per_iteration": 2.8329596519470215 + }, + { + "auxiliary_loss_clip": 0.01430354, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.2559855, + "balance_loss_mlp": 1.01260483, + "epoch": 0.4466556440703442, + "flos": 17941049498880.0, + "grad_norm": 1.9080483747007464, + "language_loss": 0.78647584, + "learning_rate": 2.439018845165806e-06, + "loss": 0.81113219, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.22705078, + "step": 7429, + "time_per_iteration": 2.8403327465057373 + }, + { + "auxiliary_loss_clip": 0.01449595, + "auxiliary_loss_mlp": 0.01039257, + "balance_loss_clip": 1.27122378, + "balance_loss_mlp": 1.0175128, + "epoch": 0.44671576732301216, + "flos": 21117751223040.0, + "grad_norm": 2.0755426463919235, + "language_loss": 0.91502219, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93991065, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21728516, + "step": 7430, + "time_per_iteration": 4.496643781661987 + }, + { + "auxiliary_loss_clip": 0.01460811, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.27621233, + "balance_loss_mlp": 1.01627874, + "epoch": 0.4467758905756801, + "flos": 23518158347520.0, + "grad_norm": 1.5754898465535359, + "language_loss": 0.8050459, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.83004582, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.22924805, + "step": 7431, + "time_per_iteration": 4.363646030426025 + }, + { + "auxiliary_loss_clip": 0.01444532, + "auxiliary_loss_mlp": 0.01039897, + "balance_loss_clip": 1.26508236, + "balance_loss_mlp": 1.01824892, + "epoch": 0.4468360138283481, + "flos": 18743251386240.0, + "grad_norm": 3.5089398880747296, + "language_loss": 0.81010115, + "learning_rate": 2.437878881739204e-06, + "loss": 0.83494544, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.21643066, + "step": 7432, + "time_per_iteration": 2.879289388656616 + }, + { + "auxiliary_loss_clip": 0.01447123, + "auxiliary_loss_mlp": 0.01039639, + "balance_loss_clip": 1.26615644, + "balance_loss_mlp": 1.01828837, + "epoch": 0.4468961370810161, + "flos": 23487590845440.0, + "grad_norm": 1.77111467861156, + "language_loss": 0.77973545, + "learning_rate": 2.437498860702301e-06, + "loss": 0.80460304, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.21374512, + "step": 7433, + "time_per_iteration": 2.8578684329986572 + }, + { + "auxiliary_loss_clip": 0.01423718, + "auxiliary_loss_mlp": 0.01038352, + "balance_loss_clip": 1.25319302, + "balance_loss_mlp": 1.01821804, + "epoch": 0.4469562603336841, + "flos": 30085642782720.0, + "grad_norm": 1.7043136022113377, + "language_loss": 0.78062773, + "learning_rate": 2.437118823075398e-06, + "loss": 0.80524838, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20141602, + "step": 7434, + "time_per_iteration": 4.327753305435181 + }, + { + "auxiliary_loss_clip": 0.01452626, + "auxiliary_loss_mlp": 0.01040741, + "balance_loss_clip": 1.27384901, + "balance_loss_mlp": 1.01915264, + "epoch": 0.44701638358635204, + "flos": 22466828736000.0, + "grad_norm": 1.9180184739191748, + "language_loss": 0.65146065, + "learning_rate": 2.436738768872905e-06, + "loss": 0.67639434, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21582031, + "step": 7435, + "time_per_iteration": 2.891143321990967 + }, + { + "auxiliary_loss_clip": 0.01452013, + "auxiliary_loss_mlp": 0.01045517, + "balance_loss_clip": 1.27589869, + "balance_loss_mlp": 1.02359462, + "epoch": 0.44707650683902, + "flos": 24067658171520.0, + "grad_norm": 1.7525394662809408, + "language_loss": 0.84327221, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.86824751, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21899414, + "step": 7436, + "time_per_iteration": 2.952116012573242 + }, + { + "auxiliary_loss_clip": 0.01476006, + "auxiliary_loss_mlp": 0.0104901, + "balance_loss_clip": 1.29444933, + "balance_loss_mlp": 1.02558541, + "epoch": 0.44713663009168797, + "flos": 23776923214080.0, + "grad_norm": 1.7127076417866558, + "language_loss": 0.80001497, + "learning_rate": 2.435978610798798e-06, + "loss": 0.82526517, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.23413086, + "step": 7437, + "time_per_iteration": 2.9006364345550537 + }, + { + "auxiliary_loss_clip": 0.01447198, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_clip": 1.27050757, + "balance_loss_mlp": 1.0259645, + "epoch": 0.44719675334435594, + "flos": 24510117502080.0, + "grad_norm": 1.7145968723486968, + "language_loss": 0.72269893, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74764085, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21044922, + "step": 7438, + "time_per_iteration": 2.9519965648651123 + }, + { + "auxiliary_loss_clip": 0.01465774, + "auxiliary_loss_mlp": 0.0105505, + "balance_loss_clip": 1.28555, + "balance_loss_mlp": 1.03259051, + "epoch": 0.4472568765970239, + "flos": 29791921668480.0, + "grad_norm": 1.7410615138782424, + "language_loss": 0.67471051, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69991875, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.22473145, + "step": 7439, + "time_per_iteration": 2.9677162170410156 + }, + { + "auxiliary_loss_clip": 0.01464016, + "auxiliary_loss_mlp": 0.01051448, + "balance_loss_clip": 1.28570533, + "balance_loss_mlp": 1.028512, + "epoch": 0.44731699984969187, + "flos": 24653471587200.0, + "grad_norm": 2.338812884941249, + "language_loss": 0.7453652, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.77051985, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22924805, + "step": 7440, + "time_per_iteration": 2.968296766281128 + }, + { + "auxiliary_loss_clip": 0.01450001, + "auxiliary_loss_mlp": 0.01056124, + "balance_loss_clip": 1.2745893, + "balance_loss_mlp": 1.03432012, + "epoch": 0.44737712310235983, + "flos": 29466366197760.0, + "grad_norm": 1.6401665789515534, + "language_loss": 0.74672604, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.77178729, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21801758, + "step": 7441, + "time_per_iteration": 2.9644076824188232 + }, + { + "auxiliary_loss_clip": 0.01463826, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_clip": 1.28539276, + "balance_loss_mlp": 1.03035581, + "epoch": 0.4474372463550278, + "flos": 24906807077760.0, + "grad_norm": 2.5266767964216155, + "language_loss": 0.75366676, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77882588, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.21740723, + "step": 7442, + "time_per_iteration": 2.9294800758361816 + }, + { + "auxiliary_loss_clip": 0.01466714, + "auxiliary_loss_mlp": 0.01053616, + "balance_loss_clip": 1.28352773, + "balance_loss_mlp": 1.03137124, + "epoch": 0.44749736960769576, + "flos": 33194377537920.0, + "grad_norm": 2.5163550588924246, + "language_loss": 0.74902058, + "learning_rate": 2.433697740261273e-06, + "loss": 0.7742238, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.22241211, + "step": 7443, + "time_per_iteration": 2.999361038208008 + }, + { + "auxiliary_loss_clip": 0.01452148, + "auxiliary_loss_mlp": 0.01044902, + "balance_loss_clip": 1.27550721, + "balance_loss_mlp": 1.02189469, + "epoch": 0.4475574928603637, + "flos": 21082342527360.0, + "grad_norm": 1.6593195287990052, + "language_loss": 0.78182542, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.8067959, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.23022461, + "step": 7444, + "time_per_iteration": 2.8841001987457275 + }, + { + "auxiliary_loss_clip": 0.0145058, + "auxiliary_loss_mlp": 0.01051158, + "balance_loss_clip": 1.27687073, + "balance_loss_mlp": 1.02956939, + "epoch": 0.4476176161130317, + "flos": 21870156568320.0, + "grad_norm": 3.515577080400395, + "language_loss": 0.84704757, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87206495, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21594238, + "step": 7445, + "time_per_iteration": 2.887338876724243 + }, + { + "auxiliary_loss_clip": 0.01459187, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.27976322, + "balance_loss_mlp": 1.02111936, + "epoch": 0.4476777393656997, + "flos": 22538912981760.0, + "grad_norm": 1.8969104639756431, + "language_loss": 0.65332878, + "learning_rate": 2.432557082778765e-06, + "loss": 0.67835319, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22143555, + "step": 7446, + "time_per_iteration": 2.9013113975524902 + }, + { + "auxiliary_loss_clip": 0.01286659, + "auxiliary_loss_mlp": 0.0102307, + "balance_loss_clip": 1.18097556, + "balance_loss_mlp": 1.00189805, + "epoch": 0.4477378626183677, + "flos": 49043671810560.0, + "grad_norm": 0.7475661504257273, + "language_loss": 0.50419843, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52729571, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.21191406, + "step": 7447, + "time_per_iteration": 3.246619701385498 + }, + { + "auxiliary_loss_clip": 0.01287431, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.1807673, + "balance_loss_mlp": 1.00853336, + "epoch": 0.44779798587103564, + "flos": 56573000346240.0, + "grad_norm": 0.7609810048795373, + "language_loss": 0.59391403, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61711979, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.24511719, + "step": 7448, + "time_per_iteration": 3.391934871673584 + }, + { + "auxiliary_loss_clip": 0.01447031, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.27139318, + "balance_loss_mlp": 1.01930952, + "epoch": 0.4478581091237036, + "flos": 46514669685120.0, + "grad_norm": 1.6320106180525467, + "language_loss": 0.59870529, + "learning_rate": 2.431416277672789e-06, + "loss": 0.62357903, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21032715, + "step": 7449, + "time_per_iteration": 3.109038829803467 + }, + { + "auxiliary_loss_clip": 0.01459293, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.28178048, + "balance_loss_mlp": 1.01509678, + "epoch": 0.4479182323763716, + "flos": 20824301577600.0, + "grad_norm": 2.1445812559509285, + "language_loss": 0.81117511, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.83613312, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21386719, + "step": 7450, + "time_per_iteration": 2.895526885986328 + }, + { + "auxiliary_loss_clip": 0.01458907, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.28208041, + "balance_loss_mlp": 1.01703238, + "epoch": 0.44797835562903954, + "flos": 14253966720000.0, + "grad_norm": 5.982480744081226, + "language_loss": 0.80821794, + "learning_rate": 2.430655659114697e-06, + "loss": 0.8331905, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.2130127, + "step": 7451, + "time_per_iteration": 2.861950397491455 + }, + { + "auxiliary_loss_clip": 0.01286749, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.17840767, + "balance_loss_mlp": 1.01225424, + "epoch": 0.4480384788817075, + "flos": 63563850806400.0, + "grad_norm": 0.832111098264223, + "language_loss": 0.62842786, + "learning_rate": 2.430275325332681e-06, + "loss": 0.65167916, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.26171875, + "step": 7452, + "time_per_iteration": 3.4893178939819336 + }, + { + "auxiliary_loss_clip": 0.01452983, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.27742028, + "balance_loss_mlp": 1.01760292, + "epoch": 0.44809860213437547, + "flos": 21662545587840.0, + "grad_norm": 1.8349565943458381, + "language_loss": 0.63530737, + "learning_rate": 2.429894975234582e-06, + "loss": 0.66022861, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.2154541, + "step": 7453, + "time_per_iteration": 2.8635528087615967 + }, + { + "auxiliary_loss_clip": 0.01290015, + "auxiliary_loss_mlp": 0.01052756, + "balance_loss_clip": 1.18426311, + "balance_loss_mlp": 1.0290091, + "epoch": 0.44815872538704343, + "flos": 69221595168000.0, + "grad_norm": 0.7659197684080481, + "language_loss": 0.57060099, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59402871, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.23730469, + "step": 7454, + "time_per_iteration": 3.209456443786621 + }, + { + "auxiliary_loss_clip": 0.01457489, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.28057146, + "balance_loss_mlp": 1.01388192, + "epoch": 0.4482188486397114, + "flos": 12604336128000.0, + "grad_norm": 2.0588887404233205, + "language_loss": 0.76192296, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7868399, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.20324707, + "step": 7455, + "time_per_iteration": 2.846205472946167 + }, + { + "auxiliary_loss_clip": 0.01446942, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.27270889, + "balance_loss_mlp": 1.0163281, + "epoch": 0.44827897189237936, + "flos": 34071423603840.0, + "grad_norm": 2.9217376758997613, + "language_loss": 0.77103794, + "learning_rate": 2.428753827188016e-06, + "loss": 0.79587865, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.20800781, + "step": 7456, + "time_per_iteration": 2.9845423698425293 + }, + { + "auxiliary_loss_clip": 0.01432573, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.26248276, + "balance_loss_mlp": 1.01537967, + "epoch": 0.44833909514504733, + "flos": 25156206270720.0, + "grad_norm": 2.2425335206271293, + "language_loss": 0.7721318, + "learning_rate": 2.428373411969818e-06, + "loss": 0.79682255, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.21130371, + "step": 7457, + "time_per_iteration": 2.99906325340271 + }, + { + "auxiliary_loss_clip": 0.0146291, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.28563952, + "balance_loss_mlp": 1.01698017, + "epoch": 0.4483992183977153, + "flos": 16188540935040.0, + "grad_norm": 2.189848730968719, + "language_loss": 0.6884315, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.71344626, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21606445, + "step": 7458, + "time_per_iteration": 2.9102818965911865 + }, + { + "auxiliary_loss_clip": 0.01459034, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.28060472, + "balance_loss_mlp": 1.01894331, + "epoch": 0.44845934165038326, + "flos": 17754278388480.0, + "grad_norm": 3.2145301421326864, + "language_loss": 0.72478652, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74978685, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22070312, + "step": 7459, + "time_per_iteration": 4.28303599357605 + }, + { + "auxiliary_loss_clip": 0.01442549, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.26732481, + "balance_loss_mlp": 1.01461601, + "epoch": 0.4485194649030513, + "flos": 21846104317440.0, + "grad_norm": 3.6388965441206076, + "language_loss": 0.69917035, + "learning_rate": 2.427232068909154e-06, + "loss": 0.72395855, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.2166748, + "step": 7460, + "time_per_iteration": 2.8555076122283936 + }, + { + "auxiliary_loss_clip": 0.01455118, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.2781688, + "balance_loss_mlp": 1.01442778, + "epoch": 0.44857958815571924, + "flos": 20094229180800.0, + "grad_norm": 2.06958834698811, + "language_loss": 0.78792644, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.81283033, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.20849609, + "step": 7461, + "time_per_iteration": 2.8912193775177 + }, + { + "auxiliary_loss_clip": 0.01445945, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.27021456, + "balance_loss_mlp": 1.01522112, + "epoch": 0.4486397114083872, + "flos": 27065099422080.0, + "grad_norm": 1.6910666250632658, + "language_loss": 0.69510877, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.71993518, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21496582, + "step": 7462, + "time_per_iteration": 2.9360218048095703 + }, + { + "auxiliary_loss_clip": 0.01267722, + "auxiliary_loss_mlp": 0.0105431, + "balance_loss_clip": 1.16260195, + "balance_loss_mlp": 1.02264833, + "epoch": 0.4486998346610552, + "flos": 67349422811520.0, + "grad_norm": 0.7564686833911273, + "language_loss": 0.54421496, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56743526, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.31640625, + "step": 7463, + "time_per_iteration": 3.366792678833008 + }, + { + "auxiliary_loss_clip": 0.01432669, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.25962377, + "balance_loss_mlp": 1.0152421, + "epoch": 0.44875995791372314, + "flos": 27648424373760.0, + "grad_norm": 2.409198490817067, + "language_loss": 0.76929086, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.79397738, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.20727539, + "step": 7464, + "time_per_iteration": 2.929297924041748 + }, + { + "auxiliary_loss_clip": 0.01437866, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.26538408, + "balance_loss_mlp": 1.01700401, + "epoch": 0.4488200811663911, + "flos": 13013739492480.0, + "grad_norm": 1.8836759764309163, + "language_loss": 0.75280702, + "learning_rate": 2.425329506653441e-06, + "loss": 0.777565, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20922852, + "step": 7465, + "time_per_iteration": 4.273135423660278 + }, + { + "auxiliary_loss_clip": 0.01458881, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.27751434, + "balance_loss_mlp": 1.01671088, + "epoch": 0.44888020441905907, + "flos": 27501314970240.0, + "grad_norm": 2.482491294351102, + "language_loss": 0.81031549, + "learning_rate": 2.424948945758966e-06, + "loss": 0.83529788, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.22668457, + "step": 7466, + "time_per_iteration": 4.428941488265991 + }, + { + "auxiliary_loss_clip": 0.01447325, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.26972961, + "balance_loss_mlp": 1.01457918, + "epoch": 0.44894032767172704, + "flos": 18268776720000.0, + "grad_norm": 2.428443136768659, + "language_loss": 0.8183164, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.84314561, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21020508, + "step": 7467, + "time_per_iteration": 2.840721607208252 + }, + { + "auxiliary_loss_clip": 0.01426668, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.25890791, + "balance_loss_mlp": 1.0162344, + "epoch": 0.449000450924395, + "flos": 21589782670080.0, + "grad_norm": 1.8721465214090531, + "language_loss": 0.76436865, + "learning_rate": 2.424187775642129e-06, + "loss": 0.78899926, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20153809, + "step": 7468, + "time_per_iteration": 4.246603965759277 + }, + { + "auxiliary_loss_clip": 0.01440446, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.2667017, + "balance_loss_mlp": 1.01261044, + "epoch": 0.44906057417706297, + "flos": 17976955887360.0, + "grad_norm": 1.7031083381399021, + "language_loss": 0.72086513, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.74558926, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.19348145, + "step": 7469, + "time_per_iteration": 2.835515022277832 + }, + { + "auxiliary_loss_clip": 0.01447459, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.27132416, + "balance_loss_mlp": 1.01731014, + "epoch": 0.44912069742973093, + "flos": 20056874958720.0, + "grad_norm": 1.7276932145301216, + "language_loss": 0.72479188, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.7496447, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.20483398, + "step": 7470, + "time_per_iteration": 2.8744468688964844 + }, + { + "auxiliary_loss_clip": 0.01437768, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.26199126, + "balance_loss_mlp": 1.01497459, + "epoch": 0.4491808206823989, + "flos": 21043585716480.0, + "grad_norm": 1.903701256449809, + "language_loss": 0.77645975, + "learning_rate": 2.423045899863634e-06, + "loss": 0.80119514, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20776367, + "step": 7471, + "time_per_iteration": 2.870345115661621 + }, + { + "auxiliary_loss_clip": 0.01437462, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.26434135, + "balance_loss_mlp": 1.01912582, + "epoch": 0.44924094393506686, + "flos": 22977662238720.0, + "grad_norm": 1.8140971684540312, + "language_loss": 0.71987677, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.74465388, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21130371, + "step": 7472, + "time_per_iteration": 2.8750712871551514 + }, + { + "auxiliary_loss_clip": 0.01257142, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.15401399, + "balance_loss_mlp": 1.02191305, + "epoch": 0.4493010671877349, + "flos": 59262739839360.0, + "grad_norm": 0.7421910229573018, + "language_loss": 0.61744261, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.64042008, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.18652344, + "step": 7473, + "time_per_iteration": 3.3580939769744873 + }, + { + "auxiliary_loss_clip": 0.01436203, + "auxiliary_loss_mlp": 0.01043472, + "balance_loss_clip": 1.26181078, + "balance_loss_mlp": 1.02174044, + "epoch": 0.44936119044040285, + "flos": 18014083885440.0, + "grad_norm": 1.8782165883417057, + "language_loss": 0.79247683, + "learning_rate": 2.421903879707657e-06, + "loss": 0.81727356, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21728516, + "step": 7474, + "time_per_iteration": 2.8627736568450928 + }, + { + "auxiliary_loss_clip": 0.01415184, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.24726772, + "balance_loss_mlp": 1.01634765, + "epoch": 0.4494213136930708, + "flos": 21261783980160.0, + "grad_norm": 1.585421670394442, + "language_loss": 0.72920746, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.75374484, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.2220459, + "step": 7475, + "time_per_iteration": 2.8839316368103027 + }, + { + "auxiliary_loss_clip": 0.01430269, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.25745916, + "balance_loss_mlp": 1.01274037, + "epoch": 0.4494814369457388, + "flos": 27430452334080.0, + "grad_norm": 2.101113807612924, + "language_loss": 0.77648485, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.80111974, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20483398, + "step": 7476, + "time_per_iteration": 2.9087917804718018 + }, + { + "auxiliary_loss_clip": 0.0146182, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.28153658, + "balance_loss_mlp": 1.02195013, + "epoch": 0.44954156019840674, + "flos": 22863608801280.0, + "grad_norm": 5.374391106002564, + "language_loss": 0.72408384, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74913585, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21435547, + "step": 7477, + "time_per_iteration": 2.8622586727142334 + }, + { + "auxiliary_loss_clip": 0.01458536, + "auxiliary_loss_mlp": 0.0104638, + "balance_loss_clip": 1.28026235, + "balance_loss_mlp": 1.02414703, + "epoch": 0.4496016834510747, + "flos": 17210388919680.0, + "grad_norm": 2.4223964625195307, + "language_loss": 0.69714642, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.72219563, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22229004, + "step": 7478, + "time_per_iteration": 2.8638124465942383 + }, + { + "auxiliary_loss_clip": 0.01422922, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.25216126, + "balance_loss_mlp": 1.01829112, + "epoch": 0.4496618067037427, + "flos": 18925950464640.0, + "grad_norm": 1.8862380383256183, + "language_loss": 0.89958709, + "learning_rate": 2.420000193000779e-06, + "loss": 0.92420256, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20324707, + "step": 7479, + "time_per_iteration": 2.8214612007141113 + }, + { + "auxiliary_loss_clip": 0.01426496, + "auxiliary_loss_mlp": 0.01045291, + "balance_loss_clip": 1.25413918, + "balance_loss_mlp": 1.02296293, + "epoch": 0.44972192995641064, + "flos": 21041232986880.0, + "grad_norm": 1.8549416353174877, + "language_loss": 0.76747483, + "learning_rate": 2.419619407822302e-06, + "loss": 0.7921927, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.2232666, + "step": 7480, + "time_per_iteration": 2.8730101585388184 + }, + { + "auxiliary_loss_clip": 0.0145, + "auxiliary_loss_mlp": 0.01043372, + "balance_loss_clip": 1.27159882, + "balance_loss_mlp": 1.0213542, + "epoch": 0.4497820532090786, + "flos": 20786811621120.0, + "grad_norm": 3.138225724701109, + "language_loss": 0.8145957, + "learning_rate": 2.419238606731815e-06, + "loss": 0.83952945, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.22033691, + "step": 7481, + "time_per_iteration": 2.9008326530456543 + }, + { + "auxiliary_loss_clip": 0.0142317, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.2541945, + "balance_loss_mlp": 1.01559234, + "epoch": 0.44984217646174657, + "flos": 33815735383680.0, + "grad_norm": 1.8723500932952248, + "language_loss": 0.69202185, + "learning_rate": 2.418857789743758e-06, + "loss": 0.71662158, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21203613, + "step": 7482, + "time_per_iteration": 2.960660696029663 + }, + { + "auxiliary_loss_clip": 0.01449054, + "auxiliary_loss_mlp": 0.01040692, + "balance_loss_clip": 1.27348053, + "balance_loss_mlp": 1.01957989, + "epoch": 0.44990229971441453, + "flos": 15525485366400.0, + "grad_norm": 2.118701762017151, + "language_loss": 0.86115086, + "learning_rate": 2.418476956872571e-06, + "loss": 0.88604832, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21118164, + "step": 7483, + "time_per_iteration": 2.8056747913360596 + }, + { + "auxiliary_loss_clip": 0.01438623, + "auxiliary_loss_mlp": 0.01044715, + "balance_loss_clip": 1.26432467, + "balance_loss_mlp": 1.02243495, + "epoch": 0.4499624229670825, + "flos": 29873959770240.0, + "grad_norm": 1.8870778615743897, + "language_loss": 0.81847215, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.84330559, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.22277832, + "step": 7484, + "time_per_iteration": 2.903496026992798 + }, + { + "auxiliary_loss_clip": 0.01448123, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.26934886, + "balance_loss_mlp": 1.01989985, + "epoch": 0.45002254621975046, + "flos": 18522474168960.0, + "grad_norm": 3.2706173741177706, + "language_loss": 0.76401174, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.78891087, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21887207, + "step": 7485, + "time_per_iteration": 2.8279008865356445 + }, + { + "auxiliary_loss_clip": 0.01261981, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.15856123, + "balance_loss_mlp": 1.00786352, + "epoch": 0.4500826694724185, + "flos": 70448882382720.0, + "grad_norm": 0.8017417792601544, + "language_loss": 0.58808845, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.61097473, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.1875, + "step": 7486, + "time_per_iteration": 3.3784050941467285 + }, + { + "auxiliary_loss_clip": 0.0143746, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.26482177, + "balance_loss_mlp": 1.01583433, + "epoch": 0.45014279272508645, + "flos": 15787417368960.0, + "grad_norm": 2.670169288068218, + "language_loss": 0.84285939, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.86759889, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20654297, + "step": 7487, + "time_per_iteration": 2.8163774013519287 + }, + { + "auxiliary_loss_clip": 0.01422901, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.2509985, + "balance_loss_mlp": 1.01776791, + "epoch": 0.4502029159777544, + "flos": 21809790725760.0, + "grad_norm": 6.331304448059537, + "language_loss": 0.78283858, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.80745864, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21325684, + "step": 7488, + "time_per_iteration": 2.8894307613372803 + }, + { + "auxiliary_loss_clip": 0.014591, + "auxiliary_loss_mlp": 0.01038077, + "balance_loss_clip": 1.2772063, + "balance_loss_mlp": 1.01683402, + "epoch": 0.4502630392304224, + "flos": 28779891805440.0, + "grad_norm": 2.110019595220429, + "language_loss": 0.7311337, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.75610542, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.21252441, + "step": 7489, + "time_per_iteration": 2.910886764526367 + }, + { + "auxiliary_loss_clip": 0.01445539, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.27057791, + "balance_loss_mlp": 1.01829934, + "epoch": 0.45032316248309034, + "flos": 15850724123520.0, + "grad_norm": 2.7029009370559227, + "language_loss": 0.70521963, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.73008001, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.22216797, + "step": 7490, + "time_per_iteration": 2.837088108062744 + }, + { + "auxiliary_loss_clip": 0.01248996, + "auxiliary_loss_mlp": 0.01022275, + "balance_loss_clip": 1.14706624, + "balance_loss_mlp": 1.00129414, + "epoch": 0.4503832857357583, + "flos": 57884225944320.0, + "grad_norm": 0.7209523010576697, + "language_loss": 0.56685042, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58956313, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.20996094, + "step": 7491, + "time_per_iteration": 3.344919443130493 + }, + { + "auxiliary_loss_clip": 0.01424593, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.25431013, + "balance_loss_mlp": 1.01293302, + "epoch": 0.4504434089884263, + "flos": 23888533432320.0, + "grad_norm": 1.889586817734969, + "language_loss": 0.80400872, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.82859963, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.2154541, + "step": 7492, + "time_per_iteration": 2.886906147003174 + }, + { + "auxiliary_loss_clip": 0.01459423, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.2780664, + "balance_loss_mlp": 1.01538491, + "epoch": 0.45050353224109424, + "flos": 17793985340160.0, + "grad_norm": 2.3645741983422393, + "language_loss": 0.93161088, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.95657325, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.2142334, + "step": 7493, + "time_per_iteration": 2.795440673828125 + }, + { + "auxiliary_loss_clip": 0.01248709, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.14543366, + "balance_loss_mlp": 1.00383615, + "epoch": 0.4505636554937622, + "flos": 65092188792960.0, + "grad_norm": 0.7967103253162479, + "language_loss": 0.62853253, + "learning_rate": 2.4142867511336e-06, + "loss": 0.65133083, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.2734375, + "step": 7494, + "time_per_iteration": 4.698879241943359 + }, + { + "auxiliary_loss_clip": 0.01430094, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.25814438, + "balance_loss_mlp": 1.01666856, + "epoch": 0.45062377874643017, + "flos": 22210190375040.0, + "grad_norm": 1.454704036865865, + "language_loss": 0.82430857, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.8489908, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21447754, + "step": 7495, + "time_per_iteration": 2.8762097358703613 + }, + { + "auxiliary_loss_clip": 0.01442721, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.26807511, + "balance_loss_mlp": 1.0153352, + "epoch": 0.45068390199909814, + "flos": 37684476610560.0, + "grad_norm": 2.452226608069599, + "language_loss": 0.86793792, + "learning_rate": 2.41352469075395e-06, + "loss": 0.89273822, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.21972656, + "step": 7496, + "time_per_iteration": 2.9730031490325928 + }, + { + "auxiliary_loss_clip": 0.01441125, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.26536024, + "balance_loss_mlp": 1.01322746, + "epoch": 0.4507440252517661, + "flos": 22310806106880.0, + "grad_norm": 1.9838029426468118, + "language_loss": 0.76244855, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.7872076, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21557617, + "step": 7497, + "time_per_iteration": 2.8641481399536133 + }, + { + "auxiliary_loss_clip": 0.01453662, + "auxiliary_loss_mlp": 0.01042167, + "balance_loss_clip": 1.27555704, + "balance_loss_mlp": 1.01789641, + "epoch": 0.45080414850443407, + "flos": 13196212346880.0, + "grad_norm": 1.8824213227830577, + "language_loss": 0.75531942, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.78027773, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.24279785, + "step": 7498, + "time_per_iteration": 2.8473942279815674 + }, + { + "auxiliary_loss_clip": 0.01445746, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.26861322, + "balance_loss_mlp": 1.01490676, + "epoch": 0.4508642717571021, + "flos": 21955045092480.0, + "grad_norm": 3.4704450637633006, + "language_loss": 0.71558595, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.74040645, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.21398926, + "step": 7499, + "time_per_iteration": 2.9410297870635986 + }, + { + "auxiliary_loss_clip": 0.01456927, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.27760077, + "balance_loss_mlp": 1.01475835, + "epoch": 0.45092439500977005, + "flos": 23377835664000.0, + "grad_norm": 2.185712495579001, + "language_loss": 0.78303993, + "learning_rate": 2.412000381939477e-06, + "loss": 0.80797487, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.21813965, + "step": 7500, + "time_per_iteration": 4.355379104614258 + }, + { + "auxiliary_loss_clip": 0.01444863, + "auxiliary_loss_mlp": 0.01039606, + "balance_loss_clip": 1.27034068, + "balance_loss_mlp": 1.01843452, + "epoch": 0.450984518262438, + "flos": 20781608469120.0, + "grad_norm": 1.7795001522087839, + "language_loss": 0.63499457, + "learning_rate": 2.411619265641992e-06, + "loss": 0.65983927, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21154785, + "step": 7501, + "time_per_iteration": 4.305271148681641 + }, + { + "auxiliary_loss_clip": 0.01449767, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.27252269, + "balance_loss_mlp": 1.0168947, + "epoch": 0.451044641515106, + "flos": 17715023884800.0, + "grad_norm": 5.33127561482403, + "language_loss": 0.85737801, + "learning_rate": 2.411238133735863e-06, + "loss": 0.88225532, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21069336, + "step": 7502, + "time_per_iteration": 2.8434741497039795 + }, + { + "auxiliary_loss_clip": 0.01435536, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.26301837, + "balance_loss_mlp": 1.01215184, + "epoch": 0.45110476476777395, + "flos": 20604384011520.0, + "grad_norm": 1.2985438445518493, + "language_loss": 0.79659235, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.82127887, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.2097168, + "step": 7503, + "time_per_iteration": 2.8235387802124023 + }, + { + "auxiliary_loss_clip": 0.01435912, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.26547575, + "balance_loss_mlp": 1.01547265, + "epoch": 0.4511648880204419, + "flos": 16042562651520.0, + "grad_norm": 1.7746088312129782, + "language_loss": 0.81402564, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83874226, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20263672, + "step": 7504, + "time_per_iteration": 4.22543740272522 + }, + { + "auxiliary_loss_clip": 0.01429615, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.25814629, + "balance_loss_mlp": 1.01489758, + "epoch": 0.4512250112731099, + "flos": 23987022658560.0, + "grad_norm": 1.7609570283478744, + "language_loss": 0.64074606, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.66540539, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21435547, + "step": 7505, + "time_per_iteration": 2.8386025428771973 + }, + { + "auxiliary_loss_clip": 0.01248181, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.14251101, + "balance_loss_mlp": 1.01249933, + "epoch": 0.45128513452577784, + "flos": 71496338924160.0, + "grad_norm": 0.8398333130912533, + "language_loss": 0.58992809, + "learning_rate": 2.409713450313968e-06, + "loss": 0.61277044, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.23535156, + "step": 7506, + "time_per_iteration": 3.3831698894500732 + }, + { + "auxiliary_loss_clip": 0.0143649, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.26494968, + "balance_loss_mlp": 1.01114357, + "epoch": 0.4513452577784458, + "flos": 22100978131200.0, + "grad_norm": 1.6372141295113214, + "language_loss": 0.79684627, + "learning_rate": 2.40933224058142e-06, + "loss": 0.82153809, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21533203, + "step": 7507, + "time_per_iteration": 2.891263008117676 + }, + { + "auxiliary_loss_clip": 0.01444575, + "auxiliary_loss_mlp": 0.01039665, + "balance_loss_clip": 1.26988423, + "balance_loss_mlp": 1.01610923, + "epoch": 0.4514053810311138, + "flos": 24286173148800.0, + "grad_norm": 1.6563284639354048, + "language_loss": 0.74030221, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76514459, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.23571777, + "step": 7508, + "time_per_iteration": 2.889578104019165 + }, + { + "auxiliary_loss_clip": 0.01435001, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.26435709, + "balance_loss_mlp": 1.01630116, + "epoch": 0.45146550428378174, + "flos": 17893696176000.0, + "grad_norm": 2.0395573046248705, + "language_loss": 0.80340934, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.8281287, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.2064209, + "step": 7509, + "time_per_iteration": 2.8748972415924072 + }, + { + "auxiliary_loss_clip": 0.01432107, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.26123118, + "balance_loss_mlp": 1.01102138, + "epoch": 0.4515256275364497, + "flos": 24254112568320.0, + "grad_norm": 1.945381163595029, + "language_loss": 0.74407256, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.76871663, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21264648, + "step": 7510, + "time_per_iteration": 2.8546926975250244 + }, + { + "auxiliary_loss_clip": 0.01451483, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.27562249, + "balance_loss_mlp": 1.01574802, + "epoch": 0.45158575078911767, + "flos": 20640697603200.0, + "grad_norm": 2.6323798126588067, + "language_loss": 0.77695966, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.80184644, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21447754, + "step": 7511, + "time_per_iteration": 2.8518483638763428 + }, + { + "auxiliary_loss_clip": 0.01451993, + "auxiliary_loss_mlp": 0.01036825, + "balance_loss_clip": 1.27621281, + "balance_loss_mlp": 1.01487875, + "epoch": 0.45164587404178563, + "flos": 23337269061120.0, + "grad_norm": 1.8363495730815926, + "language_loss": 0.79831094, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.82319915, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21948242, + "step": 7512, + "time_per_iteration": 2.858537197113037 + }, + { + "auxiliary_loss_clip": 0.0146753, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.285707, + "balance_loss_mlp": 1.01642025, + "epoch": 0.45170599729445365, + "flos": 23816630165760.0, + "grad_norm": 2.2233736709960734, + "language_loss": 0.88486713, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.9099288, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.22216797, + "step": 7513, + "time_per_iteration": 2.869544506072998 + }, + { + "auxiliary_loss_clip": 0.01423503, + "auxiliary_loss_mlp": 0.0103671, + "balance_loss_clip": 1.25698113, + "balance_loss_mlp": 1.01556206, + "epoch": 0.4517661205471216, + "flos": 23523225765120.0, + "grad_norm": 1.4950478906898914, + "language_loss": 0.67995822, + "learning_rate": 2.406663338649419e-06, + "loss": 0.70456028, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21142578, + "step": 7514, + "time_per_iteration": 2.889476776123047 + }, + { + "auxiliary_loss_clip": 0.01463059, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.28647804, + "balance_loss_mlp": 1.01160216, + "epoch": 0.4518262437997896, + "flos": 23524492619520.0, + "grad_norm": 2.3330860922871537, + "language_loss": 0.70272928, + "learning_rate": 2.406282005146318e-06, + "loss": 0.72770143, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.2253418, + "step": 7515, + "time_per_iteration": 2.9836409091949463 + }, + { + "auxiliary_loss_clip": 0.01454787, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.2758671, + "balance_loss_mlp": 1.0169636, + "epoch": 0.45188636705245755, + "flos": 14574500017920.0, + "grad_norm": 4.675468633979767, + "language_loss": 0.83237004, + "learning_rate": 2.405900656236963e-06, + "loss": 0.85731518, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22766113, + "step": 7516, + "time_per_iteration": 2.825336217880249 + }, + { + "auxiliary_loss_clip": 0.01441017, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.26922405, + "balance_loss_mlp": 1.01326585, + "epoch": 0.4519464903051255, + "flos": 19911077654400.0, + "grad_norm": 1.673857177327844, + "language_loss": 0.6668725, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.69162798, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21264648, + "step": 7517, + "time_per_iteration": 2.8684167861938477 + }, + { + "auxiliary_loss_clip": 0.01439839, + "auxiliary_loss_mlp": 0.01037001, + "balance_loss_clip": 1.26845467, + "balance_loss_mlp": 1.01610327, + "epoch": 0.4520066135577935, + "flos": 18853866218880.0, + "grad_norm": 1.7979159188346792, + "language_loss": 0.64002788, + "learning_rate": 2.405137912257333e-06, + "loss": 0.66479623, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20898438, + "step": 7518, + "time_per_iteration": 2.830096960067749 + }, + { + "auxiliary_loss_clip": 0.01443735, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.27062142, + "balance_loss_mlp": 1.01269519, + "epoch": 0.45206673681046144, + "flos": 48231271860480.0, + "grad_norm": 1.4703578945787348, + "language_loss": 0.60327619, + "learning_rate": 2.404756517215982e-06, + "loss": 0.6280508, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.21008301, + "step": 7519, + "time_per_iteration": 3.0923566818237305 + }, + { + "auxiliary_loss_clip": 0.01456659, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.28059196, + "balance_loss_mlp": 1.01557279, + "epoch": 0.4521268600631294, + "flos": 23852762778240.0, + "grad_norm": 1.5429550887359644, + "language_loss": 0.72990692, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75484556, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21643066, + "step": 7520, + "time_per_iteration": 2.8903303146362305 + }, + { + "auxiliary_loss_clip": 0.01441898, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.26612663, + "balance_loss_mlp": 1.01395845, + "epoch": 0.4521869833157974, + "flos": 18852916078080.0, + "grad_norm": 3.0471213762301983, + "language_loss": 0.76349354, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.78826451, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21240234, + "step": 7521, + "time_per_iteration": 2.8569300174713135 + }, + { + "auxiliary_loss_clip": 0.01463324, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.28357923, + "balance_loss_mlp": 1.0142467, + "epoch": 0.45224710656846534, + "flos": 19796526524160.0, + "grad_norm": 3.1840144745983285, + "language_loss": 0.68132985, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70631957, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.2142334, + "step": 7522, + "time_per_iteration": 2.8193488121032715 + }, + { + "auxiliary_loss_clip": 0.01442578, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.27051318, + "balance_loss_mlp": 1.01208329, + "epoch": 0.4523072298211333, + "flos": 28267655713920.0, + "grad_norm": 1.5035068912827847, + "language_loss": 0.61344445, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63820624, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21508789, + "step": 7523, + "time_per_iteration": 2.892474412918091 + }, + { + "auxiliary_loss_clip": 0.01454555, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.27641201, + "balance_loss_mlp": 1.01358914, + "epoch": 0.45236735307380127, + "flos": 11188196542080.0, + "grad_norm": 1.9423573896392052, + "language_loss": 0.79059923, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.81548893, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.20837402, + "step": 7524, + "time_per_iteration": 2.817948341369629 + }, + { + "auxiliary_loss_clip": 0.01449666, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.27684116, + "balance_loss_mlp": 1.01254487, + "epoch": 0.45242747632646924, + "flos": 22611585409920.0, + "grad_norm": 2.8430086199352176, + "language_loss": 0.64152706, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.6663599, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21081543, + "step": 7525, + "time_per_iteration": 2.8133604526519775 + }, + { + "auxiliary_loss_clip": 0.01440028, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.26871562, + "balance_loss_mlp": 1.01365614, + "epoch": 0.45248759957913726, + "flos": 18264478464000.0, + "grad_norm": 3.312750475421455, + "language_loss": 0.801054, + "learning_rate": 2.402086322981083e-06, + "loss": 0.82580125, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21032715, + "step": 7526, + "time_per_iteration": 2.8109474182128906 + }, + { + "auxiliary_loss_clip": 0.01439537, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.26854277, + "balance_loss_mlp": 1.01416886, + "epoch": 0.4525477228318052, + "flos": 22458865651200.0, + "grad_norm": 1.7779127042978262, + "language_loss": 0.82333934, + "learning_rate": 2.40170480555747e-06, + "loss": 0.84808248, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20629883, + "step": 7527, + "time_per_iteration": 2.8305063247680664 + }, + { + "auxiliary_loss_clip": 0.01444048, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.27126765, + "balance_loss_mlp": 1.01372957, + "epoch": 0.4526078460844732, + "flos": 29657345074560.0, + "grad_norm": 1.6919029761316504, + "language_loss": 0.66164792, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.68644285, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21740723, + "step": 7528, + "time_per_iteration": 2.9238178730010986 + }, + { + "auxiliary_loss_clip": 0.01437383, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.26697946, + "balance_loss_mlp": 1.01401556, + "epoch": 0.45266796933714115, + "flos": 23050198932480.0, + "grad_norm": 1.5496654167426727, + "language_loss": 0.76085961, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.78558433, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21081543, + "step": 7529, + "time_per_iteration": 4.328989505767822 + }, + { + "auxiliary_loss_clip": 0.01450387, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.27534938, + "balance_loss_mlp": 1.01914334, + "epoch": 0.4527280925898091, + "flos": 14436439574400.0, + "grad_norm": 2.9625870458893053, + "language_loss": 0.74024177, + "learning_rate": 2.400560161948384e-06, + "loss": 0.76514739, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.21020508, + "step": 7530, + "time_per_iteration": 2.8579118251800537 + }, + { + "auxiliary_loss_clip": 0.01446265, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.27284193, + "balance_loss_mlp": 1.01415467, + "epoch": 0.4527882158424771, + "flos": 22934697661440.0, + "grad_norm": 1.8724647519210782, + "language_loss": 0.77437466, + "learning_rate": 2.400178583680834e-06, + "loss": 0.79918849, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20959473, + "step": 7531, + "time_per_iteration": 2.918513298034668 + }, + { + "auxiliary_loss_clip": 0.01441096, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.27138329, + "balance_loss_mlp": 1.01392722, + "epoch": 0.45284833909514505, + "flos": 25565926348800.0, + "grad_norm": 2.5394848168783253, + "language_loss": 0.67720497, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.70196599, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21069336, + "step": 7532, + "time_per_iteration": 2.905796527862549 + }, + { + "auxiliary_loss_clip": 0.01443196, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.2721622, + "balance_loss_mlp": 1.01507735, + "epoch": 0.452908462347813, + "flos": 18159021538560.0, + "grad_norm": 2.2172270115341637, + "language_loss": 0.79222119, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81701005, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.20605469, + "step": 7533, + "time_per_iteration": 2.824237108230591 + }, + { + "auxiliary_loss_clip": 0.01465636, + "auxiliary_loss_mlp": 0.01036118, + "balance_loss_clip": 1.28271782, + "balance_loss_mlp": 1.01390958, + "epoch": 0.452968585600481, + "flos": 19072154972160.0, + "grad_norm": 2.0128232789563048, + "language_loss": 0.84328318, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.86830074, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 1.83007812, + "router_z_loss_mlp": 0.2220459, + "step": 7534, + "time_per_iteration": 2.858508348464966 + }, + { + "auxiliary_loss_clip": 0.01447967, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.27403069, + "balance_loss_mlp": 1.01616001, + "epoch": 0.45302870885314894, + "flos": 22061180689920.0, + "grad_norm": 3.658325286393308, + "language_loss": 0.77439439, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.79924679, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21130371, + "step": 7535, + "time_per_iteration": 4.349774360656738 + }, + { + "auxiliary_loss_clip": 0.01446202, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.27328396, + "balance_loss_mlp": 1.01471424, + "epoch": 0.4530888321058169, + "flos": 20385959523840.0, + "grad_norm": 1.568198138846674, + "language_loss": 0.81764126, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.84245133, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20080566, + "step": 7536, + "time_per_iteration": 2.8531036376953125 + }, + { + "auxiliary_loss_clip": 0.0146027, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.28320265, + "balance_loss_mlp": 1.0144788, + "epoch": 0.4531489553584849, + "flos": 14838739505280.0, + "grad_norm": 2.0854179894787896, + "language_loss": 0.76751143, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.79246521, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.20629883, + "step": 7537, + "time_per_iteration": 4.214274883270264 + }, + { + "auxiliary_loss_clip": 0.01448719, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.2737875, + "balance_loss_mlp": 1.01053929, + "epoch": 0.45320907861115284, + "flos": 21955090337280.0, + "grad_norm": 1.8957156611338983, + "language_loss": 0.76172811, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.7865237, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20300293, + "step": 7538, + "time_per_iteration": 2.8562347888946533 + }, + { + "auxiliary_loss_clip": 0.01249209, + "auxiliary_loss_mlp": 0.01020612, + "balance_loss_clip": 1.14379907, + "balance_loss_mlp": 0.99734223, + "epoch": 0.45326920186382086, + "flos": 66285849841920.0, + "grad_norm": 0.8239588358585198, + "language_loss": 0.62378681, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64648503, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.23242188, + "step": 7539, + "time_per_iteration": 4.736531496047974 + }, + { + "auxiliary_loss_clip": 0.01446929, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.27439952, + "balance_loss_mlp": 1.02011025, + "epoch": 0.4533293251164888, + "flos": 14692444508160.0, + "grad_norm": 1.8537091325693893, + "language_loss": 0.66714627, + "learning_rate": 2.396743698142872e-06, + "loss": 0.69202685, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21008301, + "step": 7540, + "time_per_iteration": 2.8608927726745605 + }, + { + "auxiliary_loss_clip": 0.01479251, + "auxiliary_loss_mlp": 0.01039469, + "balance_loss_clip": 1.295784, + "balance_loss_mlp": 1.01783252, + "epoch": 0.4533894483691568, + "flos": 22611359185920.0, + "grad_norm": 1.8450953314400171, + "language_loss": 0.85760534, + "learning_rate": 2.396361968778424e-06, + "loss": 0.88279259, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.21630859, + "step": 7541, + "time_per_iteration": 3.0485739707946777 + }, + { + "auxiliary_loss_clip": 0.01450253, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.27531767, + "balance_loss_mlp": 1.01466203, + "epoch": 0.45344957162182475, + "flos": 34765091919360.0, + "grad_norm": 2.9142560545188307, + "language_loss": 0.7736128, + "learning_rate": 2.395980224383889e-06, + "loss": 0.79846281, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.20092773, + "step": 7542, + "time_per_iteration": 2.9551656246185303 + }, + { + "auxiliary_loss_clip": 0.01457109, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.28214061, + "balance_loss_mlp": 1.01066446, + "epoch": 0.4535096948744927, + "flos": 23560715721600.0, + "grad_norm": 1.6525995216020746, + "language_loss": 0.81075853, + "learning_rate": 2.395598464973746e-06, + "loss": 0.83565569, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.21948242, + "step": 7543, + "time_per_iteration": 2.863460063934326 + }, + { + "auxiliary_loss_clip": 0.0145673, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.28101873, + "balance_loss_mlp": 1.01946163, + "epoch": 0.4535698181271607, + "flos": 25568731526400.0, + "grad_norm": 1.9060538144553325, + "language_loss": 0.77129781, + "learning_rate": 2.395216690562469e-06, + "loss": 0.79626697, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.20727539, + "step": 7544, + "time_per_iteration": 2.852433443069458 + }, + { + "auxiliary_loss_clip": 0.01469002, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.29114306, + "balance_loss_mlp": 1.01377809, + "epoch": 0.45362994137982865, + "flos": 24875063210880.0, + "grad_norm": 1.7532497011283266, + "language_loss": 0.76162183, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.78665185, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.20227051, + "step": 7545, + "time_per_iteration": 2.848027467727661 + }, + { + "auxiliary_loss_clip": 0.01462777, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.28767943, + "balance_loss_mlp": 1.01788795, + "epoch": 0.4536900646324966, + "flos": 30818113153920.0, + "grad_norm": 2.4828327428289954, + "language_loss": 0.72775596, + "learning_rate": 2.394453096794423e-06, + "loss": 0.75276577, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20300293, + "step": 7546, + "time_per_iteration": 2.893627166748047 + }, + { + "auxiliary_loss_clip": 0.01483038, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.30031276, + "balance_loss_mlp": 1.01738954, + "epoch": 0.4537501878851646, + "flos": 23414782682880.0, + "grad_norm": 1.5697198342730723, + "language_loss": 0.76530933, + "learning_rate": 2.394071277466609e-06, + "loss": 0.79052317, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.20947266, + "step": 7547, + "time_per_iteration": 2.832987070083618 + }, + { + "auxiliary_loss_clip": 0.01456459, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.2794683, + "balance_loss_mlp": 1.01511478, + "epoch": 0.45381031113783254, + "flos": 18158116642560.0, + "grad_norm": 5.05534316149341, + "language_loss": 0.70368606, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72861695, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21520996, + "step": 7548, + "time_per_iteration": 2.8207101821899414 + }, + { + "auxiliary_loss_clip": 0.01436285, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.2639271, + "balance_loss_mlp": 1.01516652, + "epoch": 0.4538704343905005, + "flos": 25347275637120.0, + "grad_norm": 2.1418994688866713, + "language_loss": 0.74046648, + "learning_rate": 2.393307593995794e-06, + "loss": 0.7651881, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20690918, + "step": 7549, + "time_per_iteration": 2.854933500289917 + }, + { + "auxiliary_loss_clip": 0.01443068, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.27014446, + "balance_loss_mlp": 1.0123744, + "epoch": 0.4539305576431685, + "flos": 28743351989760.0, + "grad_norm": 1.5991919392273304, + "language_loss": 0.65820652, + "learning_rate": 2.392925729881751e-06, + "loss": 0.68295991, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.19897461, + "step": 7550, + "time_per_iteration": 2.934553861618042 + }, + { + "auxiliary_loss_clip": 0.01439402, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.26796651, + "balance_loss_mlp": 1.01399767, + "epoch": 0.45399068089583644, + "flos": 22502689879680.0, + "grad_norm": 1.6003325085560562, + "language_loss": 0.69228423, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.71701956, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20129395, + "step": 7551, + "time_per_iteration": 2.920234203338623 + }, + { + "auxiliary_loss_clip": 0.014518, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.27381015, + "balance_loss_mlp": 1.01371336, + "epoch": 0.45405080414850446, + "flos": 12900364727040.0, + "grad_norm": 2.413844958644645, + "language_loss": 0.7994386, + "learning_rate": 2.392161956968798e-06, + "loss": 0.82429934, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.20556641, + "step": 7552, + "time_per_iteration": 3.029266834259033 + }, + { + "auxiliary_loss_clip": 0.01238088, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.13647985, + "balance_loss_mlp": 1.00725198, + "epoch": 0.4541109274011724, + "flos": 59792739154560.0, + "grad_norm": 0.8169863969579341, + "language_loss": 0.57792127, + "learning_rate": 2.39178004819885e-06, + "loss": 0.60056925, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.19433594, + "step": 7553, + "time_per_iteration": 3.3373911380767822 + }, + { + "auxiliary_loss_clip": 0.0143803, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.26663494, + "balance_loss_mlp": 1.01859927, + "epoch": 0.4541710506538404, + "flos": 28523389178880.0, + "grad_norm": 1.3895489379159014, + "language_loss": 0.77419794, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.79895949, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.1953125, + "step": 7554, + "time_per_iteration": 2.911853790283203 + }, + { + "auxiliary_loss_clip": 0.01458209, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.28070104, + "balance_loss_mlp": 1.01291394, + "epoch": 0.45423117390650836, + "flos": 17684456382720.0, + "grad_norm": 3.6058234650716776, + "language_loss": 0.78107023, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.8059988, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.21728516, + "step": 7555, + "time_per_iteration": 2.8864855766296387 + }, + { + "auxiliary_loss_clip": 0.01438451, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.26466727, + "balance_loss_mlp": 1.01035225, + "epoch": 0.4542912971591763, + "flos": 28083735025920.0, + "grad_norm": 1.5842650249395722, + "language_loss": 0.73078811, + "learning_rate": 2.390634232808903e-06, + "loss": 0.75548482, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20874023, + "step": 7556, + "time_per_iteration": 2.96071195602417 + }, + { + "auxiliary_loss_clip": 0.0146072, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.28212595, + "balance_loss_mlp": 1.01734281, + "epoch": 0.4543514204118443, + "flos": 22681316926080.0, + "grad_norm": 2.45368288432121, + "language_loss": 0.6471734, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.67217505, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22106934, + "step": 7557, + "time_per_iteration": 2.910444974899292 + }, + { + "auxiliary_loss_clip": 0.01245334, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.14205742, + "balance_loss_mlp": 1.01134074, + "epoch": 0.45441154366451225, + "flos": 58246773678720.0, + "grad_norm": 0.6952756895395331, + "language_loss": 0.57673365, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59950638, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.20605469, + "step": 7558, + "time_per_iteration": 3.285207748413086 + }, + { + "auxiliary_loss_clip": 0.01446834, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.26888442, + "balance_loss_mlp": 1.01292372, + "epoch": 0.4544716669171802, + "flos": 16773132741120.0, + "grad_norm": 2.5117343893454986, + "language_loss": 0.58298039, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.60779715, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21923828, + "step": 7559, + "time_per_iteration": 2.814863920211792 + }, + { + "auxiliary_loss_clip": 0.01436948, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.26365411, + "balance_loss_mlp": 1.01373601, + "epoch": 0.4545317901698482, + "flos": 15933893345280.0, + "grad_norm": 1.9080769478285835, + "language_loss": 0.72724855, + "learning_rate": 2.389106271642792e-06, + "loss": 0.75197458, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21899414, + "step": 7560, + "time_per_iteration": 2.8332369327545166 + }, + { + "auxiliary_loss_clip": 0.01464225, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.28395259, + "balance_loss_mlp": 1.01425958, + "epoch": 0.45459191342251615, + "flos": 17648776218240.0, + "grad_norm": 8.98567723893478, + "language_loss": 0.70624834, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.73124802, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21484375, + "step": 7561, + "time_per_iteration": 2.899129629135132 + }, + { + "auxiliary_loss_clip": 0.01438982, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.26907551, + "balance_loss_mlp": 1.01309454, + "epoch": 0.4546520366751841, + "flos": 16184287923840.0, + "grad_norm": 2.597827250449868, + "language_loss": 0.86279541, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.88751471, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19836426, + "step": 7562, + "time_per_iteration": 2.9800360202789307 + }, + { + "auxiliary_loss_clip": 0.01432064, + "auxiliary_loss_mlp": 0.01041132, + "balance_loss_clip": 1.26209497, + "balance_loss_mlp": 1.02044916, + "epoch": 0.4547121599278521, + "flos": 19759850974080.0, + "grad_norm": 1.9094296854488118, + "language_loss": 0.89894557, + "learning_rate": 2.38796014579055e-06, + "loss": 0.92367756, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20678711, + "step": 7563, + "time_per_iteration": 2.866267681121826 + }, + { + "auxiliary_loss_clip": 0.01449666, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.27346337, + "balance_loss_mlp": 1.01420927, + "epoch": 0.45477228318052004, + "flos": 19946848308480.0, + "grad_norm": 8.793927179852368, + "language_loss": 0.72725838, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.75211287, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.21557617, + "step": 7564, + "time_per_iteration": 4.202565431594849 + }, + { + "auxiliary_loss_clip": 0.0144334, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.26771319, + "balance_loss_mlp": 1.01473296, + "epoch": 0.454832406433188, + "flos": 21298278551040.0, + "grad_norm": 2.16034561099508, + "language_loss": 0.69059789, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.71539307, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21447754, + "step": 7565, + "time_per_iteration": 2.8455355167388916 + }, + { + "auxiliary_loss_clip": 0.01436771, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.26393414, + "balance_loss_mlp": 1.01282406, + "epoch": 0.45489252968585603, + "flos": 24509438830080.0, + "grad_norm": 1.7616022733784116, + "language_loss": 0.80892491, + "learning_rate": 2.386813887534922e-06, + "loss": 0.83363348, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21252441, + "step": 7566, + "time_per_iteration": 2.8623225688934326 + }, + { + "auxiliary_loss_clip": 0.0143339, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.25822997, + "balance_loss_mlp": 1.01352024, + "epoch": 0.454952652938524, + "flos": 17101357655040.0, + "grad_norm": 1.6263523332211243, + "language_loss": 0.74332327, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.76800776, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21508789, + "step": 7567, + "time_per_iteration": 2.8396365642547607 + }, + { + "auxiliary_loss_clip": 0.01445049, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.27018213, + "balance_loss_mlp": 1.0153482, + "epoch": 0.45501277619119196, + "flos": 27640325554560.0, + "grad_norm": 1.505000048665826, + "language_loss": 0.81605512, + "learning_rate": 2.386049642000249e-06, + "loss": 0.8408711, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.2121582, + "step": 7568, + "time_per_iteration": 2.9185948371887207 + }, + { + "auxiliary_loss_clip": 0.01463406, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.28281927, + "balance_loss_mlp": 1.01351106, + "epoch": 0.4550728994438599, + "flos": 19984247775360.0, + "grad_norm": 2.6853624663203743, + "language_loss": 0.80904257, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.83403903, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22753906, + "step": 7569, + "time_per_iteration": 2.8405261039733887 + }, + { + "auxiliary_loss_clip": 0.01449743, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.27161968, + "balance_loss_mlp": 1.01533031, + "epoch": 0.4551330226965279, + "flos": 26077800481920.0, + "grad_norm": 1.5025613665284998, + "language_loss": 0.75490654, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77977109, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.21398926, + "step": 7570, + "time_per_iteration": 4.338579416275024 + }, + { + "auxiliary_loss_clip": 0.01436434, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_clip": 1.26517856, + "balance_loss_mlp": 1.01928043, + "epoch": 0.45519314594919585, + "flos": 32793977888640.0, + "grad_norm": 1.8464120244707356, + "language_loss": 0.75242269, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77720881, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.22900391, + "step": 7571, + "time_per_iteration": 2.9665868282318115 + }, + { + "auxiliary_loss_clip": 0.01426166, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.2587136, + "balance_loss_mlp": 1.01119101, + "epoch": 0.4552532692018638, + "flos": 19182226867200.0, + "grad_norm": 1.447537095009797, + "language_loss": 0.81807041, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.84264237, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19836426, + "step": 7572, + "time_per_iteration": 4.227491140365601 + }, + { + "auxiliary_loss_clip": 0.01462793, + "auxiliary_loss_mlp": 0.0103705, + "balance_loss_clip": 1.28391314, + "balance_loss_mlp": 1.0146265, + "epoch": 0.4553133924545318, + "flos": 26037233879040.0, + "grad_norm": 2.1151203355951176, + "language_loss": 0.7341994, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75919783, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22412109, + "step": 7573, + "time_per_iteration": 2.8964827060699463 + }, + { + "auxiliary_loss_clip": 0.01460676, + "auxiliary_loss_mlp": 0.0104104, + "balance_loss_clip": 1.28253913, + "balance_loss_mlp": 1.01792538, + "epoch": 0.45537351570719975, + "flos": 30672994521600.0, + "grad_norm": 8.922118262640486, + "language_loss": 0.74788111, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.77289832, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.23132324, + "step": 7574, + "time_per_iteration": 4.384619235992432 + }, + { + "auxiliary_loss_clip": 0.01453033, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.2769835, + "balance_loss_mlp": 1.01322937, + "epoch": 0.4554336389598677, + "flos": 24364229708160.0, + "grad_norm": 1.6915863172451424, + "language_loss": 0.72359967, + "learning_rate": 2.383374322259915e-06, + "loss": 0.74847674, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21447754, + "step": 7575, + "time_per_iteration": 2.9089901447296143 + }, + { + "auxiliary_loss_clip": 0.01439679, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.26483953, + "balance_loss_mlp": 1.01542759, + "epoch": 0.4554937622125357, + "flos": 20567527482240.0, + "grad_norm": 1.8383230564333382, + "language_loss": 0.74455279, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.76932055, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.21679688, + "step": 7576, + "time_per_iteration": 2.8489561080932617 + }, + { + "auxiliary_loss_clip": 0.01434245, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.26350284, + "balance_loss_mlp": 1.0195061, + "epoch": 0.45555388546520365, + "flos": 22831141017600.0, + "grad_norm": 1.706013871154292, + "language_loss": 0.6719833, + "learning_rate": 2.382609814135511e-06, + "loss": 0.69673479, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.21386719, + "step": 7577, + "time_per_iteration": 2.8618245124816895 + }, + { + "auxiliary_loss_clip": 0.01444015, + "auxiliary_loss_mlp": 0.01047905, + "balance_loss_clip": 1.27028966, + "balance_loss_mlp": 1.02479017, + "epoch": 0.4556140087178716, + "flos": 21736032422400.0, + "grad_norm": 1.8003414044926744, + "language_loss": 0.75002128, + "learning_rate": 2.382227538303157e-06, + "loss": 0.77494049, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.23120117, + "step": 7578, + "time_per_iteration": 2.883845329284668 + }, + { + "auxiliary_loss_clip": 0.01434239, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.26084661, + "balance_loss_mlp": 1.01828718, + "epoch": 0.45567413197053963, + "flos": 26005580501760.0, + "grad_norm": 1.8526359869111961, + "language_loss": 0.71722507, + "learning_rate": 2.381845247976697e-06, + "loss": 0.74196541, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.21508789, + "step": 7579, + "time_per_iteration": 2.889981985092163 + }, + { + "auxiliary_loss_clip": 0.01441936, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.26905036, + "balance_loss_mlp": 1.01792765, + "epoch": 0.4557342552232076, + "flos": 21546048931200.0, + "grad_norm": 1.9089140279441101, + "language_loss": 0.79688966, + "learning_rate": 2.381462943170627e-06, + "loss": 0.82168853, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.20031738, + "step": 7580, + "time_per_iteration": 2.860614776611328 + }, + { + "auxiliary_loss_clip": 0.01439235, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.26762819, + "balance_loss_mlp": 1.01771092, + "epoch": 0.45579437847587556, + "flos": 40015876135680.0, + "grad_norm": 1.950181382572291, + "language_loss": 0.68972373, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71451092, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21777344, + "step": 7581, + "time_per_iteration": 3.020888566970825 + }, + { + "auxiliary_loss_clip": 0.01435621, + "auxiliary_loss_mlp": 0.01038144, + "balance_loss_clip": 1.2664696, + "balance_loss_mlp": 1.01742554, + "epoch": 0.4558545017285435, + "flos": 31150772058240.0, + "grad_norm": 1.7444092383859002, + "language_loss": 0.73943371, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.76417136, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20715332, + "step": 7582, + "time_per_iteration": 2.9284143447875977 + }, + { + "auxiliary_loss_clip": 0.01455487, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.27940774, + "balance_loss_mlp": 1.02502894, + "epoch": 0.4559146249812115, + "flos": 21735760953600.0, + "grad_norm": 1.6435245149878732, + "language_loss": 0.73481107, + "learning_rate": 2.380315942019729e-06, + "loss": 0.75984144, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.2253418, + "step": 7583, + "time_per_iteration": 2.8274807929992676 + }, + { + "auxiliary_loss_clip": 0.01450039, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.27267289, + "balance_loss_mlp": 1.01444459, + "epoch": 0.45597474823387946, + "flos": 23816630165760.0, + "grad_norm": 1.6987208491479704, + "language_loss": 0.73012036, + "learning_rate": 2.379933579440195e-06, + "loss": 0.7549718, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.20666504, + "step": 7584, + "time_per_iteration": 2.976654529571533 + }, + { + "auxiliary_loss_clip": 0.0145073, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.27759147, + "balance_loss_mlp": 1.01192498, + "epoch": 0.4560348714865474, + "flos": 31918379656320.0, + "grad_norm": 1.8794146153920301, + "language_loss": 0.6854108, + "learning_rate": 2.379551202453541e-06, + "loss": 0.7102567, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.21948242, + "step": 7585, + "time_per_iteration": 2.9197587966918945 + }, + { + "auxiliary_loss_clip": 0.01454096, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.27896333, + "balance_loss_mlp": 1.01626277, + "epoch": 0.4560949947392154, + "flos": 22057515861120.0, + "grad_norm": 1.6174023188146225, + "language_loss": 0.7709403, + "learning_rate": 2.379168811074267e-06, + "loss": 0.79585272, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.2088623, + "step": 7586, + "time_per_iteration": 2.847783088684082 + }, + { + "auxiliary_loss_clip": 0.01440956, + "auxiliary_loss_mlp": 0.01036033, + "balance_loss_clip": 1.26877499, + "balance_loss_mlp": 1.01527822, + "epoch": 0.45615511799188335, + "flos": 24582563706240.0, + "grad_norm": 1.9455488369331304, + "language_loss": 0.78947526, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.8142451, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20751953, + "step": 7587, + "time_per_iteration": 2.9298770427703857 + }, + { + "auxiliary_loss_clip": 0.01474189, + "auxiliary_loss_mlp": 0.01043037, + "balance_loss_clip": 1.29167795, + "balance_loss_mlp": 1.02179432, + "epoch": 0.4562152412445513, + "flos": 18339322642560.0, + "grad_norm": 1.9002097418180268, + "language_loss": 0.69483757, + "learning_rate": 2.378403985195863e-06, + "loss": 0.72000986, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.21264648, + "step": 7588, + "time_per_iteration": 2.8587167263031006 + }, + { + "auxiliary_loss_clip": 0.01436177, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.26659036, + "balance_loss_mlp": 1.0190661, + "epoch": 0.4562753644972193, + "flos": 13524392016000.0, + "grad_norm": 2.0219648214081474, + "language_loss": 0.7979728, + "learning_rate": 2.378021550725735e-06, + "loss": 0.82274365, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21838379, + "step": 7589, + "time_per_iteration": 2.867971420288086 + }, + { + "auxiliary_loss_clip": 0.01444354, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.27054203, + "balance_loss_mlp": 1.01700604, + "epoch": 0.45633548774988725, + "flos": 29650829823360.0, + "grad_norm": 2.2688258761597644, + "language_loss": 0.63853067, + "learning_rate": 2.377639101920992e-06, + "loss": 0.66337204, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.22790527, + "step": 7590, + "time_per_iteration": 2.909085273742676 + }, + { + "auxiliary_loss_clip": 0.01451568, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.276214, + "balance_loss_mlp": 1.01591563, + "epoch": 0.4563956110025552, + "flos": 22242703403520.0, + "grad_norm": 2.6151672286298555, + "language_loss": 0.7296077, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75448954, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.20703125, + "step": 7591, + "time_per_iteration": 2.8758602142333984 + }, + { + "auxiliary_loss_clip": 0.01444244, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.27019811, + "balance_loss_mlp": 1.02316523, + "epoch": 0.45645573425522323, + "flos": 17100814717440.0, + "grad_norm": 2.2599114657751347, + "language_loss": 0.78497398, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.80987012, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22216797, + "step": 7592, + "time_per_iteration": 2.849686622619629 + }, + { + "auxiliary_loss_clip": 0.01449198, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.2736963, + "balance_loss_mlp": 1.01822352, + "epoch": 0.4565158575078912, + "flos": 20340144524160.0, + "grad_norm": 2.277720968368235, + "language_loss": 0.70817959, + "learning_rate": 2.376491669644098e-06, + "loss": 0.73305625, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.20239258, + "step": 7593, + "time_per_iteration": 2.9023921489715576 + }, + { + "auxiliary_loss_clip": 0.01432356, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.26216722, + "balance_loss_mlp": 1.01546443, + "epoch": 0.45657598076055916, + "flos": 23992316300160.0, + "grad_norm": 1.9392908925271055, + "language_loss": 0.84623253, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.87092102, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21020508, + "step": 7594, + "time_per_iteration": 2.9609975814819336 + }, + { + "auxiliary_loss_clip": 0.01258472, + "auxiliary_loss_mlp": 0.01047196, + "balance_loss_clip": 1.15381527, + "balance_loss_mlp": 1.01686907, + "epoch": 0.45663610401322713, + "flos": 69393028291200.0, + "grad_norm": 0.8017831952116005, + "language_loss": 0.52821457, + "learning_rate": 2.375726643385654e-06, + "loss": 0.5512712, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.30273438, + "step": 7595, + "time_per_iteration": 3.4565269947052 + }, + { + "auxiliary_loss_clip": 0.01455675, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.2759428, + "balance_loss_mlp": 1.01421535, + "epoch": 0.4566962272658951, + "flos": 15154974547200.0, + "grad_norm": 2.8190747279946824, + "language_loss": 0.87717551, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.90208673, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21228027, + "step": 7596, + "time_per_iteration": 2.8314874172210693 + }, + { + "auxiliary_loss_clip": 0.01461636, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.2860297, + "balance_loss_mlp": 1.02209473, + "epoch": 0.45675635051856306, + "flos": 18706666325760.0, + "grad_norm": 1.5216803503314862, + "language_loss": 0.77822578, + "learning_rate": 2.374961560136843e-06, + "loss": 0.80327648, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.21350098, + "step": 7597, + "time_per_iteration": 2.80583119392395 + }, + { + "auxiliary_loss_clip": 0.01451962, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.27596998, + "balance_loss_mlp": 1.01539564, + "epoch": 0.456816473771231, + "flos": 19107473178240.0, + "grad_norm": 1.662337096903005, + "language_loss": 0.79174864, + "learning_rate": 2.374578997177314e-06, + "loss": 0.81664348, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.22131348, + "step": 7598, + "time_per_iteration": 2.847041368484497 + }, + { + "auxiliary_loss_clip": 0.0143436, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.26242721, + "balance_loss_mlp": 1.01212215, + "epoch": 0.456876597023899, + "flos": 28961957456640.0, + "grad_norm": 2.5000267652049186, + "language_loss": 0.72607541, + "learning_rate": 2.374196420013712e-06, + "loss": 0.7507503, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20996094, + "step": 7599, + "time_per_iteration": 4.2605321407318115 + }, + { + "auxiliary_loss_clip": 0.01436548, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.26444328, + "balance_loss_mlp": 1.01623464, + "epoch": 0.45693672027656695, + "flos": 23298512250240.0, + "grad_norm": 2.4036527224821387, + "language_loss": 0.70541847, + "learning_rate": 2.373813828660544e-06, + "loss": 0.73016131, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21484375, + "step": 7600, + "time_per_iteration": 2.8635849952697754 + }, + { + "auxiliary_loss_clip": 0.01454535, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.27878165, + "balance_loss_mlp": 1.01670897, + "epoch": 0.4569968435292349, + "flos": 20567889440640.0, + "grad_norm": 2.2593700131700745, + "language_loss": 0.79884666, + "learning_rate": 2.373431223132319e-06, + "loss": 0.82377791, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21875, + "step": 7601, + "time_per_iteration": 2.8804054260253906 + }, + { + "auxiliary_loss_clip": 0.01456886, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.28018165, + "balance_loss_mlp": 1.01705503, + "epoch": 0.4570569667819029, + "flos": 41297801086080.0, + "grad_norm": 2.9982826855095244, + "language_loss": 0.72501063, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.74994946, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.19946289, + "step": 7602, + "time_per_iteration": 3.0245790481567383 + }, + { + "auxiliary_loss_clip": 0.01453868, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.27768707, + "balance_loss_mlp": 1.01488888, + "epoch": 0.45711709003457085, + "flos": 26042798989440.0, + "grad_norm": 2.024513082480885, + "language_loss": 0.74407864, + "learning_rate": 2.372665969608729e-06, + "loss": 0.76898909, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22290039, + "step": 7603, + "time_per_iteration": 2.8460190296173096 + }, + { + "auxiliary_loss_clip": 0.01446665, + "auxiliary_loss_mlp": 0.01037175, + "balance_loss_clip": 1.27253616, + "balance_loss_mlp": 1.01569343, + "epoch": 0.4571772132872388, + "flos": 22166728104960.0, + "grad_norm": 1.7710762089831429, + "language_loss": 0.83452547, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85936385, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.21472168, + "step": 7604, + "time_per_iteration": 2.859025001525879 + }, + { + "auxiliary_loss_clip": 0.01468538, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.28734159, + "balance_loss_mlp": 1.02034724, + "epoch": 0.45723733653990684, + "flos": 23889528817920.0, + "grad_norm": 1.8576968278892065, + "language_loss": 0.87070775, + "learning_rate": 2.371900659559016e-06, + "loss": 0.89582193, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.22521973, + "step": 7605, + "time_per_iteration": 2.939392566680908 + }, + { + "auxiliary_loss_clip": 0.01472591, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.2938484, + "balance_loss_mlp": 1.01628411, + "epoch": 0.4572974597925748, + "flos": 16880173234560.0, + "grad_norm": 1.945181262612315, + "language_loss": 0.74755496, + "learning_rate": 2.371517983373138e-06, + "loss": 0.77265733, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21374512, + "step": 7606, + "time_per_iteration": 5.642092227935791 + }, + { + "auxiliary_loss_clip": 0.01463893, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.28380358, + "balance_loss_mlp": 1.01625633, + "epoch": 0.45735758304524277, + "flos": 13779311074560.0, + "grad_norm": 2.7528073491231124, + "language_loss": 0.81181383, + "learning_rate": 2.371135293099262e-06, + "loss": 0.83683491, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.21972656, + "step": 7607, + "time_per_iteration": 2.847872734069824 + }, + { + "auxiliary_loss_clip": 0.01459627, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.28273177, + "balance_loss_mlp": 1.01803052, + "epoch": 0.45741770629791073, + "flos": 21109335690240.0, + "grad_norm": 2.8447925849831415, + "language_loss": 0.81191951, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83691168, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.2154541, + "step": 7608, + "time_per_iteration": 4.293159008026123 + }, + { + "auxiliary_loss_clip": 0.01443923, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.26896775, + "balance_loss_mlp": 1.01329947, + "epoch": 0.4574778295505787, + "flos": 23123414298240.0, + "grad_norm": 1.783088245684045, + "language_loss": 0.69084692, + "learning_rate": 2.370369870345559e-06, + "loss": 0.71562934, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21020508, + "step": 7609, + "time_per_iteration": 2.8407235145568848 + }, + { + "auxiliary_loss_clip": 0.01447172, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.27084959, + "balance_loss_mlp": 1.01579583, + "epoch": 0.45753795280324666, + "flos": 24363641525760.0, + "grad_norm": 1.9463402177206985, + "language_loss": 0.8190971, + "learning_rate": 2.369987137894757e-06, + "loss": 0.84393096, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.20422363, + "step": 7610, + "time_per_iteration": 2.9315028190612793 + }, + { + "auxiliary_loss_clip": 0.01469481, + "auxiliary_loss_mlp": 0.01038719, + "balance_loss_clip": 1.29037023, + "balance_loss_mlp": 1.01711798, + "epoch": 0.4575980760559146, + "flos": 16662336929280.0, + "grad_norm": 2.4640997193383254, + "language_loss": 0.83330953, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.85839158, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21606445, + "step": 7611, + "time_per_iteration": 2.894212007522583 + }, + { + "auxiliary_loss_clip": 0.01453259, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.27775788, + "balance_loss_mlp": 1.01691604, + "epoch": 0.4576581993085826, + "flos": 35923190555520.0, + "grad_norm": 1.7801134098653644, + "language_loss": 0.74344808, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76836962, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.2199707, + "step": 7612, + "time_per_iteration": 2.9535319805145264 + }, + { + "auxiliary_loss_clip": 0.01434086, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.26087904, + "balance_loss_mlp": 1.01311851, + "epoch": 0.45771832256125056, + "flos": 20089071273600.0, + "grad_norm": 1.5740971651509195, + "language_loss": 0.85576648, + "learning_rate": 2.368838856420711e-06, + "loss": 0.88045168, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21313477, + "step": 7613, + "time_per_iteration": 2.872124195098877 + }, + { + "auxiliary_loss_clip": 0.01450439, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.27505064, + "balance_loss_mlp": 1.01220095, + "epoch": 0.4577784458139185, + "flos": 10751664280320.0, + "grad_norm": 1.9405372042217766, + "language_loss": 0.7622081, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.78705573, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22119141, + "step": 7614, + "time_per_iteration": 2.8006951808929443 + }, + { + "auxiliary_loss_clip": 0.01444563, + "auxiliary_loss_mlp": 0.01039102, + "balance_loss_clip": 1.27201819, + "balance_loss_mlp": 1.01830029, + "epoch": 0.4578385690665865, + "flos": 21917057443200.0, + "grad_norm": 1.5676807958794956, + "language_loss": 0.75239646, + "learning_rate": 2.368073265481791e-06, + "loss": 0.77723312, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20788574, + "step": 7615, + "time_per_iteration": 2.8754522800445557 + }, + { + "auxiliary_loss_clip": 0.01265543, + "auxiliary_loss_mlp": 0.01022911, + "balance_loss_clip": 1.16435456, + "balance_loss_mlp": 1.00116754, + "epoch": 0.45789869231925445, + "flos": 64783536894720.0, + "grad_norm": 0.7805034422521424, + "language_loss": 0.57637572, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59926027, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.21777344, + "step": 7616, + "time_per_iteration": 3.3128390312194824 + }, + { + "auxiliary_loss_clip": 0.01441806, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.26735723, + "balance_loss_mlp": 1.0152328, + "epoch": 0.4579588155719224, + "flos": 16152679791360.0, + "grad_norm": 1.8139678018668783, + "language_loss": 0.7173878, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.74217051, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21252441, + "step": 7617, + "time_per_iteration": 2.8204562664031982 + }, + { + "auxiliary_loss_clip": 0.01442577, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.26913333, + "balance_loss_mlp": 1.00994706, + "epoch": 0.45801893882459044, + "flos": 21404957086080.0, + "grad_norm": 3.1054508614221827, + "language_loss": 0.77347016, + "learning_rate": 2.36692477442939e-06, + "loss": 0.79821545, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.22033691, + "step": 7618, + "time_per_iteration": 2.878480911254883 + }, + { + "auxiliary_loss_clip": 0.01444722, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.26887989, + "balance_loss_mlp": 1.01577115, + "epoch": 0.4580790620772584, + "flos": 19545950966400.0, + "grad_norm": 1.7298875576255415, + "language_loss": 0.77835459, + "learning_rate": 2.366541916231585e-06, + "loss": 0.8031615, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.2019043, + "step": 7619, + "time_per_iteration": 2.8444936275482178 + }, + { + "auxiliary_loss_clip": 0.01441188, + "auxiliary_loss_mlp": 0.01035814, + "balance_loss_clip": 1.26782274, + "balance_loss_mlp": 1.01439261, + "epoch": 0.45813918532992637, + "flos": 16589664501120.0, + "grad_norm": 1.833564788943475, + "language_loss": 0.72443897, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74920905, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21435547, + "step": 7620, + "time_per_iteration": 2.8428006172180176 + }, + { + "auxiliary_loss_clip": 0.01437895, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.26625896, + "balance_loss_mlp": 1.01509261, + "epoch": 0.45819930858259433, + "flos": 42245483564160.0, + "grad_norm": 1.6898954212866257, + "language_loss": 0.78622794, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.81096339, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.20532227, + "step": 7621, + "time_per_iteration": 3.044480323791504 + }, + { + "auxiliary_loss_clip": 0.01255442, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.1562438, + "balance_loss_mlp": 1.0188477, + "epoch": 0.4582594318352623, + "flos": 63743563728000.0, + "grad_norm": 0.7966720418377881, + "language_loss": 0.65052855, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67350507, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.23339844, + "step": 7622, + "time_per_iteration": 3.3473803997039795 + }, + { + "auxiliary_loss_clip": 0.01451906, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.27527559, + "balance_loss_mlp": 1.01433873, + "epoch": 0.45831955508793026, + "flos": 26881223978880.0, + "grad_norm": 1.826219463501508, + "language_loss": 0.80503082, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.82991016, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.21679688, + "step": 7623, + "time_per_iteration": 2.919713258743286 + }, + { + "auxiliary_loss_clip": 0.01451138, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.27390838, + "balance_loss_mlp": 1.01635361, + "epoch": 0.45837967834059823, + "flos": 18743160896640.0, + "grad_norm": 1.855306994690235, + "language_loss": 0.71358734, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.7384733, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.21105957, + "step": 7624, + "time_per_iteration": 2.826080799102783 + }, + { + "auxiliary_loss_clip": 0.01448601, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.27169895, + "balance_loss_mlp": 1.01368415, + "epoch": 0.4584398015932662, + "flos": 21187844697600.0, + "grad_norm": 2.092687039943946, + "language_loss": 0.74421299, + "learning_rate": 2.364244475667491e-06, + "loss": 0.76905048, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.21472168, + "step": 7625, + "time_per_iteration": 2.877474069595337 + }, + { + "auxiliary_loss_clip": 0.01447589, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.27317691, + "balance_loss_mlp": 1.01472282, + "epoch": 0.45849992484593416, + "flos": 19799014988160.0, + "grad_norm": 1.8338179323685642, + "language_loss": 0.78851712, + "learning_rate": 2.363861520479451e-06, + "loss": 0.81335175, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21154785, + "step": 7626, + "time_per_iteration": 2.8163959980010986 + }, + { + "auxiliary_loss_clip": 0.01463627, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.28491211, + "balance_loss_mlp": 1.01961946, + "epoch": 0.4585600480986021, + "flos": 18232372638720.0, + "grad_norm": 1.5236407240422436, + "language_loss": 0.85862732, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.88368213, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.22229004, + "step": 7627, + "time_per_iteration": 2.942054510116577 + }, + { + "auxiliary_loss_clip": 0.01462333, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.2820822, + "balance_loss_mlp": 1.0160284, + "epoch": 0.4586201713512701, + "flos": 29033905968000.0, + "grad_norm": 1.541489355677873, + "language_loss": 0.70170277, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.72671026, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22387695, + "step": 7628, + "time_per_iteration": 2.9766438007354736 + }, + { + "auxiliary_loss_clip": 0.01443636, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.27142024, + "balance_loss_mlp": 1.01386297, + "epoch": 0.45868029460393805, + "flos": 23415144641280.0, + "grad_norm": 2.5945032934010097, + "language_loss": 0.78458142, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80938274, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.22619629, + "step": 7629, + "time_per_iteration": 2.8186886310577393 + }, + { + "auxiliary_loss_clip": 0.01458025, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.27614975, + "balance_loss_mlp": 1.01882482, + "epoch": 0.458740417856606, + "flos": 18230743825920.0, + "grad_norm": 2.008948356658365, + "language_loss": 0.80213606, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.82713503, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.23046875, + "step": 7630, + "time_per_iteration": 2.838806629180908 + }, + { + "auxiliary_loss_clip": 0.01465385, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_clip": 1.28524804, + "balance_loss_mlp": 1.02165341, + "epoch": 0.458800541109274, + "flos": 34582076127360.0, + "grad_norm": 1.7535591605332883, + "language_loss": 0.73029542, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.75537682, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.21105957, + "step": 7631, + "time_per_iteration": 2.9863345623016357 + }, + { + "auxiliary_loss_clip": 0.01462474, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_clip": 1.28492987, + "balance_loss_mlp": 1.01996064, + "epoch": 0.458860664361942, + "flos": 17720996198400.0, + "grad_norm": 2.05679462207589, + "language_loss": 0.72971213, + "learning_rate": 2.361563500108531e-06, + "loss": 0.75476474, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22839355, + "step": 7632, + "time_per_iteration": 2.81466007232666 + }, + { + "auxiliary_loss_clip": 0.01474903, + "auxiliary_loss_mlp": 0.01042416, + "balance_loss_clip": 1.29229641, + "balance_loss_mlp": 1.01833606, + "epoch": 0.45892078761460997, + "flos": 18450978105600.0, + "grad_norm": 2.199155467915187, + "language_loss": 0.70089376, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.72606695, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.24072266, + "step": 7633, + "time_per_iteration": 4.26861047744751 + }, + { + "auxiliary_loss_clip": 0.01454475, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.27600884, + "balance_loss_mlp": 1.0174675, + "epoch": 0.45898091086727794, + "flos": 22682810004480.0, + "grad_norm": 3.8819057151676013, + "language_loss": 0.81969655, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.84463626, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22045898, + "step": 7634, + "time_per_iteration": 2.8431358337402344 + }, + { + "auxiliary_loss_clip": 0.01490367, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_clip": 1.30512094, + "balance_loss_mlp": 1.0200721, + "epoch": 0.4590410341199459, + "flos": 21662952791040.0, + "grad_norm": 1.7840993401334628, + "language_loss": 0.8269186, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.85225284, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.2298584, + "step": 7635, + "time_per_iteration": 2.9180312156677246 + }, + { + "auxiliary_loss_clip": 0.01447949, + "auxiliary_loss_mlp": 0.01043487, + "balance_loss_clip": 1.27163482, + "balance_loss_mlp": 1.01991916, + "epoch": 0.45910115737261387, + "flos": 36548258474880.0, + "grad_norm": 1.4998411648664247, + "language_loss": 0.65661186, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.68152618, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.2355957, + "step": 7636, + "time_per_iteration": 2.9770870208740234 + }, + { + "auxiliary_loss_clip": 0.01444012, + "auxiliary_loss_mlp": 0.01040083, + "balance_loss_clip": 1.27207828, + "balance_loss_mlp": 1.01799321, + "epoch": 0.45916128062528183, + "flos": 24429029541120.0, + "grad_norm": 1.6915345909796045, + "language_loss": 0.8117699, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.83661091, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.22070312, + "step": 7637, + "time_per_iteration": 2.910545825958252 + }, + { + "auxiliary_loss_clip": 0.01459197, + "auxiliary_loss_mlp": 0.01043461, + "balance_loss_clip": 1.27988911, + "balance_loss_mlp": 1.0200367, + "epoch": 0.4592214038779498, + "flos": 23232400318080.0, + "grad_norm": 1.5564394269942672, + "language_loss": 0.76046091, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.78548753, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.23449707, + "step": 7638, + "time_per_iteration": 2.901210308074951 + }, + { + "auxiliary_loss_clip": 0.01441778, + "auxiliary_loss_mlp": 0.01043102, + "balance_loss_clip": 1.26843882, + "balance_loss_mlp": 1.0199275, + "epoch": 0.45928152713061776, + "flos": 19181548195200.0, + "grad_norm": 2.51148534065164, + "language_loss": 0.74782205, + "learning_rate": 2.358881852733989e-06, + "loss": 0.77267087, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.23156738, + "step": 7639, + "time_per_iteration": 2.7946150302886963 + }, + { + "auxiliary_loss_clip": 0.01452829, + "auxiliary_loss_mlp": 0.01039263, + "balance_loss_clip": 1.27615523, + "balance_loss_mlp": 1.01712584, + "epoch": 0.4593416503832857, + "flos": 22423728424320.0, + "grad_norm": 1.6104863636814424, + "language_loss": 0.69365633, + "learning_rate": 2.358498705700346e-06, + "loss": 0.71857727, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.22119141, + "step": 7640, + "time_per_iteration": 2.851330041885376 + }, + { + "auxiliary_loss_clip": 0.01468674, + "auxiliary_loss_mlp": 0.01042635, + "balance_loss_clip": 1.28825688, + "balance_loss_mlp": 1.0204618, + "epoch": 0.4594017736359537, + "flos": 18889546383360.0, + "grad_norm": 1.6101901312517073, + "language_loss": 0.76666886, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.79178196, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22180176, + "step": 7641, + "time_per_iteration": 4.344481706619263 + }, + { + "auxiliary_loss_clip": 0.01458644, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.27939129, + "balance_loss_mlp": 1.02420616, + "epoch": 0.45946189688862166, + "flos": 20527413327360.0, + "grad_norm": 1.6032145181305488, + "language_loss": 0.74876487, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77382517, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.23168945, + "step": 7642, + "time_per_iteration": 4.264801740646362 + }, + { + "auxiliary_loss_clip": 0.01255846, + "auxiliary_loss_mlp": 0.01062565, + "balance_loss_clip": 1.15119696, + "balance_loss_mlp": 1.03958118, + "epoch": 0.4595220201412896, + "flos": 61431510994560.0, + "grad_norm": 0.8596463334591588, + "language_loss": 0.58175772, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60494179, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.22949219, + "step": 7643, + "time_per_iteration": 4.44842791557312 + }, + { + "auxiliary_loss_clip": 0.01471033, + "auxiliary_loss_mlp": 0.01039326, + "balance_loss_clip": 1.28700459, + "balance_loss_mlp": 1.01726007, + "epoch": 0.4595821433939576, + "flos": 23341295848320.0, + "grad_norm": 1.578175317639982, + "language_loss": 0.9331857, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95828933, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.22070312, + "step": 7644, + "time_per_iteration": 2.870189666748047 + }, + { + "auxiliary_loss_clip": 0.01459562, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.27972162, + "balance_loss_mlp": 1.01733494, + "epoch": 0.4596422666466256, + "flos": 14290325556480.0, + "grad_norm": 3.2271704884911823, + "language_loss": 0.84058654, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.86558139, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22595215, + "step": 7645, + "time_per_iteration": 2.854024887084961 + }, + { + "auxiliary_loss_clip": 0.01261487, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.1572001, + "balance_loss_mlp": 1.01754236, + "epoch": 0.4597023898992936, + "flos": 65758140028800.0, + "grad_norm": 0.7625435082316689, + "language_loss": 0.59964979, + "learning_rate": 2.356199538526593e-06, + "loss": 0.62269747, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.2578125, + "step": 7646, + "time_per_iteration": 3.236727476119995 + }, + { + "auxiliary_loss_clip": 0.01451851, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.27397609, + "balance_loss_mlp": 1.01685214, + "epoch": 0.45976251315196154, + "flos": 26918487711360.0, + "grad_norm": 1.6360056184487939, + "language_loss": 0.73559272, + "learning_rate": 2.355816296637939e-06, + "loss": 0.76050293, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.2232666, + "step": 7647, + "time_per_iteration": 2.9498050212860107 + }, + { + "auxiliary_loss_clip": 0.01468636, + "auxiliary_loss_mlp": 0.0103989, + "balance_loss_clip": 1.28690827, + "balance_loss_mlp": 1.01769304, + "epoch": 0.4598226364046295, + "flos": 26630150728320.0, + "grad_norm": 1.8851471503796926, + "language_loss": 0.67468166, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69976699, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.2220459, + "step": 7648, + "time_per_iteration": 2.923333168029785 + }, + { + "auxiliary_loss_clip": 0.01453288, + "auxiliary_loss_mlp": 0.01039691, + "balance_loss_clip": 1.27463198, + "balance_loss_mlp": 1.01661229, + "epoch": 0.45988275965729747, + "flos": 24398054835840.0, + "grad_norm": 1.412856328751868, + "language_loss": 0.7928651, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81779492, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.23083496, + "step": 7649, + "time_per_iteration": 2.889042377471924 + }, + { + "auxiliary_loss_clip": 0.01444424, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.26792753, + "balance_loss_mlp": 1.02075672, + "epoch": 0.45994288290996543, + "flos": 24546702562560.0, + "grad_norm": 2.2982195803748966, + "language_loss": 0.70390218, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.72877932, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.22546387, + "step": 7650, + "time_per_iteration": 2.917297601699829 + }, + { + "auxiliary_loss_clip": 0.01463919, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.28052545, + "balance_loss_mlp": 1.01664221, + "epoch": 0.4600030061626334, + "flos": 14838513281280.0, + "grad_norm": 1.938361188275399, + "language_loss": 0.85015368, + "learning_rate": 2.354283194302761e-06, + "loss": 0.87519807, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.2388916, + "step": 7651, + "time_per_iteration": 2.8470563888549805 + }, + { + "auxiliary_loss_clip": 0.01449534, + "auxiliary_loss_mlp": 0.01046224, + "balance_loss_clip": 1.27428436, + "balance_loss_mlp": 1.02335978, + "epoch": 0.46006312941530136, + "flos": 18122934170880.0, + "grad_norm": 1.8645703272280578, + "language_loss": 0.7600041, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.7849617, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.2286377, + "step": 7652, + "time_per_iteration": 2.8457212448120117 + }, + { + "auxiliary_loss_clip": 0.01457657, + "auxiliary_loss_mlp": 0.01037846, + "balance_loss_clip": 1.27710938, + "balance_loss_mlp": 1.0151006, + "epoch": 0.46012325266796933, + "flos": 21985295880960.0, + "grad_norm": 1.9922473450657596, + "language_loss": 0.76580691, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.79076195, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.22741699, + "step": 7653, + "time_per_iteration": 2.8733856678009033 + }, + { + "auxiliary_loss_clip": 0.015058, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.31642175, + "balance_loss_mlp": 1.01944971, + "epoch": 0.4601833759206373, + "flos": 15276448131840.0, + "grad_norm": 2.0839339911837507, + "language_loss": 0.66852272, + "learning_rate": 2.353133226438741e-06, + "loss": 0.69401425, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.23913574, + "step": 7654, + "time_per_iteration": 2.805328130722046 + }, + { + "auxiliary_loss_clip": 0.01455106, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.2777276, + "balance_loss_mlp": 1.01544309, + "epoch": 0.46024349917330526, + "flos": 27100327138560.0, + "grad_norm": 1.6160190238407148, + "language_loss": 0.79954708, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.82446516, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.21252441, + "step": 7655, + "time_per_iteration": 2.8690006732940674 + }, + { + "auxiliary_loss_clip": 0.01452817, + "auxiliary_loss_mlp": 0.01036123, + "balance_loss_clip": 1.27859044, + "balance_loss_mlp": 1.01486802, + "epoch": 0.4603036224259732, + "flos": 24473803910400.0, + "grad_norm": 1.6723703007031339, + "language_loss": 0.68301916, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.70790851, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21264648, + "step": 7656, + "time_per_iteration": 2.8582396507263184 + }, + { + "auxiliary_loss_clip": 0.01459991, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.2795285, + "balance_loss_mlp": 1.01584816, + "epoch": 0.4603637456786412, + "flos": 28120546310400.0, + "grad_norm": 1.66160198700632, + "language_loss": 0.82239389, + "learning_rate": 2.351983138057098e-06, + "loss": 0.84739423, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.24194336, + "step": 7657, + "time_per_iteration": 2.882878541946411 + }, + { + "auxiliary_loss_clip": 0.01458924, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.28060055, + "balance_loss_mlp": 1.01470411, + "epoch": 0.4604238689313092, + "flos": 24358619352960.0, + "grad_norm": 2.2811860066437126, + "language_loss": 0.71773565, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.74269593, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.22399902, + "step": 7658, + "time_per_iteration": 2.863060235977173 + }, + { + "auxiliary_loss_clip": 0.01248624, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.14827847, + "balance_loss_mlp": 1.00993276, + "epoch": 0.4604839921839772, + "flos": 53631852906240.0, + "grad_norm": 0.981469992117087, + "language_loss": 0.62197721, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64481831, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.25585938, + "step": 7659, + "time_per_iteration": 3.4133360385894775 + }, + { + "auxiliary_loss_clip": 0.01454666, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.27858782, + "balance_loss_mlp": 1.01528013, + "epoch": 0.46054411543664514, + "flos": 31260843953280.0, + "grad_norm": 2.1375396244492753, + "language_loss": 0.69336587, + "learning_rate": 2.350832929550336e-06, + "loss": 0.71829891, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.23376465, + "step": 7660, + "time_per_iteration": 2.907419204711914 + }, + { + "auxiliary_loss_clip": 0.01470488, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.28911471, + "balance_loss_mlp": 1.01699424, + "epoch": 0.4606042386893131, + "flos": 24102297705600.0, + "grad_norm": 1.9545580605731985, + "language_loss": 0.77932233, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.8044309, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.23388672, + "step": 7661, + "time_per_iteration": 2.895961284637451 + }, + { + "auxiliary_loss_clip": 0.01463447, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.28873563, + "balance_loss_mlp": 1.01769793, + "epoch": 0.46066436194198107, + "flos": 26589719859840.0, + "grad_norm": 1.9473382129393224, + "language_loss": 0.75489354, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77992922, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22412109, + "step": 7662, + "time_per_iteration": 2.9006452560424805 + }, + { + "auxiliary_loss_clip": 0.01499356, + "auxiliary_loss_mlp": 0.01042674, + "balance_loss_clip": 1.31234562, + "balance_loss_mlp": 1.01988125, + "epoch": 0.46072448519464904, + "flos": 17782040712960.0, + "grad_norm": 2.905502127402866, + "language_loss": 0.81387842, + "learning_rate": 2.349682601310998e-06, + "loss": 0.83929873, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.22814941, + "step": 7663, + "time_per_iteration": 2.80364990234375 + }, + { + "auxiliary_loss_clip": 0.01456792, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.28175795, + "balance_loss_mlp": 1.01533186, + "epoch": 0.460784608447317, + "flos": 15094563459840.0, + "grad_norm": 2.0495288063324804, + "language_loss": 0.74272954, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.7676698, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21899414, + "step": 7664, + "time_per_iteration": 2.837287187576294 + }, + { + "auxiliary_loss_clip": 0.01476722, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.29636168, + "balance_loss_mlp": 1.01938045, + "epoch": 0.46084473169998497, + "flos": 18597499326720.0, + "grad_norm": 2.3572935485616213, + "language_loss": 0.73576784, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.76096267, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23388672, + "step": 7665, + "time_per_iteration": 2.806087017059326 + }, + { + "auxiliary_loss_clip": 0.01480909, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.30047429, + "balance_loss_mlp": 1.01862741, + "epoch": 0.46090485495265293, + "flos": 19502805409920.0, + "grad_norm": 1.9041755607085937, + "language_loss": 0.78880095, + "learning_rate": 2.348532153731669e-06, + "loss": 0.81401956, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.22338867, + "step": 7666, + "time_per_iteration": 2.86143159866333 + }, + { + "auxiliary_loss_clip": 0.01468965, + "auxiliary_loss_mlp": 0.01047178, + "balance_loss_clip": 1.29213047, + "balance_loss_mlp": 1.02426565, + "epoch": 0.4609649782053209, + "flos": 33377981512320.0, + "grad_norm": 1.492689878875838, + "language_loss": 0.74794912, + "learning_rate": 2.348148644753088e-06, + "loss": 0.77311051, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.22900391, + "step": 7667, + "time_per_iteration": 2.9449093341827393 + }, + { + "auxiliary_loss_clip": 0.01469546, + "auxiliary_loss_mlp": 0.01047107, + "balance_loss_clip": 1.29039729, + "balance_loss_mlp": 1.02492261, + "epoch": 0.46102510145798886, + "flos": 23779728391680.0, + "grad_norm": 1.5472101588568568, + "language_loss": 0.76667464, + "learning_rate": 2.347765122572676e-06, + "loss": 0.79184115, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.2220459, + "step": 7668, + "time_per_iteration": 4.272247791290283 + }, + { + "auxiliary_loss_clip": 0.01460633, + "auxiliary_loss_mlp": 0.01047417, + "balance_loss_clip": 1.28737569, + "balance_loss_mlp": 1.02582848, + "epoch": 0.4610852247106568, + "flos": 23305253725440.0, + "grad_norm": 1.5418460270269305, + "language_loss": 0.78454971, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80963016, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.21569824, + "step": 7669, + "time_per_iteration": 2.9483301639556885 + }, + { + "auxiliary_loss_clip": 0.01475282, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_clip": 1.29592896, + "balance_loss_mlp": 1.02485776, + "epoch": 0.4611453479633248, + "flos": 25458433407360.0, + "grad_norm": 1.9426543796752684, + "language_loss": 0.83513814, + "learning_rate": 2.34699803866453e-06, + "loss": 0.86035395, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.21447754, + "step": 7670, + "time_per_iteration": 2.954928398132324 + }, + { + "auxiliary_loss_clip": 0.01462845, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.28739405, + "balance_loss_mlp": 1.03315306, + "epoch": 0.4612054712159928, + "flos": 21148816417920.0, + "grad_norm": 1.5639319224879435, + "language_loss": 0.64491916, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6700983, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.21911621, + "step": 7671, + "time_per_iteration": 2.8849852085113525 + }, + { + "auxiliary_loss_clip": 0.01240997, + "auxiliary_loss_mlp": 0.01037652, + "balance_loss_clip": 1.14135361, + "balance_loss_mlp": 1.01114023, + "epoch": 0.4612655944686608, + "flos": 69991057802880.0, + "grad_norm": 0.7085719865318233, + "language_loss": 0.55859709, + "learning_rate": 2.346230902123583e-06, + "loss": 0.58138359, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.265625, + "step": 7672, + "time_per_iteration": 3.4475631713867188 + }, + { + "auxiliary_loss_clip": 0.01474484, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.29409194, + "balance_loss_mlp": 1.02968216, + "epoch": 0.46132571772132874, + "flos": 16845986148480.0, + "grad_norm": 1.9219157164943914, + "language_loss": 0.73209375, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.75736499, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.22949219, + "step": 7673, + "time_per_iteration": 2.793931007385254 + }, + { + "auxiliary_loss_clip": 0.01464096, + "auxiliary_loss_mlp": 0.01051446, + "balance_loss_clip": 1.28754342, + "balance_loss_mlp": 1.02873623, + "epoch": 0.4613858409739967, + "flos": 35822484334080.0, + "grad_norm": 1.8349664255165943, + "language_loss": 0.71349543, + "learning_rate": 2.345463713066195e-06, + "loss": 0.73865086, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.22705078, + "step": 7674, + "time_per_iteration": 2.9761016368865967 + }, + { + "auxiliary_loss_clip": 0.01473136, + "auxiliary_loss_mlp": 0.01051087, + "balance_loss_clip": 1.29566932, + "balance_loss_mlp": 1.02881896, + "epoch": 0.4614459642266647, + "flos": 35281490532480.0, + "grad_norm": 2.010265208768684, + "language_loss": 0.6601128, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.68535501, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22277832, + "step": 7675, + "time_per_iteration": 2.985412359237671 + }, + { + "auxiliary_loss_clip": 0.01247983, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.14666152, + "balance_loss_mlp": 1.00813663, + "epoch": 0.46150608747933264, + "flos": 66733630076160.0, + "grad_norm": 0.769923117974297, + "language_loss": 0.5868119, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60963058, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.2578125, + "step": 7676, + "time_per_iteration": 4.68008828163147 + }, + { + "auxiliary_loss_clip": 0.01245349, + "auxiliary_loss_mlp": 0.01025308, + "balance_loss_clip": 1.14294028, + "balance_loss_mlp": 1.00213385, + "epoch": 0.4615662107320006, + "flos": 55855940469120.0, + "grad_norm": 0.7920299197790727, + "language_loss": 0.62728584, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64999247, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.23144531, + "step": 7677, + "time_per_iteration": 4.620277166366577 + }, + { + "auxiliary_loss_clip": 0.01466207, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.29058313, + "balance_loss_mlp": 1.02275312, + "epoch": 0.46162633398466857, + "flos": 15490710097920.0, + "grad_norm": 2.421068755391075, + "language_loss": 0.77595103, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.80105448, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21398926, + "step": 7678, + "time_per_iteration": 4.186838150024414 + }, + { + "auxiliary_loss_clip": 0.0148412, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.30460799, + "balance_loss_mlp": 1.02086806, + "epoch": 0.46168645723733653, + "flos": 20021104304640.0, + "grad_norm": 1.8429472415840022, + "language_loss": 0.67890763, + "learning_rate": 2.343545511426974e-06, + "loss": 0.70416361, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.20605469, + "step": 7679, + "time_per_iteration": 2.9124860763549805 + }, + { + "auxiliary_loss_clip": 0.0147857, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.30073071, + "balance_loss_mlp": 1.02232838, + "epoch": 0.4617465804900045, + "flos": 20307043313280.0, + "grad_norm": 2.11976664704184, + "language_loss": 0.71017897, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.73539162, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.20361328, + "step": 7680, + "time_per_iteration": 2.8527047634124756 + }, + { + "auxiliary_loss_clip": 0.01495587, + "auxiliary_loss_mlp": 0.01045407, + "balance_loss_clip": 1.31292152, + "balance_loss_mlp": 1.02399707, + "epoch": 0.46180670374267246, + "flos": 22356530616960.0, + "grad_norm": 1.8260058811276356, + "language_loss": 0.64802623, + "learning_rate": 2.342778139478487e-06, + "loss": 0.67343616, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.21386719, + "step": 7681, + "time_per_iteration": 2.851130962371826 + }, + { + "auxiliary_loss_clip": 0.01463861, + "auxiliary_loss_mlp": 0.01041373, + "balance_loss_clip": 1.29016697, + "balance_loss_mlp": 1.01999879, + "epoch": 0.46186682699534043, + "flos": 19903702752000.0, + "grad_norm": 1.952491508205418, + "language_loss": 0.67846048, + "learning_rate": 2.342394433999697e-06, + "loss": 0.70351285, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21386719, + "step": 7682, + "time_per_iteration": 2.832533597946167 + }, + { + "auxiliary_loss_clip": 0.01494305, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.31328905, + "balance_loss_mlp": 1.01758325, + "epoch": 0.4619269502480084, + "flos": 31515989235840.0, + "grad_norm": 3.770754265522648, + "language_loss": 0.75775039, + "learning_rate": 2.342010715537275e-06, + "loss": 0.78308105, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.21166992, + "step": 7683, + "time_per_iteration": 2.8976147174835205 + }, + { + "auxiliary_loss_clip": 0.01471941, + "auxiliary_loss_mlp": 0.01041308, + "balance_loss_clip": 1.29552722, + "balance_loss_mlp": 1.02033877, + "epoch": 0.46198707350067636, + "flos": 25020317577600.0, + "grad_norm": 1.992006314022462, + "language_loss": 0.77803153, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.80316406, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.20959473, + "step": 7684, + "time_per_iteration": 2.919167995452881 + }, + { + "auxiliary_loss_clip": 0.01502326, + "auxiliary_loss_mlp": 0.01042188, + "balance_loss_clip": 1.31629336, + "balance_loss_mlp": 1.02130258, + "epoch": 0.4620471967533444, + "flos": 18300475342080.0, + "grad_norm": 1.8090460354710214, + "language_loss": 0.8023181, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82776332, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 1.86035156, + "router_z_loss_mlp": 0.2088623, + "step": 7685, + "time_per_iteration": 2.8620898723602295 + }, + { + "auxiliary_loss_clip": 0.01463469, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.28998268, + "balance_loss_mlp": 1.02742314, + "epoch": 0.46210732000601235, + "flos": 33998298727680.0, + "grad_norm": 1.9639234274070774, + "language_loss": 0.67728496, + "learning_rate": 2.340859482393731e-06, + "loss": 0.70240569, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21203613, + "step": 7686, + "time_per_iteration": 3.042912244796753 + }, + { + "auxiliary_loss_clip": 0.01484323, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.30349338, + "balance_loss_mlp": 1.01921248, + "epoch": 0.4621674432586803, + "flos": 25020227088000.0, + "grad_norm": 2.1253343470363855, + "language_loss": 0.74022388, + "learning_rate": 2.340475712142296e-06, + "loss": 0.76547253, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21313477, + "step": 7687, + "time_per_iteration": 2.947704553604126 + }, + { + "auxiliary_loss_clip": 0.01471661, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.29443061, + "balance_loss_mlp": 1.01863992, + "epoch": 0.4622275665113483, + "flos": 22023147795840.0, + "grad_norm": 2.3023469485964614, + "language_loss": 0.76032853, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.78544682, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.21520996, + "step": 7688, + "time_per_iteration": 2.927866220474243 + }, + { + "auxiliary_loss_clip": 0.01474889, + "auxiliary_loss_mlp": 0.01040146, + "balance_loss_clip": 1.29726112, + "balance_loss_mlp": 1.01960588, + "epoch": 0.46228768976401624, + "flos": 24068879781120.0, + "grad_norm": 1.7932758192765126, + "language_loss": 0.79931915, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.82446957, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.20532227, + "step": 7689, + "time_per_iteration": 2.9059464931488037 + }, + { + "auxiliary_loss_clip": 0.01483767, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.30191684, + "balance_loss_mlp": 1.02304173, + "epoch": 0.4623478130166842, + "flos": 26662663756800.0, + "grad_norm": 2.483227058988167, + "language_loss": 0.57718992, + "learning_rate": 2.339324323980964e-06, + "loss": 0.60246849, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.21044922, + "step": 7690, + "time_per_iteration": 2.946136951446533 + }, + { + "auxiliary_loss_clip": 0.01479847, + "auxiliary_loss_mlp": 0.01045782, + "balance_loss_clip": 1.298033, + "balance_loss_mlp": 1.0241102, + "epoch": 0.46240793626935217, + "flos": 20568160909440.0, + "grad_norm": 8.433342810712363, + "language_loss": 0.8327269, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.85798323, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21679688, + "step": 7691, + "time_per_iteration": 2.862374782562256 + }, + { + "auxiliary_loss_clip": 0.01473282, + "auxiliary_loss_mlp": 0.01044025, + "balance_loss_clip": 1.29348755, + "balance_loss_mlp": 1.02348554, + "epoch": 0.46246805952202014, + "flos": 22466376288000.0, + "grad_norm": 1.453414556777515, + "language_loss": 0.7608422, + "learning_rate": 2.338556667513091e-06, + "loss": 0.78601539, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.20532227, + "step": 7692, + "time_per_iteration": 2.8665647506713867 + }, + { + "auxiliary_loss_clip": 0.01462578, + "auxiliary_loss_mlp": 0.01046903, + "balance_loss_clip": 1.28318787, + "balance_loss_mlp": 1.02473044, + "epoch": 0.4625281827746881, + "flos": 35054288553600.0, + "grad_norm": 1.9563614231570285, + "language_loss": 0.74958766, + "learning_rate": 2.338172820014723e-06, + "loss": 0.77468246, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22155762, + "step": 7693, + "time_per_iteration": 2.9809470176696777 + }, + { + "auxiliary_loss_clip": 0.01465053, + "auxiliary_loss_mlp": 0.01052159, + "balance_loss_clip": 1.2886095, + "balance_loss_mlp": 1.03028429, + "epoch": 0.46258830602735607, + "flos": 21078360984960.0, + "grad_norm": 1.7317118223301362, + "language_loss": 0.85905278, + "learning_rate": 2.337788959692808e-06, + "loss": 0.88422489, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.21862793, + "step": 7694, + "time_per_iteration": 2.84395694732666 + }, + { + "auxiliary_loss_clip": 0.01470177, + "auxiliary_loss_mlp": 0.01051757, + "balance_loss_clip": 1.28907502, + "balance_loss_mlp": 1.03081203, + "epoch": 0.46264842928002403, + "flos": 26188415314560.0, + "grad_norm": 2.0574135414648507, + "language_loss": 0.80418372, + "learning_rate": 2.337405086561902e-06, + "loss": 0.82940304, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.20947266, + "step": 7695, + "time_per_iteration": 2.888993740081787 + }, + { + "auxiliary_loss_clip": 0.01459441, + "auxiliary_loss_mlp": 0.01050132, + "balance_loss_clip": 1.28309822, + "balance_loss_mlp": 1.02978325, + "epoch": 0.462708552532692, + "flos": 16773042251520.0, + "grad_norm": 1.7149788925478255, + "language_loss": 0.72865188, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.75374758, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.20349121, + "step": 7696, + "time_per_iteration": 2.8278751373291016 + }, + { + "auxiliary_loss_clip": 0.01466675, + "auxiliary_loss_mlp": 0.0104626, + "balance_loss_clip": 1.28605556, + "balance_loss_mlp": 1.02532649, + "epoch": 0.46276867578535996, + "flos": 15569038126080.0, + "grad_norm": 1.557160509195556, + "language_loss": 0.70277202, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.72790134, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.20947266, + "step": 7697, + "time_per_iteration": 2.7915966510772705 + }, + { + "auxiliary_loss_clip": 0.01460617, + "auxiliary_loss_mlp": 0.01053314, + "balance_loss_clip": 1.2820015, + "balance_loss_mlp": 1.03141546, + "epoch": 0.462828799038028, + "flos": 22424633320320.0, + "grad_norm": 2.2742491726599847, + "language_loss": 0.85609663, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.88123596, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21899414, + "step": 7698, + "time_per_iteration": 2.8370280265808105 + }, + { + "auxiliary_loss_clip": 0.01453938, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.27650034, + "balance_loss_mlp": 1.0272367, + "epoch": 0.46288892229069595, + "flos": 21079537349760.0, + "grad_norm": 1.7939736543825515, + "language_loss": 0.72152114, + "learning_rate": 2.335869466239502e-06, + "loss": 0.74654418, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.21118164, + "step": 7699, + "time_per_iteration": 2.8161075115203857 + }, + { + "auxiliary_loss_clip": 0.01472144, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.2892226, + "balance_loss_mlp": 1.02052689, + "epoch": 0.4629490455433639, + "flos": 23196177216000.0, + "grad_norm": 4.930155981968964, + "language_loss": 0.71936387, + "learning_rate": 2.335485529281996e-06, + "loss": 0.74454963, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.25915527, + "step": 7700, + "time_per_iteration": 2.862619161605835 + }, + { + "auxiliary_loss_clip": 0.01451055, + "auxiliary_loss_mlp": 0.01045681, + "balance_loss_clip": 1.27548301, + "balance_loss_mlp": 1.02348459, + "epoch": 0.4630091687960319, + "flos": 18842735998080.0, + "grad_norm": 2.308297658293401, + "language_loss": 0.73571998, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.76068735, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.2220459, + "step": 7701, + "time_per_iteration": 3.010960817337036 + }, + { + "auxiliary_loss_clip": 0.01467586, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.28491831, + "balance_loss_mlp": 1.01919913, + "epoch": 0.46306929204869984, + "flos": 38920541316480.0, + "grad_norm": 2.1524418939452863, + "language_loss": 0.65981722, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.68491107, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.22595215, + "step": 7702, + "time_per_iteration": 3.1165611743927 + }, + { + "auxiliary_loss_clip": 0.01451094, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.27410543, + "balance_loss_mlp": 1.01490104, + "epoch": 0.4631294153013678, + "flos": 19653127194240.0, + "grad_norm": 2.078042993062129, + "language_loss": 0.73644948, + "learning_rate": 2.33433364213785e-06, + "loss": 0.76133364, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.2244873, + "step": 7703, + "time_per_iteration": 4.363089323043823 + }, + { + "auxiliary_loss_clip": 0.01475514, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.29197788, + "balance_loss_mlp": 1.02504897, + "epoch": 0.4631895385540358, + "flos": 24619013032320.0, + "grad_norm": 1.8023291433112174, + "language_loss": 0.69816756, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.72340679, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.23339844, + "step": 7704, + "time_per_iteration": 2.8878016471862793 + }, + { + "auxiliary_loss_clip": 0.01464058, + "auxiliary_loss_mlp": 0.01041071, + "balance_loss_clip": 1.28292608, + "balance_loss_mlp": 1.01740789, + "epoch": 0.46324966180670374, + "flos": 26330547790080.0, + "grad_norm": 1.8730763207769465, + "language_loss": 0.81953818, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.84458947, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.23669434, + "step": 7705, + "time_per_iteration": 2.951017141342163 + }, + { + "auxiliary_loss_clip": 0.01468274, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.28707647, + "balance_loss_mlp": 1.02404106, + "epoch": 0.4633097850593717, + "flos": 19248746002560.0, + "grad_norm": 2.150755367227443, + "language_loss": 0.78897738, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.81410855, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.20800781, + "step": 7706, + "time_per_iteration": 2.823875665664673 + }, + { + "auxiliary_loss_clip": 0.01440747, + "auxiliary_loss_mlp": 0.01041823, + "balance_loss_clip": 1.26720452, + "balance_loss_mlp": 1.01999545, + "epoch": 0.46336990831203967, + "flos": 22793243857920.0, + "grad_norm": 1.8434156853553338, + "language_loss": 0.71102089, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.73584652, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21813965, + "step": 7707, + "time_per_iteration": 2.8452210426330566 + }, + { + "auxiliary_loss_clip": 0.01459048, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.27800703, + "balance_loss_mlp": 1.02114761, + "epoch": 0.46343003156470763, + "flos": 38223117682560.0, + "grad_norm": 1.7984126188762937, + "language_loss": 0.61930633, + "learning_rate": 2.332413576865791e-06, + "loss": 0.64435053, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.2421875, + "step": 7708, + "time_per_iteration": 2.981297492980957 + }, + { + "auxiliary_loss_clip": 0.01450719, + "auxiliary_loss_mlp": 0.01045258, + "balance_loss_clip": 1.27383637, + "balance_loss_mlp": 1.02289426, + "epoch": 0.4634901548173756, + "flos": 31950349747200.0, + "grad_norm": 1.9375090887252284, + "language_loss": 0.77821952, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.80317932, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.22375488, + "step": 7709, + "time_per_iteration": 2.906395435333252 + }, + { + "auxiliary_loss_clip": 0.0146118, + "auxiliary_loss_mlp": 0.0104163, + "balance_loss_clip": 1.28089786, + "balance_loss_mlp": 1.01849127, + "epoch": 0.46355027807004356, + "flos": 20091740716800.0, + "grad_norm": 1.6951391427451175, + "language_loss": 0.77777267, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.80280083, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.23144531, + "step": 7710, + "time_per_iteration": 2.8789968490600586 + }, + { + "auxiliary_loss_clip": 0.01465951, + "auxiliary_loss_mlp": 0.01039602, + "balance_loss_clip": 1.28387046, + "balance_loss_mlp": 1.01622474, + "epoch": 0.4636104013227116, + "flos": 24072092161920.0, + "grad_norm": 1.9366513295867969, + "language_loss": 0.74170327, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.76675874, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.23376465, + "step": 7711, + "time_per_iteration": 4.3217933177948 + }, + { + "auxiliary_loss_clip": 0.01461473, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_clip": 1.28369451, + "balance_loss_mlp": 1.02833116, + "epoch": 0.46367052457537955, + "flos": 23925118492800.0, + "grad_norm": 1.3376632639661397, + "language_loss": 0.72199255, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7471329, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.24206543, + "step": 7712, + "time_per_iteration": 4.355421543121338 + }, + { + "auxiliary_loss_clip": 0.01477079, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_clip": 1.29253101, + "balance_loss_mlp": 1.02165246, + "epoch": 0.4637306478280475, + "flos": 26407880432640.0, + "grad_norm": 1.7933232084817345, + "language_loss": 0.7410289, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.76625896, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.24267578, + "step": 7713, + "time_per_iteration": 2.8958425521850586 + }, + { + "auxiliary_loss_clip": 0.01469687, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.28695107, + "balance_loss_mlp": 1.02035594, + "epoch": 0.4637907710807155, + "flos": 21990634767360.0, + "grad_norm": 1.626554367690889, + "language_loss": 0.59631681, + "learning_rate": 2.3301090827294e-06, + "loss": 0.62144649, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.22924805, + "step": 7714, + "time_per_iteration": 2.8916175365448 + }, + { + "auxiliary_loss_clip": 0.01449198, + "auxiliary_loss_mlp": 0.01043153, + "balance_loss_clip": 1.27187729, + "balance_loss_mlp": 1.01985908, + "epoch": 0.46385089433338345, + "flos": 12429962092800.0, + "grad_norm": 3.0411747174715513, + "language_loss": 0.7085005, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.73342395, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.23303223, + "step": 7715, + "time_per_iteration": 2.838351249694824 + }, + { + "auxiliary_loss_clip": 0.01474037, + "auxiliary_loss_mlp": 0.01042341, + "balance_loss_clip": 1.28855884, + "balance_loss_mlp": 1.01948833, + "epoch": 0.4639110175860514, + "flos": 23926521081600.0, + "grad_norm": 2.290578391935896, + "language_loss": 0.68912673, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.71429056, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.22839355, + "step": 7716, + "time_per_iteration": 2.868561029434204 + }, + { + "auxiliary_loss_clip": 0.01468594, + "auxiliary_loss_mlp": 0.01044333, + "balance_loss_clip": 1.28777885, + "balance_loss_mlp": 1.02101576, + "epoch": 0.4639711408387194, + "flos": 25311052535040.0, + "grad_norm": 2.6903441097223055, + "language_loss": 0.81858528, + "learning_rate": 2.328956666474691e-06, + "loss": 0.84371454, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.2331543, + "step": 7717, + "time_per_iteration": 2.886523962020874 + }, + { + "auxiliary_loss_clip": 0.01462163, + "auxiliary_loss_mlp": 0.01043251, + "balance_loss_clip": 1.28116095, + "balance_loss_mlp": 1.01965928, + "epoch": 0.46403126409138734, + "flos": 21220855418880.0, + "grad_norm": 1.65208207784816, + "language_loss": 0.74045211, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.76550627, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.23596191, + "step": 7718, + "time_per_iteration": 2.8619933128356934 + }, + { + "auxiliary_loss_clip": 0.01451183, + "auxiliary_loss_mlp": 0.01049656, + "balance_loss_clip": 1.27223945, + "balance_loss_mlp": 1.02605271, + "epoch": 0.4640913873440553, + "flos": 35859159884160.0, + "grad_norm": 1.7000422084717037, + "language_loss": 0.71253431, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.73754269, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.23608398, + "step": 7719, + "time_per_iteration": 2.9574482440948486 + }, + { + "auxiliary_loss_clip": 0.01458284, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.27658141, + "balance_loss_mlp": 1.02425611, + "epoch": 0.46415151059672327, + "flos": 19174082803200.0, + "grad_norm": 2.5720929570713795, + "language_loss": 0.87536484, + "learning_rate": 2.327804137953357e-06, + "loss": 0.90044129, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.2512207, + "step": 7720, + "time_per_iteration": 2.8423311710357666 + }, + { + "auxiliary_loss_clip": 0.01290013, + "auxiliary_loss_mlp": 0.01065469, + "balance_loss_clip": 1.19002986, + "balance_loss_mlp": 1.03876591, + "epoch": 0.46421163384939124, + "flos": 58943925843840.0, + "grad_norm": 0.7280230534931796, + "language_loss": 0.55097175, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57452655, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.26757812, + "step": 7721, + "time_per_iteration": 3.3859004974365234 + }, + { + "auxiliary_loss_clip": 0.01454135, + "auxiliary_loss_mlp": 0.01051985, + "balance_loss_clip": 1.27794611, + "balance_loss_mlp": 1.0275588, + "epoch": 0.4642717571020592, + "flos": 20167037343360.0, + "grad_norm": 1.9056384243362798, + "language_loss": 0.81085271, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.8359139, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.24414062, + "step": 7722, + "time_per_iteration": 2.944897413253784 + }, + { + "auxiliary_loss_clip": 0.01471307, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.28795564, + "balance_loss_mlp": 1.01386809, + "epoch": 0.46433188035472717, + "flos": 25057083617280.0, + "grad_norm": 1.6707507502593257, + "language_loss": 0.78485227, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.8099364, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.23242188, + "step": 7723, + "time_per_iteration": 2.8801467418670654 + }, + { + "auxiliary_loss_clip": 0.01448334, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.27228165, + "balance_loss_mlp": 1.01637983, + "epoch": 0.4643920036073952, + "flos": 28087626078720.0, + "grad_norm": 2.082014698211516, + "language_loss": 0.69026804, + "learning_rate": 2.326267259301118e-06, + "loss": 0.71514833, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.23303223, + "step": 7724, + "time_per_iteration": 2.9307503700256348 + }, + { + "auxiliary_loss_clip": 0.01449813, + "auxiliary_loss_mlp": 0.01040912, + "balance_loss_clip": 1.27219129, + "balance_loss_mlp": 1.01702297, + "epoch": 0.46445212686006315, + "flos": 18378531901440.0, + "grad_norm": 2.2347163990468566, + "language_loss": 0.68297553, + "learning_rate": 2.325883008671415e-06, + "loss": 0.70788276, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.2388916, + "step": 7725, + "time_per_iteration": 2.821589469909668 + }, + { + "auxiliary_loss_clip": 0.01435262, + "auxiliary_loss_mlp": 0.01040885, + "balance_loss_clip": 1.26333141, + "balance_loss_mlp": 1.01618505, + "epoch": 0.4645122501127311, + "flos": 31733870785920.0, + "grad_norm": 1.7327649895604826, + "language_loss": 0.65732539, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.68208683, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.24707031, + "step": 7726, + "time_per_iteration": 2.9924604892730713 + }, + { + "auxiliary_loss_clip": 0.0145266, + "auxiliary_loss_mlp": 0.01037298, + "balance_loss_clip": 1.27536249, + "balance_loss_mlp": 1.01461291, + "epoch": 0.4645723733653991, + "flos": 23779275943680.0, + "grad_norm": 1.8060572466889955, + "language_loss": 0.75640053, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.78130013, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22692871, + "step": 7727, + "time_per_iteration": 2.908284902572632 + }, + { + "auxiliary_loss_clip": 0.0145495, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.27635193, + "balance_loss_mlp": 1.01784754, + "epoch": 0.46463249661806705, + "flos": 33158380659840.0, + "grad_norm": 1.815982588360759, + "language_loss": 0.78907669, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81404227, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.23754883, + "step": 7728, + "time_per_iteration": 2.971658706665039 + }, + { + "auxiliary_loss_clip": 0.01463619, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.28348386, + "balance_loss_mlp": 1.01168227, + "epoch": 0.464692619870735, + "flos": 18305090311680.0, + "grad_norm": 2.0359939928415662, + "language_loss": 0.76650953, + "learning_rate": 2.324345882723155e-06, + "loss": 0.79149222, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.2298584, + "step": 7729, + "time_per_iteration": 2.895003080368042 + }, + { + "auxiliary_loss_clip": 0.01446907, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.2701211, + "balance_loss_mlp": 1.01237285, + "epoch": 0.464752743123403, + "flos": 22648306204800.0, + "grad_norm": 1.7254804959246097, + "language_loss": 0.808433, + "learning_rate": 2.323961570451588e-06, + "loss": 0.83326256, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.23693848, + "step": 7730, + "time_per_iteration": 2.846914291381836 + }, + { + "auxiliary_loss_clip": 0.01447155, + "auxiliary_loss_mlp": 0.01040048, + "balance_loss_clip": 1.27066338, + "balance_loss_mlp": 1.01676667, + "epoch": 0.46481286637607094, + "flos": 20421730177920.0, + "grad_norm": 3.163985828298907, + "language_loss": 0.78187859, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.80675066, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.23266602, + "step": 7731, + "time_per_iteration": 2.8764257431030273 + }, + { + "auxiliary_loss_clip": 0.01443584, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.26808953, + "balance_loss_mlp": 1.01171589, + "epoch": 0.4648729896287389, + "flos": 34288264523520.0, + "grad_norm": 6.52787957180364, + "language_loss": 0.66027224, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68505669, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.23132324, + "step": 7732, + "time_per_iteration": 2.935483455657959 + }, + { + "auxiliary_loss_clip": 0.01465251, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.28238237, + "balance_loss_mlp": 1.02158976, + "epoch": 0.4649331128814069, + "flos": 21330972558720.0, + "grad_norm": 2.4757436415295992, + "language_loss": 0.73775566, + "learning_rate": 2.32280855998725e-06, + "loss": 0.7628774, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.25341797, + "step": 7733, + "time_per_iteration": 2.8409266471862793 + }, + { + "auxiliary_loss_clip": 0.01274518, + "auxiliary_loss_mlp": 0.01056719, + "balance_loss_clip": 1.17386246, + "balance_loss_mlp": 1.02639186, + "epoch": 0.46499323613407484, + "flos": 58334223173760.0, + "grad_norm": 1.2316066411508926, + "language_loss": 0.51962274, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54293513, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.30273438, + "step": 7734, + "time_per_iteration": 3.2833995819091797 + }, + { + "auxiliary_loss_clip": 0.01457664, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.27870202, + "balance_loss_mlp": 1.01445746, + "epoch": 0.4650533593867428, + "flos": 10896873402240.0, + "grad_norm": 2.3686845254026605, + "language_loss": 0.76397306, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.78892839, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.23425293, + "step": 7735, + "time_per_iteration": 2.840210437774658 + }, + { + "auxiliary_loss_clip": 0.01446316, + "auxiliary_loss_mlp": 0.01044473, + "balance_loss_clip": 1.27075672, + "balance_loss_mlp": 1.02033329, + "epoch": 0.46511348263941077, + "flos": 19984157285760.0, + "grad_norm": 7.523662309823855, + "language_loss": 0.70556885, + "learning_rate": 2.321655439354519e-06, + "loss": 0.73047674, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.24133301, + "step": 7736, + "time_per_iteration": 2.9055418968200684 + }, + { + "auxiliary_loss_clip": 0.01436736, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.26337123, + "balance_loss_mlp": 1.010607, + "epoch": 0.46517360589207873, + "flos": 19686771342720.0, + "grad_norm": 1.6987821331134665, + "language_loss": 0.73073918, + "learning_rate": 2.321271041396427e-06, + "loss": 0.75545382, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.24121094, + "step": 7737, + "time_per_iteration": 2.830345630645752 + }, + { + "auxiliary_loss_clip": 0.01464695, + "auxiliary_loss_mlp": 0.0103697, + "balance_loss_clip": 1.28554893, + "balance_loss_mlp": 1.01330674, + "epoch": 0.46523372914474675, + "flos": 16881259109760.0, + "grad_norm": 2.1427199503492185, + "language_loss": 0.84554875, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.87056541, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.23669434, + "step": 7738, + "time_per_iteration": 4.324207782745361 + }, + { + "auxiliary_loss_clip": 0.01268355, + "auxiliary_loss_mlp": 0.0104337, + "balance_loss_clip": 1.1697135, + "balance_loss_mlp": 1.01800191, + "epoch": 0.4652938523974147, + "flos": 53468699581440.0, + "grad_norm": 0.7841903451665665, + "language_loss": 0.57967955, + "learning_rate": 2.320502208946932e-06, + "loss": 0.60279679, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.25390625, + "step": 7739, + "time_per_iteration": 3.4321415424346924 + }, + { + "auxiliary_loss_clip": 0.01458817, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.27982616, + "balance_loss_mlp": 1.02032804, + "epoch": 0.4653539756500827, + "flos": 15239591602560.0, + "grad_norm": 1.7685700234155608, + "language_loss": 0.85815644, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.88318115, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.2331543, + "step": 7740, + "time_per_iteration": 2.8683462142944336 + }, + { + "auxiliary_loss_clip": 0.01450251, + "auxiliary_loss_mlp": 0.01044715, + "balance_loss_clip": 1.27567184, + "balance_loss_mlp": 1.02094471, + "epoch": 0.46541409890275065, + "flos": 23742690883200.0, + "grad_norm": 1.4867486481983532, + "language_loss": 0.76132101, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.78627068, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.23791504, + "step": 7741, + "time_per_iteration": 2.917785882949829 + }, + { + "auxiliary_loss_clip": 0.01474025, + "auxiliary_loss_mlp": 0.01043475, + "balance_loss_clip": 1.29195905, + "balance_loss_mlp": 1.0209204, + "epoch": 0.4654742221554186, + "flos": 20856362158080.0, + "grad_norm": 1.7989287701058376, + "language_loss": 0.8139236, + "learning_rate": 2.319348869158064e-06, + "loss": 0.83909857, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22546387, + "step": 7742, + "time_per_iteration": 2.9295310974121094 + }, + { + "auxiliary_loss_clip": 0.01453904, + "auxiliary_loss_mlp": 0.0104252, + "balance_loss_clip": 1.27441049, + "balance_loss_mlp": 1.01910782, + "epoch": 0.4655343454080866, + "flos": 20714727375360.0, + "grad_norm": 1.715048983217858, + "language_loss": 0.73633319, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.7612974, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.23413086, + "step": 7743, + "time_per_iteration": 2.848858118057251 + }, + { + "auxiliary_loss_clip": 0.01459384, + "auxiliary_loss_mlp": 0.01040553, + "balance_loss_clip": 1.28086567, + "balance_loss_mlp": 1.01638889, + "epoch": 0.46559446866075455, + "flos": 18999256320000.0, + "grad_norm": 2.481771428746751, + "language_loss": 0.7234149, + "learning_rate": 2.318579915392483e-06, + "loss": 0.74841428, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.24157715, + "step": 7744, + "time_per_iteration": 2.8956353664398193 + }, + { + "auxiliary_loss_clip": 0.01446884, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.27178741, + "balance_loss_mlp": 1.01408696, + "epoch": 0.4656545919134225, + "flos": 34509810902400.0, + "grad_norm": 2.591972612225464, + "language_loss": 0.85855019, + "learning_rate": 2.31819542038153e-06, + "loss": 0.88339424, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.234375, + "step": 7745, + "time_per_iteration": 2.9566757678985596 + }, + { + "auxiliary_loss_clip": 0.01439627, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.26432097, + "balance_loss_mlp": 1.01987052, + "epoch": 0.4657147151660905, + "flos": 24319726807680.0, + "grad_norm": 1.4905374709692187, + "language_loss": 0.7336477, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75847238, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22961426, + "step": 7746, + "time_per_iteration": 4.342200994491577 + }, + { + "auxiliary_loss_clip": 0.01438066, + "auxiliary_loss_mlp": 0.01043489, + "balance_loss_clip": 1.26488543, + "balance_loss_mlp": 1.0208751, + "epoch": 0.46577483841875844, + "flos": 58814697415680.0, + "grad_norm": 1.486868359582299, + "language_loss": 0.70790726, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.73272276, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22607422, + "step": 7747, + "time_per_iteration": 4.635173320770264 + }, + { + "auxiliary_loss_clip": 0.0145482, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.27895141, + "balance_loss_mlp": 1.01535308, + "epoch": 0.4658349616714264, + "flos": 31334557011840.0, + "grad_norm": 1.7341595062304458, + "language_loss": 0.68287826, + "learning_rate": 2.317041863010978e-06, + "loss": 0.7078042, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.22424316, + "step": 7748, + "time_per_iteration": 2.9657742977142334 + }, + { + "auxiliary_loss_clip": 0.01466496, + "auxiliary_loss_mlp": 0.01038301, + "balance_loss_clip": 1.28481126, + "balance_loss_mlp": 1.01519787, + "epoch": 0.46589508492409437, + "flos": 14866954277760.0, + "grad_norm": 1.9596727707578627, + "language_loss": 0.65822107, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.68326902, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.23095703, + "step": 7749, + "time_per_iteration": 2.80350923538208 + }, + { + "auxiliary_loss_clip": 0.01477011, + "auxiliary_loss_mlp": 0.01046794, + "balance_loss_clip": 1.29642487, + "balance_loss_mlp": 1.02336943, + "epoch": 0.46595520817676234, + "flos": 12903622352640.0, + "grad_norm": 2.3367972089632674, + "language_loss": 0.74379903, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76903701, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.23425293, + "step": 7750, + "time_per_iteration": 2.87186598777771 + }, + { + "auxiliary_loss_clip": 0.01467396, + "auxiliary_loss_mlp": 0.01035519, + "balance_loss_clip": 1.28738272, + "balance_loss_mlp": 1.01180792, + "epoch": 0.46601533142943036, + "flos": 32867102764800.0, + "grad_norm": 1.8135778067664545, + "language_loss": 0.74904871, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77407789, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.23730469, + "step": 7751, + "time_per_iteration": 2.968078851699829 + }, + { + "auxiliary_loss_clip": 0.01465479, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.2827183, + "balance_loss_mlp": 1.02093375, + "epoch": 0.4660754546820983, + "flos": 19975244060160.0, + "grad_norm": 1.776971704630873, + "language_loss": 0.73788834, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.76299167, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.23925781, + "step": 7752, + "time_per_iteration": 2.8290586471557617 + }, + { + "auxiliary_loss_clip": 0.01470458, + "auxiliary_loss_mlp": 0.01043006, + "balance_loss_clip": 1.28890622, + "balance_loss_mlp": 1.01952159, + "epoch": 0.4661355779347663, + "flos": 26699520286080.0, + "grad_norm": 2.155426009934996, + "language_loss": 0.69726419, + "learning_rate": 2.315119027142644e-06, + "loss": 0.72239882, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.23461914, + "step": 7753, + "time_per_iteration": 2.922971725463867 + }, + { + "auxiliary_loss_clip": 0.01450836, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.27548385, + "balance_loss_mlp": 1.01225865, + "epoch": 0.46619570118743425, + "flos": 20969193985920.0, + "grad_norm": 1.721509261262011, + "language_loss": 0.73656654, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.76143074, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.23339844, + "step": 7754, + "time_per_iteration": 2.9297406673431396 + }, + { + "auxiliary_loss_clip": 0.01461223, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.28119779, + "balance_loss_mlp": 1.01720691, + "epoch": 0.4662558244401022, + "flos": 24436856891520.0, + "grad_norm": 1.7304423913917106, + "language_loss": 0.79369122, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81870478, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.22924805, + "step": 7755, + "time_per_iteration": 2.912994146347046 + }, + { + "auxiliary_loss_clip": 0.01437336, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.26363611, + "balance_loss_mlp": 1.01966035, + "epoch": 0.4663159476927702, + "flos": 20605334152320.0, + "grad_norm": 1.6705440285554565, + "language_loss": 0.7358824, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.76067793, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22570801, + "step": 7756, + "time_per_iteration": 2.8729231357574463 + }, + { + "auxiliary_loss_clip": 0.01442018, + "auxiliary_loss_mlp": 0.01040723, + "balance_loss_clip": 1.26777363, + "balance_loss_mlp": 1.01737022, + "epoch": 0.46637607094543815, + "flos": 25671745232640.0, + "grad_norm": 1.709709463394274, + "language_loss": 0.78756142, + "learning_rate": 2.313580543272274e-06, + "loss": 0.81238884, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.23364258, + "step": 7757, + "time_per_iteration": 2.877902030944824 + }, + { + "auxiliary_loss_clip": 0.01452158, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.27576101, + "balance_loss_mlp": 1.0115335, + "epoch": 0.4664361941981061, + "flos": 24282960768000.0, + "grad_norm": 2.172393648766265, + "language_loss": 0.6721217, + "learning_rate": 2.313195892540705e-06, + "loss": 0.69697106, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21252441, + "step": 7758, + "time_per_iteration": 2.9181931018829346 + }, + { + "auxiliary_loss_clip": 0.01456554, + "auxiliary_loss_mlp": 0.01039626, + "balance_loss_clip": 1.2802012, + "balance_loss_mlp": 1.0170598, + "epoch": 0.4664963174507741, + "flos": 18415071717120.0, + "grad_norm": 1.741751361813492, + "language_loss": 0.75791776, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.78287959, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22570801, + "step": 7759, + "time_per_iteration": 2.8382821083068848 + }, + { + "auxiliary_loss_clip": 0.01459257, + "auxiliary_loss_mlp": 0.01041086, + "balance_loss_clip": 1.28354692, + "balance_loss_mlp": 1.01867437, + "epoch": 0.46655644070344204, + "flos": 22465199923200.0, + "grad_norm": 1.5327456154332444, + "language_loss": 0.7774961, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80249953, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.22412109, + "step": 7760, + "time_per_iteration": 2.8832669258117676 + }, + { + "auxiliary_loss_clip": 0.01444831, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.27091575, + "balance_loss_mlp": 1.0222671, + "epoch": 0.46661656395611, + "flos": 13816348583040.0, + "grad_norm": 1.7297299094069096, + "language_loss": 0.74897438, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.7738688, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.22351074, + "step": 7761, + "time_per_iteration": 2.8712756633758545 + }, + { + "auxiliary_loss_clip": 0.01477935, + "auxiliary_loss_mlp": 0.01045877, + "balance_loss_clip": 1.29701936, + "balance_loss_mlp": 1.02123713, + "epoch": 0.466676687208778, + "flos": 21661957405440.0, + "grad_norm": 1.5493029345338603, + "language_loss": 0.7944628, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81970096, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.24645996, + "step": 7762, + "time_per_iteration": 2.872647285461426 + }, + { + "auxiliary_loss_clip": 0.01247654, + "auxiliary_loss_mlp": 0.01048839, + "balance_loss_clip": 1.14730597, + "balance_loss_mlp": 1.01946604, + "epoch": 0.46673681046144594, + "flos": 68565145340160.0, + "grad_norm": 0.8155508127491906, + "language_loss": 0.59864938, + "learning_rate": 2.311272461028297e-06, + "loss": 0.62161428, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.29296875, + "step": 7763, + "time_per_iteration": 3.436049461364746 + }, + { + "auxiliary_loss_clip": 0.01466608, + "auxiliary_loss_mlp": 0.0104357, + "balance_loss_clip": 1.28489578, + "balance_loss_mlp": 1.01977539, + "epoch": 0.46679693371411396, + "flos": 15822735575040.0, + "grad_norm": 2.1462561655029786, + "language_loss": 0.7943126, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81941438, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.23803711, + "step": 7764, + "time_per_iteration": 2.810814380645752 + }, + { + "auxiliary_loss_clip": 0.01458002, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.28324461, + "balance_loss_mlp": 1.02278829, + "epoch": 0.4668570569667819, + "flos": 18523605288960.0, + "grad_norm": 1.8929049609779096, + "language_loss": 0.73463857, + "learning_rate": 2.310503005696839e-06, + "loss": 0.75967181, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.22509766, + "step": 7765, + "time_per_iteration": 2.8739616870880127 + }, + { + "auxiliary_loss_clip": 0.01467854, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.28898942, + "balance_loss_mlp": 1.02259159, + "epoch": 0.4669171802194499, + "flos": 19215554302080.0, + "grad_norm": 2.3548115022829648, + "language_loss": 0.78712451, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.81225622, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.22729492, + "step": 7766, + "time_per_iteration": 2.935147523880005 + }, + { + "auxiliary_loss_clip": 0.01457093, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.28139865, + "balance_loss_mlp": 1.01857424, + "epoch": 0.46697730347211786, + "flos": 12283712340480.0, + "grad_norm": 4.640353268551855, + "language_loss": 0.66232812, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.68730116, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21643066, + "step": 7767, + "time_per_iteration": 2.8599228858947754 + }, + { + "auxiliary_loss_clip": 0.01463348, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.2857306, + "balance_loss_mlp": 1.02368617, + "epoch": 0.4670374267247858, + "flos": 23597255537280.0, + "grad_norm": 1.9821133767787407, + "language_loss": 0.75540257, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.78049898, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.22607422, + "step": 7768, + "time_per_iteration": 2.8706419467926025 + }, + { + "auxiliary_loss_clip": 0.01455881, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_clip": 1.27774239, + "balance_loss_mlp": 1.02517605, + "epoch": 0.4670975499774538, + "flos": 15997924016640.0, + "grad_norm": 1.6213766050273777, + "language_loss": 0.71301925, + "learning_rate": 2.308963953858982e-06, + "loss": 0.73804986, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.2199707, + "step": 7769, + "time_per_iteration": 2.871973752975464 + }, + { + "auxiliary_loss_clip": 0.01466626, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.2873919, + "balance_loss_mlp": 1.01782739, + "epoch": 0.46715767323012175, + "flos": 15386022334080.0, + "grad_norm": 2.1237013647979186, + "language_loss": 0.82153803, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.84661067, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22802734, + "step": 7770, + "time_per_iteration": 2.841442823410034 + }, + { + "auxiliary_loss_clip": 0.01248082, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.14804125, + "balance_loss_mlp": 1.00129151, + "epoch": 0.4672177964827897, + "flos": 60281013484800.0, + "grad_norm": 0.8648783717561113, + "language_loss": 0.55666459, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57943487, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.27734375, + "step": 7771, + "time_per_iteration": 3.4392666816711426 + }, + { + "auxiliary_loss_clip": 0.0145727, + "auxiliary_loss_mlp": 0.01044023, + "balance_loss_clip": 1.28215218, + "balance_loss_mlp": 1.02179074, + "epoch": 0.4672779197354577, + "flos": 27647564722560.0, + "grad_norm": 2.9354985649339875, + "language_loss": 0.66069746, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68571043, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.22216797, + "step": 7772, + "time_per_iteration": 2.9237897396087646 + }, + { + "auxiliary_loss_clip": 0.01462196, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.28638244, + "balance_loss_mlp": 1.0176096, + "epoch": 0.46733804298812565, + "flos": 31406686502400.0, + "grad_norm": 2.2021678298027845, + "language_loss": 0.64753932, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.67255431, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21679688, + "step": 7773, + "time_per_iteration": 4.337916612625122 + }, + { + "auxiliary_loss_clip": 0.01456129, + "auxiliary_loss_mlp": 0.01041528, + "balance_loss_clip": 1.27808022, + "balance_loss_mlp": 1.01892591, + "epoch": 0.4673981662407936, + "flos": 19510135067520.0, + "grad_norm": 2.1223566643519307, + "language_loss": 0.80429977, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82927632, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.22570801, + "step": 7774, + "time_per_iteration": 2.847783088684082 + }, + { + "auxiliary_loss_clip": 0.01462986, + "auxiliary_loss_mlp": 0.01040135, + "balance_loss_clip": 1.28304553, + "balance_loss_mlp": 1.01835561, + "epoch": 0.4674582894934616, + "flos": 20531530604160.0, + "grad_norm": 1.7264233916950047, + "language_loss": 0.79163373, + "learning_rate": 2.306655024915726e-06, + "loss": 0.81666493, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.21789551, + "step": 7775, + "time_per_iteration": 2.8886308670043945 + }, + { + "auxiliary_loss_clip": 0.0145233, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.27652788, + "balance_loss_mlp": 1.01412368, + "epoch": 0.46751841274612954, + "flos": 22101113865600.0, + "grad_norm": 2.0266299626234274, + "language_loss": 0.70380569, + "learning_rate": 2.306270162640694e-06, + "loss": 0.72868711, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21691895, + "step": 7776, + "time_per_iteration": 2.873547077178955 + }, + { + "auxiliary_loss_clip": 0.01452846, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.27631664, + "balance_loss_mlp": 1.0152185, + "epoch": 0.46757853599879756, + "flos": 26991522097920.0, + "grad_norm": 1.8634497707474054, + "language_loss": 0.74115914, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.76605368, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21398926, + "step": 7777, + "time_per_iteration": 2.9397499561309814 + }, + { + "auxiliary_loss_clip": 0.01465381, + "auxiliary_loss_mlp": 0.01039537, + "balance_loss_clip": 1.28572893, + "balance_loss_mlp": 1.01790094, + "epoch": 0.4676386592514655, + "flos": 24144719345280.0, + "grad_norm": 6.61439344617977, + "language_loss": 0.70751655, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.7325657, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21630859, + "step": 7778, + "time_per_iteration": 2.911848306655884 + }, + { + "auxiliary_loss_clip": 0.01472656, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.29146683, + "balance_loss_mlp": 1.01825547, + "epoch": 0.4676987825041335, + "flos": 25494656509440.0, + "grad_norm": 2.2576013081586557, + "language_loss": 0.74121583, + "learning_rate": 2.305115506191206e-06, + "loss": 0.76634336, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.21838379, + "step": 7779, + "time_per_iteration": 2.897386074066162 + }, + { + "auxiliary_loss_clip": 0.01443896, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.26953268, + "balance_loss_mlp": 1.02047634, + "epoch": 0.46775890575680146, + "flos": 21955497540480.0, + "grad_norm": 2.434361575367322, + "language_loss": 0.72948003, + "learning_rate": 2.304730597548562e-06, + "loss": 0.75434327, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21948242, + "step": 7780, + "time_per_iteration": 2.901930809020996 + }, + { + "auxiliary_loss_clip": 0.01484651, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.29915869, + "balance_loss_mlp": 1.02055717, + "epoch": 0.4678190290094694, + "flos": 25239285002880.0, + "grad_norm": 1.8217064057666252, + "language_loss": 0.7454983, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.77076733, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 1.85253906, + "router_z_loss_mlp": 0.21704102, + "step": 7781, + "time_per_iteration": 4.40968132019043 + }, + { + "auxiliary_loss_clip": 0.01478606, + "auxiliary_loss_mlp": 0.01041811, + "balance_loss_clip": 1.29650533, + "balance_loss_mlp": 1.01907814, + "epoch": 0.4678791522621374, + "flos": 32280610677120.0, + "grad_norm": 1.686313850230819, + "language_loss": 0.63127172, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.6564759, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.22753906, + "step": 7782, + "time_per_iteration": 4.437561750411987 + }, + { + "auxiliary_loss_clip": 0.01471635, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.2900548, + "balance_loss_mlp": 1.01655054, + "epoch": 0.46793927551480535, + "flos": 27056231441280.0, + "grad_norm": 2.093456807640064, + "language_loss": 0.63547397, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.66056943, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21362305, + "step": 7783, + "time_per_iteration": 4.292320966720581 + }, + { + "auxiliary_loss_clip": 0.0148646, + "auxiliary_loss_mlp": 0.01042454, + "balance_loss_clip": 1.29947686, + "balance_loss_mlp": 1.01976824, + "epoch": 0.4679993987674733, + "flos": 17466393853440.0, + "grad_norm": 2.8066610323514922, + "language_loss": 0.68540865, + "learning_rate": 2.303190847569801e-06, + "loss": 0.71069777, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.22680664, + "step": 7784, + "time_per_iteration": 2.880420207977295 + }, + { + "auxiliary_loss_clip": 0.01461677, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.28419805, + "balance_loss_mlp": 1.01837587, + "epoch": 0.4680595220201413, + "flos": 17173939593600.0, + "grad_norm": 3.1240682353637212, + "language_loss": 0.85588259, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.88089532, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.2121582, + "step": 7785, + "time_per_iteration": 2.8619790077209473 + }, + { + "auxiliary_loss_clip": 0.01464067, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.28458309, + "balance_loss_mlp": 1.01906419, + "epoch": 0.46811964527280925, + "flos": 11334943987200.0, + "grad_norm": 3.101091957656719, + "language_loss": 0.78012633, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.80518723, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22937012, + "step": 7786, + "time_per_iteration": 2.813344717025757 + }, + { + "auxiliary_loss_clip": 0.01449612, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.27538788, + "balance_loss_mlp": 1.01250446, + "epoch": 0.4681797685254772, + "flos": 24289204550400.0, + "grad_norm": 2.460105322903489, + "language_loss": 0.75006241, + "learning_rate": 2.302035914315856e-06, + "loss": 0.77489024, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.20666504, + "step": 7787, + "time_per_iteration": 2.8822031021118164 + }, + { + "auxiliary_loss_clip": 0.01473976, + "auxiliary_loss_mlp": 0.0104244, + "balance_loss_clip": 1.29414427, + "balance_loss_mlp": 1.01958752, + "epoch": 0.4682398917781452, + "flos": 31663053394560.0, + "grad_norm": 1.636886613708989, + "language_loss": 0.66125584, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.68641996, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.22851562, + "step": 7788, + "time_per_iteration": 2.9383580684661865 + }, + { + "auxiliary_loss_clip": 0.01468229, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.29083395, + "balance_loss_mlp": 1.0109601, + "epoch": 0.46830001503081314, + "flos": 28122401347200.0, + "grad_norm": 2.093833991661311, + "language_loss": 0.64904666, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.67404366, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 1.77539062, + "router_z_loss_mlp": 0.20507812, + "step": 7789, + "time_per_iteration": 2.9214930534362793 + }, + { + "auxiliary_loss_clip": 0.01249193, + "auxiliary_loss_mlp": 0.01023541, + "balance_loss_clip": 1.14703476, + "balance_loss_mlp": 1.00160646, + "epoch": 0.4683601382834811, + "flos": 57910178476800.0, + "grad_norm": 0.7419934930033778, + "language_loss": 0.61923254, + "learning_rate": 2.300880877982825e-06, + "loss": 0.64195991, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.21972656, + "step": 7790, + "time_per_iteration": 3.4815359115600586 + }, + { + "auxiliary_loss_clip": 0.01466685, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.29022551, + "balance_loss_mlp": 1.01915669, + "epoch": 0.46842026153614913, + "flos": 21882010705920.0, + "grad_norm": 1.7641774248931008, + "language_loss": 0.79334438, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.8184182, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.21533203, + "step": 7791, + "time_per_iteration": 2.871277093887329 + }, + { + "auxiliary_loss_clip": 0.01460662, + "auxiliary_loss_mlp": 0.01040108, + "balance_loss_clip": 1.28444588, + "balance_loss_mlp": 1.01828074, + "epoch": 0.4684803847888171, + "flos": 24911829250560.0, + "grad_norm": 1.5101446446404527, + "language_loss": 0.75520283, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.78021049, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.21838379, + "step": 7792, + "time_per_iteration": 2.9237585067749023 + }, + { + "auxiliary_loss_clip": 0.01453348, + "auxiliary_loss_mlp": 0.01038101, + "balance_loss_clip": 1.27958024, + "balance_loss_mlp": 1.0175252, + "epoch": 0.46854050804148506, + "flos": 26263259493120.0, + "grad_norm": 1.569305039042321, + "language_loss": 0.68550348, + "learning_rate": 2.299725738964898e-06, + "loss": 0.71041799, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20568848, + "step": 7793, + "time_per_iteration": 2.9084737300872803 + }, + { + "auxiliary_loss_clip": 0.01461058, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.28563738, + "balance_loss_mlp": 1.01486659, + "epoch": 0.468600631294153, + "flos": 21589737425280.0, + "grad_norm": 1.612128752491402, + "language_loss": 0.74312907, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76810116, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.21289062, + "step": 7794, + "time_per_iteration": 2.8693981170654297 + }, + { + "auxiliary_loss_clip": 0.01471599, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.29467499, + "balance_loss_mlp": 1.01950848, + "epoch": 0.468660754546821, + "flos": 25896820705920.0, + "grad_norm": 1.5543804471693339, + "language_loss": 0.64732748, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.67245722, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.21850586, + "step": 7795, + "time_per_iteration": 2.91186785697937 + }, + { + "auxiliary_loss_clip": 0.01470685, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.29536247, + "balance_loss_mlp": 1.01596737, + "epoch": 0.46872087779948896, + "flos": 35487020252160.0, + "grad_norm": 2.2239675471800218, + "language_loss": 0.69416094, + "learning_rate": 2.298570497656304e-06, + "loss": 0.71923, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.20251465, + "step": 7796, + "time_per_iteration": 3.0099494457244873 + }, + { + "auxiliary_loss_clip": 0.01472959, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.29321802, + "balance_loss_mlp": 1.01819062, + "epoch": 0.4687810010521569, + "flos": 26407744698240.0, + "grad_norm": 1.7875485151369395, + "language_loss": 0.71962047, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.74474311, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21105957, + "step": 7797, + "time_per_iteration": 3.064347505569458 + }, + { + "auxiliary_loss_clip": 0.01500298, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.31849575, + "balance_loss_mlp": 1.01748037, + "epoch": 0.4688411243048249, + "flos": 19981849800960.0, + "grad_norm": 1.9146898263250078, + "language_loss": 0.68150985, + "learning_rate": 2.297800280150454e-06, + "loss": 0.70690298, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 1.81640625, + "router_z_loss_mlp": 0.21533203, + "step": 7798, + "time_per_iteration": 2.833261013031006 + }, + { + "auxiliary_loss_clip": 0.01260737, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.15929401, + "balance_loss_mlp": 1.00628614, + "epoch": 0.46890124755749285, + "flos": 64007622236160.0, + "grad_norm": 0.9289949272670465, + "language_loss": 0.64611316, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66904473, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.26171875, + "step": 7799, + "time_per_iteration": 3.5514254570007324 + }, + { + "auxiliary_loss_clip": 0.01469718, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.29051244, + "balance_loss_mlp": 1.0146296, + "epoch": 0.4689613708101608, + "flos": 23779502167680.0, + "grad_norm": 1.5075276318152695, + "language_loss": 0.73318988, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.75824064, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.20727539, + "step": 7800, + "time_per_iteration": 2.91387939453125 + }, + { + "auxiliary_loss_clip": 0.01481853, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.30502319, + "balance_loss_mlp": 1.02116382, + "epoch": 0.4690214940628288, + "flos": 24798997422720.0, + "grad_norm": 2.2490006203645074, + "language_loss": 0.73739249, + "learning_rate": 2.296644869233568e-06, + "loss": 0.76261747, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.19470215, + "step": 7801, + "time_per_iteration": 2.9136178493499756 + }, + { + "auxiliary_loss_clip": 0.01486086, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.30216217, + "balance_loss_mlp": 1.02126169, + "epoch": 0.46908161731549675, + "flos": 18086394355200.0, + "grad_norm": 2.5117133562382756, + "language_loss": 0.64280307, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.66809094, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.21435547, + "step": 7802, + "time_per_iteration": 2.8764374256134033 + }, + { + "auxiliary_loss_clip": 0.01491123, + "auxiliary_loss_mlp": 0.01039463, + "balance_loss_clip": 1.31052065, + "balance_loss_mlp": 1.01876783, + "epoch": 0.4691417405681647, + "flos": 25714528830720.0, + "grad_norm": 1.7111735555453922, + "language_loss": 0.74289227, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.76819807, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.20703125, + "step": 7803, + "time_per_iteration": 2.919506311416626 + }, + { + "auxiliary_loss_clip": 0.0147496, + "auxiliary_loss_mlp": 0.01044552, + "balance_loss_clip": 1.29624987, + "balance_loss_mlp": 1.02372646, + "epoch": 0.46920186382083273, + "flos": 17465941405440.0, + "grad_norm": 1.6018488155008592, + "language_loss": 0.77750731, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.80270243, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.20825195, + "step": 7804, + "time_per_iteration": 2.8506152629852295 + }, + { + "auxiliary_loss_clip": 0.01471433, + "auxiliary_loss_mlp": 0.01039168, + "balance_loss_clip": 1.29616737, + "balance_loss_mlp": 1.01738858, + "epoch": 0.4692619870735007, + "flos": 20349102994560.0, + "grad_norm": 1.6855101613326242, + "language_loss": 0.77824265, + "learning_rate": 2.295104163929305e-06, + "loss": 0.80334866, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.21801758, + "step": 7805, + "time_per_iteration": 2.8661677837371826 + }, + { + "auxiliary_loss_clip": 0.01490508, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.30569816, + "balance_loss_mlp": 1.02279735, + "epoch": 0.46932211032616866, + "flos": 29508380634240.0, + "grad_norm": 1.528944816733302, + "language_loss": 0.83899462, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.86434412, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.21655273, + "step": 7806, + "time_per_iteration": 2.913583755493164 + }, + { + "auxiliary_loss_clip": 0.01485637, + "auxiliary_loss_mlp": 0.01043155, + "balance_loss_clip": 1.30454922, + "balance_loss_mlp": 1.02201939, + "epoch": 0.4693822335788366, + "flos": 36225146223360.0, + "grad_norm": 2.001180856885291, + "language_loss": 0.78222942, + "learning_rate": 2.294333744076472e-06, + "loss": 0.80751735, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21130371, + "step": 7807, + "time_per_iteration": 3.01485276222229 + }, + { + "auxiliary_loss_clip": 0.01494708, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.31445813, + "balance_loss_mlp": 1.01653039, + "epoch": 0.4694423568315046, + "flos": 20348514812160.0, + "grad_norm": 2.3734718760396993, + "language_loss": 0.52023339, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.54556084, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.21520996, + "step": 7808, + "time_per_iteration": 4.304847002029419 + }, + { + "auxiliary_loss_clip": 0.01259979, + "auxiliary_loss_mlp": 0.01023494, + "balance_loss_clip": 1.16038191, + "balance_loss_mlp": 1.00022435, + "epoch": 0.46950248008417256, + "flos": 64353022191360.0, + "grad_norm": 0.782187776037097, + "language_loss": 0.57843632, + "learning_rate": 2.293563279578978e-06, + "loss": 0.60127103, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.23242188, + "step": 7809, + "time_per_iteration": 3.2456390857696533 + }, + { + "auxiliary_loss_clip": 0.01483452, + "auxiliary_loss_mlp": 0.01042584, + "balance_loss_clip": 1.30134118, + "balance_loss_mlp": 1.02070916, + "epoch": 0.4695626033368405, + "flos": 19207319748480.0, + "grad_norm": 2.142945239007722, + "language_loss": 0.72492272, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.75018311, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.21875, + "step": 7810, + "time_per_iteration": 2.9011523723602295 + }, + { + "auxiliary_loss_clip": 0.01478844, + "auxiliary_loss_mlp": 0.01039907, + "balance_loss_clip": 1.29854155, + "balance_loss_mlp": 1.01838994, + "epoch": 0.4696227265895085, + "flos": 23012663731200.0, + "grad_norm": 1.8374945381940766, + "language_loss": 0.82175028, + "learning_rate": 2.29279277055369e-06, + "loss": 0.84693778, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.21520996, + "step": 7811, + "time_per_iteration": 2.8768444061279297 + }, + { + "auxiliary_loss_clip": 0.01480574, + "auxiliary_loss_mlp": 0.01039235, + "balance_loss_clip": 1.3004998, + "balance_loss_mlp": 1.01752722, + "epoch": 0.46968284984217645, + "flos": 21880653361920.0, + "grad_norm": 1.6401858213468052, + "language_loss": 0.81001389, + "learning_rate": 2.292407499379644e-06, + "loss": 0.83521199, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.21716309, + "step": 7812, + "time_per_iteration": 2.8815088272094727 + }, + { + "auxiliary_loss_clip": 0.01463394, + "auxiliary_loss_mlp": 0.01040914, + "balance_loss_clip": 1.28846431, + "balance_loss_mlp": 1.01955163, + "epoch": 0.4697429730948444, + "flos": 19984654978560.0, + "grad_norm": 1.6410024830192649, + "language_loss": 0.75020576, + "learning_rate": 2.292022217117477e-06, + "loss": 0.77524883, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21362305, + "step": 7813, + "time_per_iteration": 2.913576126098633 + }, + { + "auxiliary_loss_clip": 0.01470667, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.29104733, + "balance_loss_mlp": 1.01453304, + "epoch": 0.4698030963475124, + "flos": 15163933017600.0, + "grad_norm": 2.15004620440249, + "language_loss": 0.85116082, + "learning_rate": 2.291636923781798e-06, + "loss": 0.8762337, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22094727, + "step": 7814, + "time_per_iteration": 2.831911325454712 + }, + { + "auxiliary_loss_clip": 0.01450643, + "auxiliary_loss_mlp": 0.01045528, + "balance_loss_clip": 1.27605915, + "balance_loss_mlp": 1.02440381, + "epoch": 0.46986321960018035, + "flos": 15157598745600.0, + "grad_norm": 2.290244990236686, + "language_loss": 0.82353669, + "learning_rate": 2.291251619387217e-06, + "loss": 0.8484984, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21130371, + "step": 7815, + "time_per_iteration": 2.8504645824432373 + }, + { + "auxiliary_loss_clip": 0.01457627, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.28010654, + "balance_loss_mlp": 1.01384628, + "epoch": 0.4699233428528483, + "flos": 23118392125440.0, + "grad_norm": 2.0359092749536383, + "language_loss": 0.78001314, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.80494529, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.21740723, + "step": 7816, + "time_per_iteration": 4.305444002151489 + }, + { + "auxiliary_loss_clip": 0.01253574, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.15262461, + "balance_loss_mlp": 1.0054301, + "epoch": 0.46998346610551633, + "flos": 68138702668800.0, + "grad_norm": 0.8349083415262301, + "language_loss": 0.59058952, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61341417, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.234375, + "step": 7817, + "time_per_iteration": 4.785445213317871 + }, + { + "auxiliary_loss_clip": 0.01451632, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.27947927, + "balance_loss_mlp": 1.01607347, + "epoch": 0.4700435893581843, + "flos": 24138928010880.0, + "grad_norm": 1.6890186170951793, + "language_loss": 0.80063552, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.82552063, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20825195, + "step": 7818, + "time_per_iteration": 4.355440616607666 + }, + { + "auxiliary_loss_clip": 0.014605, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.28285146, + "balance_loss_mlp": 1.01301074, + "epoch": 0.47010371261085226, + "flos": 20158440831360.0, + "grad_norm": 1.874728934131144, + "language_loss": 0.84375691, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86869532, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20336914, + "step": 7819, + "time_per_iteration": 2.9428608417510986 + }, + { + "auxiliary_loss_clip": 0.01466979, + "auxiliary_loss_mlp": 0.01038391, + "balance_loss_clip": 1.28756261, + "balance_loss_mlp": 1.01608729, + "epoch": 0.47016383586352023, + "flos": 15130560337920.0, + "grad_norm": 2.8465941419502623, + "language_loss": 0.77319783, + "learning_rate": 2.289324932042186e-06, + "loss": 0.79825151, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 1.79492188, + "router_z_loss_mlp": 0.22302246, + "step": 7820, + "time_per_iteration": 2.8793141841888428 + }, + { + "auxiliary_loss_clip": 0.01456593, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.28267908, + "balance_loss_mlp": 1.01651084, + "epoch": 0.4702239591161882, + "flos": 13560841342080.0, + "grad_norm": 1.9837884291164707, + "language_loss": 0.74586463, + "learning_rate": 2.288939561601039e-06, + "loss": 0.77079809, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20227051, + "step": 7821, + "time_per_iteration": 2.900174856185913 + }, + { + "auxiliary_loss_clip": 0.01453181, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.27872109, + "balance_loss_mlp": 1.01554513, + "epoch": 0.47028408236885616, + "flos": 24286308883200.0, + "grad_norm": 1.8033370683402354, + "language_loss": 0.8917166, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91660672, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.20288086, + "step": 7822, + "time_per_iteration": 2.898970603942871 + }, + { + "auxiliary_loss_clip": 0.01471444, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.29434514, + "balance_loss_mlp": 1.01512957, + "epoch": 0.4703442056215241, + "flos": 22867228385280.0, + "grad_norm": 1.5446502409283804, + "language_loss": 0.80584037, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.83091164, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.20532227, + "step": 7823, + "time_per_iteration": 2.842829942703247 + }, + { + "auxiliary_loss_clip": 0.01249799, + "auxiliary_loss_mlp": 0.01043991, + "balance_loss_clip": 1.14823866, + "balance_loss_mlp": 1.02215183, + "epoch": 0.4704043288741921, + "flos": 69274604090880.0, + "grad_norm": 0.7087614436710279, + "language_loss": 0.56746227, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.59040016, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.21875, + "step": 7824, + "time_per_iteration": 3.43638014793396 + }, + { + "auxiliary_loss_clip": 0.01469268, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.29015899, + "balance_loss_mlp": 1.01626873, + "epoch": 0.47046445212686006, + "flos": 18050216497920.0, + "grad_norm": 1.704437578773318, + "language_loss": 0.81644452, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.84152389, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.22412109, + "step": 7825, + "time_per_iteration": 2.8232266902923584 + }, + { + "auxiliary_loss_clip": 0.01468259, + "auxiliary_loss_mlp": 0.01040784, + "balance_loss_clip": 1.28833401, + "balance_loss_mlp": 1.01859975, + "epoch": 0.470524575379528, + "flos": 23962020266880.0, + "grad_norm": 1.6854491696583402, + "language_loss": 0.67473251, + "learning_rate": 2.287012545338324e-06, + "loss": 0.6998229, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.22180176, + "step": 7826, + "time_per_iteration": 2.9147801399230957 + }, + { + "auxiliary_loss_clip": 0.0146393, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.28512299, + "balance_loss_mlp": 1.01826119, + "epoch": 0.470584698632196, + "flos": 18122572212480.0, + "grad_norm": 1.7331574855348737, + "language_loss": 0.84073484, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86576855, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21191406, + "step": 7827, + "time_per_iteration": 2.8743977546691895 + }, + { + "auxiliary_loss_clip": 0.01253659, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.15188205, + "balance_loss_mlp": 1.00680661, + "epoch": 0.47064482188486395, + "flos": 57277735655040.0, + "grad_norm": 0.8008181991902088, + "language_loss": 0.55612749, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57898772, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.25585938, + "step": 7828, + "time_per_iteration": 3.284156560897827 + }, + { + "auxiliary_loss_clip": 0.01454124, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.27779698, + "balance_loss_mlp": 1.01503563, + "epoch": 0.4707049451375319, + "flos": 17904147724800.0, + "grad_norm": 1.8946849734897582, + "language_loss": 0.82127982, + "learning_rate": 2.285856204861245e-06, + "loss": 0.84617382, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.20239258, + "step": 7829, + "time_per_iteration": 2.8616549968719482 + }, + { + "auxiliary_loss_clip": 0.01457314, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.28147697, + "balance_loss_mlp": 1.01650906, + "epoch": 0.47076506839019994, + "flos": 25244669134080.0, + "grad_norm": 1.4220725499870086, + "language_loss": 0.76318157, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78813314, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21337891, + "step": 7830, + "time_per_iteration": 2.9515786170959473 + }, + { + "auxiliary_loss_clip": 0.0146302, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.28772914, + "balance_loss_mlp": 1.0125314, + "epoch": 0.4708251916428679, + "flos": 13487444997120.0, + "grad_norm": 2.496806263472205, + "language_loss": 0.79466593, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81964123, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.2199707, + "step": 7831, + "time_per_iteration": 2.8606808185577393 + }, + { + "auxiliary_loss_clip": 0.01494509, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.30913651, + "balance_loss_mlp": 1.01649046, + "epoch": 0.47088531489553587, + "flos": 30158405700480.0, + "grad_norm": 1.674181010050731, + "language_loss": 0.76316768, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.7884928, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 1.85644531, + "router_z_loss_mlp": 0.21533203, + "step": 7832, + "time_per_iteration": 2.920445680618286 + }, + { + "auxiliary_loss_clip": 0.01458191, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.28283715, + "balance_loss_mlp": 1.01292706, + "epoch": 0.47094543814820383, + "flos": 21808161912960.0, + "grad_norm": 1.4769851532734934, + "language_loss": 0.75408977, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.77900922, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.20837402, + "step": 7833, + "time_per_iteration": 2.887096881866455 + }, + { + "auxiliary_loss_clip": 0.01464233, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.28715324, + "balance_loss_mlp": 1.01962495, + "epoch": 0.4710055614008718, + "flos": 23013387648000.0, + "grad_norm": 1.6741331104421786, + "language_loss": 0.76524794, + "learning_rate": 2.283928754133762e-06, + "loss": 0.79029143, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.20495605, + "step": 7834, + "time_per_iteration": 2.891139268875122 + }, + { + "auxiliary_loss_clip": 0.01458826, + "auxiliary_loss_mlp": 0.01037648, + "balance_loss_clip": 1.2839762, + "balance_loss_mlp": 1.01680994, + "epoch": 0.47106568465353976, + "flos": 42756724270080.0, + "grad_norm": 1.5334048843906214, + "language_loss": 0.67330229, + "learning_rate": 2.283543231629972e-06, + "loss": 0.69826698, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20849609, + "step": 7835, + "time_per_iteration": 3.07818341255188 + }, + { + "auxiliary_loss_clip": 0.01255208, + "auxiliary_loss_mlp": 0.01053279, + "balance_loss_clip": 1.14560843, + "balance_loss_mlp": 1.02352452, + "epoch": 0.4711258079062077, + "flos": 68580555834240.0, + "grad_norm": 0.8785818952814886, + "language_loss": 0.6222533, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64533818, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.296875, + "step": 7836, + "time_per_iteration": 3.3395493030548096 + }, + { + "auxiliary_loss_clip": 0.01490665, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.30507672, + "balance_loss_mlp": 1.01723909, + "epoch": 0.4711859311588757, + "flos": 25457347532160.0, + "grad_norm": 1.9455392564938272, + "language_loss": 0.70281118, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.72809458, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 1.85546875, + "router_z_loss_mlp": 0.2043457, + "step": 7837, + "time_per_iteration": 2.92879581451416 + }, + { + "auxiliary_loss_clip": 0.0147429, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.29559255, + "balance_loss_mlp": 1.0229311, + "epoch": 0.47124605441154366, + "flos": 21992127845760.0, + "grad_norm": 1.8358517601885067, + "language_loss": 0.66894317, + "learning_rate": 2.282386599665153e-06, + "loss": 0.69413471, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21936035, + "step": 7838, + "time_per_iteration": 2.8693721294403076 + }, + { + "auxiliary_loss_clip": 0.01483333, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.30039167, + "balance_loss_mlp": 1.01609349, + "epoch": 0.4713061776642116, + "flos": 25424110586880.0, + "grad_norm": 2.307480758739275, + "language_loss": 0.78752005, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.8127268, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.21240234, + "step": 7839, + "time_per_iteration": 3.012094497680664 + }, + { + "auxiliary_loss_clip": 0.01451558, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.27638173, + "balance_loss_mlp": 1.0194912, + "epoch": 0.4713663009168796, + "flos": 26553587247360.0, + "grad_norm": 1.837168159703692, + "language_loss": 0.73329735, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75821841, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.21069336, + "step": 7840, + "time_per_iteration": 3.0048975944519043 + }, + { + "auxiliary_loss_clip": 0.01460464, + "auxiliary_loss_mlp": 0.01038364, + "balance_loss_clip": 1.28310275, + "balance_loss_mlp": 1.0179913, + "epoch": 0.47142642416954755, + "flos": 23634202556160.0, + "grad_norm": 1.973271585402389, + "language_loss": 0.76117682, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.78616512, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.20373535, + "step": 7841, + "time_per_iteration": 2.9037413597106934 + }, + { + "auxiliary_loss_clip": 0.01459689, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.28170085, + "balance_loss_mlp": 1.01869416, + "epoch": 0.4714865474222155, + "flos": 22320443249280.0, + "grad_norm": 1.650317573945922, + "language_loss": 0.70567644, + "learning_rate": 2.280844273866501e-06, + "loss": 0.73067003, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.20996094, + "step": 7842, + "time_per_iteration": 2.8694963455200195 + }, + { + "auxiliary_loss_clip": 0.01458587, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.28056216, + "balance_loss_mlp": 1.01599348, + "epoch": 0.4715466706748835, + "flos": 17831565786240.0, + "grad_norm": 1.8913768813448193, + "language_loss": 0.79765463, + "learning_rate": 2.280458665756177e-06, + "loss": 0.82261801, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21728516, + "step": 7843, + "time_per_iteration": 4.255522012710571 + }, + { + "auxiliary_loss_clip": 0.01461756, + "auxiliary_loss_mlp": 0.010415, + "balance_loss_clip": 1.28347254, + "balance_loss_mlp": 1.02066207, + "epoch": 0.4716067939275515, + "flos": 23670018455040.0, + "grad_norm": 1.8639463398720095, + "language_loss": 0.74755329, + "learning_rate": 2.280073047010832e-06, + "loss": 0.77258581, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20837402, + "step": 7844, + "time_per_iteration": 2.8645012378692627 + }, + { + "auxiliary_loss_clip": 0.01442409, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.26777339, + "balance_loss_mlp": 1.01982892, + "epoch": 0.47166691718021947, + "flos": 17938877748480.0, + "grad_norm": 1.793064000970921, + "language_loss": 0.7941916, + "learning_rate": 2.279687417645088e-06, + "loss": 0.81902838, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.2142334, + "step": 7845, + "time_per_iteration": 2.842479705810547 + }, + { + "auxiliary_loss_clip": 0.0144826, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.27224171, + "balance_loss_mlp": 1.01464176, + "epoch": 0.47172704043288743, + "flos": 26625761982720.0, + "grad_norm": 1.401184361210611, + "language_loss": 0.73972797, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.76456958, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21252441, + "step": 7846, + "time_per_iteration": 2.916719913482666 + }, + { + "auxiliary_loss_clip": 0.01435932, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.26316524, + "balance_loss_mlp": 1.01663291, + "epoch": 0.4717871636855554, + "flos": 27932825059200.0, + "grad_norm": 1.3922680085700205, + "language_loss": 0.75336832, + "learning_rate": 2.2789161271109e-06, + "loss": 0.77810711, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21325684, + "step": 7847, + "time_per_iteration": 2.918854236602783 + }, + { + "auxiliary_loss_clip": 0.0144397, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.26886129, + "balance_loss_mlp": 1.01663494, + "epoch": 0.47184728693822336, + "flos": 14510514591360.0, + "grad_norm": 1.7183308672731796, + "language_loss": 0.81597966, + "learning_rate": 2.278530465971703e-06, + "loss": 0.84079039, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.20471191, + "step": 7848, + "time_per_iteration": 2.8308908939361572 + }, + { + "auxiliary_loss_clip": 0.01465003, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.28665221, + "balance_loss_mlp": 1.01409507, + "epoch": 0.47190741019089133, + "flos": 17865074200320.0, + "grad_norm": 6.370362858939226, + "language_loss": 0.70873439, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.73373914, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21386719, + "step": 7849, + "time_per_iteration": 2.848738193511963 + }, + { + "auxiliary_loss_clip": 0.01462421, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.28122306, + "balance_loss_mlp": 1.01790071, + "epoch": 0.4719675334435593, + "flos": 17904645417600.0, + "grad_norm": 2.2195777818397717, + "language_loss": 0.70986992, + "learning_rate": 2.277759112022224e-06, + "loss": 0.73488784, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.21459961, + "step": 7850, + "time_per_iteration": 2.9214279651641846 + }, + { + "auxiliary_loss_clip": 0.0145751, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.27901661, + "balance_loss_mlp": 1.0154295, + "epoch": 0.47202765669622726, + "flos": 20714139192960.0, + "grad_norm": 2.474970264276976, + "language_loss": 0.75611514, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.78105879, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21435547, + "step": 7851, + "time_per_iteration": 2.8645052909851074 + }, + { + "auxiliary_loss_clip": 0.01449633, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.27105772, + "balance_loss_mlp": 1.01204479, + "epoch": 0.4720877799488952, + "flos": 16368299101440.0, + "grad_norm": 1.8767499447808493, + "language_loss": 0.77674288, + "learning_rate": 2.276987715942132e-06, + "loss": 0.80157614, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21655273, + "step": 7852, + "time_per_iteration": 4.342456102371216 + }, + { + "auxiliary_loss_clip": 0.01441278, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.26557589, + "balance_loss_mlp": 1.01631141, + "epoch": 0.4721479032015632, + "flos": 20678006580480.0, + "grad_norm": 1.5188824052618932, + "language_loss": 0.69626069, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.72108471, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.24816895, + "step": 7853, + "time_per_iteration": 4.253943681716919 + }, + { + "auxiliary_loss_clip": 0.01250261, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.14952064, + "balance_loss_mlp": 0.99757463, + "epoch": 0.47220802645423116, + "flos": 67786018300800.0, + "grad_norm": 0.7067996931114588, + "language_loss": 0.50181437, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52458072, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.28710938, + "step": 7854, + "time_per_iteration": 3.4705193042755127 + }, + { + "auxiliary_loss_clip": 0.01461464, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.28264916, + "balance_loss_mlp": 1.01225877, + "epoch": 0.4722681497068991, + "flos": 20930799133440.0, + "grad_norm": 3.1038921284818803, + "language_loss": 0.64488685, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66984475, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.2208252, + "step": 7855, + "time_per_iteration": 2.8398942947387695 + }, + { + "auxiliary_loss_clip": 0.01438504, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.26277947, + "balance_loss_mlp": 1.01600635, + "epoch": 0.4723282729595671, + "flos": 28304738467200.0, + "grad_norm": 1.859403075528418, + "language_loss": 0.76770616, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.79246485, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21362305, + "step": 7856, + "time_per_iteration": 2.909841537475586 + }, + { + "auxiliary_loss_clip": 0.01450925, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.27640462, + "balance_loss_mlp": 1.01476526, + "epoch": 0.4723883962122351, + "flos": 27136278771840.0, + "grad_norm": 2.0442679916783657, + "language_loss": 0.75421679, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21765137, + "step": 7857, + "time_per_iteration": 2.897757053375244 + }, + { + "auxiliary_loss_clip": 0.0143945, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.26613188, + "balance_loss_mlp": 1.0159502, + "epoch": 0.47244851946490307, + "flos": 31548954712320.0, + "grad_norm": 1.4972727236661267, + "language_loss": 0.65591621, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.68067849, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20837402, + "step": 7858, + "time_per_iteration": 2.978564977645874 + }, + { + "auxiliary_loss_clip": 0.01429052, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.25767684, + "balance_loss_mlp": 1.01753247, + "epoch": 0.47250864271757104, + "flos": 20896295333760.0, + "grad_norm": 1.905955344899447, + "language_loss": 0.71314061, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.73781902, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21252441, + "step": 7859, + "time_per_iteration": 2.855790138244629 + }, + { + "auxiliary_loss_clip": 0.01458988, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.27835131, + "balance_loss_mlp": 1.01610231, + "epoch": 0.472568765970239, + "flos": 20531847317760.0, + "grad_norm": 2.0228191281621912, + "language_loss": 0.62672973, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.65169418, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 1.80566406, + "router_z_loss_mlp": 0.21350098, + "step": 7860, + "time_per_iteration": 2.855999231338501 + }, + { + "auxiliary_loss_clip": 0.01439564, + "auxiliary_loss_mlp": 0.01040181, + "balance_loss_clip": 1.26314974, + "balance_loss_mlp": 1.01881838, + "epoch": 0.47262888922290697, + "flos": 35817778874880.0, + "grad_norm": 5.595022856215944, + "language_loss": 0.72678924, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.75158674, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.21362305, + "step": 7861, + "time_per_iteration": 2.961784839630127 + }, + { + "auxiliary_loss_clip": 0.01445223, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.26948655, + "balance_loss_mlp": 1.01391768, + "epoch": 0.47268901247557493, + "flos": 20677463642880.0, + "grad_norm": 1.916917282784984, + "language_loss": 0.86495149, + "learning_rate": 2.273130107677896e-06, + "loss": 0.88976711, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.22424316, + "step": 7862, + "time_per_iteration": 2.8530490398406982 + }, + { + "auxiliary_loss_clip": 0.01449058, + "auxiliary_loss_mlp": 0.01040726, + "balance_loss_clip": 1.27061749, + "balance_loss_mlp": 1.01988828, + "epoch": 0.4727491357282429, + "flos": 19582762250880.0, + "grad_norm": 1.7889064881853776, + "language_loss": 0.85212266, + "learning_rate": 2.272744289645927e-06, + "loss": 0.87702048, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.20837402, + "step": 7863, + "time_per_iteration": 2.837104558944702 + }, + { + "auxiliary_loss_clip": 0.01441237, + "auxiliary_loss_mlp": 0.01038207, + "balance_loss_clip": 1.26618516, + "balance_loss_mlp": 1.01753569, + "epoch": 0.47280925898091086, + "flos": 18224771512320.0, + "grad_norm": 1.8548898131851954, + "language_loss": 0.67245579, + "learning_rate": 2.272358461271467e-06, + "loss": 0.69725025, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20666504, + "step": 7864, + "time_per_iteration": 2.834808588027954 + }, + { + "auxiliary_loss_clip": 0.01439444, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.26503253, + "balance_loss_mlp": 1.01616144, + "epoch": 0.4728693822335788, + "flos": 17830796624640.0, + "grad_norm": 3.537489847010876, + "language_loss": 0.66144288, + "learning_rate": 2.271972622569147e-06, + "loss": 0.68620968, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21069336, + "step": 7865, + "time_per_iteration": 2.8784751892089844 + }, + { + "auxiliary_loss_clip": 0.01437637, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.26583445, + "balance_loss_mlp": 1.01920998, + "epoch": 0.4729295054862468, + "flos": 20604836459520.0, + "grad_norm": 1.752535118665879, + "language_loss": 0.74988967, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.77467614, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21801758, + "step": 7866, + "time_per_iteration": 2.846238374710083 + }, + { + "auxiliary_loss_clip": 0.01452682, + "auxiliary_loss_mlp": 0.01034737, + "balance_loss_clip": 1.27481747, + "balance_loss_mlp": 1.01346982, + "epoch": 0.47298962873891476, + "flos": 23378604825600.0, + "grad_norm": 2.7315376622189955, + "language_loss": 0.83674914, + "learning_rate": 2.271200914239451e-06, + "loss": 0.86162329, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21264648, + "step": 7867, + "time_per_iteration": 2.874845266342163 + }, + { + "auxiliary_loss_clip": 0.01435391, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.26254082, + "balance_loss_mlp": 1.01248884, + "epoch": 0.4730497519915827, + "flos": 22061814117120.0, + "grad_norm": 1.643375970538988, + "language_loss": 0.80238312, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.82707769, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.21582031, + "step": 7868, + "time_per_iteration": 2.8520069122314453 + }, + { + "auxiliary_loss_clip": 0.01450689, + "auxiliary_loss_mlp": 0.01042197, + "balance_loss_clip": 1.27029562, + "balance_loss_mlp": 1.02016735, + "epoch": 0.4731098752442507, + "flos": 21079718328960.0, + "grad_norm": 1.795968225482409, + "language_loss": 0.75799346, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.78292227, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22045898, + "step": 7869, + "time_per_iteration": 2.8515944480895996 + }, + { + "auxiliary_loss_clip": 0.01453628, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.27673209, + "balance_loss_mlp": 1.02133918, + "epoch": 0.4731699984969187, + "flos": 22539048716160.0, + "grad_norm": 1.7660068388069483, + "language_loss": 0.74567491, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.77065092, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.22619629, + "step": 7870, + "time_per_iteration": 2.8965253829956055 + }, + { + "auxiliary_loss_clip": 0.01486543, + "auxiliary_loss_mlp": 0.01047802, + "balance_loss_clip": 1.3041575, + "balance_loss_mlp": 1.02612948, + "epoch": 0.4732301217495867, + "flos": 24908435890560.0, + "grad_norm": 2.031840180327355, + "language_loss": 0.82517678, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.85052025, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 1.82226562, + "router_z_loss_mlp": 0.21679688, + "step": 7871, + "time_per_iteration": 2.8965330123901367 + }, + { + "auxiliary_loss_clip": 0.01453391, + "auxiliary_loss_mlp": 0.01041624, + "balance_loss_clip": 1.27744746, + "balance_loss_mlp": 1.02011847, + "epoch": 0.47329024500225464, + "flos": 22794827425920.0, + "grad_norm": 1.5388626835197496, + "language_loss": 0.76893228, + "learning_rate": 2.269271463701879e-06, + "loss": 0.79388237, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21520996, + "step": 7872, + "time_per_iteration": 2.876718521118164 + }, + { + "auxiliary_loss_clip": 0.01464899, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.28724647, + "balance_loss_mlp": 1.0210638, + "epoch": 0.4733503682549226, + "flos": 38711392012800.0, + "grad_norm": 1.9210858742729469, + "language_loss": 0.68885744, + "learning_rate": 2.268885542903428e-06, + "loss": 0.71393776, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22070312, + "step": 7873, + "time_per_iteration": 3.011988401412964 + }, + { + "auxiliary_loss_clip": 0.01438428, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.26427913, + "balance_loss_mlp": 1.01669395, + "epoch": 0.47341049150759057, + "flos": 22977300280320.0, + "grad_norm": 2.008489158926519, + "language_loss": 0.73071843, + "learning_rate": 2.26849961190881e-06, + "loss": 0.75548267, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.2130127, + "step": 7874, + "time_per_iteration": 2.8827805519104004 + }, + { + "auxiliary_loss_clip": 0.01454186, + "auxiliary_loss_mlp": 0.01040357, + "balance_loss_clip": 1.27540898, + "balance_loss_mlp": 1.01837516, + "epoch": 0.47347061476025853, + "flos": 14546692448640.0, + "grad_norm": 2.187710296549013, + "language_loss": 0.65941012, + "learning_rate": 2.26811367073266e-06, + "loss": 0.68435556, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 1.78710938, + "router_z_loss_mlp": 0.21972656, + "step": 7875, + "time_per_iteration": 2.8408305644989014 + }, + { + "auxiliary_loss_clip": 0.01461833, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.28527653, + "balance_loss_mlp": 1.02444649, + "epoch": 0.4735307380129265, + "flos": 30275400049920.0, + "grad_norm": 2.2797913752014085, + "language_loss": 0.81907171, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.84414572, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 1.76757812, + "router_z_loss_mlp": 0.21118164, + "step": 7876, + "time_per_iteration": 2.9677083492279053 + }, + { + "auxiliary_loss_clip": 0.0144617, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.2691648, + "balance_loss_mlp": 1.02020216, + "epoch": 0.47359086126559446, + "flos": 19400515620480.0, + "grad_norm": 1.8039945578345826, + "language_loss": 0.7937628, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81865126, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.22473145, + "step": 7877, + "time_per_iteration": 2.8970017433166504 + }, + { + "auxiliary_loss_clip": 0.01428498, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.25511026, + "balance_loss_mlp": 1.01936996, + "epoch": 0.47365098451826243, + "flos": 21948077393280.0, + "grad_norm": 1.8742675027664282, + "language_loss": 0.72131073, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.74600422, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.21459961, + "step": 7878, + "time_per_iteration": 4.311206102371216 + }, + { + "auxiliary_loss_clip": 0.01443194, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.27007389, + "balance_loss_mlp": 1.01602221, + "epoch": 0.4737111077709304, + "flos": 25855168227840.0, + "grad_norm": 1.6640024102260982, + "language_loss": 0.75815666, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.78296959, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22058105, + "step": 7879, + "time_per_iteration": 2.892313003540039 + }, + { + "auxiliary_loss_clip": 0.01240752, + "auxiliary_loss_mlp": 0.01020961, + "balance_loss_clip": 1.14393878, + "balance_loss_mlp": 1.00007522, + "epoch": 0.47377123102359836, + "flos": 67789710391680.0, + "grad_norm": 0.7446899127067215, + "language_loss": 0.61290729, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63552445, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.20898438, + "step": 7880, + "time_per_iteration": 3.355663776397705 + }, + { + "auxiliary_loss_clip": 0.01431736, + "auxiliary_loss_mlp": 0.0103941, + "balance_loss_clip": 1.25914729, + "balance_loss_mlp": 1.01722515, + "epoch": 0.4738313542762663, + "flos": 24326558772480.0, + "grad_norm": 1.7297080364130981, + "language_loss": 0.69108129, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.71579278, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22192383, + "step": 7881, + "time_per_iteration": 2.8771960735321045 + }, + { + "auxiliary_loss_clip": 0.01428056, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.25631428, + "balance_loss_mlp": 1.01589537, + "epoch": 0.4738914775289343, + "flos": 20715360802560.0, + "grad_norm": 2.262164047963832, + "language_loss": 0.78132993, + "learning_rate": 2.265411798646092e-06, + "loss": 0.80598098, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21130371, + "step": 7882, + "time_per_iteration": 2.875046968460083 + }, + { + "auxiliary_loss_clip": 0.01445618, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.26925242, + "balance_loss_mlp": 1.01612782, + "epoch": 0.4739516007816023, + "flos": 25457030818560.0, + "grad_norm": 1.5454473865535945, + "language_loss": 0.77410281, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.79893601, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.21582031, + "step": 7883, + "time_per_iteration": 2.9306907653808594 + }, + { + "auxiliary_loss_clip": 0.01441907, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.26768684, + "balance_loss_mlp": 1.0135442, + "epoch": 0.4740117240342703, + "flos": 19983840572160.0, + "grad_norm": 1.7032897891962955, + "language_loss": 0.72521055, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74997365, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20837402, + "step": 7884, + "time_per_iteration": 2.8358917236328125 + }, + { + "auxiliary_loss_clip": 0.01463034, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.28226089, + "balance_loss_mlp": 1.01565742, + "epoch": 0.47407184728693824, + "flos": 15667482107520.0, + "grad_norm": 2.099785369363229, + "language_loss": 0.82768476, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.85269129, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.21960449, + "step": 7885, + "time_per_iteration": 2.8035881519317627 + }, + { + "auxiliary_loss_clip": 0.01463686, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.28825617, + "balance_loss_mlp": 1.01820803, + "epoch": 0.4741319705396062, + "flos": 18598177998720.0, + "grad_norm": 2.0142737391927517, + "language_loss": 0.74412048, + "learning_rate": 2.263867649999751e-06, + "loss": 0.76914823, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.20898438, + "step": 7886, + "time_per_iteration": 2.8094804286956787 + }, + { + "auxiliary_loss_clip": 0.01468862, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.2857281, + "balance_loss_mlp": 1.01524425, + "epoch": 0.47419209379227417, + "flos": 13268884775040.0, + "grad_norm": 1.796130149320026, + "language_loss": 0.74930668, + "learning_rate": 2.263481587786849e-06, + "loss": 0.77438056, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 1.83300781, + "router_z_loss_mlp": 0.23266602, + "step": 7887, + "time_per_iteration": 5.634202003479004 + }, + { + "auxiliary_loss_clip": 0.01441568, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.2678895, + "balance_loss_mlp": 1.01627946, + "epoch": 0.47425221704494214, + "flos": 20052712437120.0, + "grad_norm": 1.7511428226217989, + "language_loss": 0.77547526, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.80026567, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21203613, + "step": 7888, + "time_per_iteration": 4.364242792129517 + }, + { + "auxiliary_loss_clip": 0.01446215, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.27005291, + "balance_loss_mlp": 1.01514304, + "epoch": 0.4743123402976101, + "flos": 27283388175360.0, + "grad_norm": 1.6691236659237736, + "language_loss": 0.73672771, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.76155454, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.21325684, + "step": 7889, + "time_per_iteration": 2.898346424102783 + }, + { + "auxiliary_loss_clip": 0.01230387, + "auxiliary_loss_mlp": 0.01021554, + "balance_loss_clip": 1.1312865, + "balance_loss_mlp": 0.99961996, + "epoch": 0.47437246355027807, + "flos": 55420041634560.0, + "grad_norm": 0.7246164590249049, + "language_loss": 0.56165278, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58417213, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.21972656, + "step": 7890, + "time_per_iteration": 3.407794713973999 + }, + { + "auxiliary_loss_clip": 0.01457518, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.27838063, + "balance_loss_mlp": 1.01503849, + "epoch": 0.47443258680294603, + "flos": 23889031125120.0, + "grad_norm": 1.9573746809417971, + "language_loss": 0.66129559, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.68623817, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21691895, + "step": 7891, + "time_per_iteration": 2.88854718208313 + }, + { + "auxiliary_loss_clip": 0.01468378, + "auxiliary_loss_mlp": 0.0104058, + "balance_loss_clip": 1.2857573, + "balance_loss_mlp": 1.01909852, + "epoch": 0.474492710055614, + "flos": 21986924693760.0, + "grad_norm": 2.456255908336512, + "language_loss": 0.71390045, + "learning_rate": 2.26155112714642e-06, + "loss": 0.73899007, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 1.82519531, + "router_z_loss_mlp": 0.21484375, + "step": 7892, + "time_per_iteration": 2.926445484161377 + }, + { + "auxiliary_loss_clip": 0.01231542, + "auxiliary_loss_mlp": 0.01019317, + "balance_loss_clip": 1.13426185, + "balance_loss_mlp": 0.99900389, + "epoch": 0.47455283330828196, + "flos": 62588225024640.0, + "grad_norm": 0.8102391583503529, + "language_loss": 0.58629745, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60880601, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.203125, + "step": 7893, + "time_per_iteration": 3.3877816200256348 + }, + { + "auxiliary_loss_clip": 0.01452198, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.27670634, + "balance_loss_mlp": 1.01404572, + "epoch": 0.47461295656094993, + "flos": 12101148996480.0, + "grad_norm": 2.0851220742836096, + "language_loss": 0.78656888, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.81143558, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20422363, + "step": 7894, + "time_per_iteration": 2.8438351154327393 + }, + { + "auxiliary_loss_clip": 0.01453912, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.27813804, + "balance_loss_mlp": 1.01529539, + "epoch": 0.4746730798136179, + "flos": 20893806869760.0, + "grad_norm": 2.224788415487396, + "language_loss": 0.75619018, + "learning_rate": 2.260392731628497e-06, + "loss": 0.78109109, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20874023, + "step": 7895, + "time_per_iteration": 2.8740763664245605 + }, + { + "auxiliary_loss_clip": 0.01458239, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.28126752, + "balance_loss_mlp": 1.01762259, + "epoch": 0.4747332030662859, + "flos": 19984428754560.0, + "grad_norm": 1.9145926299658675, + "language_loss": 0.83680999, + "learning_rate": 2.260006580021429e-06, + "loss": 0.8617754, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.20690918, + "step": 7896, + "time_per_iteration": 2.8380517959594727 + }, + { + "auxiliary_loss_clip": 0.01466762, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.29035354, + "balance_loss_mlp": 1.01648378, + "epoch": 0.4747933263189539, + "flos": 16042743630720.0, + "grad_norm": 1.8297255476216812, + "language_loss": 0.76886177, + "learning_rate": 2.259620418554886e-06, + "loss": 0.79390121, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.20678711, + "step": 7897, + "time_per_iteration": 2.825946092605591 + }, + { + "auxiliary_loss_clip": 0.01472292, + "auxiliary_loss_mlp": 0.01038745, + "balance_loss_clip": 1.29081774, + "balance_loss_mlp": 1.01726341, + "epoch": 0.47485344957162184, + "flos": 13963277007360.0, + "grad_norm": 1.875892332269221, + "language_loss": 0.65271294, + "learning_rate": 2.25923424724351e-06, + "loss": 0.67782331, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 1.81347656, + "router_z_loss_mlp": 0.21496582, + "step": 7898, + "time_per_iteration": 2.8085544109344482 + }, + { + "auxiliary_loss_clip": 0.01450629, + "auxiliary_loss_mlp": 0.01042907, + "balance_loss_clip": 1.27292216, + "balance_loss_mlp": 1.02156878, + "epoch": 0.4749135728242898, + "flos": 20458812931200.0, + "grad_norm": 2.1585640874822087, + "language_loss": 0.71862209, + "learning_rate": 2.258848066101946e-06, + "loss": 0.74355745, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21337891, + "step": 7899, + "time_per_iteration": 2.9206736087799072 + }, + { + "auxiliary_loss_clip": 0.0145496, + "auxiliary_loss_mlp": 0.01042325, + "balance_loss_clip": 1.27894425, + "balance_loss_mlp": 1.02126098, + "epoch": 0.4749736960769578, + "flos": 28961821722240.0, + "grad_norm": 1.8069131329959525, + "language_loss": 0.69728923, + "learning_rate": 2.258461875144837e-06, + "loss": 0.72226208, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21069336, + "step": 7900, + "time_per_iteration": 3.010093927383423 + }, + { + "auxiliary_loss_clip": 0.01456563, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.28090489, + "balance_loss_mlp": 1.01812029, + "epoch": 0.47503381932962574, + "flos": 31950621216000.0, + "grad_norm": 2.0233452839536494, + "language_loss": 0.71084154, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.73579955, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21118164, + "step": 7901, + "time_per_iteration": 2.923919439315796 + }, + { + "auxiliary_loss_clip": 0.01443869, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.26960754, + "balance_loss_mlp": 1.02358305, + "epoch": 0.4750939425822937, + "flos": 22137382212480.0, + "grad_norm": 1.6989644654061156, + "language_loss": 0.74870539, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.77358824, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.20861816, + "step": 7902, + "time_per_iteration": 2.839940309524536 + }, + { + "auxiliary_loss_clip": 0.0144367, + "auxiliary_loss_mlp": 0.01040427, + "balance_loss_clip": 1.27042079, + "balance_loss_mlp": 1.02006602, + "epoch": 0.47515406583496167, + "flos": 20859846007680.0, + "grad_norm": 1.876129177612612, + "language_loss": 0.69224894, + "learning_rate": 2.257303243526688e-06, + "loss": 0.71708989, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20349121, + "step": 7903, + "time_per_iteration": 2.845796585083008 + }, + { + "auxiliary_loss_clip": 0.01438191, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.26548529, + "balance_loss_mlp": 1.01904511, + "epoch": 0.47521418908762963, + "flos": 17532370051200.0, + "grad_norm": 1.970702509167785, + "language_loss": 0.72999454, + "learning_rate": 2.256917013453848e-06, + "loss": 0.75475979, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.19274902, + "step": 7904, + "time_per_iteration": 2.810915946960449 + }, + { + "auxiliary_loss_clip": 0.01449906, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.27752423, + "balance_loss_mlp": 1.01840472, + "epoch": 0.4752743123402976, + "flos": 20569427763840.0, + "grad_norm": 1.5635333258125323, + "language_loss": 0.86566007, + "learning_rate": 2.25653077363869e-06, + "loss": 0.89054108, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.19787598, + "step": 7905, + "time_per_iteration": 2.8209826946258545 + }, + { + "auxiliary_loss_clip": 0.01426798, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.25825298, + "balance_loss_mlp": 1.017802, + "epoch": 0.47533443559296557, + "flos": 26372109778560.0, + "grad_norm": 1.5686102149324286, + "language_loss": 0.83001417, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.85466784, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20776367, + "step": 7906, + "time_per_iteration": 2.9096903800964355 + }, + { + "auxiliary_loss_clip": 0.01230183, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.13349199, + "balance_loss_mlp": 1.00776279, + "epoch": 0.47539455884563353, + "flos": 65981405710080.0, + "grad_norm": 0.6690045123722127, + "language_loss": 0.59081483, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61342603, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.23144531, + "step": 7907, + "time_per_iteration": 3.474402666091919 + }, + { + "auxiliary_loss_clip": 0.01438576, + "auxiliary_loss_mlp": 0.01044248, + "balance_loss_clip": 1.26537943, + "balance_loss_mlp": 1.02310061, + "epoch": 0.4754546820983015, + "flos": 17246973980160.0, + "grad_norm": 1.9371858244488456, + "language_loss": 0.81604058, + "learning_rate": 2.255371995885765e-06, + "loss": 0.84086883, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21130371, + "step": 7908, + "time_per_iteration": 2.837373733520508 + }, + { + "auxiliary_loss_clip": 0.01468486, + "auxiliary_loss_mlp": 0.01044411, + "balance_loss_clip": 1.29094028, + "balance_loss_mlp": 1.02331138, + "epoch": 0.47551480535096946, + "flos": 19834695152640.0, + "grad_norm": 1.751726343034224, + "language_loss": 0.75060117, + "learning_rate": 2.254985717247797e-06, + "loss": 0.77573013, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.21105957, + "step": 7909, + "time_per_iteration": 2.8907501697540283 + }, + { + "auxiliary_loss_clip": 0.01450541, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.27460253, + "balance_loss_mlp": 1.01952159, + "epoch": 0.4755749286036375, + "flos": 22174148252160.0, + "grad_norm": 1.9918412478014957, + "language_loss": 0.7582044, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.78310776, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.20275879, + "step": 7910, + "time_per_iteration": 2.9048752784729004 + }, + { + "auxiliary_loss_clip": 0.01443253, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.2695545, + "balance_loss_mlp": 1.01864696, + "epoch": 0.47563505185630545, + "flos": 21657070967040.0, + "grad_norm": 3.6949576837529277, + "language_loss": 0.79551566, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.82033265, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.19799805, + "step": 7911, + "time_per_iteration": 2.9060709476470947 + }, + { + "auxiliary_loss_clip": 0.01472042, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.2922883, + "balance_loss_mlp": 1.02579832, + "epoch": 0.4756951751089734, + "flos": 20637847180800.0, + "grad_norm": 1.5968336654195847, + "language_loss": 0.76442933, + "learning_rate": 2.253826823377983e-06, + "loss": 0.78961587, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.20812988, + "step": 7912, + "time_per_iteration": 2.8346428871154785 + }, + { + "auxiliary_loss_clip": 0.01455963, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.27979958, + "balance_loss_mlp": 1.02407932, + "epoch": 0.4757552983616414, + "flos": 25859873687040.0, + "grad_norm": 1.4753427927154625, + "language_loss": 0.75012153, + "learning_rate": 2.253440506151569e-06, + "loss": 0.77512747, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.20556641, + "step": 7913, + "time_per_iteration": 4.36723256111145 + }, + { + "auxiliary_loss_clip": 0.01452514, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.27838588, + "balance_loss_mlp": 1.02061498, + "epoch": 0.47581542161430934, + "flos": 18232055925120.0, + "grad_norm": 2.3119915412960705, + "language_loss": 0.73447669, + "learning_rate": 2.253054179314666e-06, + "loss": 0.75940692, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.19909668, + "step": 7914, + "time_per_iteration": 2.8622288703918457 + }, + { + "auxiliary_loss_clip": 0.01453504, + "auxiliary_loss_mlp": 0.01044605, + "balance_loss_clip": 1.27739096, + "balance_loss_mlp": 1.02340949, + "epoch": 0.4758755448669773, + "flos": 21589692180480.0, + "grad_norm": 2.169745988239635, + "language_loss": 0.65088075, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67586184, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21191406, + "step": 7915, + "time_per_iteration": 2.850184202194214 + }, + { + "auxiliary_loss_clip": 0.01430904, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.2609638, + "balance_loss_mlp": 1.01778531, + "epoch": 0.47593566811964527, + "flos": 15239410623360.0, + "grad_norm": 1.9366611695726872, + "language_loss": 0.77618688, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.80087644, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20263672, + "step": 7916, + "time_per_iteration": 2.835258960723877 + }, + { + "auxiliary_loss_clip": 0.01433478, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.26300013, + "balance_loss_mlp": 1.01854348, + "epoch": 0.47599579137231324, + "flos": 21553152364800.0, + "grad_norm": 1.7387117118691906, + "language_loss": 0.64461434, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.6693362, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20178223, + "step": 7917, + "time_per_iteration": 2.837278366088867 + }, + { + "auxiliary_loss_clip": 0.01226785, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.13236141, + "balance_loss_mlp": 1.01349092, + "epoch": 0.4760559146249812, + "flos": 64583753264640.0, + "grad_norm": 0.8420057263394259, + "language_loss": 0.65787745, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.68053198, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.25195312, + "step": 7918, + "time_per_iteration": 3.3946964740753174 + }, + { + "auxiliary_loss_clip": 0.014478, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.27267015, + "balance_loss_mlp": 1.01800728, + "epoch": 0.47611603787764917, + "flos": 22243291585920.0, + "grad_norm": 1.5341470990763908, + "language_loss": 0.69664377, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.72151798, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.21606445, + "step": 7919, + "time_per_iteration": 2.862159490585327 + }, + { + "auxiliary_loss_clip": 0.01457729, + "auxiliary_loss_mlp": 0.01037134, + "balance_loss_clip": 1.28084755, + "balance_loss_mlp": 1.01581955, + "epoch": 0.47617616113031713, + "flos": 22789262315520.0, + "grad_norm": 2.5850042652875778, + "language_loss": 0.75568521, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.78063381, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21313477, + "step": 7920, + "time_per_iteration": 2.9414336681365967 + }, + { + "auxiliary_loss_clip": 0.01469862, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.28970337, + "balance_loss_mlp": 1.01684403, + "epoch": 0.4762362843829851, + "flos": 24144402631680.0, + "grad_norm": 1.5001670311001594, + "language_loss": 0.78286862, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.80796206, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.22668457, + "step": 7921, + "time_per_iteration": 2.8941493034362793 + }, + { + "auxiliary_loss_clip": 0.01447172, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.26932287, + "balance_loss_mlp": 1.0164032, + "epoch": 0.47629640763565306, + "flos": 22461354115200.0, + "grad_norm": 1.5246718678344429, + "language_loss": 0.78848642, + "learning_rate": 2.249963220399845e-06, + "loss": 0.8133328, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21057129, + "step": 7922, + "time_per_iteration": 4.306033372879028 + }, + { + "auxiliary_loss_clip": 0.01459002, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.27929962, + "balance_loss_mlp": 1.01287889, + "epoch": 0.4763565308883211, + "flos": 11188648990080.0, + "grad_norm": 1.6914768447248065, + "language_loss": 0.73693967, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.76187336, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.21496582, + "step": 7923, + "time_per_iteration": 4.273958206176758 + }, + { + "auxiliary_loss_clip": 0.01455436, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.27922797, + "balance_loss_mlp": 1.0149883, + "epoch": 0.47641665414098905, + "flos": 22392210781440.0, + "grad_norm": 1.7814305703928432, + "language_loss": 0.8314504, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.85636753, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21289062, + "step": 7924, + "time_per_iteration": 4.3541576862335205 + }, + { + "auxiliary_loss_clip": 0.01471961, + "auxiliary_loss_mlp": 0.01040658, + "balance_loss_clip": 1.29211628, + "balance_loss_mlp": 1.01868773, + "epoch": 0.476476777393657, + "flos": 25057038372480.0, + "grad_norm": 1.8756625482568685, + "language_loss": 0.81860507, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.84373128, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.21960449, + "step": 7925, + "time_per_iteration": 2.9104220867156982 + }, + { + "auxiliary_loss_clip": 0.01447694, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.27109289, + "balance_loss_mlp": 1.0151757, + "epoch": 0.476536900646325, + "flos": 27280899711360.0, + "grad_norm": 1.7793373890005506, + "language_loss": 0.73208791, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.75692129, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.20471191, + "step": 7926, + "time_per_iteration": 2.9189493656158447 + }, + { + "auxiliary_loss_clip": 0.0147246, + "auxiliary_loss_mlp": 0.01042099, + "balance_loss_clip": 1.29299951, + "balance_loss_mlp": 1.01985502, + "epoch": 0.47659702389899294, + "flos": 25312319389440.0, + "grad_norm": 2.257584878145908, + "language_loss": 0.69917536, + "learning_rate": 2.248031062546432e-06, + "loss": 0.72432101, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.22253418, + "step": 7927, + "time_per_iteration": 2.8907370567321777 + }, + { + "auxiliary_loss_clip": 0.01450685, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.27743602, + "balance_loss_mlp": 1.01728868, + "epoch": 0.4766571471516609, + "flos": 26003499240960.0, + "grad_norm": 1.8358093499029962, + "language_loss": 0.6835041, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70838624, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20239258, + "step": 7928, + "time_per_iteration": 2.9191734790802 + }, + { + "auxiliary_loss_clip": 0.01449735, + "auxiliary_loss_mlp": 0.01042065, + "balance_loss_clip": 1.27239752, + "balance_loss_mlp": 1.01872373, + "epoch": 0.4767172704043289, + "flos": 16040255166720.0, + "grad_norm": 2.3106231676446054, + "language_loss": 0.79905987, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.82397795, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.23327637, + "step": 7929, + "time_per_iteration": 2.8347578048706055 + }, + { + "auxiliary_loss_clip": 0.01444249, + "auxiliary_loss_mlp": 0.01041076, + "balance_loss_clip": 1.27113712, + "balance_loss_mlp": 1.01951075, + "epoch": 0.47677739365699684, + "flos": 39248947209600.0, + "grad_norm": 1.818756884187019, + "language_loss": 0.67673957, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.7015928, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21569824, + "step": 7930, + "time_per_iteration": 3.027539014816284 + }, + { + "auxiliary_loss_clip": 0.01437974, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.2644248, + "balance_loss_mlp": 1.01952112, + "epoch": 0.4768375169096648, + "flos": 24728587234560.0, + "grad_norm": 1.9001083619453858, + "language_loss": 0.80587333, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.8306576, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20935059, + "step": 7931, + "time_per_iteration": 2.970187187194824 + }, + { + "auxiliary_loss_clip": 0.01455683, + "auxiliary_loss_mlp": 0.01040805, + "balance_loss_clip": 1.27845311, + "balance_loss_mlp": 1.02038443, + "epoch": 0.47689764016233277, + "flos": 22538912981760.0, + "grad_norm": 1.9819938362330696, + "language_loss": 0.77207291, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.79703772, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.20422363, + "step": 7932, + "time_per_iteration": 2.842827558517456 + }, + { + "auxiliary_loss_clip": 0.01437083, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.26441312, + "balance_loss_mlp": 1.01616406, + "epoch": 0.47695776341500074, + "flos": 15128569566720.0, + "grad_norm": 1.783631545389019, + "language_loss": 0.81141335, + "learning_rate": 2.245712162906593e-06, + "loss": 0.83615851, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21264648, + "step": 7933, + "time_per_iteration": 3.0048182010650635 + }, + { + "auxiliary_loss_clip": 0.0145493, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.27382267, + "balance_loss_mlp": 1.01640463, + "epoch": 0.4770178866676687, + "flos": 14685748277760.0, + "grad_norm": 1.7891748806098162, + "language_loss": 0.74494922, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76989508, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.23254395, + "step": 7934, + "time_per_iteration": 2.881333112716675 + }, + { + "auxiliary_loss_clip": 0.01457388, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.27836823, + "balance_loss_mlp": 1.01628268, + "epoch": 0.47707800992033667, + "flos": 22575950490240.0, + "grad_norm": 2.0148548665650843, + "language_loss": 0.80967647, + "learning_rate": 2.244939121664211e-06, + "loss": 0.83462739, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.21411133, + "step": 7935, + "time_per_iteration": 2.934932231903076 + }, + { + "auxiliary_loss_clip": 0.01474209, + "auxiliary_loss_mlp": 0.01040096, + "balance_loss_clip": 1.29099, + "balance_loss_mlp": 1.0171361, + "epoch": 0.4771381331730047, + "flos": 30930085330560.0, + "grad_norm": 1.895183297126345, + "language_loss": 0.71792078, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.74306381, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.22937012, + "step": 7936, + "time_per_iteration": 3.001569986343384 + }, + { + "auxiliary_loss_clip": 0.01456553, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.27553093, + "balance_loss_mlp": 1.01547837, + "epoch": 0.47719825642567265, + "flos": 25749847036800.0, + "grad_norm": 2.396897277723704, + "language_loss": 0.68382281, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.70875323, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 1.81054688, + "router_z_loss_mlp": 0.21008301, + "step": 7937, + "time_per_iteration": 2.8742074966430664 + }, + { + "auxiliary_loss_clip": 0.01234936, + "auxiliary_loss_mlp": 0.01071162, + "balance_loss_clip": 1.14122164, + "balance_loss_mlp": 1.05018103, + "epoch": 0.4772583796783406, + "flos": 66388049141760.0, + "grad_norm": 0.7194995213311575, + "language_loss": 0.5652529, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58831382, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20996094, + "step": 7938, + "time_per_iteration": 3.533442735671997 + }, + { + "auxiliary_loss_clip": 0.01441096, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.26679194, + "balance_loss_mlp": 1.01601017, + "epoch": 0.4773185029310086, + "flos": 22060366283520.0, + "grad_norm": 1.8075248938717292, + "language_loss": 0.89444637, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91923392, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.21643066, + "step": 7939, + "time_per_iteration": 2.840416193008423 + }, + { + "auxiliary_loss_clip": 0.01450332, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.27415764, + "balance_loss_mlp": 1.01402736, + "epoch": 0.47737862618367655, + "flos": 16736638170240.0, + "grad_norm": 2.796284533793052, + "language_loss": 0.78930116, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.81416303, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.21826172, + "step": 7940, + "time_per_iteration": 2.9140732288360596 + }, + { + "auxiliary_loss_clip": 0.01436832, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.26485133, + "balance_loss_mlp": 1.01330817, + "epoch": 0.4774387494363445, + "flos": 19618849618560.0, + "grad_norm": 1.667016976341965, + "language_loss": 0.85843933, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.88315183, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21105957, + "step": 7941, + "time_per_iteration": 2.8956892490386963 + }, + { + "auxiliary_loss_clip": 0.01460285, + "auxiliary_loss_mlp": 0.01043984, + "balance_loss_clip": 1.28053987, + "balance_loss_mlp": 1.02038026, + "epoch": 0.4774988726890125, + "flos": 16662563153280.0, + "grad_norm": 3.6040550853451783, + "language_loss": 0.76558268, + "learning_rate": 2.24223318550976e-06, + "loss": 0.79062539, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 0.23583984, + "step": 7942, + "time_per_iteration": 2.8286314010620117 + }, + { + "auxiliary_loss_clip": 0.01457229, + "auxiliary_loss_mlp": 0.01040316, + "balance_loss_clip": 1.28000283, + "balance_loss_mlp": 1.02007437, + "epoch": 0.47755899594168044, + "flos": 20495307502080.0, + "grad_norm": 1.651941950532469, + "language_loss": 0.64448071, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66945618, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.20239258, + "step": 7943, + "time_per_iteration": 2.8506343364715576 + }, + { + "auxiliary_loss_clip": 0.01477128, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.2948966, + "balance_loss_mlp": 1.01763988, + "epoch": 0.4776191191943484, + "flos": 21662771811840.0, + "grad_norm": 1.8410164717280832, + "language_loss": 0.74352181, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.76868814, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.21862793, + "step": 7944, + "time_per_iteration": 2.862076997756958 + }, + { + "auxiliary_loss_clip": 0.01448337, + "auxiliary_loss_mlp": 0.01041772, + "balance_loss_clip": 1.27118826, + "balance_loss_mlp": 1.02006435, + "epoch": 0.4776792424470164, + "flos": 18779383998720.0, + "grad_norm": 2.010855858882736, + "language_loss": 0.69227648, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.71717763, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.21716309, + "step": 7945, + "time_per_iteration": 2.8539230823516846 + }, + { + "auxiliary_loss_clip": 0.01446507, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.26834524, + "balance_loss_mlp": 1.01606417, + "epoch": 0.47773936569968434, + "flos": 29727800507520.0, + "grad_norm": 1.8045056132554278, + "language_loss": 0.76118743, + "learning_rate": 2.240686733875009e-06, + "loss": 0.78602302, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.21008301, + "step": 7946, + "time_per_iteration": 2.9083118438720703 + }, + { + "auxiliary_loss_clip": 0.01449317, + "auxiliary_loss_mlp": 0.01042397, + "balance_loss_clip": 1.27118063, + "balance_loss_mlp": 1.02073646, + "epoch": 0.4777994889523523, + "flos": 24802571761920.0, + "grad_norm": 1.8017225958195104, + "language_loss": 0.80142123, + "learning_rate": 2.240300098112506e-06, + "loss": 0.82633829, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.21643066, + "step": 7947, + "time_per_iteration": 2.895531177520752 + }, + { + "auxiliary_loss_clip": 0.0144208, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.26919341, + "balance_loss_mlp": 1.01466036, + "epoch": 0.47785961220502027, + "flos": 17867110216320.0, + "grad_norm": 2.1117316091413256, + "language_loss": 0.74433041, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.76911843, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.2208252, + "step": 7948, + "time_per_iteration": 4.248170614242554 + }, + { + "auxiliary_loss_clip": 0.01453769, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.27648318, + "balance_loss_mlp": 1.01616859, + "epoch": 0.4779197354576883, + "flos": 20276204342400.0, + "grad_norm": 1.5064424941895973, + "language_loss": 0.79096341, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.81587994, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.21716309, + "step": 7949, + "time_per_iteration": 2.837682008743286 + }, + { + "auxiliary_loss_clip": 0.01440064, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.26607645, + "balance_loss_mlp": 1.0196619, + "epoch": 0.47797985871035625, + "flos": 17065632245760.0, + "grad_norm": 2.278572798472134, + "language_loss": 0.74984539, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.77466154, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21887207, + "step": 7950, + "time_per_iteration": 2.870100498199463 + }, + { + "auxiliary_loss_clip": 0.01444432, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.26934147, + "balance_loss_mlp": 1.02218902, + "epoch": 0.4780399819630242, + "flos": 31371006337920.0, + "grad_norm": 1.6198737308840891, + "language_loss": 0.75231057, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.77719939, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22229004, + "step": 7951, + "time_per_iteration": 2.9302005767822266 + }, + { + "auxiliary_loss_clip": 0.01453093, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.27359128, + "balance_loss_mlp": 1.01524758, + "epoch": 0.4781001052156922, + "flos": 24910290927360.0, + "grad_norm": 1.919109942092921, + "language_loss": 0.8090899, + "learning_rate": 2.238366782910174e-06, + "loss": 0.83399296, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 1.79394531, + "router_z_loss_mlp": 0.21984863, + "step": 7952, + "time_per_iteration": 2.887119770050049 + }, + { + "auxiliary_loss_clip": 0.01467143, + "auxiliary_loss_mlp": 0.01040345, + "balance_loss_clip": 1.28840065, + "balance_loss_mlp": 1.01873231, + "epoch": 0.47816022846836015, + "flos": 18706983039360.0, + "grad_norm": 1.7156258351692608, + "language_loss": 0.79525322, + "learning_rate": 2.23798009269438e-06, + "loss": 0.82032812, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.21606445, + "step": 7953, + "time_per_iteration": 2.8589110374450684 + }, + { + "auxiliary_loss_clip": 0.01464405, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.28268421, + "balance_loss_mlp": 1.01912224, + "epoch": 0.4782203517210281, + "flos": 11983340240640.0, + "grad_norm": 2.767307700880212, + "language_loss": 0.84688824, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.87193513, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 1.81542969, + "router_z_loss_mlp": 0.21154785, + "step": 7954, + "time_per_iteration": 2.786742925643921 + }, + { + "auxiliary_loss_clip": 0.01441048, + "auxiliary_loss_mlp": 0.01039343, + "balance_loss_clip": 1.2660948, + "balance_loss_mlp": 1.01708639, + "epoch": 0.4782804749736961, + "flos": 20823215702400.0, + "grad_norm": 1.6622117051563683, + "language_loss": 0.70623767, + "learning_rate": 2.237206685204768e-06, + "loss": 0.73104155, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.22265625, + "step": 7955, + "time_per_iteration": 2.880290985107422 + }, + { + "auxiliary_loss_clip": 0.0146181, + "auxiliary_loss_mlp": 0.01043032, + "balance_loss_clip": 1.28369248, + "balance_loss_mlp": 1.02183652, + "epoch": 0.47834059822636404, + "flos": 23850410048640.0, + "grad_norm": 1.6727667272234268, + "language_loss": 0.82222223, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84727061, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.21203613, + "step": 7956, + "time_per_iteration": 2.927103281021118 + }, + { + "auxiliary_loss_clip": 0.0144786, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.27360499, + "balance_loss_mlp": 1.01751566, + "epoch": 0.478400721479032, + "flos": 22642922073600.0, + "grad_norm": 1.8429657620329316, + "language_loss": 0.85649341, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.88136709, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21984863, + "step": 7957, + "time_per_iteration": 4.283450603485107 + }, + { + "auxiliary_loss_clip": 0.01443054, + "auxiliary_loss_mlp": 0.01039922, + "balance_loss_clip": 1.26830482, + "balance_loss_mlp": 1.01932263, + "epoch": 0.4784608447317, + "flos": 19364518742400.0, + "grad_norm": 1.7889230765108997, + "language_loss": 0.80462468, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.82945442, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.20593262, + "step": 7958, + "time_per_iteration": 4.287928581237793 + }, + { + "auxiliary_loss_clip": 0.0145318, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.2766856, + "balance_loss_mlp": 1.02347922, + "epoch": 0.47852096798436794, + "flos": 24030846887040.0, + "grad_norm": 11.208309163051378, + "language_loss": 0.84226263, + "learning_rate": 2.235659762404047e-06, + "loss": 0.86724669, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.2175293, + "step": 7959, + "time_per_iteration": 4.277459144592285 + }, + { + "auxiliary_loss_clip": 0.01439043, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.26930642, + "balance_loss_mlp": 1.01577437, + "epoch": 0.4785810912370359, + "flos": 25677491322240.0, + "grad_norm": 2.917568435913246, + "language_loss": 0.73711342, + "learning_rate": 2.235273009326599e-06, + "loss": 0.76185858, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19714355, + "step": 7960, + "time_per_iteration": 2.893531322479248 + }, + { + "auxiliary_loss_clip": 0.01448327, + "auxiliary_loss_mlp": 0.01039817, + "balance_loss_clip": 1.27592111, + "balance_loss_mlp": 1.01901448, + "epoch": 0.47864121448970387, + "flos": 21441587391360.0, + "grad_norm": 1.6984373747146715, + "language_loss": 0.77541584, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.80029726, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20788574, + "step": 7961, + "time_per_iteration": 2.814842462539673 + }, + { + "auxiliary_loss_clip": 0.01446014, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.2703042, + "balance_loss_mlp": 1.01718807, + "epoch": 0.47870133774237184, + "flos": 16152544056960.0, + "grad_norm": 1.8826411917429637, + "language_loss": 0.78348994, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.80832767, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20556641, + "step": 7962, + "time_per_iteration": 2.8784334659576416 + }, + { + "auxiliary_loss_clip": 0.01454218, + "auxiliary_loss_mlp": 0.01038868, + "balance_loss_clip": 1.27687478, + "balance_loss_mlp": 1.01837635, + "epoch": 0.47876146099503986, + "flos": 26918125752960.0, + "grad_norm": 2.1610304750723026, + "language_loss": 0.65879983, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.68373072, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.20495605, + "step": 7963, + "time_per_iteration": 2.944908380508423 + }, + { + "auxiliary_loss_clip": 0.0145986, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.28285408, + "balance_loss_mlp": 1.01963246, + "epoch": 0.4788215842477078, + "flos": 45348200760960.0, + "grad_norm": 1.9829093081648996, + "language_loss": 0.78575063, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.81075966, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.21398926, + "step": 7964, + "time_per_iteration": 3.041947364807129 + }, + { + "auxiliary_loss_clip": 0.01465915, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.28495955, + "balance_loss_mlp": 1.01917756, + "epoch": 0.4788817075003758, + "flos": 22247454107520.0, + "grad_norm": 1.825409086212718, + "language_loss": 0.76801038, + "learning_rate": 2.233339110409044e-06, + "loss": 0.79308629, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 1.80859375, + "router_z_loss_mlp": 0.22509766, + "step": 7965, + "time_per_iteration": 2.902519464492798 + }, + { + "auxiliary_loss_clip": 0.01454327, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.27725518, + "balance_loss_mlp": 1.01764953, + "epoch": 0.47894183075304375, + "flos": 16479637850880.0, + "grad_norm": 1.571762812802044, + "language_loss": 0.75843501, + "learning_rate": 2.232952304022137e-06, + "loss": 0.78337443, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21972656, + "step": 7966, + "time_per_iteration": 2.827443838119507 + }, + { + "auxiliary_loss_clip": 0.01459112, + "auxiliary_loss_mlp": 0.01038399, + "balance_loss_clip": 1.28224015, + "balance_loss_mlp": 1.01657212, + "epoch": 0.4790019540057117, + "flos": 24294090988800.0, + "grad_norm": 1.707138716418712, + "language_loss": 0.73791134, + "learning_rate": 2.232565488801655e-06, + "loss": 0.7628864, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.21838379, + "step": 7967, + "time_per_iteration": 3.0499303340911865 + }, + { + "auxiliary_loss_clip": 0.0144345, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.27343225, + "balance_loss_mlp": 1.01915908, + "epoch": 0.4790620772583797, + "flos": 25677536567040.0, + "grad_norm": 1.827366769135906, + "language_loss": 0.803002, + "learning_rate": 2.232178664762267e-06, + "loss": 0.82784629, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21826172, + "step": 7968, + "time_per_iteration": 2.9276950359344482 + }, + { + "auxiliary_loss_clip": 0.01233186, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.13905478, + "balance_loss_mlp": 1.0280844, + "epoch": 0.47912220051104765, + "flos": 69463908910080.0, + "grad_norm": 0.7638946134702306, + "language_loss": 0.62269735, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64552367, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.21386719, + "step": 7969, + "time_per_iteration": 3.5951247215270996 + }, + { + "auxiliary_loss_clip": 0.01444823, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.27207911, + "balance_loss_mlp": 1.0215739, + "epoch": 0.4791823237637156, + "flos": 24179177900160.0, + "grad_norm": 1.417795177806257, + "language_loss": 0.78075999, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.8056457, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22167969, + "step": 7970, + "time_per_iteration": 2.953460693359375 + }, + { + "auxiliary_loss_clip": 0.01446467, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.27104092, + "balance_loss_mlp": 1.01418185, + "epoch": 0.4792424470163836, + "flos": 24761552711040.0, + "grad_norm": 1.8780800365876704, + "language_loss": 0.71320045, + "learning_rate": 2.231018139877349e-06, + "loss": 0.73801446, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20751953, + "step": 7971, + "time_per_iteration": 2.8848326206207275 + }, + { + "auxiliary_loss_clip": 0.0144755, + "auxiliary_loss_mlp": 0.01037135, + "balance_loss_clip": 1.27413845, + "balance_loss_mlp": 1.01542699, + "epoch": 0.47930257026905154, + "flos": 23268216216960.0, + "grad_norm": 1.493220119932062, + "language_loss": 0.80697685, + "learning_rate": 2.230631280709021e-06, + "loss": 0.83182371, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21716309, + "step": 7972, + "time_per_iteration": 2.8771238327026367 + }, + { + "auxiliary_loss_clip": 0.01462834, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.28429544, + "balance_loss_mlp": 1.01025009, + "epoch": 0.4793626935217195, + "flos": 14072172537600.0, + "grad_norm": 2.0718755723732185, + "language_loss": 0.70957828, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.73452026, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.21118164, + "step": 7973, + "time_per_iteration": 2.8069820404052734 + }, + { + "auxiliary_loss_clip": 0.01437785, + "auxiliary_loss_mlp": 0.01044017, + "balance_loss_clip": 1.26695621, + "balance_loss_mlp": 1.02155757, + "epoch": 0.4794228167743875, + "flos": 21808885829760.0, + "grad_norm": 2.114884285014367, + "language_loss": 0.7959547, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.82077277, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22436523, + "step": 7974, + "time_per_iteration": 2.8390238285064697 + }, + { + "auxiliary_loss_clip": 0.01226348, + "auxiliary_loss_mlp": 0.01024539, + "balance_loss_clip": 1.13285351, + "balance_loss_mlp": 1.00756347, + "epoch": 0.47948294002705544, + "flos": 67000177048320.0, + "grad_norm": 0.75831902757407, + "language_loss": 0.5411014, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56361026, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.16992188, + "step": 7975, + "time_per_iteration": 3.3998100757598877 + }, + { + "auxiliary_loss_clip": 0.01474574, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.29262376, + "balance_loss_mlp": 1.01553464, + "epoch": 0.47954306327972346, + "flos": 12430188316800.0, + "grad_norm": 2.1778087503371535, + "language_loss": 0.90651679, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.93162853, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.21069336, + "step": 7976, + "time_per_iteration": 2.8015477657318115 + }, + { + "auxiliary_loss_clip": 0.01469731, + "auxiliary_loss_mlp": 0.01039762, + "balance_loss_clip": 1.28772855, + "balance_loss_mlp": 1.01689792, + "epoch": 0.4796031865323914, + "flos": 18369211472640.0, + "grad_norm": 2.3958547476113803, + "language_loss": 0.7490648, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.77415973, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.22888184, + "step": 7977, + "time_per_iteration": 2.851747512817383 + }, + { + "auxiliary_loss_clip": 0.01445232, + "auxiliary_loss_mlp": 0.0103578, + "balance_loss_clip": 1.27186, + "balance_loss_mlp": 1.01534748, + "epoch": 0.4796633097850594, + "flos": 21845063687040.0, + "grad_norm": 1.713877891740713, + "language_loss": 0.78898185, + "learning_rate": 2.228309942555734e-06, + "loss": 0.81379199, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.2043457, + "step": 7978, + "time_per_iteration": 2.894744873046875 + }, + { + "auxiliary_loss_clip": 0.01463166, + "auxiliary_loss_mlp": 0.01038293, + "balance_loss_clip": 1.2849896, + "balance_loss_mlp": 1.01656127, + "epoch": 0.47972343303772735, + "flos": 23446933752960.0, + "grad_norm": 1.6682053843263283, + "language_loss": 0.89809173, + "learning_rate": 2.22792302247656e-06, + "loss": 0.92310631, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.21728516, + "step": 7979, + "time_per_iteration": 2.885093927383423 + }, + { + "auxiliary_loss_clip": 0.01465277, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.285537, + "balance_loss_mlp": 1.01904571, + "epoch": 0.4797835562903953, + "flos": 24910517151360.0, + "grad_norm": 1.5204224289895303, + "language_loss": 0.77336419, + "learning_rate": 2.227536093754523e-06, + "loss": 0.7984271, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 1.79980469, + "router_z_loss_mlp": 0.21972656, + "step": 7980, + "time_per_iteration": 2.9058990478515625 + }, + { + "auxiliary_loss_clip": 0.01472954, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.29174662, + "balance_loss_mlp": 1.0196507, + "epoch": 0.4798436795430633, + "flos": 35056415059200.0, + "grad_norm": 1.759323580275435, + "language_loss": 0.72995806, + "learning_rate": 2.227149156404295e-06, + "loss": 0.75510764, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.22351074, + "step": 7981, + "time_per_iteration": 2.9926531314849854 + }, + { + "auxiliary_loss_clip": 0.0145242, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.27875364, + "balance_loss_mlp": 1.0179565, + "epoch": 0.47990380279573125, + "flos": 20598728411520.0, + "grad_norm": 1.923699508825224, + "language_loss": 0.70587063, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.73078048, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20617676, + "step": 7982, + "time_per_iteration": 2.895615577697754 + }, + { + "auxiliary_loss_clip": 0.01435514, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.2658844, + "balance_loss_mlp": 1.01508439, + "epoch": 0.4799639260483992, + "flos": 26370571455360.0, + "grad_norm": 1.6099331029287876, + "language_loss": 0.71994317, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.74465036, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20117188, + "step": 7983, + "time_per_iteration": 4.352653503417969 + }, + { + "auxiliary_loss_clip": 0.01230113, + "auxiliary_loss_mlp": 0.01051785, + "balance_loss_clip": 1.1367898, + "balance_loss_mlp": 1.03604925, + "epoch": 0.4800240493010672, + "flos": 71010598302720.0, + "grad_norm": 0.8321816553551329, + "language_loss": 0.59458911, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.6174081, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.15722656, + "step": 7984, + "time_per_iteration": 3.3027122020721436 + }, + { + "auxiliary_loss_clip": 0.01437008, + "auxiliary_loss_mlp": 0.01041783, + "balance_loss_clip": 1.26417375, + "balance_loss_mlp": 1.02014637, + "epoch": 0.48008417255373514, + "flos": 17094027997440.0, + "grad_norm": 1.5909223358865534, + "language_loss": 0.67351359, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.69830149, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.21630859, + "step": 7985, + "time_per_iteration": 2.854088306427002 + }, + { + "auxiliary_loss_clip": 0.01461293, + "auxiliary_loss_mlp": 0.01043303, + "balance_loss_clip": 1.28187227, + "balance_loss_mlp": 1.02091599, + "epoch": 0.4801442958064031, + "flos": 15421702498560.0, + "grad_norm": 1.813739014653087, + "language_loss": 0.71286267, + "learning_rate": 2.225214340743835e-06, + "loss": 0.7379086, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.22387695, + "step": 7986, + "time_per_iteration": 2.9303128719329834 + }, + { + "auxiliary_loss_clip": 0.01458847, + "auxiliary_loss_mlp": 0.01039261, + "balance_loss_clip": 1.27966392, + "balance_loss_mlp": 1.01774335, + "epoch": 0.4802044190590711, + "flos": 11481193739520.0, + "grad_norm": 1.9943469183220444, + "language_loss": 0.80038142, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.82536244, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.21508789, + "step": 7987, + "time_per_iteration": 2.8450844287872314 + }, + { + "auxiliary_loss_clip": 0.01447119, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.2713213, + "balance_loss_mlp": 1.02040792, + "epoch": 0.48026454231173904, + "flos": 20958561457920.0, + "grad_norm": 1.9799738798781703, + "language_loss": 0.76637465, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.79125243, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20251465, + "step": 7988, + "time_per_iteration": 2.8418564796447754 + }, + { + "auxiliary_loss_clip": 0.01449707, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.27320051, + "balance_loss_mlp": 1.01756454, + "epoch": 0.48032466556440706, + "flos": 20457591321600.0, + "grad_norm": 2.1556683668002603, + "language_loss": 0.79941022, + "learning_rate": 2.224053348748365e-06, + "loss": 0.82429403, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.21118164, + "step": 7989, + "time_per_iteration": 2.850249767303467 + }, + { + "auxiliary_loss_clip": 0.01469762, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.28891683, + "balance_loss_mlp": 1.01853776, + "epoch": 0.480384788817075, + "flos": 37134298114560.0, + "grad_norm": 1.7690563828819914, + "language_loss": 0.74414825, + "learning_rate": 2.223666334404724e-06, + "loss": 0.76924366, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.21240234, + "step": 7990, + "time_per_iteration": 3.0854408740997314 + }, + { + "auxiliary_loss_clip": 0.01231484, + "auxiliary_loss_mlp": 0.01023448, + "balance_loss_clip": 1.1378274, + "balance_loss_mlp": 1.00132227, + "epoch": 0.480444912069743, + "flos": 69582876048000.0, + "grad_norm": 0.7696504984305313, + "language_loss": 0.59093881, + "learning_rate": 2.223279311579633e-06, + "loss": 0.6134882, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.22167969, + "step": 7991, + "time_per_iteration": 3.4493937492370605 + }, + { + "auxiliary_loss_clip": 0.01440855, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.26621222, + "balance_loss_mlp": 1.0137291, + "epoch": 0.48050503532241096, + "flos": 29833438412160.0, + "grad_norm": 2.4762385066957813, + "language_loss": 0.67651904, + "learning_rate": 2.222892280287768e-06, + "loss": 0.70127338, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.20849609, + "step": 7992, + "time_per_iteration": 4.352416038513184 + }, + { + "auxiliary_loss_clip": 0.01461452, + "auxiliary_loss_mlp": 0.01044538, + "balance_loss_clip": 1.28223157, + "balance_loss_mlp": 1.02368808, + "epoch": 0.4805651585750789, + "flos": 23958355438080.0, + "grad_norm": 1.8820432129740612, + "language_loss": 0.76919591, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.79425585, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.20861816, + "step": 7993, + "time_per_iteration": 4.377902030944824 + }, + { + "auxiliary_loss_clip": 0.01439263, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.26773238, + "balance_loss_mlp": 1.0201447, + "epoch": 0.4806252818277469, + "flos": 25676224467840.0, + "grad_norm": 1.9353126819328703, + "language_loss": 0.78956151, + "learning_rate": 2.222118192362422e-06, + "loss": 0.81436127, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.20556641, + "step": 7994, + "time_per_iteration": 2.9501264095306396 + }, + { + "auxiliary_loss_clip": 0.01440736, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.26568949, + "balance_loss_mlp": 1.01641679, + "epoch": 0.48068540508041485, + "flos": 13159853510400.0, + "grad_norm": 1.8356234087053929, + "language_loss": 0.80145812, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.82623887, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20910645, + "step": 7995, + "time_per_iteration": 2.850233316421509 + }, + { + "auxiliary_loss_clip": 0.01441428, + "auxiliary_loss_mlp": 0.01044477, + "balance_loss_clip": 1.2687633, + "balance_loss_mlp": 1.02247095, + "epoch": 0.4807455283330828, + "flos": 21186125395200.0, + "grad_norm": 1.9971995138691423, + "language_loss": 0.83392429, + "learning_rate": 2.2213440707461e-06, + "loss": 0.85878336, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22009277, + "step": 7996, + "time_per_iteration": 2.8977081775665283 + }, + { + "auxiliary_loss_clip": 0.01432743, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.26266611, + "balance_loss_mlp": 1.01860571, + "epoch": 0.4808056515857508, + "flos": 12283531361280.0, + "grad_norm": 16.175787720793505, + "language_loss": 0.81336981, + "learning_rate": 2.220956997340516e-06, + "loss": 0.83810425, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22094727, + "step": 7997, + "time_per_iteration": 2.873546838760376 + }, + { + "auxiliary_loss_clip": 0.01452646, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.27685905, + "balance_loss_mlp": 1.01999462, + "epoch": 0.48086577483841875, + "flos": 24836577868800.0, + "grad_norm": 1.685274737077479, + "language_loss": 0.73144317, + "learning_rate": 2.220569915556221e-06, + "loss": 0.75639242, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.22265625, + "step": 7998, + "time_per_iteration": 2.944410562515259 + }, + { + "auxiliary_loss_clip": 0.01431101, + "auxiliary_loss_mlp": 0.01043078, + "balance_loss_clip": 1.25838649, + "balance_loss_mlp": 1.0202378, + "epoch": 0.4809258980910867, + "flos": 24476880556800.0, + "grad_norm": 1.7481417099559415, + "language_loss": 0.71442926, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73917103, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.2286377, + "step": 7999, + "time_per_iteration": 3.049927234649658 + }, + { + "auxiliary_loss_clip": 0.01454771, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.27593374, + "balance_loss_mlp": 1.02358603, + "epoch": 0.4809860213437547, + "flos": 21226058570880.0, + "grad_norm": 1.5224006786718538, + "language_loss": 0.72325689, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.74826956, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.22912598, + "step": 8000, + "time_per_iteration": 2.9187307357788086 + }, + { + "auxiliary_loss_clip": 0.01452473, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.27656174, + "balance_loss_mlp": 1.01813626, + "epoch": 0.48104614459642264, + "flos": 37645810289280.0, + "grad_norm": 1.4690997171397149, + "language_loss": 0.75139773, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77632916, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.22546387, + "step": 8001, + "time_per_iteration": 3.0310871601104736 + }, + { + "auxiliary_loss_clip": 0.01455787, + "auxiliary_loss_mlp": 0.01044168, + "balance_loss_clip": 1.27804601, + "balance_loss_mlp": 1.02248371, + "epoch": 0.48110626784909066, + "flos": 18415388430720.0, + "grad_norm": 1.6703034474858074, + "language_loss": 0.82015562, + "learning_rate": 2.219021504925493e-06, + "loss": 0.84515512, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21691895, + "step": 8002, + "time_per_iteration": 2.88570237159729 + }, + { + "auxiliary_loss_clip": 0.0145258, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.27460814, + "balance_loss_mlp": 1.01823831, + "epoch": 0.48116639110175863, + "flos": 28451712136320.0, + "grad_norm": 1.7372421682889956, + "language_loss": 0.72100884, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74594748, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.23046875, + "step": 8003, + "time_per_iteration": 2.928603172302246 + }, + { + "auxiliary_loss_clip": 0.0142792, + "auxiliary_loss_mlp": 0.01039762, + "balance_loss_clip": 1.25713599, + "balance_loss_mlp": 1.01792276, + "epoch": 0.4812265143544266, + "flos": 21735082281600.0, + "grad_norm": 1.6178341630289752, + "language_loss": 0.82884675, + "learning_rate": 2.218247249719507e-06, + "loss": 0.85352361, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21826172, + "step": 8004, + "time_per_iteration": 2.9434335231781006 + }, + { + "auxiliary_loss_clip": 0.01476651, + "auxiliary_loss_mlp": 0.01042934, + "balance_loss_clip": 1.29080272, + "balance_loss_mlp": 1.02029634, + "epoch": 0.48128663760709456, + "flos": 13232480693760.0, + "grad_norm": 1.9785219840643586, + "language_loss": 0.78148961, + "learning_rate": 2.217860109695239e-06, + "loss": 0.80668539, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.2265625, + "step": 8005, + "time_per_iteration": 2.853102445602417 + }, + { + "auxiliary_loss_clip": 0.01456384, + "auxiliary_loss_mlp": 0.01039028, + "balance_loss_clip": 1.27821374, + "balance_loss_mlp": 1.01799941, + "epoch": 0.4813467608597625, + "flos": 24254112568320.0, + "grad_norm": 1.761310330873654, + "language_loss": 0.71587288, + "learning_rate": 2.217472961409692e-06, + "loss": 0.74082708, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21044922, + "step": 8006, + "time_per_iteration": 2.9403984546661377 + }, + { + "auxiliary_loss_clip": 0.01452385, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.27412009, + "balance_loss_mlp": 1.01665509, + "epoch": 0.4814068841124305, + "flos": 27489279853440.0, + "grad_norm": 1.9187621192169098, + "language_loss": 0.71263802, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.73754394, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.2154541, + "step": 8007, + "time_per_iteration": 2.895547389984131 + }, + { + "auxiliary_loss_clip": 0.0146015, + "auxiliary_loss_mlp": 0.01037177, + "balance_loss_clip": 1.28154206, + "balance_loss_mlp": 1.01549256, + "epoch": 0.48146700736509845, + "flos": 19582264558080.0, + "grad_norm": 1.6640914707682575, + "language_loss": 0.72706449, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7520377, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.21679688, + "step": 8008, + "time_per_iteration": 2.9506442546844482 + }, + { + "auxiliary_loss_clip": 0.01457586, + "auxiliary_loss_mlp": 0.01042161, + "balance_loss_clip": 1.2792033, + "balance_loss_mlp": 1.01932037, + "epoch": 0.4815271306177664, + "flos": 20636580326400.0, + "grad_norm": 1.8103298055598507, + "language_loss": 0.61591613, + "learning_rate": 2.216311467132199e-06, + "loss": 0.64091349, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.22851562, + "step": 8009, + "time_per_iteration": 2.937575578689575 + }, + { + "auxiliary_loss_clip": 0.01231813, + "auxiliary_loss_mlp": 0.01026591, + "balance_loss_clip": 1.13711929, + "balance_loss_mlp": 1.003703, + "epoch": 0.4815872538704344, + "flos": 67720702792320.0, + "grad_norm": 0.8805211965501725, + "language_loss": 0.61471403, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63729799, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.22851562, + "step": 8010, + "time_per_iteration": 3.417440891265869 + }, + { + "auxiliary_loss_clip": 0.01446432, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.27107453, + "balance_loss_mlp": 1.01566339, + "epoch": 0.48164737712310235, + "flos": 22831141017600.0, + "grad_norm": 1.6237337707487096, + "language_loss": 0.74107701, + "learning_rate": 2.215537096576639e-06, + "loss": 0.76593781, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.23986816, + "step": 8011, + "time_per_iteration": 2.9076569080352783 + }, + { + "auxiliary_loss_clip": 0.01440414, + "auxiliary_loss_mlp": 0.01038125, + "balance_loss_clip": 1.26746809, + "balance_loss_mlp": 1.01658404, + "epoch": 0.4817075003757703, + "flos": 23744726899200.0, + "grad_norm": 1.773482835948781, + "language_loss": 0.80109239, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82587779, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.21533203, + "step": 8012, + "time_per_iteration": 2.9070706367492676 + }, + { + "auxiliary_loss_clip": 0.01455092, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.27848315, + "balance_loss_mlp": 1.0162226, + "epoch": 0.4817676236284383, + "flos": 28194349858560.0, + "grad_norm": 1.718084716160457, + "language_loss": 0.74468762, + "learning_rate": 2.214762693328326e-06, + "loss": 0.7696265, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.22595215, + "step": 8013, + "time_per_iteration": 2.9592089653015137 + }, + { + "auxiliary_loss_clip": 0.01452941, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.27726054, + "balance_loss_mlp": 1.01774359, + "epoch": 0.48182774688110624, + "flos": 17100452759040.0, + "grad_norm": 1.8581532077794292, + "language_loss": 0.90977663, + "learning_rate": 2.214375479481094e-06, + "loss": 0.93470323, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21984863, + "step": 8014, + "time_per_iteration": 2.866194248199463 + }, + { + "auxiliary_loss_clip": 0.01458032, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.27855456, + "balance_loss_mlp": 1.02150929, + "epoch": 0.4818878701337742, + "flos": 12575487928320.0, + "grad_norm": 2.0689594427714146, + "language_loss": 0.7545805, + "learning_rate": 2.213988257504722e-06, + "loss": 0.77960587, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.23010254, + "step": 8015, + "time_per_iteration": 2.970508098602295 + }, + { + "auxiliary_loss_clip": 0.0147471, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.2908293, + "balance_loss_mlp": 1.0193249, + "epoch": 0.48194799338644223, + "flos": 24619329745920.0, + "grad_norm": 2.5640191400150147, + "language_loss": 0.80913389, + "learning_rate": 2.213601027413894e-06, + "loss": 0.8342995, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.2253418, + "step": 8016, + "time_per_iteration": 2.927116870880127 + }, + { + "auxiliary_loss_clip": 0.01436485, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.26443505, + "balance_loss_mlp": 1.01681173, + "epoch": 0.4820081166391102, + "flos": 21114946045440.0, + "grad_norm": 1.8637497119065356, + "language_loss": 0.78400362, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.80875075, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21398926, + "step": 8017, + "time_per_iteration": 4.364032983779907 + }, + { + "auxiliary_loss_clip": 0.01441151, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.26871276, + "balance_loss_mlp": 1.01575017, + "epoch": 0.48206823989177816, + "flos": 25275417615360.0, + "grad_norm": 1.8641081324352222, + "language_loss": 0.80884123, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.83363283, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.22253418, + "step": 8018, + "time_per_iteration": 2.912993907928467 + }, + { + "auxiliary_loss_clip": 0.01466144, + "auxiliary_loss_mlp": 0.01043754, + "balance_loss_clip": 1.28629899, + "balance_loss_mlp": 1.02351272, + "epoch": 0.4821283631444461, + "flos": 24655914806400.0, + "grad_norm": 1.8991504010230011, + "language_loss": 0.76844895, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.79354793, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.20227051, + "step": 8019, + "time_per_iteration": 2.9922773838043213 + }, + { + "auxiliary_loss_clip": 0.01451761, + "auxiliary_loss_mlp": 0.01042341, + "balance_loss_clip": 1.27387071, + "balance_loss_mlp": 1.02099097, + "epoch": 0.4821884863971141, + "flos": 23962834673280.0, + "grad_norm": 1.651683129202949, + "language_loss": 0.79810244, + "learning_rate": 2.212052026199701e-06, + "loss": 0.82304347, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.21337891, + "step": 8020, + "time_per_iteration": 3.019916296005249 + }, + { + "auxiliary_loss_clip": 0.0142945, + "auxiliary_loss_mlp": 0.01045968, + "balance_loss_clip": 1.25740147, + "balance_loss_mlp": 1.02478433, + "epoch": 0.48224860964978206, + "flos": 17168329238400.0, + "grad_norm": 10.718134904980115, + "language_loss": 0.70137709, + "learning_rate": 2.211664755756855e-06, + "loss": 0.72613126, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21191406, + "step": 8021, + "time_per_iteration": 2.939615249633789 + }, + { + "auxiliary_loss_clip": 0.01474348, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.29206014, + "balance_loss_mlp": 1.02315307, + "epoch": 0.48230873290245, + "flos": 23086195810560.0, + "grad_norm": 1.864012865195003, + "language_loss": 0.64054352, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.66574156, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 1.82421875, + "router_z_loss_mlp": 0.22290039, + "step": 8022, + "time_per_iteration": 2.8991527557373047 + }, + { + "auxiliary_loss_clip": 0.01436965, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.26431179, + "balance_loss_mlp": 1.01695263, + "epoch": 0.482368856155118, + "flos": 19362708950400.0, + "grad_norm": 2.8418351332735634, + "language_loss": 0.67140746, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.6961503, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.20373535, + "step": 8023, + "time_per_iteration": 2.8752830028533936 + }, + { + "auxiliary_loss_clip": 0.01453627, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.27809834, + "balance_loss_mlp": 1.01846886, + "epoch": 0.48242897940778595, + "flos": 20087261481600.0, + "grad_norm": 1.8058208886102474, + "language_loss": 0.77542478, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.80036139, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21557617, + "step": 8024, + "time_per_iteration": 2.9372446537017822 + }, + { + "auxiliary_loss_clip": 0.01450113, + "auxiliary_loss_mlp": 0.01044104, + "balance_loss_clip": 1.27362776, + "balance_loss_mlp": 1.02051187, + "epoch": 0.4824891026604539, + "flos": 23414375479680.0, + "grad_norm": 1.5713528217868877, + "language_loss": 0.75878501, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.78372711, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.23596191, + "step": 8025, + "time_per_iteration": 2.895030975341797 + }, + { + "auxiliary_loss_clip": 0.01437201, + "auxiliary_loss_mlp": 0.01042406, + "balance_loss_clip": 1.26308405, + "balance_loss_mlp": 1.02103186, + "epoch": 0.4825492259131219, + "flos": 20376458115840.0, + "grad_norm": 1.9107915944120655, + "language_loss": 0.71817243, + "learning_rate": 2.209728283441112e-06, + "loss": 0.74296856, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21374512, + "step": 8026, + "time_per_iteration": 2.8466641902923584 + }, + { + "auxiliary_loss_clip": 0.01453411, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.27448559, + "balance_loss_mlp": 1.02571642, + "epoch": 0.48260934916578985, + "flos": 14327046351360.0, + "grad_norm": 2.287695772785659, + "language_loss": 0.76144081, + "learning_rate": 2.209340965060465e-06, + "loss": 0.78645062, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.21838379, + "step": 8027, + "time_per_iteration": 5.129932165145874 + }, + { + "auxiliary_loss_clip": 0.01446108, + "auxiliary_loss_mlp": 0.01044907, + "balance_loss_clip": 1.26997089, + "balance_loss_mlp": 1.02305627, + "epoch": 0.4826694724184578, + "flos": 22129690596480.0, + "grad_norm": 1.6297460318866361, + "language_loss": 0.6856913, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.71060145, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.21862793, + "step": 8028, + "time_per_iteration": 4.345627069473267 + }, + { + "auxiliary_loss_clip": 0.01445677, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.27115953, + "balance_loss_mlp": 1.01670611, + "epoch": 0.48272959567112583, + "flos": 16190034013440.0, + "grad_norm": 1.694295402777243, + "language_loss": 0.74138522, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.76622844, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.21936035, + "step": 8029, + "time_per_iteration": 2.8974924087524414 + }, + { + "auxiliary_loss_clip": 0.01451874, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.2736274, + "balance_loss_mlp": 1.01528645, + "epoch": 0.4827897189237938, + "flos": 23189571475200.0, + "grad_norm": 1.9546767708510602, + "language_loss": 0.84901345, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.8739239, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.2388916, + "step": 8030, + "time_per_iteration": 2.9009616374969482 + }, + { + "auxiliary_loss_clip": 0.01437047, + "auxiliary_loss_mlp": 0.01039586, + "balance_loss_clip": 1.2634902, + "balance_loss_mlp": 1.01754391, + "epoch": 0.48284984217646176, + "flos": 21662545587840.0, + "grad_norm": 2.0742542092825196, + "language_loss": 0.74431676, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76908308, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.22045898, + "step": 8031, + "time_per_iteration": 2.8582918643951416 + }, + { + "auxiliary_loss_clip": 0.01473567, + "auxiliary_loss_mlp": 0.01042698, + "balance_loss_clip": 1.29095435, + "balance_loss_mlp": 1.02019143, + "epoch": 0.48290996542912973, + "flos": 31479947112960.0, + "grad_norm": 1.5975229955690644, + "language_loss": 0.72770518, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.75286782, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 1.82714844, + "router_z_loss_mlp": 0.22521973, + "step": 8032, + "time_per_iteration": 2.93294095993042 + }, + { + "auxiliary_loss_clip": 0.01440138, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.26621902, + "balance_loss_mlp": 1.01105213, + "epoch": 0.4829700886817977, + "flos": 24472582300800.0, + "grad_norm": 1.38540646951421, + "language_loss": 0.75057137, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.77530259, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21936035, + "step": 8033, + "time_per_iteration": 2.960073471069336 + }, + { + "auxiliary_loss_clip": 0.01468895, + "auxiliary_loss_mlp": 0.01043461, + "balance_loss_clip": 1.28767824, + "balance_loss_mlp": 1.02180076, + "epoch": 0.48303021193446566, + "flos": 25713035752320.0, + "grad_norm": 3.0041930592867563, + "language_loss": 0.84033597, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.86545956, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.2166748, + "step": 8034, + "time_per_iteration": 2.8878417015075684 + }, + { + "auxiliary_loss_clip": 0.01429851, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.25885224, + "balance_loss_mlp": 1.01777136, + "epoch": 0.4830903351871336, + "flos": 20095134076800.0, + "grad_norm": 1.9106655716120213, + "language_loss": 0.80411536, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.82881176, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.22021484, + "step": 8035, + "time_per_iteration": 2.891605854034424 + }, + { + "auxiliary_loss_clip": 0.01437059, + "auxiliary_loss_mlp": 0.01043221, + "balance_loss_clip": 1.26182795, + "balance_loss_mlp": 1.01885474, + "epoch": 0.4831504584398016, + "flos": 39465335681280.0, + "grad_norm": 2.438277291074345, + "language_loss": 0.70340228, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.72820503, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.24365234, + "step": 8036, + "time_per_iteration": 2.9818954467773438 + }, + { + "auxiliary_loss_clip": 0.01447352, + "auxiliary_loss_mlp": 0.01043407, + "balance_loss_clip": 1.27193117, + "balance_loss_mlp": 1.02248573, + "epoch": 0.48321058169246955, + "flos": 20014996256640.0, + "grad_norm": 1.9817730381958845, + "language_loss": 0.73786223, + "learning_rate": 2.205467347074847e-06, + "loss": 0.76276982, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.20910645, + "step": 8037, + "time_per_iteration": 2.845090866088867 + }, + { + "auxiliary_loss_clip": 0.01467061, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.28468919, + "balance_loss_mlp": 1.01781321, + "epoch": 0.4832707049451375, + "flos": 20751402925440.0, + "grad_norm": 2.0588494341109813, + "language_loss": 0.70228326, + "learning_rate": 2.205079942181525e-06, + "loss": 0.72735345, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.22143555, + "step": 8038, + "time_per_iteration": 2.9087917804718018 + }, + { + "auxiliary_loss_clip": 0.01437536, + "auxiliary_loss_mlp": 0.01043383, + "balance_loss_clip": 1.26288116, + "balance_loss_mlp": 1.01967251, + "epoch": 0.4833308281978055, + "flos": 33158471149440.0, + "grad_norm": 1.5105880071092512, + "language_loss": 0.79417861, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81898779, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.23706055, + "step": 8039, + "time_per_iteration": 2.995439052581787 + }, + { + "auxiliary_loss_clip": 0.01435852, + "auxiliary_loss_mlp": 0.01041466, + "balance_loss_clip": 1.26187956, + "balance_loss_mlp": 1.02005601, + "epoch": 0.48339095145047345, + "flos": 19108649543040.0, + "grad_norm": 1.639877376062746, + "language_loss": 0.78355777, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.80833101, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21435547, + "step": 8040, + "time_per_iteration": 3.003995895385742 + }, + { + "auxiliary_loss_clip": 0.01450697, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.27450788, + "balance_loss_mlp": 1.01733363, + "epoch": 0.4834510747031414, + "flos": 34472773393920.0, + "grad_norm": 1.4730316763011786, + "language_loss": 0.76194346, + "learning_rate": 2.203917680900409e-06, + "loss": 0.786852, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.22839355, + "step": 8041, + "time_per_iteration": 2.9654502868652344 + }, + { + "auxiliary_loss_clip": 0.0142614, + "auxiliary_loss_mlp": 0.01039896, + "balance_loss_clip": 1.25677872, + "balance_loss_mlp": 1.01799703, + "epoch": 0.48351119795580944, + "flos": 27392238460800.0, + "grad_norm": 2.232451265957086, + "language_loss": 0.6772902, + "learning_rate": 2.203530244988624e-06, + "loss": 0.70195055, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21911621, + "step": 8042, + "time_per_iteration": 2.9001424312591553 + }, + { + "auxiliary_loss_clip": 0.01259886, + "auxiliary_loss_mlp": 0.01047734, + "balance_loss_clip": 1.15897596, + "balance_loss_mlp": 1.01912332, + "epoch": 0.4835713212084774, + "flos": 67173465208320.0, + "grad_norm": 0.701833726374452, + "language_loss": 0.58668327, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60975945, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.28515625, + "step": 8043, + "time_per_iteration": 3.4702389240264893 + }, + { + "auxiliary_loss_clip": 0.01448629, + "auxiliary_loss_mlp": 0.01044365, + "balance_loss_clip": 1.27165091, + "balance_loss_mlp": 1.02190566, + "epoch": 0.48363144446114537, + "flos": 17976412949760.0, + "grad_norm": 3.1943441738863907, + "language_loss": 0.73546666, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.7603966, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.2244873, + "step": 8044, + "time_per_iteration": 2.842984199523926 + }, + { + "auxiliary_loss_clip": 0.01439386, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.26619899, + "balance_loss_mlp": 1.01372313, + "epoch": 0.48369156771381333, + "flos": 20602890933120.0, + "grad_norm": 1.5918803391866734, + "language_loss": 0.76196301, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78673232, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.23803711, + "step": 8045, + "time_per_iteration": 2.8632731437683105 + }, + { + "auxiliary_loss_clip": 0.01448876, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.27292132, + "balance_loss_mlp": 1.01753855, + "epoch": 0.4837516909664813, + "flos": 22685479447680.0, + "grad_norm": 1.5220760521722976, + "language_loss": 0.69723201, + "learning_rate": 2.201980424309533e-06, + "loss": 0.72211725, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.22106934, + "step": 8046, + "time_per_iteration": 2.8912031650543213 + }, + { + "auxiliary_loss_clip": 0.01437976, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.26261163, + "balance_loss_mlp": 1.01619804, + "epoch": 0.48381181421914926, + "flos": 25529160309120.0, + "grad_norm": 1.751074824553134, + "language_loss": 0.82639217, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.85117066, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.23706055, + "step": 8047, + "time_per_iteration": 2.875175952911377 + }, + { + "auxiliary_loss_clip": 0.01427702, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.25598717, + "balance_loss_mlp": 1.01528692, + "epoch": 0.4838719374718172, + "flos": 24218296669440.0, + "grad_norm": 1.8071422837358269, + "language_loss": 0.80980414, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.83445305, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21899414, + "step": 8048, + "time_per_iteration": 2.8864898681640625 + }, + { + "auxiliary_loss_clip": 0.01459169, + "auxiliary_loss_mlp": 0.0104431, + "balance_loss_clip": 1.27935255, + "balance_loss_mlp": 1.02088594, + "epoch": 0.4839320607244852, + "flos": 26735924367360.0, + "grad_norm": 2.4998326903685397, + "language_loss": 0.8234061, + "learning_rate": 2.200817978328054e-06, + "loss": 0.84844089, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.23425293, + "step": 8049, + "time_per_iteration": 2.925243377685547 + }, + { + "auxiliary_loss_clip": 0.01438797, + "auxiliary_loss_mlp": 0.010353, + "balance_loss_clip": 1.26761174, + "balance_loss_mlp": 1.01443839, + "epoch": 0.48399218397715316, + "flos": 20458586707200.0, + "grad_norm": 1.8061718237111914, + "language_loss": 0.73694795, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.76168889, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20861816, + "step": 8050, + "time_per_iteration": 2.9233829975128174 + }, + { + "auxiliary_loss_clip": 0.0124702, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.1496799, + "balance_loss_mlp": 1.01739621, + "epoch": 0.4840523072298211, + "flos": 67210140758400.0, + "grad_norm": 0.7091427012831094, + "language_loss": 0.56415063, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58709234, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.296875, + "step": 8051, + "time_per_iteration": 3.397515058517456 + }, + { + "auxiliary_loss_clip": 0.01448871, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.272434, + "balance_loss_mlp": 1.01644003, + "epoch": 0.4841124304824891, + "flos": 22419701637120.0, + "grad_norm": 4.034535896796491, + "language_loss": 0.76096451, + "learning_rate": 2.199655463811236e-06, + "loss": 0.78584731, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.2298584, + "step": 8052, + "time_per_iteration": 4.300498008728027 + }, + { + "auxiliary_loss_clip": 0.01449626, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.27585959, + "balance_loss_mlp": 1.014853, + "epoch": 0.48417255373515705, + "flos": 13851666789120.0, + "grad_norm": 4.270077672584764, + "language_loss": 0.67061222, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.6954819, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22497559, + "step": 8053, + "time_per_iteration": 2.833307981491089 + }, + { + "auxiliary_loss_clip": 0.01439411, + "auxiliary_loss_mlp": 0.01040292, + "balance_loss_clip": 1.26779151, + "balance_loss_mlp": 1.01767755, + "epoch": 0.484232676987825, + "flos": 31662193743360.0, + "grad_norm": 2.61514655344533, + "language_loss": 0.70877516, + "learning_rate": 2.198880416254091e-06, + "loss": 0.73357219, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.22607422, + "step": 8054, + "time_per_iteration": 2.925290822982788 + }, + { + "auxiliary_loss_clip": 0.01438566, + "auxiliary_loss_mlp": 0.01034911, + "balance_loss_clip": 1.26523471, + "balance_loss_mlp": 1.01347721, + "epoch": 0.48429280024049304, + "flos": 24105464841600.0, + "grad_norm": 2.3256861410830854, + "language_loss": 0.70055044, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.72528523, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.21435547, + "step": 8055, + "time_per_iteration": 2.8612911701202393 + }, + { + "auxiliary_loss_clip": 0.01460316, + "auxiliary_loss_mlp": 0.01042063, + "balance_loss_clip": 1.28302574, + "balance_loss_mlp": 1.01813757, + "epoch": 0.484352923493161, + "flos": 17539021036800.0, + "grad_norm": 2.6151571569005796, + "language_loss": 0.63861001, + "learning_rate": 2.198105338530685e-06, + "loss": 0.66363382, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.23913574, + "step": 8056, + "time_per_iteration": 2.8068222999572754 + }, + { + "auxiliary_loss_clip": 0.01441721, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.26638496, + "balance_loss_mlp": 1.01711941, + "epoch": 0.48441304674582897, + "flos": 29178255438720.0, + "grad_norm": 1.6464511326427886, + "language_loss": 0.6823284, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.70715487, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.23803711, + "step": 8057, + "time_per_iteration": 2.9068243503570557 + }, + { + "auxiliary_loss_clip": 0.01425192, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.25356436, + "balance_loss_mlp": 1.01505506, + "epoch": 0.48447316999849693, + "flos": 15894819820800.0, + "grad_norm": 1.7583110916200952, + "language_loss": 0.82154047, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84617937, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.23632812, + "step": 8058, + "time_per_iteration": 2.8482937812805176 + }, + { + "auxiliary_loss_clip": 0.01456847, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.27584779, + "balance_loss_mlp": 1.02263761, + "epoch": 0.4845332932511649, + "flos": 24390046506240.0, + "grad_norm": 1.6365331806822792, + "language_loss": 0.80985039, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.83487618, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.23095703, + "step": 8059, + "time_per_iteration": 2.8700923919677734 + }, + { + "auxiliary_loss_clip": 0.01466689, + "auxiliary_loss_mlp": 0.01047048, + "balance_loss_clip": 1.2863214, + "balance_loss_mlp": 1.02352798, + "epoch": 0.48459341650383286, + "flos": 37128823493760.0, + "grad_norm": 2.2015483155723805, + "language_loss": 0.66673303, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69187033, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.23522949, + "step": 8060, + "time_per_iteration": 3.029085874557495 + }, + { + "auxiliary_loss_clip": 0.01452095, + "auxiliary_loss_mlp": 0.01043536, + "balance_loss_clip": 1.276999, + "balance_loss_mlp": 1.02089787, + "epoch": 0.48465353975650083, + "flos": 22977209790720.0, + "grad_norm": 1.9607669267781869, + "language_loss": 0.68190807, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.70686436, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22619629, + "step": 8061, + "time_per_iteration": 2.8707222938537598 + }, + { + "auxiliary_loss_clip": 0.01457512, + "auxiliary_loss_mlp": 0.01053537, + "balance_loss_clip": 1.28130221, + "balance_loss_mlp": 1.03027976, + "epoch": 0.4847136630091688, + "flos": 17715431088000.0, + "grad_norm": 1.7687183212398327, + "language_loss": 0.83119237, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.85630298, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.23242188, + "step": 8062, + "time_per_iteration": 4.433736801147461 + }, + { + "auxiliary_loss_clip": 0.01438039, + "auxiliary_loss_mlp": 0.01044369, + "balance_loss_clip": 1.26651371, + "balance_loss_mlp": 1.02229166, + "epoch": 0.48477378626183676, + "flos": 22028486682240.0, + "grad_norm": 1.5396134741962573, + "language_loss": 0.74999231, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.77481639, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.2208252, + "step": 8063, + "time_per_iteration": 5.73506760597229 + }, + { + "auxiliary_loss_clip": 0.01444597, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.26917815, + "balance_loss_mlp": 1.02211738, + "epoch": 0.4848339095145047, + "flos": 27974070334080.0, + "grad_norm": 1.868383136602539, + "language_loss": 0.79385269, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81873882, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.21911621, + "step": 8064, + "time_per_iteration": 2.885828971862793 + }, + { + "auxiliary_loss_clip": 0.01436847, + "auxiliary_loss_mlp": 0.01042506, + "balance_loss_clip": 1.2658658, + "balance_loss_mlp": 1.02065492, + "epoch": 0.4848940327671727, + "flos": 21698587710720.0, + "grad_norm": 3.1285754680966034, + "language_loss": 0.79690337, + "learning_rate": 2.194617118620173e-06, + "loss": 0.82169694, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21850586, + "step": 8065, + "time_per_iteration": 2.880333662033081 + }, + { + "auxiliary_loss_clip": 0.01419716, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.25403714, + "balance_loss_mlp": 1.02210593, + "epoch": 0.48495415601984065, + "flos": 20641376275200.0, + "grad_norm": 2.8071562682373865, + "language_loss": 0.76571661, + "learning_rate": 2.194229501534644e-06, + "loss": 0.7903555, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.22058105, + "step": 8066, + "time_per_iteration": 2.8433468341827393 + }, + { + "auxiliary_loss_clip": 0.01443366, + "auxiliary_loss_mlp": 0.01042575, + "balance_loss_clip": 1.27160716, + "balance_loss_mlp": 1.01988912, + "epoch": 0.4850142792725086, + "flos": 25638734511360.0, + "grad_norm": 1.3686760721919282, + "language_loss": 0.72368866, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74854809, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.22692871, + "step": 8067, + "time_per_iteration": 2.891291618347168 + }, + { + "auxiliary_loss_clip": 0.01446076, + "auxiliary_loss_mlp": 0.01045048, + "balance_loss_clip": 1.27179384, + "balance_loss_mlp": 1.02349544, + "epoch": 0.4850744025251766, + "flos": 13779899256960.0, + "grad_norm": 3.8965206340136644, + "language_loss": 0.80188477, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.82679605, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21557617, + "step": 8068, + "time_per_iteration": 2.813518524169922 + }, + { + "auxiliary_loss_clip": 0.01442591, + "auxiliary_loss_mlp": 0.01044814, + "balance_loss_clip": 1.27020168, + "balance_loss_mlp": 1.02285576, + "epoch": 0.4851345257778446, + "flos": 20269643846400.0, + "grad_norm": 1.4686560825483552, + "language_loss": 0.84847629, + "learning_rate": 2.193066606145638e-06, + "loss": 0.87335038, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21960449, + "step": 8069, + "time_per_iteration": 2.8704187870025635 + }, + { + "auxiliary_loss_clip": 0.01438317, + "auxiliary_loss_mlp": 0.0104364, + "balance_loss_clip": 1.26679838, + "balance_loss_mlp": 1.02228904, + "epoch": 0.48519464903051257, + "flos": 27101186789760.0, + "grad_norm": 1.6504494422275644, + "language_loss": 0.78307521, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80789477, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21362305, + "step": 8070, + "time_per_iteration": 2.8899312019348145 + }, + { + "auxiliary_loss_clip": 0.01435661, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.26295114, + "balance_loss_mlp": 1.01773977, + "epoch": 0.48525477228318054, + "flos": 17135680475520.0, + "grad_norm": 2.252941075151758, + "language_loss": 0.79155713, + "learning_rate": 2.192291305922943e-06, + "loss": 0.8163141, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.22290039, + "step": 8071, + "time_per_iteration": 2.826347827911377 + }, + { + "auxiliary_loss_clip": 0.01445201, + "auxiliary_loss_mlp": 0.01044551, + "balance_loss_clip": 1.26935768, + "balance_loss_mlp": 1.02105451, + "epoch": 0.4853148955358485, + "flos": 28191092232960.0, + "grad_norm": 2.006240430079787, + "language_loss": 0.7280674, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.75296485, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.23510742, + "step": 8072, + "time_per_iteration": 2.919768810272217 + }, + { + "auxiliary_loss_clip": 0.01457559, + "auxiliary_loss_mlp": 0.01043555, + "balance_loss_clip": 1.28111303, + "balance_loss_mlp": 1.02227569, + "epoch": 0.48537501878851647, + "flos": 17502164507520.0, + "grad_norm": 1.9385614401495233, + "language_loss": 0.88705039, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.91206157, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.21276855, + "step": 8073, + "time_per_iteration": 2.8192501068115234 + }, + { + "auxiliary_loss_clip": 0.01429108, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.26142657, + "balance_loss_mlp": 1.01591778, + "epoch": 0.48543514204118443, + "flos": 28596559299840.0, + "grad_norm": 1.910572128136991, + "language_loss": 0.61617911, + "learning_rate": 2.19112830093786e-06, + "loss": 0.64084423, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21459961, + "step": 8074, + "time_per_iteration": 2.9331893920898438 + }, + { + "auxiliary_loss_clip": 0.01437058, + "auxiliary_loss_mlp": 0.01044356, + "balance_loss_clip": 1.26278901, + "balance_loss_mlp": 1.02184951, + "epoch": 0.4854952652938524, + "flos": 20969872657920.0, + "grad_norm": 1.84367120633218, + "language_loss": 0.73501194, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75982606, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.22521973, + "step": 8075, + "time_per_iteration": 2.8330326080322266 + }, + { + "auxiliary_loss_clip": 0.01424549, + "auxiliary_loss_mlp": 0.01039199, + "balance_loss_clip": 1.25682068, + "balance_loss_mlp": 1.01806331, + "epoch": 0.48555538854652036, + "flos": 66550958259840.0, + "grad_norm": 1.6331113398150123, + "language_loss": 0.82280684, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.8474443, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21142578, + "step": 8076, + "time_per_iteration": 3.2422988414764404 + }, + { + "auxiliary_loss_clip": 0.01442424, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.26951718, + "balance_loss_mlp": 1.01769567, + "epoch": 0.4856155117991883, + "flos": 15933576631680.0, + "grad_norm": 2.092076568716854, + "language_loss": 0.8722049, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.89702922, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22338867, + "step": 8077, + "time_per_iteration": 2.8467464447021484 + }, + { + "auxiliary_loss_clip": 0.0122409, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.1306777, + "balance_loss_mlp": 1.01858711, + "epoch": 0.4856756350518563, + "flos": 71077796110080.0, + "grad_norm": 0.9129896384808979, + "language_loss": 0.58601314, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60867643, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.23632812, + "step": 8078, + "time_per_iteration": 3.3495171070098877 + }, + { + "auxiliary_loss_clip": 0.01452484, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.27526987, + "balance_loss_mlp": 1.0188663, + "epoch": 0.48573575830452426, + "flos": 29837781912960.0, + "grad_norm": 1.7341041488366238, + "language_loss": 0.72863466, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.75356042, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.21228027, + "step": 8079, + "time_per_iteration": 2.950404405593872 + }, + { + "auxiliary_loss_clip": 0.01449127, + "auxiliary_loss_mlp": 0.01039992, + "balance_loss_clip": 1.27311623, + "balance_loss_mlp": 1.0195353, + "epoch": 0.4857958815571922, + "flos": 17648459504640.0, + "grad_norm": 2.109474449002977, + "language_loss": 0.80138928, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.82628047, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.20458984, + "step": 8080, + "time_per_iteration": 2.833333969116211 + }, + { + "auxiliary_loss_clip": 0.01446088, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.27214074, + "balance_loss_mlp": 1.01736593, + "epoch": 0.4858560048098602, + "flos": 21115308003840.0, + "grad_norm": 2.14373920976841, + "language_loss": 0.85284358, + "learning_rate": 2.188414369659251e-06, + "loss": 0.87769103, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.2130127, + "step": 8081, + "time_per_iteration": 2.86248517036438 + }, + { + "auxiliary_loss_clip": 0.01426787, + "auxiliary_loss_mlp": 0.0103981, + "balance_loss_clip": 1.25496471, + "balance_loss_mlp": 1.01801848, + "epoch": 0.4859161280625282, + "flos": 22100978131200.0, + "grad_norm": 1.7302285115805585, + "language_loss": 0.84254527, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.86721122, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.21789551, + "step": 8082, + "time_per_iteration": 2.8618814945220947 + }, + { + "auxiliary_loss_clip": 0.0143099, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.26266766, + "balance_loss_mlp": 1.01608443, + "epoch": 0.4859762513151962, + "flos": 17502345486720.0, + "grad_norm": 2.1290044950941787, + "language_loss": 0.88055992, + "learning_rate": 2.187638896199746e-06, + "loss": 0.90524375, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21313477, + "step": 8083, + "time_per_iteration": 2.835001230239868 + }, + { + "auxiliary_loss_clip": 0.01420226, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.25270808, + "balance_loss_mlp": 1.02088261, + "epoch": 0.48603637456786414, + "flos": 18013269479040.0, + "grad_norm": 2.348167939239756, + "language_loss": 0.82290447, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.84753203, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21643066, + "step": 8084, + "time_per_iteration": 2.8705813884735107 + }, + { + "auxiliary_loss_clip": 0.01443965, + "auxiliary_loss_mlp": 0.01040847, + "balance_loss_clip": 1.2703619, + "balance_loss_mlp": 1.02043831, + "epoch": 0.4860964978205321, + "flos": 22502237431680.0, + "grad_norm": 4.114739424010158, + "language_loss": 0.68081319, + "learning_rate": 2.186863394279098e-06, + "loss": 0.7056613, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20422363, + "step": 8085, + "time_per_iteration": 2.8588743209838867 + }, + { + "auxiliary_loss_clip": 0.01432467, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.26148438, + "balance_loss_mlp": 1.01460814, + "epoch": 0.48615662107320007, + "flos": 23384215180800.0, + "grad_norm": 1.503049634588074, + "language_loss": 0.77912819, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.80381733, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21838379, + "step": 8086, + "time_per_iteration": 2.9791693687438965 + }, + { + "auxiliary_loss_clip": 0.01437015, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.26506555, + "balance_loss_mlp": 1.01401544, + "epoch": 0.48621674432586803, + "flos": 34431075671040.0, + "grad_norm": 2.0802476933038774, + "language_loss": 0.71267456, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.73739707, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21228027, + "step": 8087, + "time_per_iteration": 4.35870099067688 + }, + { + "auxiliary_loss_clip": 0.01465144, + "auxiliary_loss_mlp": 0.01040952, + "balance_loss_clip": 1.28564024, + "balance_loss_mlp": 1.01894617, + "epoch": 0.486276867578536, + "flos": 33120031052160.0, + "grad_norm": 2.4138602395549613, + "language_loss": 0.73884028, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.76390129, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.22021484, + "step": 8088, + "time_per_iteration": 2.9477765560150146 + }, + { + "auxiliary_loss_clip": 0.01431126, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.26064634, + "balance_loss_mlp": 1.01751423, + "epoch": 0.48633699083120396, + "flos": 21480706160640.0, + "grad_norm": 1.6455066506297114, + "language_loss": 0.76423609, + "learning_rate": 2.185312305524892e-06, + "loss": 0.78893507, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21264648, + "step": 8089, + "time_per_iteration": 2.850593328475952 + }, + { + "auxiliary_loss_clip": 0.01436726, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.26246011, + "balance_loss_mlp": 1.01601076, + "epoch": 0.48639711408387193, + "flos": 20094002956800.0, + "grad_norm": 1.6936691947522515, + "language_loss": 0.84970927, + "learning_rate": 2.184924515731926e-06, + "loss": 0.87446427, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.22753906, + "step": 8090, + "time_per_iteration": 2.8234190940856934 + }, + { + "auxiliary_loss_clip": 0.0141734, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.25151014, + "balance_loss_mlp": 1.01332104, + "epoch": 0.4864572373365399, + "flos": 20789073861120.0, + "grad_norm": 5.207403759525097, + "language_loss": 0.76844466, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.79296011, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20898438, + "step": 8091, + "time_per_iteration": 2.850848913192749 + }, + { + "auxiliary_loss_clip": 0.01422565, + "auxiliary_loss_mlp": 0.01035524, + "balance_loss_clip": 1.25205886, + "balance_loss_mlp": 1.01429236, + "epoch": 0.48651736058920786, + "flos": 26034971639040.0, + "grad_norm": 1.5126120665784986, + "language_loss": 0.80743599, + "learning_rate": 2.184148915123631e-06, + "loss": 0.83201689, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21228027, + "step": 8092, + "time_per_iteration": 2.899534225463867 + }, + { + "auxiliary_loss_clip": 0.01442825, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.26957977, + "balance_loss_mlp": 1.01407754, + "epoch": 0.4865774838418758, + "flos": 20495352746880.0, + "grad_norm": 1.6600810946338407, + "language_loss": 0.72852021, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.75330609, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21704102, + "step": 8093, + "time_per_iteration": 2.8183488845825195 + }, + { + "auxiliary_loss_clip": 0.01428029, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.25924492, + "balance_loss_mlp": 1.01412654, + "epoch": 0.4866376070945438, + "flos": 23557548585600.0, + "grad_norm": 1.7828753424000217, + "language_loss": 0.69015515, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.71478975, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.2130127, + "step": 8094, + "time_per_iteration": 2.8831067085266113 + }, + { + "auxiliary_loss_clip": 0.01453888, + "auxiliary_loss_mlp": 0.01040509, + "balance_loss_clip": 1.27743804, + "balance_loss_mlp": 1.01976633, + "epoch": 0.4866977303472118, + "flos": 16699238703360.0, + "grad_norm": 2.2106419077448254, + "language_loss": 0.67152172, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69646567, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.20727539, + "step": 8095, + "time_per_iteration": 2.96636700630188 + }, + { + "auxiliary_loss_clip": 0.01454434, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.28058422, + "balance_loss_mlp": 1.0159117, + "epoch": 0.4867578535998798, + "flos": 17905324089600.0, + "grad_norm": 1.8844350650391613, + "language_loss": 0.78903711, + "learning_rate": 2.182597630229345e-06, + "loss": 0.81396043, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21972656, + "step": 8096, + "time_per_iteration": 2.8480963706970215 + }, + { + "auxiliary_loss_clip": 0.0143104, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.26022768, + "balance_loss_mlp": 1.01574922, + "epoch": 0.48681797685254774, + "flos": 22647989491200.0, + "grad_norm": 1.9197187802021536, + "language_loss": 0.68015182, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.70484108, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.22119141, + "step": 8097, + "time_per_iteration": 4.334709405899048 + }, + { + "auxiliary_loss_clip": 0.01435481, + "auxiliary_loss_mlp": 0.01037197, + "balance_loss_clip": 1.26474857, + "balance_loss_mlp": 1.01662147, + "epoch": 0.4868781001052157, + "flos": 20895707151360.0, + "grad_norm": 1.7183655076429076, + "language_loss": 0.72248709, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.74721384, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20568848, + "step": 8098, + "time_per_iteration": 5.6493775844573975 + }, + { + "auxiliary_loss_clip": 0.01461568, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.28320515, + "balance_loss_mlp": 1.01329708, + "epoch": 0.48693822335788367, + "flos": 41990066812800.0, + "grad_norm": 2.0338062527418006, + "language_loss": 0.6685816, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.69355935, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22900391, + "step": 8099, + "time_per_iteration": 3.1179182529449463 + }, + { + "auxiliary_loss_clip": 0.01433614, + "auxiliary_loss_mlp": 0.010415, + "balance_loss_clip": 1.26063704, + "balance_loss_mlp": 1.02001882, + "epoch": 0.48699834661055164, + "flos": 24253931589120.0, + "grad_norm": 1.9550738723194627, + "language_loss": 0.67769742, + "learning_rate": 2.181046234549138e-06, + "loss": 0.70244861, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21484375, + "step": 8100, + "time_per_iteration": 2.862433671951294 + }, + { + "auxiliary_loss_clip": 0.01435036, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.26583946, + "balance_loss_mlp": 1.01692581, + "epoch": 0.4870584698632196, + "flos": 25935532272000.0, + "grad_norm": 1.4413725823341306, + "language_loss": 0.76973641, + "learning_rate": 2.180658368429088e-06, + "loss": 0.79446363, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20739746, + "step": 8101, + "time_per_iteration": 2.93546462059021 + }, + { + "auxiliary_loss_clip": 0.01221442, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.12868667, + "balance_loss_mlp": 1.0131768, + "epoch": 0.48711859311588757, + "flos": 70243081194240.0, + "grad_norm": 0.6843800230410032, + "language_loss": 0.52421474, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54676402, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.203125, + "step": 8102, + "time_per_iteration": 3.5113306045532227 + }, + { + "auxiliary_loss_clip": 0.01434911, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.262766, + "balance_loss_mlp": 1.01881194, + "epoch": 0.48717871636855553, + "flos": 12348964621440.0, + "grad_norm": 2.3693104330231414, + "language_loss": 0.74097693, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76571929, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.20507812, + "step": 8103, + "time_per_iteration": 2.8283851146698 + }, + { + "auxiliary_loss_clip": 0.01442917, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.26881528, + "balance_loss_mlp": 1.01771879, + "epoch": 0.4872388396212235, + "flos": 23487590845440.0, + "grad_norm": 1.6214064846903644, + "language_loss": 0.63837671, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.66320407, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.22094727, + "step": 8104, + "time_per_iteration": 2.890270233154297 + }, + { + "auxiliary_loss_clip": 0.01432873, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.26150072, + "balance_loss_mlp": 1.01524627, + "epoch": 0.48729896287389146, + "flos": 31439199530880.0, + "grad_norm": 1.698156738404256, + "language_loss": 0.69928646, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.72398448, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21691895, + "step": 8105, + "time_per_iteration": 2.9396603107452393 + }, + { + "auxiliary_loss_clip": 0.01430982, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.26085925, + "balance_loss_mlp": 1.01007748, + "epoch": 0.4873590861265594, + "flos": 19066092168960.0, + "grad_norm": 1.7735045158162335, + "language_loss": 0.74410719, + "learning_rate": 2.178718935364259e-06, + "loss": 0.76872742, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20947266, + "step": 8106, + "time_per_iteration": 2.8169360160827637 + }, + { + "auxiliary_loss_clip": 0.01449256, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.27394724, + "balance_loss_mlp": 1.01666343, + "epoch": 0.4874192093792274, + "flos": 24357804946560.0, + "grad_norm": 1.7976885004382848, + "language_loss": 0.77436775, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79925132, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.22436523, + "step": 8107, + "time_per_iteration": 2.8699493408203125 + }, + { + "auxiliary_loss_clip": 0.0142832, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.26163971, + "balance_loss_mlp": 1.01282859, + "epoch": 0.4874793326318954, + "flos": 23123052339840.0, + "grad_norm": 1.5656285412688782, + "language_loss": 0.75906229, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.78367889, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20495605, + "step": 8108, + "time_per_iteration": 2.9119393825531006 + }, + { + "auxiliary_loss_clip": 0.0143535, + "auxiliary_loss_mlp": 0.01037277, + "balance_loss_clip": 1.26842618, + "balance_loss_mlp": 1.01777434, + "epoch": 0.4875394558845634, + "flos": 19035524666880.0, + "grad_norm": 1.937278042420797, + "language_loss": 0.74616641, + "learning_rate": 2.177555194083212e-06, + "loss": 0.77089274, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19494629, + "step": 8109, + "time_per_iteration": 2.874976396560669 + }, + { + "auxiliary_loss_clip": 0.01432604, + "auxiliary_loss_mlp": 0.01036088, + "balance_loss_clip": 1.26474524, + "balance_loss_mlp": 1.01470184, + "epoch": 0.48759957913723134, + "flos": 21443442428160.0, + "grad_norm": 2.2552804293361945, + "language_loss": 0.785236, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80992293, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21398926, + "step": 8110, + "time_per_iteration": 2.8316781520843506 + }, + { + "auxiliary_loss_clip": 0.01438444, + "auxiliary_loss_mlp": 0.01041538, + "balance_loss_clip": 1.26768303, + "balance_loss_mlp": 1.02046227, + "epoch": 0.4876597023898993, + "flos": 17757581258880.0, + "grad_norm": 2.3937053373797017, + "language_loss": 0.73893827, + "learning_rate": 2.176779332873444e-06, + "loss": 0.76373804, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21081543, + "step": 8111, + "time_per_iteration": 2.7993099689483643 + }, + { + "auxiliary_loss_clip": 0.01439993, + "auxiliary_loss_mlp": 0.01040548, + "balance_loss_clip": 1.27199674, + "balance_loss_mlp": 1.01934087, + "epoch": 0.4877198256425673, + "flos": 17028549492480.0, + "grad_norm": 2.7563350312291073, + "language_loss": 0.76499283, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78979826, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21203613, + "step": 8112, + "time_per_iteration": 2.845808982849121 + }, + { + "auxiliary_loss_clip": 0.01461038, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.28296244, + "balance_loss_mlp": 1.01889873, + "epoch": 0.48777994889523524, + "flos": 22394654000640.0, + "grad_norm": 2.24628151459118, + "language_loss": 0.7608242, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.78584421, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.22070312, + "step": 8113, + "time_per_iteration": 2.877472162246704 + }, + { + "auxiliary_loss_clip": 0.01231193, + "auxiliary_loss_mlp": 0.01069275, + "balance_loss_clip": 1.13772368, + "balance_loss_mlp": 1.04428899, + "epoch": 0.4878400721479032, + "flos": 61271977253760.0, + "grad_norm": 0.7965883454101493, + "language_loss": 0.48907876, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.51208341, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.25, + "step": 8114, + "time_per_iteration": 3.2386727333068848 + }, + { + "auxiliary_loss_clip": 0.01444637, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.27163064, + "balance_loss_mlp": 1.02270126, + "epoch": 0.48790019540057117, + "flos": 24547697948160.0, + "grad_norm": 1.3834542741610156, + "language_loss": 0.7778511, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.80274624, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22167969, + "step": 8115, + "time_per_iteration": 2.9347805976867676 + }, + { + "auxiliary_loss_clip": 0.01456443, + "auxiliary_loss_mlp": 0.01044654, + "balance_loss_clip": 1.2812618, + "balance_loss_mlp": 1.02319646, + "epoch": 0.48796031865323913, + "flos": 21843615853440.0, + "grad_norm": 2.74312883504099, + "language_loss": 0.72508246, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.7500934, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.21447754, + "step": 8116, + "time_per_iteration": 2.860118865966797 + }, + { + "auxiliary_loss_clip": 0.01420356, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.25303376, + "balance_loss_mlp": 1.01261675, + "epoch": 0.4880204419059071, + "flos": 18598313733120.0, + "grad_norm": 2.4313775613708217, + "language_loss": 0.63809431, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.66263437, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.21032715, + "step": 8117, + "time_per_iteration": 2.9099063873291016 + }, + { + "auxiliary_loss_clip": 0.01436877, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.26570833, + "balance_loss_mlp": 1.01370645, + "epoch": 0.48808056515857506, + "flos": 19181593440000.0, + "grad_norm": 1.7193465027978267, + "language_loss": 0.80135679, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.826069, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20629883, + "step": 8118, + "time_per_iteration": 2.860924243927002 + }, + { + "auxiliary_loss_clip": 0.01444277, + "auxiliary_loss_mlp": 0.0104047, + "balance_loss_clip": 1.27078712, + "balance_loss_mlp": 1.0197041, + "epoch": 0.48814068841124303, + "flos": 20130090324480.0, + "grad_norm": 2.1526762588888553, + "language_loss": 0.64086854, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.66571599, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20776367, + "step": 8119, + "time_per_iteration": 2.827331304550171 + }, + { + "auxiliary_loss_clip": 0.01450205, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.27791524, + "balance_loss_mlp": 1.01580572, + "epoch": 0.488200811663911, + "flos": 22975716712320.0, + "grad_norm": 1.8976194225287937, + "language_loss": 0.72929507, + "learning_rate": 2.173287627305878e-06, + "loss": 0.7541647, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20959473, + "step": 8120, + "time_per_iteration": 2.966766357421875 + }, + { + "auxiliary_loss_clip": 0.01461384, + "auxiliary_loss_mlp": 0.01039339, + "balance_loss_clip": 1.28731966, + "balance_loss_mlp": 1.01784539, + "epoch": 0.48826093491657896, + "flos": 33923499793920.0, + "grad_norm": 1.8560281707501027, + "language_loss": 0.64333105, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.6683383, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.21496582, + "step": 8121, + "time_per_iteration": 2.953099012374878 + }, + { + "auxiliary_loss_clip": 0.01460596, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.28407276, + "balance_loss_mlp": 1.01895738, + "epoch": 0.488321058169247, + "flos": 23079228111360.0, + "grad_norm": 1.953756880873776, + "language_loss": 0.84111011, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.86612618, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22033691, + "step": 8122, + "time_per_iteration": 4.326294898986816 + }, + { + "auxiliary_loss_clip": 0.01469179, + "auxiliary_loss_mlp": 0.01045056, + "balance_loss_clip": 1.29410779, + "balance_loss_mlp": 1.02308607, + "epoch": 0.48838118142191494, + "flos": 19327074030720.0, + "grad_norm": 1.8255772912046255, + "language_loss": 0.86064613, + "learning_rate": 2.172123606640866e-06, + "loss": 0.88578844, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21960449, + "step": 8123, + "time_per_iteration": 2.850022315979004 + }, + { + "auxiliary_loss_clip": 0.01464022, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_clip": 1.28754985, + "balance_loss_mlp": 1.02516723, + "epoch": 0.4884413046745829, + "flos": 25421576878080.0, + "grad_norm": 1.503435372386699, + "language_loss": 0.86007309, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.88517296, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.20800781, + "step": 8124, + "time_per_iteration": 2.896627187728882 + }, + { + "auxiliary_loss_clip": 0.01455055, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.28271019, + "balance_loss_mlp": 1.02137101, + "epoch": 0.4885014279272509, + "flos": 21000168691200.0, + "grad_norm": 2.0485485972877493, + "language_loss": 0.80284905, + "learning_rate": 2.171347560204948e-06, + "loss": 0.82782698, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21362305, + "step": 8125, + "time_per_iteration": 2.864440441131592 + }, + { + "auxiliary_loss_clip": 0.01458041, + "auxiliary_loss_mlp": 0.01045961, + "balance_loss_clip": 1.28514218, + "balance_loss_mlp": 1.02472961, + "epoch": 0.48856155117991884, + "flos": 13779446808960.0, + "grad_norm": 2.276154741886507, + "language_loss": 0.7310468, + "learning_rate": 2.170959527233356e-06, + "loss": 0.75608683, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21228027, + "step": 8126, + "time_per_iteration": 2.8201115131378174 + }, + { + "auxiliary_loss_clip": 0.01475823, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.29790938, + "balance_loss_mlp": 1.01481688, + "epoch": 0.4886216744325868, + "flos": 32100445307520.0, + "grad_norm": 1.683463162399402, + "language_loss": 0.6891672, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.7142669, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.1932373, + "step": 8127, + "time_per_iteration": 2.917506217956543 + }, + { + "auxiliary_loss_clip": 0.01466066, + "auxiliary_loss_mlp": 0.01044397, + "balance_loss_clip": 1.2884568, + "balance_loss_mlp": 1.02338052, + "epoch": 0.48868179768525477, + "flos": 19619618780160.0, + "grad_norm": 1.584792701769592, + "language_loss": 0.77209163, + "learning_rate": 2.170183441856481e-06, + "loss": 0.79719627, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.21020508, + "step": 8128, + "time_per_iteration": 2.851788282394409 + }, + { + "auxiliary_loss_clip": 0.01467662, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.29159367, + "balance_loss_mlp": 1.02119923, + "epoch": 0.48874192093792274, + "flos": 21296514003840.0, + "grad_norm": 1.5418513861336363, + "language_loss": 0.76918757, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.79428172, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.20556641, + "step": 8129, + "time_per_iteration": 2.8409652709960938 + }, + { + "auxiliary_loss_clip": 0.01477301, + "auxiliary_loss_mlp": 0.01038552, + "balance_loss_clip": 1.29986167, + "balance_loss_mlp": 1.01817966, + "epoch": 0.4888020441905907, + "flos": 14181520515840.0, + "grad_norm": 3.222392148337765, + "language_loss": 0.65420693, + "learning_rate": 2.169407330666114e-06, + "loss": 0.6793654, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20373535, + "step": 8130, + "time_per_iteration": 2.8144986629486084 + }, + { + "auxiliary_loss_clip": 0.01455241, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.2827338, + "balance_loss_mlp": 1.01100659, + "epoch": 0.48886216744325867, + "flos": 24108631977600.0, + "grad_norm": 1.7308774045871818, + "language_loss": 0.72655904, + "learning_rate": 2.169019265427658e-06, + "loss": 0.75143808, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21655273, + "step": 8131, + "time_per_iteration": 2.9621613025665283 + }, + { + "auxiliary_loss_clip": 0.01479564, + "auxiliary_loss_mlp": 0.01049076, + "balance_loss_clip": 1.3015635, + "balance_loss_mlp": 1.02585387, + "epoch": 0.48892229069592663, + "flos": 38444483082240.0, + "grad_norm": 1.4740493672193002, + "language_loss": 0.70379847, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.72908491, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.23205566, + "step": 8132, + "time_per_iteration": 4.395914077758789 + }, + { + "auxiliary_loss_clip": 0.01455157, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.28432226, + "balance_loss_mlp": 1.01404691, + "epoch": 0.4889824139485946, + "flos": 23854210611840.0, + "grad_norm": 1.3977969574562459, + "language_loss": 0.70919615, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.73410398, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21582031, + "step": 8133, + "time_per_iteration": 5.6911046504974365 + }, + { + "auxiliary_loss_clip": 0.01449834, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.27768683, + "balance_loss_mlp": 1.01711464, + "epoch": 0.48904253720126256, + "flos": 24436540177920.0, + "grad_norm": 2.27269287371248, + "language_loss": 0.72038043, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.74526346, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21350098, + "step": 8134, + "time_per_iteration": 2.951812744140625 + }, + { + "auxiliary_loss_clip": 0.01481212, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.30251372, + "balance_loss_mlp": 1.01541305, + "epoch": 0.4891026604539306, + "flos": 24181394895360.0, + "grad_norm": 2.8850364595340405, + "language_loss": 0.81488162, + "learning_rate": 2.167466940528718e-06, + "loss": 0.84005582, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.20800781, + "step": 8135, + "time_per_iteration": 2.8771300315856934 + }, + { + "auxiliary_loss_clip": 0.01459283, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.28797078, + "balance_loss_mlp": 1.01793146, + "epoch": 0.48916278370659855, + "flos": 21481022874240.0, + "grad_norm": 1.6370145905378466, + "language_loss": 0.75193465, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.77689946, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19274902, + "step": 8136, + "time_per_iteration": 2.9013381004333496 + }, + { + "auxiliary_loss_clip": 0.01465509, + "auxiliary_loss_mlp": 0.01037904, + "balance_loss_clip": 1.29330587, + "balance_loss_mlp": 1.01612484, + "epoch": 0.4892229069592665, + "flos": 22319945556480.0, + "grad_norm": 1.4321808075690805, + "language_loss": 0.74365371, + "learning_rate": 2.166690739918204e-06, + "loss": 0.76868784, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21777344, + "step": 8137, + "time_per_iteration": 2.9524285793304443 + }, + { + "auxiliary_loss_clip": 0.01464244, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.28843021, + "balance_loss_mlp": 1.01855516, + "epoch": 0.4892830302119345, + "flos": 12794545843200.0, + "grad_norm": 2.628099369172474, + "language_loss": 0.7667135, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.79174793, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.20666504, + "step": 8138, + "time_per_iteration": 2.8581910133361816 + }, + { + "auxiliary_loss_clip": 0.01460348, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.28933573, + "balance_loss_mlp": 1.01891029, + "epoch": 0.48934315346460244, + "flos": 20823713395200.0, + "grad_norm": 1.659086764356683, + "language_loss": 0.74862987, + "learning_rate": 2.165914514023972e-06, + "loss": 0.7736367, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.2142334, + "step": 8139, + "time_per_iteration": 2.993156909942627 + }, + { + "auxiliary_loss_clip": 0.01478292, + "auxiliary_loss_mlp": 0.01038599, + "balance_loss_clip": 1.30192947, + "balance_loss_mlp": 1.01716518, + "epoch": 0.4894032767172704, + "flos": 19765008881280.0, + "grad_norm": 5.4852455055334755, + "language_loss": 0.62823761, + "learning_rate": 2.165526391632255e-06, + "loss": 0.6534065, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.21435547, + "step": 8140, + "time_per_iteration": 2.9858992099761963 + }, + { + "auxiliary_loss_clip": 0.01474058, + "auxiliary_loss_mlp": 0.01038881, + "balance_loss_clip": 1.29551053, + "balance_loss_mlp": 1.0162555, + "epoch": 0.4894633999699384, + "flos": 17827222285440.0, + "grad_norm": 2.1255302553734805, + "language_loss": 0.82898653, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.85411596, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.22607422, + "step": 8141, + "time_per_iteration": 2.9424965381622314 + }, + { + "auxiliary_loss_clip": 0.01487861, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.3091433, + "balance_loss_mlp": 1.01553917, + "epoch": 0.48952352322260634, + "flos": 25534951643520.0, + "grad_norm": 1.5093269307163109, + "language_loss": 0.7287147, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.75397265, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 1.78808594, + "router_z_loss_mlp": 0.22387695, + "step": 8142, + "time_per_iteration": 2.935468912124634 + }, + { + "auxiliary_loss_clip": 0.01461832, + "auxiliary_loss_mlp": 0.01040792, + "balance_loss_clip": 1.28672612, + "balance_loss_mlp": 1.01852345, + "epoch": 0.4895836464752743, + "flos": 29066373751680.0, + "grad_norm": 1.7155066064434383, + "language_loss": 0.68184024, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.7068665, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.22265625, + "step": 8143, + "time_per_iteration": 2.8996427059173584 + }, + { + "auxiliary_loss_clip": 0.01459036, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.28743935, + "balance_loss_mlp": 1.01301634, + "epoch": 0.48964376972794227, + "flos": 33559956673920.0, + "grad_norm": 2.189256093724405, + "language_loss": 0.75473082, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77965927, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20776367, + "step": 8144, + "time_per_iteration": 2.95009183883667 + }, + { + "auxiliary_loss_clip": 0.01455361, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.28204155, + "balance_loss_mlp": 1.0144366, + "epoch": 0.48970389298061023, + "flos": 22064076357120.0, + "grad_norm": 1.7575856458095978, + "language_loss": 0.7643199, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.7892257, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.2076416, + "step": 8145, + "time_per_iteration": 2.8702006340026855 + }, + { + "auxiliary_loss_clip": 0.01460258, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.284863, + "balance_loss_mlp": 1.01696014, + "epoch": 0.4897640162332782, + "flos": 20093957712000.0, + "grad_norm": 1.8360620187059318, + "language_loss": 0.81097001, + "learning_rate": 2.163197525984761e-06, + "loss": 0.83596212, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.2199707, + "step": 8146, + "time_per_iteration": 2.8272719383239746 + }, + { + "auxiliary_loss_clip": 0.01441596, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.27227354, + "balance_loss_mlp": 1.01403487, + "epoch": 0.48982413948594616, + "flos": 23816946879360.0, + "grad_norm": 1.542716675832851, + "language_loss": 0.75360078, + "learning_rate": 2.162809359964687e-06, + "loss": 0.77837598, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.21875, + "step": 8147, + "time_per_iteration": 2.919450044631958 + }, + { + "auxiliary_loss_clip": 0.01455195, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.2815001, + "balance_loss_mlp": 1.01363635, + "epoch": 0.4898842627386142, + "flos": 17648640483840.0, + "grad_norm": 2.8309275122872224, + "language_loss": 0.83926558, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8641693, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21557617, + "step": 8148, + "time_per_iteration": 2.8466272354125977 + }, + { + "auxiliary_loss_clip": 0.01445528, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.27528977, + "balance_loss_mlp": 1.01481867, + "epoch": 0.48994438599128215, + "flos": 16626611520000.0, + "grad_norm": 12.122978510065916, + "language_loss": 0.75237668, + "learning_rate": 2.162033009418015e-06, + "loss": 0.77718556, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2052002, + "step": 8149, + "time_per_iteration": 2.83486008644104 + }, + { + "auxiliary_loss_clip": 0.0147117, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.29217565, + "balance_loss_mlp": 1.01384759, + "epoch": 0.4900045092439501, + "flos": 26626078696320.0, + "grad_norm": 2.552995464617393, + "language_loss": 0.76374245, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78881252, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.21984863, + "step": 8150, + "time_per_iteration": 2.8890230655670166 + }, + { + "auxiliary_loss_clip": 0.01464281, + "auxiliary_loss_mlp": 0.01046877, + "balance_loss_clip": 1.28795791, + "balance_loss_mlp": 1.02364302, + "epoch": 0.4900646324966181, + "flos": 19911801571200.0, + "grad_norm": 2.033007191501246, + "language_loss": 0.73038125, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75549287, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 1.76074219, + "router_z_loss_mlp": 0.23254395, + "step": 8151, + "time_per_iteration": 2.851952314376831 + }, + { + "auxiliary_loss_clip": 0.01241399, + "auxiliary_loss_mlp": 0.0104854, + "balance_loss_clip": 1.13987446, + "balance_loss_mlp": 1.02126503, + "epoch": 0.49012475574928605, + "flos": 59216970084480.0, + "grad_norm": 0.9046548355209094, + "language_loss": 0.54407847, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56697786, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.2734375, + "step": 8152, + "time_per_iteration": 3.439268112182617 + }, + { + "auxiliary_loss_clip": 0.01464514, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.28738987, + "balance_loss_mlp": 1.01392221, + "epoch": 0.490184879001954, + "flos": 45276071270400.0, + "grad_norm": 1.6442995348875897, + "language_loss": 0.61540163, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.64039516, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.20910645, + "step": 8153, + "time_per_iteration": 3.4006574153900146 + }, + { + "auxiliary_loss_clip": 0.0144816, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.27516651, + "balance_loss_mlp": 1.01557767, + "epoch": 0.490245002254622, + "flos": 28013958264960.0, + "grad_norm": 1.9849519706201668, + "language_loss": 0.77525604, + "learning_rate": 2.160092025783549e-06, + "loss": 0.80012035, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.22680664, + "step": 8154, + "time_per_iteration": 2.9402153491973877 + }, + { + "auxiliary_loss_clip": 0.01231899, + "auxiliary_loss_mlp": 0.01044798, + "balance_loss_clip": 1.13343596, + "balance_loss_mlp": 1.02419841, + "epoch": 0.49030512550728994, + "flos": 58983044613120.0, + "grad_norm": 0.9948281143524212, + "language_loss": 0.67037505, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.693142, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.20605469, + "step": 8155, + "time_per_iteration": 3.38663649559021 + }, + { + "auxiliary_loss_clip": 0.01449062, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.27595496, + "balance_loss_mlp": 1.0129056, + "epoch": 0.4903652487599579, + "flos": 19801277228160.0, + "grad_norm": 3.475005000818123, + "language_loss": 0.77528703, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.80010623, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.19934082, + "step": 8156, + "time_per_iteration": 2.8406879901885986 + }, + { + "auxiliary_loss_clip": 0.01456824, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.28246582, + "balance_loss_mlp": 1.0178746, + "epoch": 0.49042537201262587, + "flos": 21772029300480.0, + "grad_norm": 2.5103102431106157, + "language_loss": 0.8548401, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.87980562, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.21850586, + "step": 8157, + "time_per_iteration": 4.253950595855713 + }, + { + "auxiliary_loss_clip": 0.01442471, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.27009249, + "balance_loss_mlp": 1.0145998, + "epoch": 0.49048549526529384, + "flos": 18962354545920.0, + "grad_norm": 1.7609584925004516, + "language_loss": 0.80418414, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82897651, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.22167969, + "step": 8158, + "time_per_iteration": 2.83685302734375 + }, + { + "auxiliary_loss_clip": 0.01466034, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.28857625, + "balance_loss_mlp": 1.01572216, + "epoch": 0.4905456185179618, + "flos": 26917628060160.0, + "grad_norm": 1.5874804959135074, + "language_loss": 0.70052195, + "learning_rate": 2.158150890381454e-06, + "loss": 0.7255621, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 1.7734375, + "router_z_loss_mlp": 0.22265625, + "step": 8159, + "time_per_iteration": 2.905890464782715 + }, + { + "auxiliary_loss_clip": 0.01445737, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.27380991, + "balance_loss_mlp": 1.0152812, + "epoch": 0.49060574177062977, + "flos": 20422001646720.0, + "grad_norm": 2.1881449593909523, + "language_loss": 0.74049795, + "learning_rate": 2.157762645250854e-06, + "loss": 0.76533043, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.22241211, + "step": 8160, + "time_per_iteration": 2.862586498260498 + }, + { + "auxiliary_loss_clip": 0.01471136, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.29272616, + "balance_loss_mlp": 1.016047, + "epoch": 0.4906658650232978, + "flos": 17502662200320.0, + "grad_norm": 1.8794782787650801, + "language_loss": 0.72672105, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.75180721, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.2142334, + "step": 8161, + "time_per_iteration": 2.8249099254608154 + }, + { + "auxiliary_loss_clip": 0.01446996, + "auxiliary_loss_mlp": 0.01038897, + "balance_loss_clip": 1.27550733, + "balance_loss_mlp": 1.01671243, + "epoch": 0.49072598827596575, + "flos": 26625581003520.0, + "grad_norm": 2.585470909483985, + "language_loss": 0.69815242, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.72301137, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22192383, + "step": 8162, + "time_per_iteration": 2.9160163402557373 + }, + { + "auxiliary_loss_clip": 0.01469684, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.29148722, + "balance_loss_mlp": 1.01709509, + "epoch": 0.4907861115286337, + "flos": 20422318360320.0, + "grad_norm": 3.6308608738458843, + "language_loss": 0.64277595, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.66787523, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.23144531, + "step": 8163, + "time_per_iteration": 2.8451972007751465 + }, + { + "auxiliary_loss_clip": 0.01431555, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.26256776, + "balance_loss_mlp": 1.01933336, + "epoch": 0.4908462347813017, + "flos": 14072398761600.0, + "grad_norm": 2.203978744145157, + "language_loss": 0.78309196, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.80782223, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.22131348, + "step": 8164, + "time_per_iteration": 2.8346669673919678 + }, + { + "auxiliary_loss_clip": 0.01457079, + "auxiliary_loss_mlp": 0.010402, + "balance_loss_clip": 1.28153038, + "balance_loss_mlp": 1.01881373, + "epoch": 0.49090635803396965, + "flos": 18744608730240.0, + "grad_norm": 1.679217193289505, + "language_loss": 0.77769208, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.80266482, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.21374512, + "step": 8165, + "time_per_iteration": 2.830990791320801 + }, + { + "auxiliary_loss_clip": 0.01447307, + "auxiliary_loss_mlp": 0.01038688, + "balance_loss_clip": 1.2758925, + "balance_loss_mlp": 1.01640797, + "epoch": 0.4909664812866376, + "flos": 20568251399040.0, + "grad_norm": 1.8180130909832208, + "language_loss": 0.78941929, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.81427926, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.22265625, + "step": 8166, + "time_per_iteration": 2.905067205429077 + }, + { + "auxiliary_loss_clip": 0.01228036, + "auxiliary_loss_mlp": 0.01042821, + "balance_loss_clip": 1.12819755, + "balance_loss_mlp": 1.01669014, + "epoch": 0.4910266045393056, + "flos": 54715034856960.0, + "grad_norm": 0.8167167566084262, + "language_loss": 0.54197699, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56468558, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.26171875, + "step": 8167, + "time_per_iteration": 4.743985176086426 + }, + { + "auxiliary_loss_clip": 0.01454062, + "auxiliary_loss_mlp": 0.01043449, + "balance_loss_clip": 1.28172266, + "balance_loss_mlp": 1.02059627, + "epoch": 0.49108672779197354, + "flos": 16253340768000.0, + "grad_norm": 1.7772006224515646, + "language_loss": 0.86733973, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.89231479, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.22875977, + "step": 8168, + "time_per_iteration": 4.239855527877808 + }, + { + "auxiliary_loss_clip": 0.01444552, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.2745707, + "balance_loss_mlp": 1.02076554, + "epoch": 0.4911468510446415, + "flos": 19834106970240.0, + "grad_norm": 1.6227190617428697, + "language_loss": 0.74108881, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.76596212, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.2199707, + "step": 8169, + "time_per_iteration": 4.255047559738159 + }, + { + "auxiliary_loss_clip": 0.0143134, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.26108551, + "balance_loss_mlp": 1.0091083, + "epoch": 0.4912069742973095, + "flos": 21221941294080.0, + "grad_norm": 1.5511620500651622, + "language_loss": 0.7831502, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80775988, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20532227, + "step": 8170, + "time_per_iteration": 2.8297572135925293 + }, + { + "auxiliary_loss_clip": 0.01465566, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.28978884, + "balance_loss_mlp": 1.01953065, + "epoch": 0.49126709754997744, + "flos": 19546674883200.0, + "grad_norm": 4.172380421184691, + "language_loss": 0.7663871, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.79144812, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.21008301, + "step": 8171, + "time_per_iteration": 2.8364923000335693 + }, + { + "auxiliary_loss_clip": 0.01474133, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.29683673, + "balance_loss_mlp": 1.02032471, + "epoch": 0.4913272208026454, + "flos": 12247263014400.0, + "grad_norm": 2.1226654462227885, + "language_loss": 0.82425833, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.84940541, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.20251465, + "step": 8172, + "time_per_iteration": 2.8341197967529297 + }, + { + "auxiliary_loss_clip": 0.01222334, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.12450707, + "balance_loss_mlp": 1.00496137, + "epoch": 0.49138734405531337, + "flos": 65495031661440.0, + "grad_norm": 0.6919032315673139, + "language_loss": 0.53306067, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55557108, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.23730469, + "step": 8173, + "time_per_iteration": 3.4278345108032227 + }, + { + "auxiliary_loss_clip": 0.01464264, + "auxiliary_loss_mlp": 0.01039586, + "balance_loss_clip": 1.28873575, + "balance_loss_mlp": 1.01792574, + "epoch": 0.4914474673079814, + "flos": 18447403766400.0, + "grad_norm": 2.0262613850166713, + "language_loss": 0.63419211, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65923065, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.21643066, + "step": 8174, + "time_per_iteration": 2.8558428287506104 + }, + { + "auxiliary_loss_clip": 0.01450988, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.27851462, + "balance_loss_mlp": 1.01851356, + "epoch": 0.49150759056064935, + "flos": 21693927496320.0, + "grad_norm": 1.9548159979775948, + "language_loss": 0.70370489, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.72861904, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21923828, + "step": 8175, + "time_per_iteration": 2.8537890911102295 + }, + { + "auxiliary_loss_clip": 0.01451768, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.27958012, + "balance_loss_mlp": 1.01520991, + "epoch": 0.4915677138133173, + "flos": 22392572739840.0, + "grad_norm": 1.7542418646735947, + "language_loss": 0.75831801, + "learning_rate": 2.151549919570068e-06, + "loss": 0.78318959, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.2019043, + "step": 8176, + "time_per_iteration": 2.894666910171509 + }, + { + "auxiliary_loss_clip": 0.0146646, + "auxiliary_loss_mlp": 0.01040553, + "balance_loss_clip": 1.29082322, + "balance_loss_mlp": 1.01966739, + "epoch": 0.4916278370659853, + "flos": 18411316398720.0, + "grad_norm": 2.1143225874530023, + "language_loss": 0.703098, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72816813, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 1.75976562, + "router_z_loss_mlp": 0.2088623, + "step": 8177, + "time_per_iteration": 2.853842258453369 + }, + { + "auxiliary_loss_clip": 0.01219642, + "auxiliary_loss_mlp": 0.01040518, + "balance_loss_clip": 1.12159252, + "balance_loss_mlp": 1.01629436, + "epoch": 0.49168796031865325, + "flos": 66641746590720.0, + "grad_norm": 0.6981189701045334, + "language_loss": 0.46187276, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48447436, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.2421875, + "step": 8178, + "time_per_iteration": 3.402052164077759 + }, + { + "auxiliary_loss_clip": 0.01481933, + "auxiliary_loss_mlp": 0.01045911, + "balance_loss_clip": 1.30299723, + "balance_loss_mlp": 1.02495408, + "epoch": 0.4917480835713212, + "flos": 20969058251520.0, + "grad_norm": 2.1886455170928154, + "language_loss": 0.66053569, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68581414, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.20947266, + "step": 8179, + "time_per_iteration": 3.0126535892486572 + }, + { + "auxiliary_loss_clip": 0.01462469, + "auxiliary_loss_mlp": 0.01039066, + "balance_loss_clip": 1.28367376, + "balance_loss_mlp": 1.01648724, + "epoch": 0.4918082068239892, + "flos": 15779861487360.0, + "grad_norm": 3.4135977935366606, + "language_loss": 0.70682353, + "learning_rate": 2.149996505922343e-06, + "loss": 0.73183882, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.22570801, + "step": 8180, + "time_per_iteration": 2.8449788093566895 + }, + { + "auxiliary_loss_clip": 0.01438055, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.26807857, + "balance_loss_mlp": 1.01757669, + "epoch": 0.49186833007665715, + "flos": 24614669531520.0, + "grad_norm": 1.6844214782023053, + "language_loss": 0.84909666, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.87387532, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.22241211, + "step": 8181, + "time_per_iteration": 2.9278435707092285 + }, + { + "auxiliary_loss_clip": 0.01441335, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.27279067, + "balance_loss_mlp": 1.01458967, + "epoch": 0.4919284533293251, + "flos": 22100616172800.0, + "grad_norm": 1.9867022878885205, + "language_loss": 0.73939145, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.76416039, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.2097168, + "step": 8182, + "time_per_iteration": 2.8432302474975586 + }, + { + "auxiliary_loss_clip": 0.01451735, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.27907848, + "balance_loss_mlp": 1.02057958, + "epoch": 0.4919885765819931, + "flos": 23378559580800.0, + "grad_norm": 2.3780365824590284, + "language_loss": 0.73283869, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.7577768, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21496582, + "step": 8183, + "time_per_iteration": 2.879854917526245 + }, + { + "auxiliary_loss_clip": 0.01482479, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.3038528, + "balance_loss_mlp": 1.01707101, + "epoch": 0.49204869983466104, + "flos": 21370272307200.0, + "grad_norm": 1.8502872252415148, + "language_loss": 0.78039777, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.80559874, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.20556641, + "step": 8184, + "time_per_iteration": 2.8740339279174805 + }, + { + "auxiliary_loss_clip": 0.0144757, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.2765063, + "balance_loss_mlp": 1.01653171, + "epoch": 0.492108823087329, + "flos": 21152707470720.0, + "grad_norm": 2.6839361956008974, + "language_loss": 0.72237831, + "learning_rate": 2.148054610995789e-06, + "loss": 0.74722451, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.2052002, + "step": 8185, + "time_per_iteration": 2.8489127159118652 + }, + { + "auxiliary_loss_clip": 0.01462393, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.2872901, + "balance_loss_mlp": 1.02212358, + "epoch": 0.49216894633999697, + "flos": 25126860378240.0, + "grad_norm": 1.8710079259843284, + "language_loss": 0.75696129, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7820189, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.21252441, + "step": 8186, + "time_per_iteration": 2.869615077972412 + }, + { + "auxiliary_loss_clip": 0.01459307, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.2859478, + "balance_loss_mlp": 1.01625562, + "epoch": 0.49222906959266494, + "flos": 22648170470400.0, + "grad_norm": 2.0941665194130503, + "language_loss": 0.68979287, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.7147578, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20935059, + "step": 8187, + "time_per_iteration": 2.830219268798828 + }, + { + "auxiliary_loss_clip": 0.01451611, + "auxiliary_loss_mlp": 0.01041486, + "balance_loss_clip": 1.27915454, + "balance_loss_mlp": 1.02039814, + "epoch": 0.49228919284533296, + "flos": 20419739406720.0, + "grad_norm": 1.7217662692687186, + "language_loss": 0.67430449, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69923544, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21081543, + "step": 8188, + "time_per_iteration": 2.8512818813323975 + }, + { + "auxiliary_loss_clip": 0.01441359, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.26866388, + "balance_loss_mlp": 1.01642799, + "epoch": 0.4923493160980009, + "flos": 27132885411840.0, + "grad_norm": 1.9811784906404113, + "language_loss": 0.7592231, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.78401852, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.2175293, + "step": 8189, + "time_per_iteration": 2.908710479736328 + }, + { + "auxiliary_loss_clip": 0.014504, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.27940679, + "balance_loss_mlp": 1.01458931, + "epoch": 0.4924094393506689, + "flos": 35750400088320.0, + "grad_norm": 1.5636197797100626, + "language_loss": 0.64946198, + "learning_rate": 2.146112575713104e-06, + "loss": 0.6743201, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.20812988, + "step": 8190, + "time_per_iteration": 3.0177488327026367 + }, + { + "auxiliary_loss_clip": 0.01456166, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.28404009, + "balance_loss_mlp": 1.01512599, + "epoch": 0.49246956260333685, + "flos": 20422182625920.0, + "grad_norm": 1.9583403535608692, + "language_loss": 0.72594404, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.75086546, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20849609, + "step": 8191, + "time_per_iteration": 2.850749969482422 + }, + { + "auxiliary_loss_clip": 0.01453985, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.28094435, + "balance_loss_mlp": 1.01808763, + "epoch": 0.4925296858560048, + "flos": 38989684650240.0, + "grad_norm": 1.6686216306710615, + "language_loss": 0.7268399, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.75177109, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.21044922, + "step": 8192, + "time_per_iteration": 4.420319080352783 + }, + { + "auxiliary_loss_clip": 0.01230015, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.13098443, + "balance_loss_mlp": 1.01658332, + "epoch": 0.4925898091086728, + "flos": 64312202102400.0, + "grad_norm": 0.7241045434073838, + "language_loss": 0.52232754, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54505479, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.26171875, + "step": 8193, + "time_per_iteration": 3.437406539916992 + }, + { + "auxiliary_loss_clip": 0.01445633, + "auxiliary_loss_mlp": 0.01040815, + "balance_loss_clip": 1.27592158, + "balance_loss_mlp": 1.01902318, + "epoch": 0.49264993236134075, + "flos": 23045945921280.0, + "grad_norm": 1.7365441093438982, + "language_loss": 0.77583838, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.80070287, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21789551, + "step": 8194, + "time_per_iteration": 2.8996403217315674 + }, + { + "auxiliary_loss_clip": 0.01447805, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.27611732, + "balance_loss_mlp": 1.01299524, + "epoch": 0.4927100556140087, + "flos": 24729130172160.0, + "grad_norm": 1.991708363561768, + "language_loss": 0.71441102, + "learning_rate": 2.144170401915341e-06, + "loss": 0.73921943, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20056152, + "step": 8195, + "time_per_iteration": 2.855769157409668 + }, + { + "auxiliary_loss_clip": 0.01447798, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.2742517, + "balance_loss_mlp": 1.01286554, + "epoch": 0.4927701788666767, + "flos": 23513905336320.0, + "grad_norm": 2.1378896912552623, + "language_loss": 0.81784123, + "learning_rate": 2.143781950696001e-06, + "loss": 0.84265399, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20617676, + "step": 8196, + "time_per_iteration": 2.915947914123535 + }, + { + "auxiliary_loss_clip": 0.01464516, + "auxiliary_loss_mlp": 0.01035186, + "balance_loss_clip": 1.28770912, + "balance_loss_mlp": 1.01368105, + "epoch": 0.49283030211934464, + "flos": 22938950672640.0, + "grad_norm": 1.92325499743484, + "language_loss": 0.71602762, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.74102467, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21520996, + "step": 8197, + "time_per_iteration": 2.9748408794403076 + }, + { + "auxiliary_loss_clip": 0.01439091, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.26834989, + "balance_loss_mlp": 1.01488876, + "epoch": 0.4928904253720126, + "flos": 16881711557760.0, + "grad_norm": 2.0572289461413114, + "language_loss": 0.86017811, + "learning_rate": 2.143005031915374e-06, + "loss": 0.88492942, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21154785, + "step": 8198, + "time_per_iteration": 2.8543999195098877 + }, + { + "auxiliary_loss_clip": 0.01467948, + "auxiliary_loss_mlp": 0.01039683, + "balance_loss_clip": 1.29027474, + "balance_loss_mlp": 1.0183084, + "epoch": 0.4929505486246806, + "flos": 14874781628160.0, + "grad_norm": 1.8676249686351547, + "language_loss": 0.76783425, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.79291052, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 1.77441406, + "router_z_loss_mlp": 0.21386719, + "step": 8199, + "time_per_iteration": 2.878070116043091 + }, + { + "auxiliary_loss_clip": 0.01454815, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.27903676, + "balance_loss_mlp": 1.01534605, + "epoch": 0.49301067187734854, + "flos": 23852989002240.0, + "grad_norm": 1.872316379202134, + "language_loss": 0.60379112, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62871027, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21740723, + "step": 8200, + "time_per_iteration": 2.8904125690460205 + }, + { + "auxiliary_loss_clip": 0.01428716, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.26236987, + "balance_loss_mlp": 1.01929808, + "epoch": 0.49307079513001656, + "flos": 22501287290880.0, + "grad_norm": 1.5489372804232813, + "language_loss": 0.79561245, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.82030165, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20922852, + "step": 8201, + "time_per_iteration": 2.920135021209717 + }, + { + "auxiliary_loss_clip": 0.01473095, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.2928232, + "balance_loss_mlp": 1.01708508, + "epoch": 0.4931309183826845, + "flos": 15933078938880.0, + "grad_norm": 1.9891450096530265, + "language_loss": 0.68311822, + "learning_rate": 2.141451129398785e-06, + "loss": 0.70823509, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 1.80273438, + "router_z_loss_mlp": 0.21520996, + "step": 8202, + "time_per_iteration": 4.269949436187744 + }, + { + "auxiliary_loss_clip": 0.01441389, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.26808083, + "balance_loss_mlp": 1.01370549, + "epoch": 0.4931910416353525, + "flos": 27320742397440.0, + "grad_norm": 1.9923308273112112, + "language_loss": 0.76327473, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.78804135, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21569824, + "step": 8203, + "time_per_iteration": 5.720124006271362 + }, + { + "auxiliary_loss_clip": 0.01456064, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.28020239, + "balance_loss_mlp": 1.01694953, + "epoch": 0.49325116488802045, + "flos": 20813940518400.0, + "grad_norm": 1.9742443046693592, + "language_loss": 0.81781983, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.84277201, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.2220459, + "step": 8204, + "time_per_iteration": 2.9217777252197266 + }, + { + "auxiliary_loss_clip": 0.01441062, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.2687397, + "balance_loss_mlp": 1.01902175, + "epoch": 0.4933112881406884, + "flos": 19875714203520.0, + "grad_norm": 4.330394288741532, + "language_loss": 0.6778729, + "learning_rate": 2.140285646139455e-06, + "loss": 0.7026875, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21362305, + "step": 8205, + "time_per_iteration": 2.8415639400482178 + }, + { + "auxiliary_loss_clip": 0.01460482, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_clip": 1.28115368, + "balance_loss_mlp": 1.01934564, + "epoch": 0.4933714113933564, + "flos": 21836105216640.0, + "grad_norm": 1.968802761724132, + "language_loss": 0.67040551, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69543397, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.23010254, + "step": 8206, + "time_per_iteration": 2.8661131858825684 + }, + { + "auxiliary_loss_clip": 0.01459445, + "auxiliary_loss_mlp": 0.01036052, + "balance_loss_clip": 1.2834723, + "balance_loss_mlp": 1.01532114, + "epoch": 0.49343153464602435, + "flos": 27901262171520.0, + "grad_norm": 2.1651463115234137, + "language_loss": 0.77686775, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.80182278, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20715332, + "step": 8207, + "time_per_iteration": 2.8913066387176514 + }, + { + "auxiliary_loss_clip": 0.01442315, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.26901376, + "balance_loss_mlp": 1.01463687, + "epoch": 0.4934916578986923, + "flos": 24691821194880.0, + "grad_norm": 2.189598542845242, + "language_loss": 0.61461997, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.63941574, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.22607422, + "step": 8208, + "time_per_iteration": 2.8725881576538086 + }, + { + "auxiliary_loss_clip": 0.01446477, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.27137899, + "balance_loss_mlp": 1.01620591, + "epoch": 0.4935517811513603, + "flos": 23415823313280.0, + "grad_norm": 1.6282770113750196, + "language_loss": 0.79234123, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.81719315, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.22497559, + "step": 8209, + "time_per_iteration": 2.850533962249756 + }, + { + "auxiliary_loss_clip": 0.01436662, + "auxiliary_loss_mlp": 0.0103948, + "balance_loss_clip": 1.26424003, + "balance_loss_mlp": 1.01710474, + "epoch": 0.49361190440402825, + "flos": 21954502154880.0, + "grad_norm": 1.82626551194719, + "language_loss": 0.79803318, + "learning_rate": 2.138343067844089e-06, + "loss": 0.82279462, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.22375488, + "step": 8210, + "time_per_iteration": 2.8450286388397217 + }, + { + "auxiliary_loss_clip": 0.01456005, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.27743852, + "balance_loss_mlp": 1.01665151, + "epoch": 0.4936720276566962, + "flos": 25125684013440.0, + "grad_norm": 1.6947884002719207, + "language_loss": 0.81567526, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.84062374, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.22192383, + "step": 8211, + "time_per_iteration": 2.8857808113098145 + }, + { + "auxiliary_loss_clip": 0.01451204, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.27582943, + "balance_loss_mlp": 1.01271188, + "epoch": 0.4937321509093642, + "flos": 26369983272960.0, + "grad_norm": 2.7305029919752286, + "language_loss": 0.93177354, + "learning_rate": 2.137565999700933e-06, + "loss": 0.95663577, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22302246, + "step": 8212, + "time_per_iteration": 2.910034418106079 + }, + { + "auxiliary_loss_clip": 0.01445364, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.27105355, + "balance_loss_mlp": 1.0159384, + "epoch": 0.49379227416203214, + "flos": 22970423070720.0, + "grad_norm": 1.749143286021176, + "language_loss": 0.6567601, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.68157887, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.20568848, + "step": 8213, + "time_per_iteration": 2.886085271835327 + }, + { + "auxiliary_loss_clip": 0.01440133, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.26711988, + "balance_loss_mlp": 1.01524508, + "epoch": 0.49385239741470016, + "flos": 32501885587200.0, + "grad_norm": 1.899885506491966, + "language_loss": 0.76730549, + "learning_rate": 2.136788910691711e-06, + "loss": 0.79206985, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.21057129, + "step": 8214, + "time_per_iteration": 2.935310125350952 + }, + { + "auxiliary_loss_clip": 0.01448748, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.27502108, + "balance_loss_mlp": 1.01875675, + "epoch": 0.4939125206673681, + "flos": 22503278062080.0, + "grad_norm": 2.002967409918675, + "language_loss": 0.84835982, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.87325537, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.22070312, + "step": 8215, + "time_per_iteration": 2.858553886413574 + }, + { + "auxiliary_loss_clip": 0.01419376, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.25308371, + "balance_loss_mlp": 1.01429796, + "epoch": 0.4939726439200361, + "flos": 31188985931520.0, + "grad_norm": 1.513193717763, + "language_loss": 0.83842307, + "learning_rate": 2.136011800934292e-06, + "loss": 0.86297929, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.21936035, + "step": 8216, + "time_per_iteration": 3.0113375186920166 + }, + { + "auxiliary_loss_clip": 0.01435548, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.26501751, + "balance_loss_mlp": 1.01925397, + "epoch": 0.49403276717270406, + "flos": 22684393572480.0, + "grad_norm": 1.4390454668871435, + "language_loss": 0.75405836, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.77883893, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.2322998, + "step": 8217, + "time_per_iteration": 2.9483742713928223 + }, + { + "auxiliary_loss_clip": 0.01445431, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.27547812, + "balance_loss_mlp": 1.01662123, + "epoch": 0.494092890425372, + "flos": 20750905232640.0, + "grad_norm": 1.6052079283455736, + "language_loss": 0.79387152, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.81871557, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22363281, + "step": 8218, + "time_per_iteration": 2.91963529586792 + }, + { + "auxiliary_loss_clip": 0.01436509, + "auxiliary_loss_mlp": 0.01041804, + "balance_loss_clip": 1.26696587, + "balance_loss_mlp": 1.02052486, + "epoch": 0.49415301367804, + "flos": 18378124698240.0, + "grad_norm": 2.4575384050721443, + "language_loss": 0.7698158, + "learning_rate": 2.134846097653142e-06, + "loss": 0.79459882, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21276855, + "step": 8219, + "time_per_iteration": 2.8443048000335693 + }, + { + "auxiliary_loss_clip": 0.01449304, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.27658546, + "balance_loss_mlp": 1.01671481, + "epoch": 0.49421313693070795, + "flos": 17539699708800.0, + "grad_norm": 1.6612431482513117, + "language_loss": 0.63228023, + "learning_rate": 2.134457519646357e-06, + "loss": 0.65716422, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.22375488, + "step": 8220, + "time_per_iteration": 2.8849024772644043 + }, + { + "auxiliary_loss_clip": 0.01446179, + "auxiliary_loss_mlp": 0.0103666, + "balance_loss_clip": 1.27391446, + "balance_loss_mlp": 1.01545227, + "epoch": 0.4942732601833759, + "flos": 20821541644800.0, + "grad_norm": 1.724176438696826, + "language_loss": 0.73322552, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.7580539, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21203613, + "step": 8221, + "time_per_iteration": 2.879103422164917 + }, + { + "auxiliary_loss_clip": 0.01441856, + "auxiliary_loss_mlp": 0.01044353, + "balance_loss_clip": 1.27271998, + "balance_loss_mlp": 1.02347994, + "epoch": 0.4943333834360439, + "flos": 15057209237760.0, + "grad_norm": 2.2122399059870372, + "language_loss": 0.80377233, + "learning_rate": 2.133680348351595e-06, + "loss": 0.82863444, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.2088623, + "step": 8222, + "time_per_iteration": 2.9110918045043945 + }, + { + "auxiliary_loss_clip": 0.01459338, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.28657675, + "balance_loss_mlp": 1.02110648, + "epoch": 0.49439350668871185, + "flos": 16078921488000.0, + "grad_norm": 2.269909493117277, + "language_loss": 0.73425663, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75928605, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22497559, + "step": 8223, + "time_per_iteration": 2.773883104324341 + }, + { + "auxiliary_loss_clip": 0.01459208, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.28335881, + "balance_loss_mlp": 1.02331066, + "epoch": 0.4944536299413798, + "flos": 20888151269760.0, + "grad_norm": 1.7888037147271982, + "language_loss": 0.75995475, + "learning_rate": 2.132903156780144e-06, + "loss": 0.78499544, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.21520996, + "step": 8224, + "time_per_iteration": 2.856503963470459 + }, + { + "auxiliary_loss_clip": 0.01455379, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.28197038, + "balance_loss_mlp": 1.0214653, + "epoch": 0.4945137531940478, + "flos": 26618613304320.0, + "grad_norm": 6.956542818011109, + "language_loss": 0.64806193, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.67304778, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21740723, + "step": 8225, + "time_per_iteration": 2.896453380584717 + }, + { + "auxiliary_loss_clip": 0.01459352, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.28432965, + "balance_loss_mlp": 1.0180583, + "epoch": 0.49457387644671574, + "flos": 23998786306560.0, + "grad_norm": 1.8964438747703498, + "language_loss": 0.77447039, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.79945201, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.20751953, + "step": 8226, + "time_per_iteration": 2.8432672023773193 + }, + { + "auxiliary_loss_clip": 0.01449, + "auxiliary_loss_mlp": 0.01040559, + "balance_loss_clip": 1.27474332, + "balance_loss_mlp": 1.01795745, + "epoch": 0.49463399969938376, + "flos": 26987133352320.0, + "grad_norm": 1.6948962462172361, + "language_loss": 0.71732152, + "learning_rate": 2.131737331662051e-06, + "loss": 0.74221718, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.22595215, + "step": 8227, + "time_per_iteration": 4.3562023639678955 + }, + { + "auxiliary_loss_clip": 0.01470781, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.29303312, + "balance_loss_mlp": 1.02123666, + "epoch": 0.49469412295205173, + "flos": 29694065869440.0, + "grad_norm": 1.7121214570588066, + "language_loss": 0.72287858, + "learning_rate": 2.131348713278718e-06, + "loss": 0.74800551, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 1.77734375, + "router_z_loss_mlp": 0.20678711, + "step": 8228, + "time_per_iteration": 2.9402685165405273 + }, + { + "auxiliary_loss_clip": 0.01436251, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.26724005, + "balance_loss_mlp": 1.01475883, + "epoch": 0.4947542462047197, + "flos": 24141868922880.0, + "grad_norm": 1.439214482718858, + "language_loss": 0.84433842, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.86906397, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21533203, + "step": 8229, + "time_per_iteration": 2.858778238296509 + }, + { + "auxiliary_loss_clip": 0.01456885, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.28232551, + "balance_loss_mlp": 1.01530468, + "epoch": 0.49481436945738766, + "flos": 20053979291520.0, + "grad_norm": 1.7897235968712297, + "language_loss": 0.75671721, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.78165066, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.21166992, + "step": 8230, + "time_per_iteration": 2.8566603660583496 + }, + { + "auxiliary_loss_clip": 0.01456711, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.28332138, + "balance_loss_mlp": 1.01268864, + "epoch": 0.4948744927100556, + "flos": 15678069390720.0, + "grad_norm": 3.2115603950343643, + "language_loss": 0.80933547, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.83422792, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.19848633, + "step": 8231, + "time_per_iteration": 2.845608949661255 + }, + { + "auxiliary_loss_clip": 0.01232457, + "auxiliary_loss_mlp": 0.01046879, + "balance_loss_clip": 1.13180113, + "balance_loss_mlp": 1.02227461, + "epoch": 0.4949346159627236, + "flos": 68905495860480.0, + "grad_norm": 0.8370235975750995, + "language_loss": 0.60239446, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62518781, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.24511719, + "step": 8232, + "time_per_iteration": 3.5098164081573486 + }, + { + "auxiliary_loss_clip": 0.01465735, + "auxiliary_loss_mlp": 0.0103662, + "balance_loss_clip": 1.28680158, + "balance_loss_mlp": 1.01397038, + "epoch": 0.49499473921539155, + "flos": 24800988193920.0, + "grad_norm": 1.8740165854746917, + "language_loss": 0.69920343, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.72422701, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 1.7890625, + "router_z_loss_mlp": 0.2265625, + "step": 8233, + "time_per_iteration": 3.0036284923553467 + }, + { + "auxiliary_loss_clip": 0.01432608, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.26357293, + "balance_loss_mlp": 1.01430571, + "epoch": 0.4950548624680595, + "flos": 32720717278080.0, + "grad_norm": 2.412614600127574, + "language_loss": 0.68116248, + "learning_rate": 2.129016898898633e-06, + "loss": 0.70585108, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21936035, + "step": 8234, + "time_per_iteration": 2.9526586532592773 + }, + { + "auxiliary_loss_clip": 0.01232414, + "auxiliary_loss_mlp": 0.01027663, + "balance_loss_clip": 1.13321161, + "balance_loss_mlp": 1.00248647, + "epoch": 0.4951149857207275, + "flos": 50110791857280.0, + "grad_norm": 0.7934942557027935, + "language_loss": 0.58026052, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60286129, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.25195312, + "step": 8235, + "time_per_iteration": 3.2508633136749268 + }, + { + "auxiliary_loss_clip": 0.01456037, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.28156972, + "balance_loss_mlp": 1.01820433, + "epoch": 0.49517510897339545, + "flos": 22246956414720.0, + "grad_norm": 1.7113596090754408, + "language_loss": 0.78627646, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.81122887, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21008301, + "step": 8236, + "time_per_iteration": 2.8552067279815674 + }, + { + "auxiliary_loss_clip": 0.01436008, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.26663017, + "balance_loss_mlp": 1.01470041, + "epoch": 0.4952352322260634, + "flos": 25384991817600.0, + "grad_norm": 2.298913212659366, + "language_loss": 0.73657691, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.76129591, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21191406, + "step": 8237, + "time_per_iteration": 4.280331134796143 + }, + { + "auxiliary_loss_clip": 0.01457836, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.28733587, + "balance_loss_mlp": 1.01841879, + "epoch": 0.4952953554787314, + "flos": 24619601214720.0, + "grad_norm": 1.6505000943191281, + "language_loss": 0.76647663, + "learning_rate": 2.127462257935406e-06, + "loss": 0.7914542, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21496582, + "step": 8238, + "time_per_iteration": 4.320410251617432 + }, + { + "auxiliary_loss_clip": 0.01453883, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.2792784, + "balance_loss_mlp": 1.01905406, + "epoch": 0.49535547873139935, + "flos": 17320189345920.0, + "grad_norm": 3.786254977699487, + "language_loss": 0.75076181, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.77570832, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21716309, + "step": 8239, + "time_per_iteration": 4.280998706817627 + }, + { + "auxiliary_loss_clip": 0.014474, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.27315426, + "balance_loss_mlp": 1.01463294, + "epoch": 0.4954156019840673, + "flos": 20749955091840.0, + "grad_norm": 2.471581904142579, + "language_loss": 0.79762912, + "learning_rate": 2.126684908394552e-06, + "loss": 0.82247323, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.22363281, + "step": 8240, + "time_per_iteration": 2.8616950511932373 + }, + { + "auxiliary_loss_clip": 0.01440133, + "auxiliary_loss_mlp": 0.01037914, + "balance_loss_clip": 1.2710464, + "balance_loss_mlp": 1.01667082, + "epoch": 0.49547572523673533, + "flos": 12827692298880.0, + "grad_norm": 2.0043229767452444, + "language_loss": 0.86245334, + "learning_rate": 2.126296226410898e-06, + "loss": 0.88723385, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21228027, + "step": 8241, + "time_per_iteration": 2.797053813934326 + }, + { + "auxiliary_loss_clip": 0.01435311, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.26796055, + "balance_loss_mlp": 1.01708472, + "epoch": 0.4955358484894033, + "flos": 15605668431360.0, + "grad_norm": 2.2401795036012992, + "language_loss": 0.78386188, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.80858833, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20251465, + "step": 8242, + "time_per_iteration": 2.8272273540496826 + }, + { + "auxiliary_loss_clip": 0.0144507, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.27438462, + "balance_loss_mlp": 1.01553273, + "epoch": 0.49559597174207126, + "flos": 26474399568000.0, + "grad_norm": 2.6910323777114367, + "language_loss": 0.67757714, + "learning_rate": 2.125518848090833e-06, + "loss": 0.70240051, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21740723, + "step": 8243, + "time_per_iteration": 2.8541581630706787 + }, + { + "auxiliary_loss_clip": 0.01446235, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.27527487, + "balance_loss_mlp": 1.01801074, + "epoch": 0.4956560949947392, + "flos": 23158461035520.0, + "grad_norm": 2.6783266863319204, + "language_loss": 0.69245607, + "learning_rate": 2.125130151783901e-06, + "loss": 0.71730644, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.20788574, + "step": 8244, + "time_per_iteration": 2.871535062789917 + }, + { + "auxiliary_loss_clip": 0.01444475, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.27150297, + "balance_loss_mlp": 1.01747715, + "epoch": 0.4957162182474072, + "flos": 20782241896320.0, + "grad_norm": 2.306718478188363, + "language_loss": 0.75846064, + "learning_rate": 2.12474145073202e-06, + "loss": 0.78329867, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.21862793, + "step": 8245, + "time_per_iteration": 2.980043888092041 + }, + { + "auxiliary_loss_clip": 0.01438106, + "auxiliary_loss_mlp": 0.01042818, + "balance_loss_clip": 1.27001309, + "balance_loss_mlp": 1.02058589, + "epoch": 0.49577634150007516, + "flos": 18743477610240.0, + "grad_norm": 2.0239142419883063, + "language_loss": 0.82741129, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.85222054, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.22241211, + "step": 8246, + "time_per_iteration": 2.9433393478393555 + }, + { + "auxiliary_loss_clip": 0.01456667, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.28215802, + "balance_loss_mlp": 1.01841772, + "epoch": 0.4958364647527431, + "flos": 25564478515200.0, + "grad_norm": 1.7436618700432225, + "language_loss": 0.85021126, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.87517512, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.2130127, + "step": 8247, + "time_per_iteration": 3.0296268463134766 + }, + { + "auxiliary_loss_clip": 0.01458263, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.28421545, + "balance_loss_mlp": 1.01915538, + "epoch": 0.4958965880054111, + "flos": 24435906750720.0, + "grad_norm": 2.1832910518651265, + "language_loss": 0.83970106, + "learning_rate": 2.123575319254087e-06, + "loss": 0.86467814, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20300293, + "step": 8248, + "time_per_iteration": 2.8678877353668213 + }, + { + "auxiliary_loss_clip": 0.01450137, + "auxiliary_loss_mlp": 0.01038513, + "balance_loss_clip": 1.27632558, + "balance_loss_mlp": 1.01599491, + "epoch": 0.49595671125807905, + "flos": 25094573573760.0, + "grad_norm": 1.870974259656413, + "language_loss": 0.7437439, + "learning_rate": 2.123186599369812e-06, + "loss": 0.76863045, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.2253418, + "step": 8249, + "time_per_iteration": 2.88061261177063 + }, + { + "auxiliary_loss_clip": 0.01464396, + "auxiliary_loss_mlp": 0.0104353, + "balance_loss_clip": 1.28953266, + "balance_loss_mlp": 1.02164292, + "epoch": 0.496016834510747, + "flos": 16444636358400.0, + "grad_norm": 1.7453559720161278, + "language_loss": 0.76261747, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78769672, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21862793, + "step": 8250, + "time_per_iteration": 2.827531099319458 + }, + { + "auxiliary_loss_clip": 0.01474998, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.29934514, + "balance_loss_mlp": 1.01960695, + "epoch": 0.496076957763415, + "flos": 23447657669760.0, + "grad_norm": 2.041604718086462, + "language_loss": 0.70400172, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72915912, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.21118164, + "step": 8251, + "time_per_iteration": 2.856873035430908 + }, + { + "auxiliary_loss_clip": 0.01449576, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.27898145, + "balance_loss_mlp": 1.01193869, + "epoch": 0.49613708101608295, + "flos": 16918206128640.0, + "grad_norm": 1.7902495305275334, + "language_loss": 0.80099756, + "learning_rate": 2.122020411748461e-06, + "loss": 0.8258127, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19995117, + "step": 8252, + "time_per_iteration": 2.821040153503418 + }, + { + "auxiliary_loss_clip": 0.0144719, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.27888441, + "balance_loss_mlp": 1.01239514, + "epoch": 0.4961972042687509, + "flos": 16626837744000.0, + "grad_norm": 1.7331384377033097, + "language_loss": 0.82229441, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.84710401, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21386719, + "step": 8253, + "time_per_iteration": 2.8722009658813477 + }, + { + "auxiliary_loss_clip": 0.01441682, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.27177858, + "balance_loss_mlp": 1.01583314, + "epoch": 0.49625732752141893, + "flos": 28970372989440.0, + "grad_norm": 1.6056117500591456, + "language_loss": 0.67625052, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.70103008, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.2043457, + "step": 8254, + "time_per_iteration": 2.951997995376587 + }, + { + "auxiliary_loss_clip": 0.01454627, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.28163397, + "balance_loss_mlp": 1.01631784, + "epoch": 0.4963174507740869, + "flos": 23122735626240.0, + "grad_norm": 2.47012597429492, + "language_loss": 0.74861062, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.77352309, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20288086, + "step": 8255, + "time_per_iteration": 2.8855559825897217 + }, + { + "auxiliary_loss_clip": 0.01433998, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.2641151, + "balance_loss_mlp": 1.01895475, + "epoch": 0.49637757402675486, + "flos": 13925832295680.0, + "grad_norm": 1.7130947821587992, + "language_loss": 0.82077241, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.84551638, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21435547, + "step": 8256, + "time_per_iteration": 2.819978952407837 + }, + { + "auxiliary_loss_clip": 0.01435752, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.2685504, + "balance_loss_mlp": 1.01465797, + "epoch": 0.49643769727942283, + "flos": 22318995415680.0, + "grad_norm": 1.8008453746631123, + "language_loss": 0.81647611, + "learning_rate": 2.120076673368901e-06, + "loss": 0.84117651, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19628906, + "step": 8257, + "time_per_iteration": 2.8695476055145264 + }, + { + "auxiliary_loss_clip": 0.01476536, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.29646826, + "balance_loss_mlp": 1.01595938, + "epoch": 0.4964978205320908, + "flos": 19509682619520.0, + "grad_norm": 1.8755350827063946, + "language_loss": 0.66822338, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.69335926, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.2109375, + "step": 8258, + "time_per_iteration": 2.864656448364258 + }, + { + "auxiliary_loss_clip": 0.01427335, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.26166916, + "balance_loss_mlp": 1.01373577, + "epoch": 0.49655794378475876, + "flos": 23445893122560.0, + "grad_norm": 1.4900845845911228, + "language_loss": 0.78127182, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.8058908, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20825195, + "step": 8259, + "time_per_iteration": 2.9319028854370117 + }, + { + "auxiliary_loss_clip": 0.01444747, + "auxiliary_loss_mlp": 0.01036353, + "balance_loss_clip": 1.27619994, + "balance_loss_mlp": 1.01663613, + "epoch": 0.4966180670374267, + "flos": 26841743251200.0, + "grad_norm": 1.3942112317785031, + "language_loss": 0.79225117, + "learning_rate": 2.1189103755834e-06, + "loss": 0.81706214, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19714355, + "step": 8260, + "time_per_iteration": 2.873436450958252 + }, + { + "auxiliary_loss_clip": 0.01447524, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.27538347, + "balance_loss_mlp": 1.01239347, + "epoch": 0.4966781902900947, + "flos": 22017220727040.0, + "grad_norm": 4.192326549138425, + "language_loss": 0.77176088, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.79656726, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20703125, + "step": 8261, + "time_per_iteration": 2.8344788551330566 + }, + { + "auxiliary_loss_clip": 0.01422316, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.25636733, + "balance_loss_mlp": 1.01667953, + "epoch": 0.49673831354276266, + "flos": 26224457437440.0, + "grad_norm": 1.9080778504008375, + "language_loss": 0.90132582, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.92591095, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19506836, + "step": 8262, + "time_per_iteration": 4.348931312561035 + }, + { + "auxiliary_loss_clip": 0.01426966, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.2619108, + "balance_loss_mlp": 1.01273108, + "epoch": 0.4967984367954306, + "flos": 23191878960000.0, + "grad_norm": 1.5031936255966605, + "language_loss": 0.74680853, + "learning_rate": 2.11774403721606e-06, + "loss": 0.77140856, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.203125, + "step": 8263, + "time_per_iteration": 2.8238422870635986 + }, + { + "auxiliary_loss_clip": 0.01456706, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.28397548, + "balance_loss_mlp": 1.01456594, + "epoch": 0.4968585600480986, + "flos": 19290850928640.0, + "grad_norm": 2.099511989918052, + "language_loss": 0.70824254, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.73316967, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21447754, + "step": 8264, + "time_per_iteration": 2.853356122970581 + }, + { + "auxiliary_loss_clip": 0.01451987, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.27845979, + "balance_loss_mlp": 1.01362562, + "epoch": 0.49691868330076655, + "flos": 22538958226560.0, + "grad_norm": 1.3277344961355184, + "language_loss": 0.6518262, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67668021, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.19787598, + "step": 8265, + "time_per_iteration": 3.0164215564727783 + }, + { + "auxiliary_loss_clip": 0.01255557, + "auxiliary_loss_mlp": 0.01050967, + "balance_loss_clip": 1.15312171, + "balance_loss_mlp": 1.02903259, + "epoch": 0.4969788065534345, + "flos": 66610907619840.0, + "grad_norm": 0.8393662098591155, + "language_loss": 0.53520167, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.558267, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.21972656, + "step": 8266, + "time_per_iteration": 3.4696285724639893 + }, + { + "auxiliary_loss_clip": 0.01419455, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.25695491, + "balance_loss_mlp": 1.01554549, + "epoch": 0.49703892980610254, + "flos": 24069739432320.0, + "grad_norm": 1.6257641741883453, + "language_loss": 0.80075932, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.82532275, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.21337891, + "step": 8267, + "time_per_iteration": 2.9869699478149414 + }, + { + "auxiliary_loss_clip": 0.01441554, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.27074361, + "balance_loss_mlp": 1.01572561, + "epoch": 0.4970990530587705, + "flos": 29136467226240.0, + "grad_norm": 2.45937698245482, + "language_loss": 0.76137316, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.78616166, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21569824, + "step": 8268, + "time_per_iteration": 2.99566912651062 + }, + { + "auxiliary_loss_clip": 0.01435114, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.26569617, + "balance_loss_mlp": 1.01467836, + "epoch": 0.49715917631143847, + "flos": 46042683482880.0, + "grad_norm": 1.8401971237031238, + "language_loss": 0.68623108, + "learning_rate": 2.115411240328073e-06, + "loss": 0.71093833, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20935059, + "step": 8269, + "time_per_iteration": 3.1610946655273438 + }, + { + "auxiliary_loss_clip": 0.0141601, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.25186265, + "balance_loss_mlp": 1.01035118, + "epoch": 0.49721929956410643, + "flos": 20200319533440.0, + "grad_norm": 2.3343443519380607, + "language_loss": 0.86134446, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.88581491, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20666504, + "step": 8270, + "time_per_iteration": 2.8996567726135254 + }, + { + "auxiliary_loss_clip": 0.01446137, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.27500355, + "balance_loss_mlp": 1.01708269, + "epoch": 0.4972794228167744, + "flos": 21663721952640.0, + "grad_norm": 1.675690900753053, + "language_loss": 0.7135663, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73839629, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19787598, + "step": 8271, + "time_per_iteration": 2.926745891571045 + }, + { + "auxiliary_loss_clip": 0.01440679, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.27150142, + "balance_loss_mlp": 1.01607323, + "epoch": 0.49733954606944236, + "flos": 24290380915200.0, + "grad_norm": 1.375068049414241, + "language_loss": 0.7872116, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.8119784, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.19934082, + "step": 8272, + "time_per_iteration": 4.388345241546631 + }, + { + "auxiliary_loss_clip": 0.01435361, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.26566136, + "balance_loss_mlp": 1.01859379, + "epoch": 0.4973996693221103, + "flos": 37866768485760.0, + "grad_norm": 1.9544743999724805, + "language_loss": 0.6745559, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.69929862, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.203125, + "step": 8273, + "time_per_iteration": 4.518434286117554 + }, + { + "auxiliary_loss_clip": 0.01444345, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.27587342, + "balance_loss_mlp": 1.01529646, + "epoch": 0.4974597925747783, + "flos": 21371448672000.0, + "grad_norm": 1.7316856689701627, + "language_loss": 0.79017603, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.81496489, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19250488, + "step": 8274, + "time_per_iteration": 3.0537285804748535 + }, + { + "auxiliary_loss_clip": 0.0144434, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.27313995, + "balance_loss_mlp": 1.01686776, + "epoch": 0.49751991582744626, + "flos": 30750553388160.0, + "grad_norm": 1.783096844158929, + "language_loss": 0.76313674, + "learning_rate": 2.113078285889493e-06, + "loss": 0.78795213, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20336914, + "step": 8275, + "time_per_iteration": 2.997716188430786 + }, + { + "auxiliary_loss_clip": 0.0143992, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.26839519, + "balance_loss_mlp": 1.01742351, + "epoch": 0.4975800390801142, + "flos": 14108621863680.0, + "grad_norm": 3.7241392144646808, + "language_loss": 0.84945071, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.8742336, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20947266, + "step": 8276, + "time_per_iteration": 2.881439208984375 + }, + { + "auxiliary_loss_clip": 0.01411302, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.24922705, + "balance_loss_mlp": 1.01431727, + "epoch": 0.4976401623327822, + "flos": 24218025200640.0, + "grad_norm": 2.6650933845193134, + "language_loss": 0.70704055, + "learning_rate": 2.112300599949172e-06, + "loss": 0.73149103, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19421387, + "step": 8277, + "time_per_iteration": 2.848639726638794 + }, + { + "auxiliary_loss_clip": 0.01421986, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.25673127, + "balance_loss_mlp": 1.0144738, + "epoch": 0.49770028558545015, + "flos": 21145242078720.0, + "grad_norm": 1.8007828876973913, + "language_loss": 0.82945389, + "learning_rate": 2.111911750583964e-06, + "loss": 0.85401994, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20141602, + "step": 8278, + "time_per_iteration": 2.899932861328125 + }, + { + "auxiliary_loss_clip": 0.01442089, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.27081096, + "balance_loss_mlp": 1.0185008, + "epoch": 0.4977604088381181, + "flos": 16772680293120.0, + "grad_norm": 2.6748067334441394, + "language_loss": 0.68616974, + "learning_rate": 2.111522896975052e-06, + "loss": 0.71097928, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.20349121, + "step": 8279, + "time_per_iteration": 2.8194363117218018 + }, + { + "auxiliary_loss_clip": 0.01425069, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.25653839, + "balance_loss_mlp": 1.01403987, + "epoch": 0.49782053209078614, + "flos": 15711396825600.0, + "grad_norm": 2.0199987144363147, + "language_loss": 0.71132362, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.73593497, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.22033691, + "step": 8280, + "time_per_iteration": 2.858469247817993 + }, + { + "auxiliary_loss_clip": 0.01429725, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.26182532, + "balance_loss_mlp": 1.01289487, + "epoch": 0.4978806553434541, + "flos": 24764357888640.0, + "grad_norm": 1.6703510521572618, + "language_loss": 0.65404719, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67868614, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21289062, + "step": 8281, + "time_per_iteration": 2.8670592308044434 + }, + { + "auxiliary_loss_clip": 0.01442373, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.2708981, + "balance_loss_mlp": 1.01267481, + "epoch": 0.49794077859612207, + "flos": 13123630408320.0, + "grad_norm": 1.9080773403449358, + "language_loss": 0.74080968, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.76557618, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21618652, + "step": 8282, + "time_per_iteration": 2.8007819652557373 + }, + { + "auxiliary_loss_clip": 0.01428167, + "auxiliary_loss_mlp": 0.01035919, + "balance_loss_clip": 1.26124001, + "balance_loss_mlp": 1.01644051, + "epoch": 0.49800090184879003, + "flos": 27536135483520.0, + "grad_norm": 1.60909637287516, + "language_loss": 0.7381171, + "learning_rate": 2.109967440397263e-06, + "loss": 0.7627579, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19470215, + "step": 8283, + "time_per_iteration": 2.9009766578674316 + }, + { + "auxiliary_loss_clip": 0.01416575, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.2510376, + "balance_loss_mlp": 1.01490188, + "epoch": 0.498061025101458, + "flos": 19802091634560.0, + "grad_norm": 1.5658581064192356, + "language_loss": 0.7998445, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.82436824, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20910645, + "step": 8284, + "time_per_iteration": 2.826444387435913 + }, + { + "auxiliary_loss_clip": 0.01445208, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.27169049, + "balance_loss_mlp": 1.01856315, + "epoch": 0.49812114835412596, + "flos": 29904798741120.0, + "grad_norm": 1.6118923491231278, + "language_loss": 0.74326092, + "learning_rate": 2.109189687029526e-06, + "loss": 0.76811022, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.21154785, + "step": 8285, + "time_per_iteration": 2.9164702892303467 + }, + { + "auxiliary_loss_clip": 0.01439567, + "auxiliary_loss_mlp": 0.01037054, + "balance_loss_clip": 1.27078593, + "balance_loss_mlp": 1.01503563, + "epoch": 0.49818127160679393, + "flos": 23156967957120.0, + "grad_norm": 3.144371952258391, + "language_loss": 0.74601632, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.77078259, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.22033691, + "step": 8286, + "time_per_iteration": 2.8582475185394287 + }, + { + "auxiliary_loss_clip": 0.0143773, + "auxiliary_loss_mlp": 0.0104109, + "balance_loss_clip": 1.26827812, + "balance_loss_mlp": 1.01973963, + "epoch": 0.4982413948594619, + "flos": 21662771811840.0, + "grad_norm": 1.6482529627279956, + "language_loss": 0.86190414, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.88669229, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21337891, + "step": 8287, + "time_per_iteration": 2.893589496612549 + }, + { + "auxiliary_loss_clip": 0.01441928, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.27151847, + "balance_loss_mlp": 1.01568198, + "epoch": 0.49830151811212986, + "flos": 32499216144000.0, + "grad_norm": 1.9828190599963893, + "language_loss": 0.72941405, + "learning_rate": 2.108023025961159e-06, + "loss": 0.75420225, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.2121582, + "step": 8288, + "time_per_iteration": 3.0443592071533203 + }, + { + "auxiliary_loss_clip": 0.01443297, + "auxiliary_loss_mlp": 0.01043825, + "balance_loss_clip": 1.27109027, + "balance_loss_mlp": 1.02138948, + "epoch": 0.4983616413647978, + "flos": 18149384396160.0, + "grad_norm": 5.924420055047721, + "language_loss": 0.82203668, + "learning_rate": 2.10763413072622e-06, + "loss": 0.84690785, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.22424316, + "step": 8289, + "time_per_iteration": 2.8993639945983887 + }, + { + "auxiliary_loss_clip": 0.01427974, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.25956798, + "balance_loss_mlp": 1.01740849, + "epoch": 0.4984217646174658, + "flos": 19728242841600.0, + "grad_norm": 2.472707562387102, + "language_loss": 0.74925655, + "learning_rate": 2.107245231409784e-06, + "loss": 0.7739253, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21496582, + "step": 8290, + "time_per_iteration": 2.829845428466797 + }, + { + "auxiliary_loss_clip": 0.01448934, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.27721047, + "balance_loss_mlp": 1.0117929, + "epoch": 0.49848188787013376, + "flos": 24947373680640.0, + "grad_norm": 1.5034216334358055, + "language_loss": 0.84753066, + "learning_rate": 2.106856328026598e-06, + "loss": 0.8723706, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.23291016, + "step": 8291, + "time_per_iteration": 2.8804898262023926 + }, + { + "auxiliary_loss_clip": 0.01450105, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.27495492, + "balance_loss_mlp": 1.01909232, + "epoch": 0.4985420111228017, + "flos": 22392572739840.0, + "grad_norm": 1.7168282525449434, + "language_loss": 0.67791671, + "learning_rate": 2.106467420591409e-06, + "loss": 0.70282817, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.21948242, + "step": 8292, + "time_per_iteration": 2.908088445663452 + }, + { + "auxiliary_loss_clip": 0.01426437, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.25881827, + "balance_loss_mlp": 1.01805258, + "epoch": 0.4986021343754697, + "flos": 16225035505920.0, + "grad_norm": 1.6382674749987316, + "language_loss": 0.67546499, + "learning_rate": 2.106078509118965e-06, + "loss": 0.70013011, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.22021484, + "step": 8293, + "time_per_iteration": 2.895575761795044 + }, + { + "auxiliary_loss_clip": 0.01449054, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.27714503, + "balance_loss_mlp": 1.01451683, + "epoch": 0.4986622576281377, + "flos": 23413877786880.0, + "grad_norm": 2.9146647437491295, + "language_loss": 0.83533001, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.86016655, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20068359, + "step": 8294, + "time_per_iteration": 2.8390579223632812 + }, + { + "auxiliary_loss_clip": 0.01444253, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.27491736, + "balance_loss_mlp": 1.0187974, + "epoch": 0.49872238088080567, + "flos": 19984112040960.0, + "grad_norm": 1.849717484645562, + "language_loss": 0.74261612, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.76746464, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21801758, + "step": 8295, + "time_per_iteration": 2.8709442615509033 + }, + { + "auxiliary_loss_clip": 0.01428613, + "auxiliary_loss_mlp": 0.01037078, + "balance_loss_clip": 1.26084995, + "balance_loss_mlp": 1.0164665, + "epoch": 0.49878250413347364, + "flos": 22903225263360.0, + "grad_norm": 1.835415707141508, + "language_loss": 0.68591869, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.71057564, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20617676, + "step": 8296, + "time_per_iteration": 4.269610404968262 + }, + { + "auxiliary_loss_clip": 0.01437504, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.26491022, + "balance_loss_mlp": 1.01560402, + "epoch": 0.4988426273861416, + "flos": 32611233565440.0, + "grad_norm": 1.7879994703510005, + "language_loss": 0.65337014, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67811543, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.2142334, + "step": 8297, + "time_per_iteration": 2.914956569671631 + }, + { + "auxiliary_loss_clip": 0.01422897, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.25621223, + "balance_loss_mlp": 1.01866806, + "epoch": 0.49890275063880957, + "flos": 20933513821440.0, + "grad_norm": 3.1681861252421295, + "language_loss": 0.70977032, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.73439151, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20544434, + "step": 8298, + "time_per_iteration": 2.8613665103912354 + }, + { + "auxiliary_loss_clip": 0.01421609, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.25400269, + "balance_loss_mlp": 1.01756573, + "epoch": 0.49896287389147753, + "flos": 18633631939200.0, + "grad_norm": 2.106564995235704, + "language_loss": 0.85694665, + "learning_rate": 2.103744956327814e-06, + "loss": 0.88154602, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20776367, + "step": 8299, + "time_per_iteration": 2.827418327331543 + }, + { + "auxiliary_loss_clip": 0.01439466, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.26702237, + "balance_loss_mlp": 1.01820505, + "epoch": 0.4990229971441455, + "flos": 24837211296000.0, + "grad_norm": 2.184379209444536, + "language_loss": 0.70049632, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.72529334, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.22045898, + "step": 8300, + "time_per_iteration": 2.910083055496216 + }, + { + "auxiliary_loss_clip": 0.01249317, + "auxiliary_loss_mlp": 0.01044677, + "balance_loss_clip": 1.14495099, + "balance_loss_mlp": 1.01434982, + "epoch": 0.49908312039681346, + "flos": 71417286979200.0, + "grad_norm": 0.760877337354368, + "language_loss": 0.51170915, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53464913, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.30273438, + "step": 8301, + "time_per_iteration": 3.450291156768799 + }, + { + "auxiliary_loss_clip": 0.01418382, + "auxiliary_loss_mlp": 0.0104422, + "balance_loss_clip": 1.25189281, + "balance_loss_mlp": 1.02269137, + "epoch": 0.4991432436494814, + "flos": 19838179002240.0, + "grad_norm": 2.054574732192301, + "language_loss": 0.84988672, + "learning_rate": 2.102578126623879e-06, + "loss": 0.87451279, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.2154541, + "step": 8302, + "time_per_iteration": 2.8369388580322266 + }, + { + "auxiliary_loss_clip": 0.01418105, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.25159717, + "balance_loss_mlp": 1.01691246, + "epoch": 0.4992033669021494, + "flos": 15130198379520.0, + "grad_norm": 1.8683815165032045, + "language_loss": 0.6985985, + "learning_rate": 2.102189175590024e-06, + "loss": 0.7231679, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21936035, + "step": 8303, + "time_per_iteration": 2.843846082687378 + }, + { + "auxiliary_loss_clip": 0.01445396, + "auxiliary_loss_mlp": 0.0104289, + "balance_loss_clip": 1.27267957, + "balance_loss_mlp": 1.02159953, + "epoch": 0.49926349015481736, + "flos": 31219327209600.0, + "grad_norm": 2.1933706513510582, + "language_loss": 0.73185921, + "learning_rate": 2.101800220681144e-06, + "loss": 0.756742, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21313477, + "step": 8304, + "time_per_iteration": 2.958021640777588 + }, + { + "auxiliary_loss_clip": 0.01433485, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.26330113, + "balance_loss_mlp": 1.01714551, + "epoch": 0.4993236134074853, + "flos": 24911060088960.0, + "grad_norm": 2.1325587094168337, + "language_loss": 0.8188889, + "learning_rate": 2.10141126191199e-06, + "loss": 0.84359777, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20263672, + "step": 8305, + "time_per_iteration": 2.8711745738983154 + }, + { + "auxiliary_loss_clip": 0.01241112, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.13984132, + "balance_loss_mlp": 0.99868959, + "epoch": 0.4993837366601533, + "flos": 70449108606720.0, + "grad_norm": 0.7122494379314129, + "language_loss": 0.56986904, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.59256458, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.296875, + "step": 8306, + "time_per_iteration": 4.8658154010772705 + }, + { + "auxiliary_loss_clip": 0.01431439, + "auxiliary_loss_mlp": 0.01037779, + "balance_loss_clip": 1.26179361, + "balance_loss_mlp": 1.0157249, + "epoch": 0.4994438599128213, + "flos": 15969754488960.0, + "grad_norm": 6.551209928916245, + "language_loss": 0.83687961, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.86157179, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.2208252, + "step": 8307, + "time_per_iteration": 2.926408052444458 + }, + { + "auxiliary_loss_clip": 0.01437395, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.2679379, + "balance_loss_mlp": 1.01809788, + "epoch": 0.4995039831654893, + "flos": 27939883248000.0, + "grad_norm": 2.327357043498748, + "language_loss": 0.622298, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.64708054, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.22753906, + "step": 8308, + "time_per_iteration": 4.344104528427124 + }, + { + "auxiliary_loss_clip": 0.01413202, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.2470181, + "balance_loss_mlp": 1.01262093, + "epoch": 0.49956410641815724, + "flos": 24214903309440.0, + "grad_norm": 1.6647158352699243, + "language_loss": 0.75140208, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77588511, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.22473145, + "step": 8309, + "time_per_iteration": 4.280550479888916 + }, + { + "auxiliary_loss_clip": 0.01431563, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.26086056, + "balance_loss_mlp": 1.02082479, + "epoch": 0.4996242296708252, + "flos": 16188721914240.0, + "grad_norm": 2.091737009061471, + "language_loss": 0.80294436, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82768148, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.21313477, + "step": 8310, + "time_per_iteration": 2.8051228523254395 + }, + { + "auxiliary_loss_clip": 0.01439917, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.26498699, + "balance_loss_mlp": 1.01942039, + "epoch": 0.49968435292349317, + "flos": 16882344984960.0, + "grad_norm": 1.5368714109456652, + "language_loss": 0.71747857, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.74229193, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.2199707, + "step": 8311, + "time_per_iteration": 2.846593141555786 + }, + { + "auxiliary_loss_clip": 0.01440471, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.27085924, + "balance_loss_mlp": 1.01788723, + "epoch": 0.49974447617616113, + "flos": 14947770769920.0, + "grad_norm": 1.707881682019575, + "language_loss": 0.77848321, + "learning_rate": 2.098688443679187e-06, + "loss": 0.80328089, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.2142334, + "step": 8312, + "time_per_iteration": 2.8453209400177 + }, + { + "auxiliary_loss_clip": 0.01444967, + "auxiliary_loss_mlp": 0.01040168, + "balance_loss_clip": 1.27275681, + "balance_loss_mlp": 1.01888871, + "epoch": 0.4998045994288291, + "flos": 26662844736000.0, + "grad_norm": 2.4392611082195286, + "language_loss": 0.85239315, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.87724447, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21289062, + "step": 8313, + "time_per_iteration": 2.8862712383270264 + }, + { + "auxiliary_loss_clip": 0.01425537, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.2542429, + "balance_loss_mlp": 1.01536691, + "epoch": 0.49986472268149706, + "flos": 20962045307520.0, + "grad_norm": 1.8988266726809413, + "language_loss": 0.8160485, + "learning_rate": 2.097910461710939e-06, + "loss": 0.84068215, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.22460938, + "step": 8314, + "time_per_iteration": 2.8773045539855957 + }, + { + "auxiliary_loss_clip": 0.01434306, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.26247096, + "balance_loss_mlp": 1.01937854, + "epoch": 0.49992484593416503, + "flos": 22794148753920.0, + "grad_norm": 1.7812668206997266, + "language_loss": 0.80024195, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.82501197, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.23327637, + "step": 8315, + "time_per_iteration": 2.828500986099243 + }, + { + "auxiliary_loss_clip": 0.01433071, + "auxiliary_loss_mlp": 0.01040746, + "balance_loss_clip": 1.26345956, + "balance_loss_mlp": 1.01915729, + "epoch": 0.499984969186833, + "flos": 46802282751360.0, + "grad_norm": 1.674495860334052, + "language_loss": 0.75012171, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.77485991, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.21582031, + "step": 8316, + "time_per_iteration": 3.0383615493774414 + }, + { + "auxiliary_loss_clip": 0.01426462, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.25919676, + "balance_loss_mlp": 1.01234543, + "epoch": 0.500045092439501, + "flos": 25568007609600.0, + "grad_norm": 1.5517483564985073, + "language_loss": 0.81896949, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.84357786, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.22033691, + "step": 8317, + "time_per_iteration": 2.9274802207946777 + }, + { + "auxiliary_loss_clip": 0.0143092, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.26013112, + "balance_loss_mlp": 1.01424813, + "epoch": 0.5001052156921689, + "flos": 20714636885760.0, + "grad_norm": 1.766743987489358, + "language_loss": 0.83924854, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.86391866, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.21826172, + "step": 8318, + "time_per_iteration": 2.9248757362365723 + }, + { + "auxiliary_loss_clip": 0.01430774, + "auxiliary_loss_mlp": 0.010353, + "balance_loss_clip": 1.25996411, + "balance_loss_mlp": 1.01348436, + "epoch": 0.500165338944837, + "flos": 21260879084160.0, + "grad_norm": 1.7711534398998237, + "language_loss": 0.8239904, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.84865117, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21813965, + "step": 8319, + "time_per_iteration": 2.865321159362793 + }, + { + "auxiliary_loss_clip": 0.0142736, + "auxiliary_loss_mlp": 0.01035886, + "balance_loss_clip": 1.25908077, + "balance_loss_mlp": 1.01459491, + "epoch": 0.5002254621975049, + "flos": 27865717741440.0, + "grad_norm": 1.6054604667511614, + "language_loss": 0.72079885, + "learning_rate": 2.095576427171635e-06, + "loss": 0.7454313, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21289062, + "step": 8320, + "time_per_iteration": 2.8846378326416016 + }, + { + "auxiliary_loss_clip": 0.01456975, + "auxiliary_loss_mlp": 0.01043032, + "balance_loss_clip": 1.27836633, + "balance_loss_mlp": 1.02014375, + "epoch": 0.5002855854501729, + "flos": 15558315108480.0, + "grad_norm": 3.503262700148327, + "language_loss": 0.78053772, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.80553782, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22888184, + "step": 8321, + "time_per_iteration": 2.80202054977417 + }, + { + "auxiliary_loss_clip": 0.01435109, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.26499844, + "balance_loss_mlp": 1.02047932, + "epoch": 0.5003457087028408, + "flos": 16116049486080.0, + "grad_norm": 1.8071262296644288, + "language_loss": 0.83707947, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.86185962, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22436523, + "step": 8322, + "time_per_iteration": 2.8136980533599854 + }, + { + "auxiliary_loss_clip": 0.0143442, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.26309729, + "balance_loss_mlp": 1.01582706, + "epoch": 0.5004058319555088, + "flos": 22720661919360.0, + "grad_norm": 4.262263903814169, + "language_loss": 0.74692637, + "learning_rate": 2.094409360775228e-06, + "loss": 0.77164924, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22045898, + "step": 8323, + "time_per_iteration": 2.8452413082122803 + }, + { + "auxiliary_loss_clip": 0.01432632, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.26146114, + "balance_loss_mlp": 1.01397276, + "epoch": 0.5004659552081767, + "flos": 30130236172800.0, + "grad_norm": 1.6950805850476818, + "language_loss": 0.70290524, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.72759277, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22155762, + "step": 8324, + "time_per_iteration": 2.9169812202453613 + }, + { + "auxiliary_loss_clip": 0.0144191, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.27093947, + "balance_loss_mlp": 1.01782775, + "epoch": 0.5005260784608447, + "flos": 18633948652800.0, + "grad_norm": 2.041774308139965, + "language_loss": 0.72617072, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.75099748, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.22937012, + "step": 8325, + "time_per_iteration": 2.817307710647583 + }, + { + "auxiliary_loss_clip": 0.01434499, + "auxiliary_loss_mlp": 0.01043584, + "balance_loss_clip": 1.26249003, + "balance_loss_mlp": 1.02056515, + "epoch": 0.5005862017135126, + "flos": 24869860058880.0, + "grad_norm": 1.897849344792366, + "language_loss": 0.7361055, + "learning_rate": 2.093242262158709e-06, + "loss": 0.76088637, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.22998047, + "step": 8326, + "time_per_iteration": 2.9285545349121094 + }, + { + "auxiliary_loss_clip": 0.01420082, + "auxiliary_loss_mlp": 0.01038718, + "balance_loss_clip": 1.25105786, + "balance_loss_mlp": 1.01678324, + "epoch": 0.5006463249661807, + "flos": 18743703834240.0, + "grad_norm": 1.917761741261309, + "language_loss": 0.78234458, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80693257, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21923828, + "step": 8327, + "time_per_iteration": 2.8496129512786865 + }, + { + "auxiliary_loss_clip": 0.01439684, + "auxiliary_loss_mlp": 0.01044486, + "balance_loss_clip": 1.26718092, + "balance_loss_mlp": 1.02182436, + "epoch": 0.5007064482188487, + "flos": 13049872104960.0, + "grad_norm": 2.436914338529368, + "language_loss": 0.88671136, + "learning_rate": 2.092464178710997e-06, + "loss": 0.91155303, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.2265625, + "step": 8328, + "time_per_iteration": 2.8428597450256348 + }, + { + "auxiliary_loss_clip": 0.01438773, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.26363051, + "balance_loss_mlp": 1.01454365, + "epoch": 0.5007665714715166, + "flos": 21298730999040.0, + "grad_norm": 7.359012599218469, + "language_loss": 0.75390691, + "learning_rate": 2.092075131720388e-06, + "loss": 0.77866167, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.22155762, + "step": 8329, + "time_per_iteration": 2.881842851638794 + }, + { + "auxiliary_loss_clip": 0.01421733, + "auxiliary_loss_mlp": 0.01038836, + "balance_loss_clip": 1.25461721, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5008266947241846, + "flos": 29767145500800.0, + "grad_norm": 1.8049434047077915, + "language_loss": 0.80163586, + "learning_rate": 2.091686081238281e-06, + "loss": 0.82624161, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21264648, + "step": 8330, + "time_per_iteration": 2.9144208431243896 + }, + { + "auxiliary_loss_clip": 0.01232396, + "auxiliary_loss_mlp": 0.01040897, + "balance_loss_clip": 1.13772273, + "balance_loss_mlp": 1.01209593, + "epoch": 0.5008868179768525, + "flos": 63583695290880.0, + "grad_norm": 0.7328523067955023, + "language_loss": 0.56205106, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58478397, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.28710938, + "step": 8331, + "time_per_iteration": 4.544417142868042 + }, + { + "auxiliary_loss_clip": 0.01419724, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.25297618, + "balance_loss_mlp": 1.01390958, + "epoch": 0.5009469412295205, + "flos": 27385677964800.0, + "grad_norm": 1.898428778583623, + "language_loss": 0.65698028, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.68153, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.21337891, + "step": 8332, + "time_per_iteration": 2.89968204498291 + }, + { + "auxiliary_loss_clip": 0.01427431, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.25928521, + "balance_loss_mlp": 1.01371098, + "epoch": 0.5010070644821885, + "flos": 27389704752000.0, + "grad_norm": 1.8582003345615024, + "language_loss": 0.75873542, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.78336513, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.21838379, + "step": 8333, + "time_per_iteration": 2.9078688621520996 + }, + { + "auxiliary_loss_clip": 0.01440911, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.2676717, + "balance_loss_mlp": 1.01746917, + "epoch": 0.5010671877348565, + "flos": 20671174615680.0, + "grad_norm": 2.239478943239947, + "language_loss": 0.81441379, + "learning_rate": 2.090129844689929e-06, + "loss": 0.8392058, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20837402, + "step": 8334, + "time_per_iteration": 2.8509902954101562 + }, + { + "auxiliary_loss_clip": 0.01232857, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.1400075, + "balance_loss_mlp": 1.00572705, + "epoch": 0.5011273109875244, + "flos": 59159092705920.0, + "grad_norm": 0.9451909104062545, + "language_loss": 0.62703294, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64968771, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.26953125, + "step": 8335, + "time_per_iteration": 3.262941360473633 + }, + { + "auxiliary_loss_clip": 0.01418709, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.25121832, + "balance_loss_mlp": 1.01554966, + "epoch": 0.5011874342401924, + "flos": 25346868433920.0, + "grad_norm": 1.5039593143502603, + "language_loss": 0.80683959, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.83139074, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20861816, + "step": 8336, + "time_per_iteration": 2.8914999961853027 + }, + { + "auxiliary_loss_clip": 0.0142828, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.25797904, + "balance_loss_mlp": 1.01483059, + "epoch": 0.5012475574928603, + "flos": 20239574037120.0, + "grad_norm": 1.7934573804457412, + "language_loss": 0.80807811, + "learning_rate": 2.088962631340836e-06, + "loss": 0.83273131, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.2220459, + "step": 8337, + "time_per_iteration": 2.831033945083618 + }, + { + "auxiliary_loss_clip": 0.01453022, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.27635765, + "balance_loss_mlp": 1.01429844, + "epoch": 0.5013076807455283, + "flos": 22720164226560.0, + "grad_norm": 2.027587609567849, + "language_loss": 0.79567397, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.82056665, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.21948242, + "step": 8338, + "time_per_iteration": 2.8648276329040527 + }, + { + "auxiliary_loss_clip": 0.01439613, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.26722431, + "balance_loss_mlp": 1.01505804, + "epoch": 0.5013678039981962, + "flos": 24256284318720.0, + "grad_norm": 1.7437196791305, + "language_loss": 0.85392749, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87869364, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21948242, + "step": 8339, + "time_per_iteration": 2.9259607791900635 + }, + { + "auxiliary_loss_clip": 0.01432415, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.26185846, + "balance_loss_mlp": 1.01697576, + "epoch": 0.5014279272508643, + "flos": 26187736642560.0, + "grad_norm": 2.1778429943543873, + "language_loss": 0.71739352, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.74211025, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.22265625, + "step": 8340, + "time_per_iteration": 2.8991920948028564 + }, + { + "auxiliary_loss_clip": 0.01441785, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.26655209, + "balance_loss_mlp": 1.01758814, + "epoch": 0.5014880505035323, + "flos": 21439460885760.0, + "grad_norm": 3.9456704509452165, + "language_loss": 0.79154027, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.81636047, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.22644043, + "step": 8341, + "time_per_iteration": 4.2436957359313965 + }, + { + "auxiliary_loss_clip": 0.01441016, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.26589894, + "balance_loss_mlp": 1.01200628, + "epoch": 0.5015481737562002, + "flos": 15777146799360.0, + "grad_norm": 2.873417764662513, + "language_loss": 0.90044016, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.9251858, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.2154541, + "step": 8342, + "time_per_iteration": 2.833301305770874 + }, + { + "auxiliary_loss_clip": 0.01436643, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.26528335, + "balance_loss_mlp": 1.01554072, + "epoch": 0.5016082970088682, + "flos": 26841336048000.0, + "grad_norm": 1.8610268312154412, + "language_loss": 0.77346671, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.79821193, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22338867, + "step": 8343, + "time_per_iteration": 4.2628302574157715 + }, + { + "auxiliary_loss_clip": 0.01420901, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.25267267, + "balance_loss_mlp": 1.01278234, + "epoch": 0.5016684202615361, + "flos": 21480570426240.0, + "grad_norm": 1.9044178418421396, + "language_loss": 0.68900883, + "learning_rate": 2.086239016143293e-06, + "loss": 0.71356308, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21740723, + "step": 8344, + "time_per_iteration": 4.295549154281616 + }, + { + "auxiliary_loss_clip": 0.01436443, + "auxiliary_loss_mlp": 0.01041306, + "balance_loss_clip": 1.26479948, + "balance_loss_mlp": 1.02014661, + "epoch": 0.5017285435142042, + "flos": 26257151445120.0, + "grad_norm": 1.882164795888667, + "language_loss": 0.76184022, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.78661776, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.21154785, + "step": 8345, + "time_per_iteration": 2.8923771381378174 + }, + { + "auxiliary_loss_clip": 0.01428523, + "auxiliary_loss_mlp": 0.01039034, + "balance_loss_clip": 1.25782871, + "balance_loss_mlp": 1.01553822, + "epoch": 0.5017886667668721, + "flos": 20787399803520.0, + "grad_norm": 2.205529519438255, + "language_loss": 0.79466939, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.81934494, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.23510742, + "step": 8346, + "time_per_iteration": 2.849806547164917 + }, + { + "auxiliary_loss_clip": 0.01428157, + "auxiliary_loss_mlp": 0.01042725, + "balance_loss_clip": 1.25794113, + "balance_loss_mlp": 1.02036142, + "epoch": 0.5018487900195401, + "flos": 20165860978560.0, + "grad_norm": 1.5567607335147207, + "language_loss": 0.70103025, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.72573912, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.22351074, + "step": 8347, + "time_per_iteration": 2.8379967212677 + }, + { + "auxiliary_loss_clip": 0.01430398, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.25721264, + "balance_loss_mlp": 1.01733065, + "epoch": 0.501908913272208, + "flos": 18159881189760.0, + "grad_norm": 2.0254496702502913, + "language_loss": 0.72951692, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.75421309, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21887207, + "step": 8348, + "time_per_iteration": 2.908296585083008 + }, + { + "auxiliary_loss_clip": 0.0142332, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.25820565, + "balance_loss_mlp": 1.01967943, + "epoch": 0.501969036524876, + "flos": 23122645136640.0, + "grad_norm": 1.5572992414891362, + "language_loss": 0.75204551, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.77669322, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.21765137, + "step": 8349, + "time_per_iteration": 2.933507204055786 + }, + { + "auxiliary_loss_clip": 0.01432036, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.25758529, + "balance_loss_mlp": 1.02087808, + "epoch": 0.5020291597775439, + "flos": 11370352682880.0, + "grad_norm": 2.4303316981335676, + "language_loss": 0.6523363, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.67709589, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.23059082, + "step": 8350, + "time_per_iteration": 2.9370200634002686 + }, + { + "auxiliary_loss_clip": 0.0124238, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.14899004, + "balance_loss_mlp": 1.01403487, + "epoch": 0.5020892830302119, + "flos": 64041085405440.0, + "grad_norm": 0.8333635180892747, + "language_loss": 0.59762996, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.62044781, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.25390625, + "step": 8351, + "time_per_iteration": 3.484189510345459 + }, + { + "auxiliary_loss_clip": 0.01435046, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.26181102, + "balance_loss_mlp": 1.02018726, + "epoch": 0.5021494062828799, + "flos": 23743460044800.0, + "grad_norm": 1.7810241418359718, + "language_loss": 0.75985932, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.78462833, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21655273, + "step": 8352, + "time_per_iteration": 2.967494487762451 + }, + { + "auxiliary_loss_clip": 0.01442221, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.26980519, + "balance_loss_mlp": 1.02139437, + "epoch": 0.5022095295355479, + "flos": 21585891617280.0, + "grad_norm": 2.7827969205692487, + "language_loss": 0.72362196, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74849045, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.23242188, + "step": 8353, + "time_per_iteration": 2.883181095123291 + }, + { + "auxiliary_loss_clip": 0.01444916, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_clip": 1.27311563, + "balance_loss_mlp": 1.02310359, + "epoch": 0.5022696527882159, + "flos": 21407083591680.0, + "grad_norm": 7.2227188232481385, + "language_loss": 0.75211072, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.77701682, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.22595215, + "step": 8354, + "time_per_iteration": 2.8617396354675293 + }, + { + "auxiliary_loss_clip": 0.01429783, + "auxiliary_loss_mlp": 0.01044013, + "balance_loss_clip": 1.25905633, + "balance_loss_mlp": 1.02172089, + "epoch": 0.5023297760408838, + "flos": 27171732712320.0, + "grad_norm": 1.5741106832654135, + "language_loss": 0.73152691, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.75626487, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22302246, + "step": 8355, + "time_per_iteration": 2.8915348052978516 + }, + { + "auxiliary_loss_clip": 0.01443721, + "auxiliary_loss_mlp": 0.01046109, + "balance_loss_clip": 1.26813173, + "balance_loss_mlp": 1.0241034, + "epoch": 0.5023898992935518, + "flos": 26225045619840.0, + "grad_norm": 1.5418048718225648, + "language_loss": 0.81866723, + "learning_rate": 2.081569591520548e-06, + "loss": 0.84356552, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.22009277, + "step": 8356, + "time_per_iteration": 2.902352809906006 + }, + { + "auxiliary_loss_clip": 0.01458082, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.27779138, + "balance_loss_mlp": 1.03072739, + "epoch": 0.5024500225462197, + "flos": 13447330842240.0, + "grad_norm": 2.101459375771465, + "language_loss": 0.77415884, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.79927129, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 1.80371094, + "router_z_loss_mlp": 0.22424316, + "step": 8357, + "time_per_iteration": 2.825868844985962 + }, + { + "auxiliary_loss_clip": 0.01433006, + "auxiliary_loss_mlp": 0.01050121, + "balance_loss_clip": 1.26100099, + "balance_loss_mlp": 1.02773321, + "epoch": 0.5025101457988878, + "flos": 21589465956480.0, + "grad_norm": 1.9134179683542836, + "language_loss": 0.77015704, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.79498827, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.22387695, + "step": 8358, + "time_per_iteration": 2.8802406787872314 + }, + { + "auxiliary_loss_clip": 0.01428985, + "auxiliary_loss_mlp": 0.01050253, + "balance_loss_clip": 1.25771677, + "balance_loss_mlp": 1.02813959, + "epoch": 0.5025702690515557, + "flos": 24656005296000.0, + "grad_norm": 4.333566527318841, + "language_loss": 0.73652041, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.76131284, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.22119141, + "step": 8359, + "time_per_iteration": 2.885776996612549 + }, + { + "auxiliary_loss_clip": 0.01421508, + "auxiliary_loss_mlp": 0.01048189, + "balance_loss_clip": 1.25361586, + "balance_loss_mlp": 1.02464533, + "epoch": 0.5026303923042237, + "flos": 22100118480000.0, + "grad_norm": 1.5868764172211098, + "language_loss": 0.77943122, + "learning_rate": 2.080013016407077e-06, + "loss": 0.80412817, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.23547363, + "step": 8360, + "time_per_iteration": 2.896709442138672 + }, + { + "auxiliary_loss_clip": 0.01422607, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_clip": 1.25362515, + "balance_loss_mlp": 1.02217746, + "epoch": 0.5026905155568916, + "flos": 23708006104320.0, + "grad_norm": 2.01962387853291, + "language_loss": 0.7792362, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.80390573, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.22180176, + "step": 8361, + "time_per_iteration": 2.910471200942993 + }, + { + "auxiliary_loss_clip": 0.01444824, + "auxiliary_loss_mlp": 0.01049626, + "balance_loss_clip": 1.27007318, + "balance_loss_mlp": 1.02686882, + "epoch": 0.5027506388095596, + "flos": 25823514850560.0, + "grad_norm": 2.734174263147762, + "language_loss": 0.85745144, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.88239592, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22753906, + "step": 8362, + "time_per_iteration": 2.9095380306243896 + }, + { + "auxiliary_loss_clip": 0.01445046, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.27067614, + "balance_loss_mlp": 1.02040887, + "epoch": 0.5028107620622275, + "flos": 27537990520320.0, + "grad_norm": 1.6873138741466056, + "language_loss": 0.79124451, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.81611782, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21875, + "step": 8363, + "time_per_iteration": 2.899487257003784 + }, + { + "auxiliary_loss_clip": 0.01416399, + "auxiliary_loss_mlp": 0.01043382, + "balance_loss_clip": 1.25100303, + "balance_loss_mlp": 1.02056527, + "epoch": 0.5028708853148955, + "flos": 24544757036160.0, + "grad_norm": 3.7990575610916943, + "language_loss": 0.7623589, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.78695667, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.22802734, + "step": 8364, + "time_per_iteration": 2.8654627799987793 + }, + { + "auxiliary_loss_clip": 0.01421345, + "auxiliary_loss_mlp": 0.01042622, + "balance_loss_clip": 1.25234365, + "balance_loss_mlp": 1.02053297, + "epoch": 0.5029310085675635, + "flos": 20823351436800.0, + "grad_norm": 1.686707570182686, + "language_loss": 0.70426142, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.72890115, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.22094727, + "step": 8365, + "time_per_iteration": 2.8564751148223877 + }, + { + "auxiliary_loss_clip": 0.014395, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.2637136, + "balance_loss_mlp": 1.02196527, + "epoch": 0.5029911318202315, + "flos": 22351689423360.0, + "grad_norm": 1.688128288618669, + "language_loss": 0.73786002, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.76270205, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.22729492, + "step": 8366, + "time_per_iteration": 4.360262155532837 + }, + { + "auxiliary_loss_clip": 0.01426791, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.25742722, + "balance_loss_mlp": 1.02061117, + "epoch": 0.5030512550728995, + "flos": 24363370056960.0, + "grad_norm": 1.4687611303897505, + "language_loss": 0.78595769, + "learning_rate": 2.077288893713735e-06, + "loss": 0.81064647, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.21459961, + "step": 8367, + "time_per_iteration": 2.8941681385040283 + }, + { + "auxiliary_loss_clip": 0.01420454, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_clip": 1.25170803, + "balance_loss_mlp": 1.02140272, + "epoch": 0.5031113783255674, + "flos": 18269093433600.0, + "grad_norm": 1.7787403888400009, + "language_loss": 0.71134865, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.73598206, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21484375, + "step": 8368, + "time_per_iteration": 2.8883469104766846 + }, + { + "auxiliary_loss_clip": 0.01236679, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.14295673, + "balance_loss_mlp": 1.01182604, + "epoch": 0.5031715015782354, + "flos": 57279319223040.0, + "grad_norm": 0.9125837126252, + "language_loss": 0.63345504, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65622616, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.28515625, + "step": 8369, + "time_per_iteration": 3.3405072689056396 + }, + { + "auxiliary_loss_clip": 0.01423889, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.25612283, + "balance_loss_mlp": 1.01937985, + "epoch": 0.5032316248309033, + "flos": 27538940661120.0, + "grad_norm": 2.1570513528981303, + "language_loss": 0.60673159, + "learning_rate": 2.076121368302263e-06, + "loss": 0.6313805, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21630859, + "step": 8370, + "time_per_iteration": 2.966266393661499 + }, + { + "auxiliary_loss_clip": 0.01431099, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.2576834, + "balance_loss_mlp": 1.02038372, + "epoch": 0.5032917480835714, + "flos": 34509132230400.0, + "grad_norm": 1.6131214806494696, + "language_loss": 0.69009441, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.71483308, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.22375488, + "step": 8371, + "time_per_iteration": 2.98302960395813 + }, + { + "auxiliary_loss_clip": 0.01430982, + "auxiliary_loss_mlp": 0.01044839, + "balance_loss_clip": 1.26059437, + "balance_loss_mlp": 1.02108026, + "epoch": 0.5033518713362393, + "flos": 33669892834560.0, + "grad_norm": 1.6924884608175534, + "language_loss": 0.68739903, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.71215725, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.2376709, + "step": 8372, + "time_per_iteration": 2.9893484115600586 + }, + { + "auxiliary_loss_clip": 0.01439924, + "auxiliary_loss_mlp": 0.01038915, + "balance_loss_clip": 1.26675034, + "balance_loss_mlp": 1.01575279, + "epoch": 0.5034119945889073, + "flos": 28197607484160.0, + "grad_norm": 1.8090425379098323, + "language_loss": 0.67605072, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.70083916, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.23156738, + "step": 8373, + "time_per_iteration": 2.9488015174865723 + }, + { + "auxiliary_loss_clip": 0.01428046, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.25696802, + "balance_loss_mlp": 1.01639235, + "epoch": 0.5034721178415752, + "flos": 21368010067200.0, + "grad_norm": 1.7364474658196598, + "language_loss": 0.75180936, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.77647543, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.22155762, + "step": 8374, + "time_per_iteration": 2.8580431938171387 + }, + { + "auxiliary_loss_clip": 0.01441762, + "auxiliary_loss_mlp": 0.010413, + "balance_loss_clip": 1.26911676, + "balance_loss_mlp": 1.02021158, + "epoch": 0.5035322410942432, + "flos": 22685162734080.0, + "grad_norm": 2.9473415558959317, + "language_loss": 0.69214183, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.71697247, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21081543, + "step": 8375, + "time_per_iteration": 2.862151622772217 + }, + { + "auxiliary_loss_clip": 0.01451138, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.27640116, + "balance_loss_mlp": 1.0144192, + "epoch": 0.5035923643469111, + "flos": 19838269491840.0, + "grad_norm": 1.7768975137579637, + "language_loss": 0.79972851, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.82460022, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21606445, + "step": 8376, + "time_per_iteration": 2.8581931591033936 + }, + { + "auxiliary_loss_clip": 0.01447988, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.27215457, + "balance_loss_mlp": 1.01532292, + "epoch": 0.5036524875995791, + "flos": 30525794628480.0, + "grad_norm": 1.872353408818428, + "language_loss": 0.60490125, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.62976134, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.22692871, + "step": 8377, + "time_per_iteration": 4.293928146362305 + }, + { + "auxiliary_loss_clip": 0.01432364, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.26219702, + "balance_loss_mlp": 1.01482129, + "epoch": 0.5037126108522471, + "flos": 14728758099840.0, + "grad_norm": 1.873481486683625, + "language_loss": 0.77001822, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.79471749, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.22741699, + "step": 8378, + "time_per_iteration": 4.251403570175171 + }, + { + "auxiliary_loss_clip": 0.01449701, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.27809834, + "balance_loss_mlp": 1.01519895, + "epoch": 0.5037727341049151, + "flos": 25307659175040.0, + "grad_norm": 1.7644916259846293, + "language_loss": 0.75476587, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77962542, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.21044922, + "step": 8379, + "time_per_iteration": 4.293873071670532 + }, + { + "auxiliary_loss_clip": 0.01434946, + "auxiliary_loss_mlp": 0.01043343, + "balance_loss_clip": 1.26542413, + "balance_loss_mlp": 1.02136087, + "epoch": 0.5038328573575831, + "flos": 28551739685760.0, + "grad_norm": 2.0331784077447237, + "language_loss": 0.67542553, + "learning_rate": 2.072229431544548e-06, + "loss": 0.70020843, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21984863, + "step": 8380, + "time_per_iteration": 2.9331934452056885 + }, + { + "auxiliary_loss_clip": 0.01423501, + "auxiliary_loss_mlp": 0.01039671, + "balance_loss_clip": 1.25548077, + "balance_loss_mlp": 1.01760578, + "epoch": 0.503892980610251, + "flos": 31662419967360.0, + "grad_norm": 1.8404363113023556, + "language_loss": 0.64109039, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66572213, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.22070312, + "step": 8381, + "time_per_iteration": 2.939826488494873 + }, + { + "auxiliary_loss_clip": 0.01429939, + "auxiliary_loss_mlp": 0.01043949, + "balance_loss_clip": 1.25893307, + "balance_loss_mlp": 1.02114439, + "epoch": 0.503953103862919, + "flos": 27100779586560.0, + "grad_norm": 1.6933261866109408, + "language_loss": 0.67945534, + "learning_rate": 2.071451010853365e-06, + "loss": 0.70419419, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22827148, + "step": 8382, + "time_per_iteration": 2.893967628479004 + }, + { + "auxiliary_loss_clip": 0.01453832, + "auxiliary_loss_mlp": 0.01040615, + "balance_loss_clip": 1.27519011, + "balance_loss_mlp": 1.01816785, + "epoch": 0.5040132271155869, + "flos": 15641439085440.0, + "grad_norm": 1.8218705496728391, + "language_loss": 0.6279825, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.65292698, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 1.78515625, + "router_z_loss_mlp": 0.22460938, + "step": 8383, + "time_per_iteration": 2.839463233947754 + }, + { + "auxiliary_loss_clip": 0.01422096, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.25420523, + "balance_loss_mlp": 1.01715982, + "epoch": 0.504073350368255, + "flos": 13598150319360.0, + "grad_norm": 3.9612910837046393, + "language_loss": 0.68009269, + "learning_rate": 2.070672579324465e-06, + "loss": 0.70471072, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.22546387, + "step": 8384, + "time_per_iteration": 2.8835644721984863 + }, + { + "auxiliary_loss_clip": 0.01432984, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.26229918, + "balance_loss_mlp": 1.01645112, + "epoch": 0.5041334736209229, + "flos": 29069721866880.0, + "grad_norm": 1.674905433489787, + "language_loss": 0.72212869, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.74683452, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.21154785, + "step": 8385, + "time_per_iteration": 2.9000062942504883 + }, + { + "auxiliary_loss_clip": 0.01428234, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.26082766, + "balance_loss_mlp": 1.0108664, + "epoch": 0.5041935968735909, + "flos": 24619148766720.0, + "grad_norm": 1.8089377201330277, + "language_loss": 0.83810043, + "learning_rate": 2.069894137075919e-06, + "loss": 0.86270797, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21643066, + "step": 8386, + "time_per_iteration": 2.860795021057129 + }, + { + "auxiliary_loss_clip": 0.01431738, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.26082897, + "balance_loss_mlp": 1.01541853, + "epoch": 0.5042537201262588, + "flos": 26298215740800.0, + "grad_norm": 1.6444205664665725, + "language_loss": 0.67157227, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.69626355, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21984863, + "step": 8387, + "time_per_iteration": 2.9011127948760986 + }, + { + "auxiliary_loss_clip": 0.0142495, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.2574259, + "balance_loss_mlp": 1.01694298, + "epoch": 0.5043138433789268, + "flos": 22027355562240.0, + "grad_norm": 1.4914332464492825, + "language_loss": 0.80788159, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.83250642, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20605469, + "step": 8388, + "time_per_iteration": 2.917672872543335 + }, + { + "auxiliary_loss_clip": 0.01423214, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.25372434, + "balance_loss_mlp": 1.01403618, + "epoch": 0.5043739666315947, + "flos": 28778805930240.0, + "grad_norm": 2.375197001613976, + "language_loss": 0.71044517, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.73502654, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20910645, + "step": 8389, + "time_per_iteration": 2.924994707107544 + }, + { + "auxiliary_loss_clip": 0.01452734, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.2789737, + "balance_loss_mlp": 1.02103686, + "epoch": 0.5044340898842627, + "flos": 27610572458880.0, + "grad_norm": 1.6621339312405221, + "language_loss": 0.70131707, + "learning_rate": 2.068337220892191e-06, + "loss": 0.72627532, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.22058105, + "step": 8390, + "time_per_iteration": 2.9571189880371094 + }, + { + "auxiliary_loss_clip": 0.01217353, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.12460828, + "balance_loss_mlp": 1.00468147, + "epoch": 0.5044942131369307, + "flos": 67483954160640.0, + "grad_norm": 0.8589424854474446, + "language_loss": 0.53022027, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55273628, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.29492188, + "step": 8391, + "time_per_iteration": 3.1589062213897705 + }, + { + "auxiliary_loss_clip": 0.01213285, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.12073684, + "balance_loss_mlp": 1.00441837, + "epoch": 0.5045543363895987, + "flos": 58655073185280.0, + "grad_norm": 0.8996938084945462, + "language_loss": 0.60749388, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.63001043, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.33984375, + "step": 8392, + "time_per_iteration": 3.2038748264312744 + }, + { + "auxiliary_loss_clip": 0.01424458, + "auxiliary_loss_mlp": 0.01039468, + "balance_loss_clip": 1.25749922, + "balance_loss_mlp": 1.01827288, + "epoch": 0.5046144596422667, + "flos": 22536334028160.0, + "grad_norm": 1.4971834698392428, + "language_loss": 0.84951389, + "learning_rate": 2.067169506493517e-06, + "loss": 0.87415314, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.21191406, + "step": 8393, + "time_per_iteration": 2.880765676498413 + }, + { + "auxiliary_loss_clip": 0.01431506, + "auxiliary_loss_mlp": 0.01039509, + "balance_loss_clip": 1.26138091, + "balance_loss_mlp": 1.01803994, + "epoch": 0.5046745828949346, + "flos": 27465770540160.0, + "grad_norm": 1.864628479015273, + "language_loss": 0.51377147, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53848159, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21484375, + "step": 8394, + "time_per_iteration": 2.918851852416992 + }, + { + "auxiliary_loss_clip": 0.01445159, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.27287519, + "balance_loss_mlp": 1.01869035, + "epoch": 0.5047347061476026, + "flos": 17283378061440.0, + "grad_norm": 1.5682013731798858, + "language_loss": 0.76002765, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.78488034, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21435547, + "step": 8395, + "time_per_iteration": 2.892678737640381 + }, + { + "auxiliary_loss_clip": 0.01446665, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.2756542, + "balance_loss_mlp": 1.01834667, + "epoch": 0.5047948294002705, + "flos": 16656952798080.0, + "grad_norm": 2.0710543108812027, + "language_loss": 0.69213825, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.71699667, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20837402, + "step": 8396, + "time_per_iteration": 2.8517794609069824 + }, + { + "auxiliary_loss_clip": 0.0144869, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.27853036, + "balance_loss_mlp": 1.01834106, + "epoch": 0.5048549526529386, + "flos": 26875839847680.0, + "grad_norm": 2.4025512674138128, + "language_loss": 0.79031914, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81519997, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21057129, + "step": 8397, + "time_per_iteration": 2.906883955001831 + }, + { + "auxiliary_loss_clip": 0.01424657, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.25700569, + "balance_loss_mlp": 1.01691651, + "epoch": 0.5049150759056065, + "flos": 21843615853440.0, + "grad_norm": 1.5323956235716287, + "language_loss": 0.6670686, + "learning_rate": 2.065223265084376e-06, + "loss": 0.69169199, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20776367, + "step": 8398, + "time_per_iteration": 2.876971483230591 + }, + { + "auxiliary_loss_clip": 0.01430321, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.25995624, + "balance_loss_mlp": 1.01364183, + "epoch": 0.5049751991582745, + "flos": 21694877637120.0, + "grad_norm": 1.640642452848556, + "language_loss": 0.72115183, + "learning_rate": 2.064834009323688e-06, + "loss": 0.7458235, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.23205566, + "step": 8399, + "time_per_iteration": 2.856163263320923 + }, + { + "auxiliary_loss_clip": 0.01445142, + "auxiliary_loss_mlp": 0.01044064, + "balance_loss_clip": 1.27089179, + "balance_loss_mlp": 1.0222491, + "epoch": 0.5050353224109424, + "flos": 21368869718400.0, + "grad_norm": 2.177903217927153, + "language_loss": 0.82704031, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.85193241, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.21789551, + "step": 8400, + "time_per_iteration": 2.836794376373291 + }, + { + "auxiliary_loss_clip": 0.01430908, + "auxiliary_loss_mlp": 0.01040565, + "balance_loss_clip": 1.26082551, + "balance_loss_mlp": 1.01842833, + "epoch": 0.5050954456636104, + "flos": 22830281366400.0, + "grad_norm": 2.0007364024903995, + "language_loss": 0.79622334, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.82093805, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.22119141, + "step": 8401, + "time_per_iteration": 4.242810487747192 + }, + { + "auxiliary_loss_clip": 0.01451447, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.27609134, + "balance_loss_mlp": 1.01676357, + "epoch": 0.5051555689162783, + "flos": 30461085285120.0, + "grad_norm": 1.9758844169241188, + "language_loss": 0.70358086, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72848034, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.2175293, + "step": 8402, + "time_per_iteration": 2.9414234161376953 + }, + { + "auxiliary_loss_clip": 0.0142672, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.25642586, + "balance_loss_mlp": 1.01304078, + "epoch": 0.5052156921689464, + "flos": 21297826103040.0, + "grad_norm": 1.6316024423516056, + "language_loss": 0.7007392, + "learning_rate": 2.063276961843422e-06, + "loss": 0.72535092, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2142334, + "step": 8403, + "time_per_iteration": 2.9457948207855225 + }, + { + "auxiliary_loss_clip": 0.01427959, + "auxiliary_loss_mlp": 0.0103989, + "balance_loss_clip": 1.25934362, + "balance_loss_mlp": 1.01837289, + "epoch": 0.5052758154216143, + "flos": 25091858885760.0, + "grad_norm": 1.360456053684624, + "language_loss": 0.86400396, + "learning_rate": 2.062887693937781e-06, + "loss": 0.88868248, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21508789, + "step": 8404, + "time_per_iteration": 2.8990824222564697 + }, + { + "auxiliary_loss_clip": 0.01434073, + "auxiliary_loss_mlp": 0.01042366, + "balance_loss_clip": 1.26511669, + "balance_loss_mlp": 1.02132559, + "epoch": 0.5053359386742823, + "flos": 20894983234560.0, + "grad_norm": 1.5863972537086344, + "language_loss": 0.76160002, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.78636444, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21032715, + "step": 8405, + "time_per_iteration": 2.8412139415740967 + }, + { + "auxiliary_loss_clip": 0.01439347, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.26580203, + "balance_loss_mlp": 1.01267087, + "epoch": 0.5053960619269503, + "flos": 37757556241920.0, + "grad_norm": 1.6429847174801282, + "language_loss": 0.73644161, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.76118064, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21887207, + "step": 8406, + "time_per_iteration": 2.9892184734344482 + }, + { + "auxiliary_loss_clip": 0.01414694, + "auxiliary_loss_mlp": 0.01035697, + "balance_loss_clip": 1.24901152, + "balance_loss_mlp": 1.01471615, + "epoch": 0.5054561851796182, + "flos": 23524356885120.0, + "grad_norm": 1.7404957402067074, + "language_loss": 0.76938939, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79389322, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.2097168, + "step": 8407, + "time_per_iteration": 2.9271011352539062 + }, + { + "auxiliary_loss_clip": 0.01434682, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.26178205, + "balance_loss_mlp": 1.0132463, + "epoch": 0.5055163084322862, + "flos": 30422690432640.0, + "grad_norm": 1.6296972485519716, + "language_loss": 0.64397192, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.66865736, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.20617676, + "step": 8408, + "time_per_iteration": 2.937140464782715 + }, + { + "auxiliary_loss_clip": 0.01434018, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.26410151, + "balance_loss_mlp": 1.01751912, + "epoch": 0.5055764316849541, + "flos": 20267517340800.0, + "grad_norm": 2.010447403194848, + "language_loss": 0.64273238, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.6674673, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21948242, + "step": 8409, + "time_per_iteration": 2.874007225036621 + }, + { + "auxiliary_loss_clip": 0.01423959, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.25606585, + "balance_loss_mlp": 1.01878381, + "epoch": 0.5056365549376222, + "flos": 26082867899520.0, + "grad_norm": 1.3038123217300452, + "language_loss": 0.70965564, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73428798, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20507812, + "step": 8410, + "time_per_iteration": 2.8882715702056885 + }, + { + "auxiliary_loss_clip": 0.01436225, + "auxiliary_loss_mlp": 0.01044081, + "balance_loss_clip": 1.26555562, + "balance_loss_mlp": 1.02271914, + "epoch": 0.5056966781902901, + "flos": 19287864771840.0, + "grad_norm": 1.7536641631581105, + "language_loss": 0.79769498, + "learning_rate": 2.060162752653113e-06, + "loss": 0.82249802, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21362305, + "step": 8411, + "time_per_iteration": 2.82243013381958 + }, + { + "auxiliary_loss_clip": 0.01440756, + "auxiliary_loss_mlp": 0.01044519, + "balance_loss_clip": 1.26840961, + "balance_loss_mlp": 1.02259612, + "epoch": 0.5057568014429581, + "flos": 21332918085120.0, + "grad_norm": 2.8024781377972143, + "language_loss": 0.82198322, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.84683597, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21899414, + "step": 8412, + "time_per_iteration": 4.352912187576294 + }, + { + "auxiliary_loss_clip": 0.01430433, + "auxiliary_loss_mlp": 0.01042969, + "balance_loss_clip": 1.26105785, + "balance_loss_mlp": 1.0218091, + "epoch": 0.505816924695626, + "flos": 17502526465920.0, + "grad_norm": 1.8306227199991982, + "language_loss": 0.81850177, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.84323573, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21154785, + "step": 8413, + "time_per_iteration": 4.262189865112305 + }, + { + "auxiliary_loss_clip": 0.01434139, + "auxiliary_loss_mlp": 0.01043944, + "balance_loss_clip": 1.26196742, + "balance_loss_mlp": 1.02116299, + "epoch": 0.505877047948294, + "flos": 21151938309120.0, + "grad_norm": 1.9612729048308963, + "language_loss": 0.81214094, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.83692169, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.22790527, + "step": 8414, + "time_per_iteration": 4.2377495765686035 + }, + { + "auxiliary_loss_clip": 0.01437115, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.26679611, + "balance_loss_mlp": 1.02182508, + "epoch": 0.5059371712009619, + "flos": 36362120791680.0, + "grad_norm": 2.0150183168087357, + "language_loss": 0.62961209, + "learning_rate": 2.058605592832528e-06, + "loss": 0.65442097, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21948242, + "step": 8415, + "time_per_iteration": 2.972032070159912 + }, + { + "auxiliary_loss_clip": 0.01438642, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.26687741, + "balance_loss_mlp": 1.02079034, + "epoch": 0.50599729445363, + "flos": 22683352942080.0, + "grad_norm": 1.5166072449560248, + "language_loss": 0.82559305, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.85040462, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21716309, + "step": 8416, + "time_per_iteration": 2.865138053894043 + }, + { + "auxiliary_loss_clip": 0.01430945, + "auxiliary_loss_mlp": 0.01042171, + "balance_loss_clip": 1.2641269, + "balance_loss_mlp": 1.02055824, + "epoch": 0.5060574177062979, + "flos": 22758242365440.0, + "grad_norm": 1.732460657027843, + "language_loss": 0.79905635, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.82378751, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.21606445, + "step": 8417, + "time_per_iteration": 2.905461072921753 + }, + { + "auxiliary_loss_clip": 0.01415068, + "auxiliary_loss_mlp": 0.01044333, + "balance_loss_clip": 1.2491802, + "balance_loss_mlp": 1.02263641, + "epoch": 0.5061175409589659, + "flos": 21663269504640.0, + "grad_norm": 1.8617179321424882, + "language_loss": 0.63071805, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65531206, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21691895, + "step": 8418, + "time_per_iteration": 2.9169647693634033 + }, + { + "auxiliary_loss_clip": 0.01445495, + "auxiliary_loss_mlp": 0.01043975, + "balance_loss_clip": 1.27124977, + "balance_loss_mlp": 1.02216029, + "epoch": 0.5061776642116339, + "flos": 21626186751360.0, + "grad_norm": 1.8345767274019003, + "language_loss": 0.78280735, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.80770206, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21801758, + "step": 8419, + "time_per_iteration": 2.885929584503174 + }, + { + "auxiliary_loss_clip": 0.0145107, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_clip": 1.27652955, + "balance_loss_mlp": 1.0221715, + "epoch": 0.5062377874643018, + "flos": 24437128360320.0, + "grad_norm": 1.9432323093172976, + "language_loss": 0.78022265, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.80517542, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.22033691, + "step": 8420, + "time_per_iteration": 2.917098045349121 + }, + { + "auxiliary_loss_clip": 0.01449041, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.27623057, + "balance_loss_mlp": 1.01874733, + "epoch": 0.5062979107169698, + "flos": 22533574095360.0, + "grad_norm": 1.9445207407282616, + "language_loss": 0.78593767, + "learning_rate": 2.056269786726999e-06, + "loss": 0.81082404, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20861816, + "step": 8421, + "time_per_iteration": 2.8774590492248535 + }, + { + "auxiliary_loss_clip": 0.01437301, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.266541, + "balance_loss_mlp": 1.01505232, + "epoch": 0.5063580339696377, + "flos": 24582789930240.0, + "grad_norm": 1.5494780858198773, + "language_loss": 0.67388105, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69860363, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19897461, + "step": 8422, + "time_per_iteration": 2.868140935897827 + }, + { + "auxiliary_loss_clip": 0.01437583, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.26879644, + "balance_loss_mlp": 1.01416397, + "epoch": 0.5064181572223058, + "flos": 22604979669120.0, + "grad_norm": 1.6061202140833888, + "language_loss": 0.8217721, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84649658, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20690918, + "step": 8423, + "time_per_iteration": 2.9397659301757812 + }, + { + "auxiliary_loss_clip": 0.01436, + "auxiliary_loss_mlp": 0.01039413, + "balance_loss_clip": 1.26518965, + "balance_loss_mlp": 1.01703691, + "epoch": 0.5064782804749737, + "flos": 26006123439360.0, + "grad_norm": 1.8868588364196885, + "language_loss": 0.75892246, + "learning_rate": 2.055101854669237e-06, + "loss": 0.78367656, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22387695, + "step": 8424, + "time_per_iteration": 2.8965418338775635 + }, + { + "auxiliary_loss_clip": 0.01429659, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.26327729, + "balance_loss_mlp": 1.01455271, + "epoch": 0.5065384037276417, + "flos": 28565946552960.0, + "grad_norm": 2.063314636927337, + "language_loss": 0.71662986, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.74127233, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20031738, + "step": 8425, + "time_per_iteration": 2.8993122577667236 + }, + { + "auxiliary_loss_clip": 0.01436385, + "auxiliary_loss_mlp": 0.01037269, + "balance_loss_clip": 1.26627862, + "balance_loss_mlp": 1.01718211, + "epoch": 0.5065985269803096, + "flos": 22976259649920.0, + "grad_norm": 1.8888708848470737, + "language_loss": 0.79546428, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.8202008, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20092773, + "step": 8426, + "time_per_iteration": 2.885683298110962 + }, + { + "auxiliary_loss_clip": 0.01439051, + "auxiliary_loss_mlp": 0.01035264, + "balance_loss_clip": 1.26913214, + "balance_loss_mlp": 1.01448607, + "epoch": 0.5066586502329776, + "flos": 21616504364160.0, + "grad_norm": 2.1367352476721098, + "language_loss": 0.78653181, + "learning_rate": 2.053933903806265e-06, + "loss": 0.81127489, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.2076416, + "step": 8427, + "time_per_iteration": 2.8570566177368164 + }, + { + "auxiliary_loss_clip": 0.01427024, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.25858641, + "balance_loss_mlp": 1.0127331, + "epoch": 0.5067187734856455, + "flos": 20349691176960.0, + "grad_norm": 1.9628054624046263, + "language_loss": 0.72443831, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.74903893, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20300293, + "step": 8428, + "time_per_iteration": 2.866793394088745 + }, + { + "auxiliary_loss_clip": 0.01429714, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.26129866, + "balance_loss_mlp": 1.01969039, + "epoch": 0.5067788967383136, + "flos": 28853061926400.0, + "grad_norm": 1.8357902240640165, + "language_loss": 0.83934456, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.86403525, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19665527, + "step": 8429, + "time_per_iteration": 2.8790953159332275 + }, + { + "auxiliary_loss_clip": 0.01460446, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.28460729, + "balance_loss_mlp": 1.01913011, + "epoch": 0.5068390199909815, + "flos": 32462314369920.0, + "grad_norm": 1.7208891345236699, + "language_loss": 0.73860759, + "learning_rate": 2.052765934536682e-06, + "loss": 0.76361477, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21154785, + "step": 8430, + "time_per_iteration": 2.929250955581665 + }, + { + "auxiliary_loss_clip": 0.01432024, + "auxiliary_loss_mlp": 0.01036914, + "balance_loss_clip": 1.26291287, + "balance_loss_mlp": 1.016482, + "epoch": 0.5068991432436495, + "flos": 23156379774720.0, + "grad_norm": 2.0686053569944005, + "language_loss": 0.77607858, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.80076796, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20446777, + "step": 8431, + "time_per_iteration": 2.8330466747283936 + }, + { + "auxiliary_loss_clip": 0.01426981, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.2587738, + "balance_loss_mlp": 1.01780772, + "epoch": 0.5069592664963174, + "flos": 19945129006080.0, + "grad_norm": 1.4482861068722297, + "language_loss": 0.73027706, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.75493163, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20666504, + "step": 8432, + "time_per_iteration": 2.852435827255249 + }, + { + "auxiliary_loss_clip": 0.01221189, + "auxiliary_loss_mlp": 0.01040828, + "balance_loss_clip": 1.1277914, + "balance_loss_mlp": 1.01393437, + "epoch": 0.5070193897489854, + "flos": 65822894616960.0, + "grad_norm": 0.7653311351175733, + "language_loss": 0.63771522, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.66033536, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.26953125, + "step": 8433, + "time_per_iteration": 3.3860888481140137 + }, + { + "auxiliary_loss_clip": 0.01438387, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.26827097, + "balance_loss_mlp": 1.01811135, + "epoch": 0.5070795130016534, + "flos": 17284373447040.0, + "grad_norm": 1.7378845091205517, + "language_loss": 0.77607685, + "learning_rate": 2.051208614233681e-06, + "loss": 0.80084777, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20593262, + "step": 8434, + "time_per_iteration": 2.8372671604156494 + }, + { + "auxiliary_loss_clip": 0.01451932, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.27796197, + "balance_loss_mlp": 1.0227294, + "epoch": 0.5071396362543213, + "flos": 21079989797760.0, + "grad_norm": 1.9051270118005597, + "language_loss": 0.71920085, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.74415678, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20935059, + "step": 8435, + "time_per_iteration": 2.8685712814331055 + }, + { + "auxiliary_loss_clip": 0.01439336, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.26728106, + "balance_loss_mlp": 1.01666522, + "epoch": 0.5071997595069894, + "flos": 23154162779520.0, + "grad_norm": 2.29015857664624, + "language_loss": 0.72786754, + "learning_rate": 2.050429942372112e-06, + "loss": 0.75264674, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21923828, + "step": 8436, + "time_per_iteration": 4.290988445281982 + }, + { + "auxiliary_loss_clip": 0.01439128, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.26943064, + "balance_loss_mlp": 1.01244605, + "epoch": 0.5072598827596573, + "flos": 22757382714240.0, + "grad_norm": 1.583456893096289, + "language_loss": 0.84407461, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86880958, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21948242, + "step": 8437, + "time_per_iteration": 2.816401481628418 + }, + { + "auxiliary_loss_clip": 0.01427753, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.25975585, + "balance_loss_mlp": 1.01726496, + "epoch": 0.5073200060123253, + "flos": 22576855386240.0, + "grad_norm": 1.4303919173805857, + "language_loss": 0.8158921, + "learning_rate": 2.049651262861309e-06, + "loss": 0.840545, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20263672, + "step": 8438, + "time_per_iteration": 2.8777084350585938 + }, + { + "auxiliary_loss_clip": 0.01436973, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.26480055, + "balance_loss_mlp": 1.0133934, + "epoch": 0.5073801292649932, + "flos": 25815868479360.0, + "grad_norm": 1.7607600728688417, + "language_loss": 0.80178136, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.82650161, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.2166748, + "step": 8439, + "time_per_iteration": 2.858907461166382 + }, + { + "auxiliary_loss_clip": 0.01418149, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.25240088, + "balance_loss_mlp": 1.01291025, + "epoch": 0.5074402525176612, + "flos": 25384765593600.0, + "grad_norm": 1.5998205616015522, + "language_loss": 0.72110605, + "learning_rate": 2.048872575819383e-06, + "loss": 0.74561858, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20202637, + "step": 8440, + "time_per_iteration": 2.9026904106140137 + }, + { + "auxiliary_loss_clip": 0.01430866, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.26155519, + "balance_loss_mlp": 1.01594889, + "epoch": 0.5075003757703291, + "flos": 26074723835520.0, + "grad_norm": 1.7041340309360287, + "language_loss": 0.71429974, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73897934, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21154785, + "step": 8441, + "time_per_iteration": 2.887484550476074 + }, + { + "auxiliary_loss_clip": 0.01442662, + "auxiliary_loss_mlp": 0.01041246, + "balance_loss_clip": 1.26866114, + "balance_loss_mlp": 1.01977658, + "epoch": 0.5075604990229972, + "flos": 21845516135040.0, + "grad_norm": 1.7267076387361733, + "language_loss": 0.64661813, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.67145723, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21484375, + "step": 8442, + "time_per_iteration": 2.8651015758514404 + }, + { + "auxiliary_loss_clip": 0.0141437, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.24971962, + "balance_loss_mlp": 1.0102694, + "epoch": 0.5076206222756651, + "flos": 31991459287680.0, + "grad_norm": 1.6497126657473655, + "language_loss": 0.71864057, + "learning_rate": 2.047704531394006e-06, + "loss": 0.7430914, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.2043457, + "step": 8443, + "time_per_iteration": 2.965240478515625 + }, + { + "auxiliary_loss_clip": 0.01448525, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.27626359, + "balance_loss_mlp": 1.01858568, + "epoch": 0.5076807455283331, + "flos": 36918000132480.0, + "grad_norm": 1.3541423082557058, + "language_loss": 0.62481171, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64969659, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21374512, + "step": 8444, + "time_per_iteration": 3.0283420085906982 + }, + { + "auxiliary_loss_clip": 0.01431626, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.26292276, + "balance_loss_mlp": 1.01278698, + "epoch": 0.507740868781001, + "flos": 29874593197440.0, + "grad_norm": 2.6411471704673612, + "language_loss": 0.64609236, + "learning_rate": 2.046925826041012e-06, + "loss": 0.67074859, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.2121582, + "step": 8445, + "time_per_iteration": 2.9207613468170166 + }, + { + "auxiliary_loss_clip": 0.01216104, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.12189209, + "balance_loss_mlp": 1.00354099, + "epoch": 0.507800992033669, + "flos": 61945194919680.0, + "grad_norm": 0.8654408244143234, + "language_loss": 0.62085617, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.64331579, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.26367188, + "step": 8446, + "time_per_iteration": 3.4733080863952637 + }, + { + "auxiliary_loss_clip": 0.014274, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.25941229, + "balance_loss_mlp": 1.01484013, + "epoch": 0.507861115286337, + "flos": 20709252754560.0, + "grad_norm": 1.5720390053304352, + "language_loss": 0.81486833, + "learning_rate": 2.04614711357029e-06, + "loss": 0.83950096, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21008301, + "step": 8447, + "time_per_iteration": 4.339062452316284 + }, + { + "auxiliary_loss_clip": 0.01418025, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.25277555, + "balance_loss_mlp": 1.01272357, + "epoch": 0.507921238539005, + "flos": 30859358428800.0, + "grad_norm": 1.522616806236274, + "language_loss": 0.71560478, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.74011636, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20410156, + "step": 8448, + "time_per_iteration": 4.964825391769409 + }, + { + "auxiliary_loss_clip": 0.01425137, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.25868201, + "balance_loss_mlp": 1.0130204, + "epoch": 0.507981361791673, + "flos": 35713905517440.0, + "grad_norm": 1.4941117690384542, + "language_loss": 0.72284615, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74743307, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20532227, + "step": 8449, + "time_per_iteration": 4.393014669418335 + }, + { + "auxiliary_loss_clip": 0.01420406, + "auxiliary_loss_mlp": 0.01037747, + "balance_loss_clip": 1.25414467, + "balance_loss_mlp": 1.01688552, + "epoch": 0.5080414850443409, + "flos": 27172682853120.0, + "grad_norm": 1.7344679430012517, + "language_loss": 0.73895395, + "learning_rate": 2.044979031776844e-06, + "loss": 0.7635355, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20874023, + "step": 8450, + "time_per_iteration": 2.9104833602905273 + }, + { + "auxiliary_loss_clip": 0.0142532, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.25542557, + "balance_loss_mlp": 1.0142777, + "epoch": 0.5081016082970089, + "flos": 27095531189760.0, + "grad_norm": 1.682311419242521, + "language_loss": 0.77404702, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79865694, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21386719, + "step": 8451, + "time_per_iteration": 2.882288932800293 + }, + { + "auxiliary_loss_clip": 0.01429199, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.25828826, + "balance_loss_mlp": 1.01462281, + "epoch": 0.5081617315496768, + "flos": 22866730692480.0, + "grad_norm": 1.9421302278091368, + "language_loss": 0.86212534, + "learning_rate": 2.044200302028559e-06, + "loss": 0.88677466, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21118164, + "step": 8452, + "time_per_iteration": 2.833099126815796 + }, + { + "auxiliary_loss_clip": 0.01432897, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.2606442, + "balance_loss_mlp": 1.01395726, + "epoch": 0.5082218548023448, + "flos": 16288523239680.0, + "grad_norm": 2.7281836908078336, + "language_loss": 0.79008687, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.81477797, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.22253418, + "step": 8453, + "time_per_iteration": 2.839404821395874 + }, + { + "auxiliary_loss_clip": 0.01418628, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.25290036, + "balance_loss_mlp": 1.0133692, + "epoch": 0.5082819780550127, + "flos": 24471089222400.0, + "grad_norm": 2.17531218197675, + "language_loss": 0.77645528, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.80099279, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.2175293, + "step": 8454, + "time_per_iteration": 2.8677916526794434 + }, + { + "auxiliary_loss_clip": 0.01429336, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.25981522, + "balance_loss_mlp": 1.0144515, + "epoch": 0.5083421013076808, + "flos": 23413470583680.0, + "grad_norm": 2.4041914716097925, + "language_loss": 0.89991367, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.92456883, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21740723, + "step": 8455, + "time_per_iteration": 2.873537302017212 + }, + { + "auxiliary_loss_clip": 0.01435059, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.26044369, + "balance_loss_mlp": 1.01539254, + "epoch": 0.5084022245603487, + "flos": 23881746712320.0, + "grad_norm": 1.7866825756749463, + "language_loss": 0.63025296, + "learning_rate": 2.042642822537149e-06, + "loss": 0.65498441, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.22692871, + "step": 8456, + "time_per_iteration": 2.8587663173675537 + }, + { + "auxiliary_loss_clip": 0.01217567, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.1222471, + "balance_loss_mlp": 1.00522864, + "epoch": 0.5084623478130167, + "flos": 62901292930560.0, + "grad_norm": 0.8306785563096524, + "language_loss": 0.62488312, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64734852, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.23730469, + "step": 8457, + "time_per_iteration": 3.244344711303711 + }, + { + "auxiliary_loss_clip": 0.0144601, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.27390051, + "balance_loss_mlp": 1.01489186, + "epoch": 0.5085224710656846, + "flos": 22356304392960.0, + "grad_norm": 2.3328748839939553, + "language_loss": 0.68403846, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.70885265, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.2052002, + "step": 8458, + "time_per_iteration": 2.9490063190460205 + }, + { + "auxiliary_loss_clip": 0.01432609, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.26065946, + "balance_loss_mlp": 1.01539552, + "epoch": 0.5085825943183526, + "flos": 26077031320320.0, + "grad_norm": 1.8405414946572345, + "language_loss": 0.78292191, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80761492, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.2130127, + "step": 8459, + "time_per_iteration": 2.911602735519409 + }, + { + "auxiliary_loss_clip": 0.01446789, + "auxiliary_loss_mlp": 0.01039546, + "balance_loss_clip": 1.27260363, + "balance_loss_mlp": 1.01748037, + "epoch": 0.5086427175710206, + "flos": 17429673058560.0, + "grad_norm": 2.7233004018424007, + "language_loss": 0.81393164, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83879501, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.22058105, + "step": 8460, + "time_per_iteration": 2.85504150390625 + }, + { + "auxiliary_loss_clip": 0.01443177, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.27052593, + "balance_loss_mlp": 1.01363707, + "epoch": 0.5087028408236886, + "flos": 20641602499200.0, + "grad_norm": 1.8157133034936361, + "language_loss": 0.69996375, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.72474313, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21118164, + "step": 8461, + "time_per_iteration": 2.847578287124634 + }, + { + "auxiliary_loss_clip": 0.01415334, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.24991202, + "balance_loss_mlp": 1.01488471, + "epoch": 0.5087629640763566, + "flos": 25604954628480.0, + "grad_norm": 1.7134584303536429, + "language_loss": 0.7685191, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.79304683, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.22558594, + "step": 8462, + "time_per_iteration": 2.894981861114502 + }, + { + "auxiliary_loss_clip": 0.01427752, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.25948405, + "balance_loss_mlp": 1.01225042, + "epoch": 0.5088230873290245, + "flos": 13269563447040.0, + "grad_norm": 2.338163313496933, + "language_loss": 0.83358115, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.85819429, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21313477, + "step": 8463, + "time_per_iteration": 2.8044345378875732 + }, + { + "auxiliary_loss_clip": 0.01433299, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.26376116, + "balance_loss_mlp": 1.01255465, + "epoch": 0.5088832105816925, + "flos": 20051988520320.0, + "grad_norm": 1.7699699636978816, + "language_loss": 0.76720876, + "learning_rate": 2.039527786882341e-06, + "loss": 0.79188377, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21655273, + "step": 8464, + "time_per_iteration": 2.8320083618164062 + }, + { + "auxiliary_loss_clip": 0.01211248, + "auxiliary_loss_mlp": 0.01019708, + "balance_loss_clip": 1.11585331, + "balance_loss_mlp": 0.99834615, + "epoch": 0.5089433338343604, + "flos": 67457702897280.0, + "grad_norm": 0.6823945633790366, + "language_loss": 0.59381807, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61612767, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.21386719, + "step": 8465, + "time_per_iteration": 3.509812593460083 + }, + { + "auxiliary_loss_clip": 0.01427481, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.25800645, + "balance_loss_mlp": 1.01762342, + "epoch": 0.5090034570870284, + "flos": 22720435695360.0, + "grad_norm": 2.0060204939731205, + "language_loss": 0.80920583, + "learning_rate": 2.038749012684354e-06, + "loss": 0.83387327, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21630859, + "step": 8466, + "time_per_iteration": 2.8685302734375 + }, + { + "auxiliary_loss_clip": 0.0141788, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.25118411, + "balance_loss_mlp": 1.01312602, + "epoch": 0.5090635803396963, + "flos": 20454695654400.0, + "grad_norm": 1.648144738558884, + "language_loss": 0.79286939, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.81739056, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21105957, + "step": 8467, + "time_per_iteration": 2.8418898582458496 + }, + { + "auxiliary_loss_clip": 0.01412471, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.24932921, + "balance_loss_mlp": 1.01607597, + "epoch": 0.5091237035923644, + "flos": 23779366433280.0, + "grad_norm": 2.5223961911644124, + "language_loss": 0.75128925, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.77578634, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.21142578, + "step": 8468, + "time_per_iteration": 2.878491163253784 + }, + { + "auxiliary_loss_clip": 0.01426646, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.25845945, + "balance_loss_mlp": 1.01653337, + "epoch": 0.5091838268450323, + "flos": 18335793548160.0, + "grad_norm": 1.8354463492754354, + "language_loss": 0.79107428, + "learning_rate": 2.03758084040404e-06, + "loss": 0.81570554, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19946289, + "step": 8469, + "time_per_iteration": 2.8228046894073486 + }, + { + "auxiliary_loss_clip": 0.01440354, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_clip": 1.27181542, + "balance_loss_mlp": 1.02102494, + "epoch": 0.5092439500977003, + "flos": 29069043194880.0, + "grad_norm": 1.4134218104524228, + "language_loss": 0.70336318, + "learning_rate": 2.037191446774109e-06, + "loss": 0.72819531, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21838379, + "step": 8470, + "time_per_iteration": 2.8960089683532715 + }, + { + "auxiliary_loss_clip": 0.0143329, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_clip": 1.26209855, + "balance_loss_mlp": 1.02068615, + "epoch": 0.5093040733503682, + "flos": 13562017706880.0, + "grad_norm": 1.9841287556265208, + "language_loss": 0.7426551, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.76741827, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.2232666, + "step": 8471, + "time_per_iteration": 4.278416156768799 + }, + { + "auxiliary_loss_clip": 0.01215659, + "auxiliary_loss_mlp": 0.01021386, + "balance_loss_clip": 1.11803198, + "balance_loss_mlp": 0.99983293, + "epoch": 0.5093641966030362, + "flos": 68939547212160.0, + "grad_norm": 0.7532947150471525, + "language_loss": 0.58198535, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60435581, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.21582031, + "step": 8472, + "time_per_iteration": 3.354746103286743 + }, + { + "auxiliary_loss_clip": 0.0143113, + "auxiliary_loss_mlp": 0.01040663, + "balance_loss_clip": 1.260566, + "balance_loss_mlp": 1.01993227, + "epoch": 0.5094243198557042, + "flos": 21591185258880.0, + "grad_norm": 1.739409784669117, + "language_loss": 0.69643807, + "learning_rate": 2.03602325748156e-06, + "loss": 0.721156, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20727539, + "step": 8473, + "time_per_iteration": 2.8897671699523926 + }, + { + "auxiliary_loss_clip": 0.01425257, + "auxiliary_loss_mlp": 0.01036419, + "balance_loss_clip": 1.25702477, + "balance_loss_mlp": 1.01597404, + "epoch": 0.5094844431083722, + "flos": 28852609478400.0, + "grad_norm": 2.0330218032089653, + "language_loss": 0.8581996, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.88281631, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20446777, + "step": 8474, + "time_per_iteration": 2.881608486175537 + }, + { + "auxiliary_loss_clip": 0.01441517, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.26939368, + "balance_loss_mlp": 1.01608646, + "epoch": 0.5095445663610402, + "flos": 14984401075200.0, + "grad_norm": 2.010098043535404, + "language_loss": 0.65807259, + "learning_rate": 2.035244457765222e-06, + "loss": 0.68286681, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21813965, + "step": 8475, + "time_per_iteration": 2.827157735824585 + }, + { + "auxiliary_loss_clip": 0.01460767, + "auxiliary_loss_mlp": 0.01043692, + "balance_loss_clip": 1.28499329, + "balance_loss_mlp": 1.02044606, + "epoch": 0.5096046896137081, + "flos": 20786811621120.0, + "grad_norm": 6.817158506031396, + "language_loss": 0.82803774, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.8530823, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.23278809, + "step": 8476, + "time_per_iteration": 2.861250400543213 + }, + { + "auxiliary_loss_clip": 0.01447887, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.27405226, + "balance_loss_mlp": 1.01919031, + "epoch": 0.5096648128663761, + "flos": 23195408054400.0, + "grad_norm": 2.2836268480174553, + "language_loss": 0.81102234, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83592784, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.23461914, + "step": 8477, + "time_per_iteration": 2.887803316116333 + }, + { + "auxiliary_loss_clip": 0.01442376, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.27089906, + "balance_loss_mlp": 1.01403737, + "epoch": 0.509724936119044, + "flos": 22319538353280.0, + "grad_norm": 1.8553245167643873, + "language_loss": 0.62620336, + "learning_rate": 2.034076248204082e-06, + "loss": 0.65099269, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.22521973, + "step": 8478, + "time_per_iteration": 3.0758187770843506 + }, + { + "auxiliary_loss_clip": 0.01423336, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.25495028, + "balance_loss_mlp": 1.01662636, + "epoch": 0.509785059371712, + "flos": 26298396720000.0, + "grad_norm": 2.979207101871582, + "language_loss": 0.67171574, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.69633263, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21728516, + "step": 8479, + "time_per_iteration": 2.8875436782836914 + }, + { + "auxiliary_loss_clip": 0.01425821, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.25873423, + "balance_loss_mlp": 1.012815, + "epoch": 0.50984518262438, + "flos": 22974449857920.0, + "grad_norm": 1.820786164358667, + "language_loss": 0.70222384, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.72681487, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20471191, + "step": 8480, + "time_per_iteration": 2.8651437759399414 + }, + { + "auxiliary_loss_clip": 0.01433517, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.26117277, + "balance_loss_mlp": 1.01207411, + "epoch": 0.509905305877048, + "flos": 26224502682240.0, + "grad_norm": 1.9760478129768126, + "language_loss": 0.79987741, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.82455271, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21948242, + "step": 8481, + "time_per_iteration": 2.8679933547973633 + }, + { + "auxiliary_loss_clip": 0.01418363, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.25112557, + "balance_loss_mlp": 1.01907635, + "epoch": 0.5099654291297159, + "flos": 20349872156160.0, + "grad_norm": 1.5151467579629598, + "language_loss": 0.8407771, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.86536813, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.2166748, + "step": 8482, + "time_per_iteration": 4.229772567749023 + }, + { + "auxiliary_loss_clip": 0.01444211, + "auxiliary_loss_mlp": 0.0103713, + "balance_loss_clip": 1.2695421, + "balance_loss_mlp": 1.015517, + "epoch": 0.5100255523823839, + "flos": 29065514100480.0, + "grad_norm": 1.9258168624672556, + "language_loss": 0.86123598, + "learning_rate": 2.032129206622238e-06, + "loss": 0.88604939, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.21618652, + "step": 8483, + "time_per_iteration": 4.432432174682617 + }, + { + "auxiliary_loss_clip": 0.01428916, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.25836635, + "balance_loss_mlp": 1.01372647, + "epoch": 0.5100856756350518, + "flos": 22466195308800.0, + "grad_norm": 2.636592216805329, + "language_loss": 0.83782536, + "learning_rate": 2.031739794591775e-06, + "loss": 0.8624568, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20507812, + "step": 8484, + "time_per_iteration": 4.2665276527404785 + }, + { + "auxiliary_loss_clip": 0.01423155, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.25322938, + "balance_loss_mlp": 1.01166415, + "epoch": 0.5101457988877198, + "flos": 19180190851200.0, + "grad_norm": 1.9135268971999295, + "language_loss": 0.82423472, + "learning_rate": 2.031350381357736e-06, + "loss": 0.84879559, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21264648, + "step": 8485, + "time_per_iteration": 2.8266758918762207 + }, + { + "auxiliary_loss_clip": 0.01419282, + "auxiliary_loss_mlp": 0.01039341, + "balance_loss_clip": 1.25323403, + "balance_loss_mlp": 1.01675034, + "epoch": 0.5102059221403878, + "flos": 14874555404160.0, + "grad_norm": 1.8139421965510567, + "language_loss": 0.74664581, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.77123201, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.22583008, + "step": 8486, + "time_per_iteration": 2.828829765319824 + }, + { + "auxiliary_loss_clip": 0.0144207, + "auxiliary_loss_mlp": 0.01040929, + "balance_loss_clip": 1.26833689, + "balance_loss_mlp": 1.01868415, + "epoch": 0.5102660453930558, + "flos": 22970061112320.0, + "grad_norm": 1.5455227727815348, + "language_loss": 0.7068947, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.73172468, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22253418, + "step": 8487, + "time_per_iteration": 3.0205676555633545 + }, + { + "auxiliary_loss_clip": 0.01421009, + "auxiliary_loss_mlp": 0.01036797, + "balance_loss_clip": 1.25363541, + "balance_loss_mlp": 1.01493394, + "epoch": 0.5103261686457238, + "flos": 23159818379520.0, + "grad_norm": 2.05230218229551, + "language_loss": 0.7390396, + "learning_rate": 2.030182134581827e-06, + "loss": 0.76361763, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21862793, + "step": 8488, + "time_per_iteration": 2.8650991916656494 + }, + { + "auxiliary_loss_clip": 0.0145235, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.27871299, + "balance_loss_mlp": 1.01706648, + "epoch": 0.5103862918983917, + "flos": 14327317820160.0, + "grad_norm": 1.8176735050634334, + "language_loss": 0.70638013, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.7312957, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.22131348, + "step": 8489, + "time_per_iteration": 2.858232259750366 + }, + { + "auxiliary_loss_clip": 0.01427147, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.25692677, + "balance_loss_mlp": 1.0179913, + "epoch": 0.5104464151510597, + "flos": 25859556973440.0, + "grad_norm": 3.520186770964302, + "language_loss": 0.73314935, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75781733, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2166748, + "step": 8490, + "time_per_iteration": 2.8718101978302 + }, + { + "auxiliary_loss_clip": 0.0141633, + "auxiliary_loss_mlp": 0.01038373, + "balance_loss_clip": 1.25035453, + "balance_loss_mlp": 1.0168674, + "epoch": 0.5105065384037276, + "flos": 21662907546240.0, + "grad_norm": 1.5323945782095576, + "language_loss": 0.81554991, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.84009689, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21508789, + "step": 8491, + "time_per_iteration": 2.8677096366882324 + }, + { + "auxiliary_loss_clip": 0.01406408, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.24294424, + "balance_loss_mlp": 1.01727128, + "epoch": 0.5105666616563956, + "flos": 22501649249280.0, + "grad_norm": 4.1781945342388305, + "language_loss": 0.80294174, + "learning_rate": 2.028624456259728e-06, + "loss": 0.82738328, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20471191, + "step": 8492, + "time_per_iteration": 2.848491668701172 + }, + { + "auxiliary_loss_clip": 0.01439867, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.26657033, + "balance_loss_mlp": 1.02059722, + "epoch": 0.5106267849090635, + "flos": 22466014329600.0, + "grad_norm": 1.7787385288085535, + "language_loss": 0.78721428, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.8120417, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22277832, + "step": 8493, + "time_per_iteration": 2.86641788482666 + }, + { + "auxiliary_loss_clip": 0.01431561, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.26132786, + "balance_loss_mlp": 1.01797986, + "epoch": 0.5106869081617316, + "flos": 23556779424000.0, + "grad_norm": 1.8551252394940954, + "language_loss": 0.84677356, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.87149632, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.22717285, + "step": 8494, + "time_per_iteration": 2.907911777496338 + }, + { + "auxiliary_loss_clip": 0.0143045, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.26022434, + "balance_loss_mlp": 1.01519036, + "epoch": 0.5107470314143995, + "flos": 26803167419520.0, + "grad_norm": 2.091037092639621, + "language_loss": 0.79817587, + "learning_rate": 2.027456186069326e-06, + "loss": 0.8228398, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.2076416, + "step": 8495, + "time_per_iteration": 2.881962299346924 + }, + { + "auxiliary_loss_clip": 0.01431582, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.26166749, + "balance_loss_mlp": 1.01814771, + "epoch": 0.5108071546670675, + "flos": 25750842422400.0, + "grad_norm": 2.0340424922799083, + "language_loss": 0.7879591, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.81266659, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21008301, + "step": 8496, + "time_per_iteration": 2.966218948364258 + }, + { + "auxiliary_loss_clip": 0.01419311, + "auxiliary_loss_mlp": 0.01037372, + "balance_loss_clip": 1.25336528, + "balance_loss_mlp": 1.0156405, + "epoch": 0.5108672779197354, + "flos": 18706756815360.0, + "grad_norm": 2.0640895084225144, + "language_loss": 0.7925334, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.81710023, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21740723, + "step": 8497, + "time_per_iteration": 2.835843324661255 + }, + { + "auxiliary_loss_clip": 0.01418484, + "auxiliary_loss_mlp": 0.01035875, + "balance_loss_clip": 1.25110984, + "balance_loss_mlp": 1.01478744, + "epoch": 0.5109274011724034, + "flos": 26699520286080.0, + "grad_norm": 2.00093596672389, + "language_loss": 0.82445586, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.8489995, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.2109375, + "step": 8498, + "time_per_iteration": 2.914102792739868 + }, + { + "auxiliary_loss_clip": 0.01414831, + "auxiliary_loss_mlp": 0.01042214, + "balance_loss_clip": 1.24974549, + "balance_loss_mlp": 1.01871812, + "epoch": 0.5109875244250714, + "flos": 22794329733120.0, + "grad_norm": 2.13561477992394, + "language_loss": 0.72403753, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.74860799, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.23498535, + "step": 8499, + "time_per_iteration": 2.887773036956787 + }, + { + "auxiliary_loss_clip": 0.01443465, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.27218497, + "balance_loss_mlp": 1.01409519, + "epoch": 0.5110476476777394, + "flos": 35601616627200.0, + "grad_norm": 1.5207032223026902, + "language_loss": 0.72883856, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.75364399, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.22973633, + "step": 8500, + "time_per_iteration": 3.006488084793091 + }, + { + "auxiliary_loss_clip": 0.0144705, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.27033281, + "balance_loss_mlp": 1.01592755, + "epoch": 0.5111077709304074, + "flos": 19290036522240.0, + "grad_norm": 3.931245653276883, + "language_loss": 0.64368749, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.66855174, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.23461914, + "step": 8501, + "time_per_iteration": 2.8542048931121826 + }, + { + "auxiliary_loss_clip": 0.01432117, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.25890028, + "balance_loss_mlp": 1.01578712, + "epoch": 0.5111678941830753, + "flos": 20678051825280.0, + "grad_norm": 1.6880578447338213, + "language_loss": 0.884152, + "learning_rate": 2.024730186540907e-06, + "loss": 0.90885562, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.2244873, + "step": 8502, + "time_per_iteration": 2.8260998725891113 + }, + { + "auxiliary_loss_clip": 0.01416189, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.24792182, + "balance_loss_mlp": 1.01300144, + "epoch": 0.5112280174357433, + "flos": 26299437350400.0, + "grad_norm": 1.4873440754669713, + "language_loss": 0.83154225, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.85606116, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.22717285, + "step": 8503, + "time_per_iteration": 2.927774429321289 + }, + { + "auxiliary_loss_clip": 0.01231329, + "auxiliary_loss_mlp": 0.01046073, + "balance_loss_clip": 1.13070178, + "balance_loss_mlp": 1.01498365, + "epoch": 0.5112881406884112, + "flos": 59499787201920.0, + "grad_norm": 0.8581867575325997, + "language_loss": 0.63910949, + "learning_rate": 2.023951320871339e-06, + "loss": 0.66188359, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.31054688, + "step": 8504, + "time_per_iteration": 3.3933780193328857 + }, + { + "auxiliary_loss_clip": 0.01419903, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.25241613, + "balance_loss_mlp": 1.01036453, + "epoch": 0.5113482639410792, + "flos": 26480055168000.0, + "grad_norm": 1.8758401628845973, + "language_loss": 0.8468399, + "learning_rate": 2.023561886666816e-06, + "loss": 0.87136877, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.22619629, + "step": 8505, + "time_per_iteration": 2.9149420261383057 + }, + { + "auxiliary_loss_clip": 0.01417343, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.25114751, + "balance_loss_mlp": 1.01468241, + "epoch": 0.5114083871937471, + "flos": 29907241960320.0, + "grad_norm": 2.297806455726351, + "language_loss": 0.76209646, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.78664398, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.22717285, + "step": 8506, + "time_per_iteration": 4.3589324951171875 + }, + { + "auxiliary_loss_clip": 0.01414429, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.24666297, + "balance_loss_mlp": 1.01489627, + "epoch": 0.5114685104464152, + "flos": 24324794225280.0, + "grad_norm": 1.697927304266879, + "language_loss": 0.58565152, + "learning_rate": 2.022783015592131e-06, + "loss": 0.61018538, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.24060059, + "step": 8507, + "time_per_iteration": 2.856609582901001 + }, + { + "auxiliary_loss_clip": 0.0142919, + "auxiliary_loss_mlp": 0.01038311, + "balance_loss_clip": 1.2608496, + "balance_loss_mlp": 1.01543486, + "epoch": 0.5115286336990831, + "flos": 17028097044480.0, + "grad_norm": 2.606928141595282, + "language_loss": 0.86179835, + "learning_rate": 2.022393578751503e-06, + "loss": 0.88647342, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.2286377, + "step": 8508, + "time_per_iteration": 2.848177671432495 + }, + { + "auxiliary_loss_clip": 0.01419511, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.25178432, + "balance_loss_mlp": 1.01400578, + "epoch": 0.5115887569517511, + "flos": 23669837475840.0, + "grad_norm": 2.4207397662807906, + "language_loss": 0.73308265, + "learning_rate": 2.022004141061709e-06, + "loss": 0.75764459, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.22680664, + "step": 8509, + "time_per_iteration": 2.861140489578247 + }, + { + "auxiliary_loss_clip": 0.01410964, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.24672747, + "balance_loss_mlp": 1.01463473, + "epoch": 0.511648880204419, + "flos": 16115778017280.0, + "grad_norm": 2.8864417737694428, + "language_loss": 0.76715553, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.7916292, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2175293, + "step": 8510, + "time_per_iteration": 2.7844481468200684 + }, + { + "auxiliary_loss_clip": 0.01428313, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.26259041, + "balance_loss_mlp": 1.01517737, + "epoch": 0.511709003457087, + "flos": 32647773381120.0, + "grad_norm": 1.943786921543668, + "language_loss": 0.71331018, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73797035, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.22558594, + "step": 8511, + "time_per_iteration": 2.962165355682373 + }, + { + "auxiliary_loss_clip": 0.01428651, + "auxiliary_loss_mlp": 0.01036546, + "balance_loss_clip": 1.26280046, + "balance_loss_mlp": 1.01423001, + "epoch": 0.511769126709755, + "flos": 21772029300480.0, + "grad_norm": 2.166333767808703, + "language_loss": 0.67615151, + "learning_rate": 2.020835823045001e-06, + "loss": 0.7008034, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.2232666, + "step": 8512, + "time_per_iteration": 2.8737900257110596 + }, + { + "auxiliary_loss_clip": 0.01424425, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.25503671, + "balance_loss_mlp": 1.01679635, + "epoch": 0.511829249962423, + "flos": 23926023388800.0, + "grad_norm": 1.891302863438915, + "language_loss": 0.68263471, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.70727527, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.22839355, + "step": 8513, + "time_per_iteration": 2.8625845909118652 + }, + { + "auxiliary_loss_clip": 0.01419399, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.25372791, + "balance_loss_mlp": 1.01656151, + "epoch": 0.511889373215091, + "flos": 23736085142400.0, + "grad_norm": 2.733267677875469, + "language_loss": 0.69553816, + "learning_rate": 2.0200569403921e-06, + "loss": 0.72012645, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.22839355, + "step": 8514, + "time_per_iteration": 2.926398754119873 + }, + { + "auxiliary_loss_clip": 0.01422022, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_clip": 1.25409102, + "balance_loss_mlp": 1.02101302, + "epoch": 0.5119494964677589, + "flos": 28123351488000.0, + "grad_norm": 2.146318832323308, + "language_loss": 0.67199099, + "learning_rate": 2.019667497917424e-06, + "loss": 0.6966399, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21850586, + "step": 8515, + "time_per_iteration": 2.9258053302764893 + }, + { + "auxiliary_loss_clip": 0.01414138, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.24893093, + "balance_loss_mlp": 1.01948214, + "epoch": 0.5120096197204269, + "flos": 24984094475520.0, + "grad_norm": 1.884097709251792, + "language_loss": 0.76684868, + "learning_rate": 2.019278054696955e-06, + "loss": 0.79141009, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.22473145, + "step": 8516, + "time_per_iteration": 2.8912696838378906 + }, + { + "auxiliary_loss_clip": 0.01424731, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.25930572, + "balance_loss_mlp": 1.01893711, + "epoch": 0.5120697429730948, + "flos": 17977091621760.0, + "grad_norm": 1.9087127385973055, + "language_loss": 0.78812855, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.81279075, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.22546387, + "step": 8517, + "time_per_iteration": 4.209630250930786 + }, + { + "auxiliary_loss_clip": 0.01442469, + "auxiliary_loss_mlp": 0.01042325, + "balance_loss_clip": 1.26922452, + "balance_loss_mlp": 1.01935351, + "epoch": 0.5121298662257628, + "flos": 23302131834240.0, + "grad_norm": 1.7765579553688344, + "language_loss": 0.74590963, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.77075756, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.22973633, + "step": 8518, + "time_per_iteration": 4.281785726547241 + }, + { + "auxiliary_loss_clip": 0.01433619, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.26399124, + "balance_loss_mlp": 1.01831722, + "epoch": 0.5121899894784308, + "flos": 17319917877120.0, + "grad_norm": 2.1151943022259494, + "language_loss": 0.79394674, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.81869429, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.22790527, + "step": 8519, + "time_per_iteration": 4.263570785522461 + }, + { + "auxiliary_loss_clip": 0.01422138, + "auxiliary_loss_mlp": 0.01042164, + "balance_loss_clip": 1.25635147, + "balance_loss_mlp": 1.01904893, + "epoch": 0.5122501127310988, + "flos": 24939229616640.0, + "grad_norm": 2.5199871791494357, + "language_loss": 0.79431629, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81895936, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.23144531, + "step": 8520, + "time_per_iteration": 2.945737361907959 + }, + { + "auxiliary_loss_clip": 0.01441449, + "auxiliary_loss_mlp": 0.01043558, + "balance_loss_clip": 1.26828754, + "balance_loss_mlp": 1.02046704, + "epoch": 0.5123102359837667, + "flos": 18452199715200.0, + "grad_norm": 2.5635350502908736, + "language_loss": 0.82248294, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.84733301, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.23095703, + "step": 8521, + "time_per_iteration": 2.8297808170318604 + }, + { + "auxiliary_loss_clip": 0.0143158, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.26162863, + "balance_loss_mlp": 1.01783645, + "epoch": 0.5123703592364347, + "flos": 26695357764480.0, + "grad_norm": 1.7901024270291064, + "language_loss": 0.69225901, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.71698046, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22729492, + "step": 8522, + "time_per_iteration": 2.899533748626709 + }, + { + "auxiliary_loss_clip": 0.01436355, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.26286149, + "balance_loss_mlp": 1.02036965, + "epoch": 0.5124304824891026, + "flos": 28815752949120.0, + "grad_norm": 1.6960102091478508, + "language_loss": 0.62392962, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.64872694, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.23010254, + "step": 8523, + "time_per_iteration": 2.9079203605651855 + }, + { + "auxiliary_loss_clip": 0.01430859, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.26285815, + "balance_loss_mlp": 1.01813412, + "epoch": 0.5124906057417706, + "flos": 21771712586880.0, + "grad_norm": 1.9594480063448338, + "language_loss": 0.7864846, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.81118774, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21313477, + "step": 8524, + "time_per_iteration": 2.825228214263916 + }, + { + "auxiliary_loss_clip": 0.01430325, + "auxiliary_loss_mlp": 0.01037112, + "balance_loss_clip": 1.26297855, + "balance_loss_mlp": 1.01538038, + "epoch": 0.5125507289944387, + "flos": 18889953586560.0, + "grad_norm": 2.0672172255614636, + "language_loss": 0.75938892, + "learning_rate": 2.015773034588706e-06, + "loss": 0.78406322, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21728516, + "step": 8525, + "time_per_iteration": 2.830467700958252 + }, + { + "auxiliary_loss_clip": 0.01441143, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.2696327, + "balance_loss_mlp": 1.01507747, + "epoch": 0.5126108522471066, + "flos": 35641685537280.0, + "grad_norm": 1.631232313952308, + "language_loss": 0.75207621, + "learning_rate": 2.015383584722531e-06, + "loss": 0.77687454, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.23620605, + "step": 8526, + "time_per_iteration": 2.95717716217041 + }, + { + "auxiliary_loss_clip": 0.01442986, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.27256632, + "balance_loss_mlp": 1.01506054, + "epoch": 0.5126709754997746, + "flos": 20199867085440.0, + "grad_norm": 1.697826780247972, + "language_loss": 0.66669095, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.69150245, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.23083496, + "step": 8527, + "time_per_iteration": 2.8749608993530273 + }, + { + "auxiliary_loss_clip": 0.01410235, + "auxiliary_loss_mlp": 0.01036324, + "balance_loss_clip": 1.24942636, + "balance_loss_mlp": 1.01441288, + "epoch": 0.5127310987524425, + "flos": 18597861285120.0, + "grad_norm": 1.7530964747077653, + "language_loss": 0.75079405, + "learning_rate": 2.014604683254908e-06, + "loss": 0.77525961, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.21899414, + "step": 8528, + "time_per_iteration": 2.8794853687286377 + }, + { + "auxiliary_loss_clip": 0.0142414, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.25706732, + "balance_loss_mlp": 1.0164938, + "epoch": 0.5127912220051105, + "flos": 22464656985600.0, + "grad_norm": 2.527393381917928, + "language_loss": 0.83930188, + "learning_rate": 2.014215231682995e-06, + "loss": 0.86394471, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.2364502, + "step": 8529, + "time_per_iteration": 2.870337724685669 + }, + { + "auxiliary_loss_clip": 0.01422706, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.25569916, + "balance_loss_mlp": 1.01589036, + "epoch": 0.5128513452577784, + "flos": 19101681843840.0, + "grad_norm": 1.9715142998212867, + "language_loss": 0.74938774, + "learning_rate": 2.01382577957204e-06, + "loss": 0.77399611, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.22229004, + "step": 8530, + "time_per_iteration": 2.8797667026519775 + }, + { + "auxiliary_loss_clip": 0.01241285, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.13450408, + "balance_loss_mlp": 1.01168871, + "epoch": 0.5129114685104464, + "flos": 67926838677120.0, + "grad_norm": 0.7437028506774569, + "language_loss": 0.60715401, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62993741, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.25390625, + "step": 8531, + "time_per_iteration": 3.459826946258545 + }, + { + "auxiliary_loss_clip": 0.01447501, + "auxiliary_loss_mlp": 0.01038927, + "balance_loss_clip": 1.27500677, + "balance_loss_mlp": 1.01584864, + "epoch": 0.5129715917631144, + "flos": 20458903420800.0, + "grad_norm": 2.105932784327798, + "language_loss": 0.77859092, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.80345523, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.2310791, + "step": 8532, + "time_per_iteration": 2.86252498626709 + }, + { + "auxiliary_loss_clip": 0.01437454, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.26752067, + "balance_loss_mlp": 1.01149213, + "epoch": 0.5130317150157824, + "flos": 35129630424960.0, + "grad_norm": 1.9813395929652537, + "language_loss": 0.67956024, + "learning_rate": 2.012657420152597e-06, + "loss": 0.70428216, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.23254395, + "step": 8533, + "time_per_iteration": 2.993361473083496 + }, + { + "auxiliary_loss_clip": 0.01432487, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.26181054, + "balance_loss_mlp": 1.01355898, + "epoch": 0.5130918382684503, + "flos": 19801005759360.0, + "grad_norm": 2.7855914618519395, + "language_loss": 0.82244754, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84713674, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22851562, + "step": 8534, + "time_per_iteration": 2.835568428039551 + }, + { + "auxiliary_loss_clip": 0.01427645, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.25854647, + "balance_loss_mlp": 1.01438189, + "epoch": 0.5131519615211183, + "flos": 26334303108480.0, + "grad_norm": 1.4340188796656528, + "language_loss": 0.64745963, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.67211556, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.23547363, + "step": 8535, + "time_per_iteration": 2.93871808052063 + }, + { + "auxiliary_loss_clip": 0.01430542, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.26302767, + "balance_loss_mlp": 1.01335359, + "epoch": 0.5132120847737862, + "flos": 19181276726400.0, + "grad_norm": 1.744597277076536, + "language_loss": 0.7013641, + "learning_rate": 2.011489056413418e-06, + "loss": 0.72603762, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.23461914, + "step": 8536, + "time_per_iteration": 2.8297345638275146 + }, + { + "auxiliary_loss_clip": 0.01432798, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.26017594, + "balance_loss_mlp": 1.01509452, + "epoch": 0.5132722080264542, + "flos": 20240162219520.0, + "grad_norm": 1.9744856201709355, + "language_loss": 0.71536362, + "learning_rate": 2.011099600942669e-06, + "loss": 0.74007308, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.23034668, + "step": 8537, + "time_per_iteration": 2.844421148300171 + }, + { + "auxiliary_loss_clip": 0.01430054, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.2583859, + "balance_loss_mlp": 1.01256561, + "epoch": 0.5133323312791223, + "flos": 16477466100480.0, + "grad_norm": 1.8927638436740795, + "language_loss": 0.80935514, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.83401436, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.23303223, + "step": 8538, + "time_per_iteration": 2.85497784614563 + }, + { + "auxiliary_loss_clip": 0.01425718, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.25717807, + "balance_loss_mlp": 1.01400161, + "epoch": 0.5133924545317902, + "flos": 26079338805120.0, + "grad_norm": 1.863756969893624, + "language_loss": 0.79242653, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.81704223, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21850586, + "step": 8539, + "time_per_iteration": 2.9098622798919678 + }, + { + "auxiliary_loss_clip": 0.01430138, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.26125407, + "balance_loss_mlp": 1.01541781, + "epoch": 0.5134525777844582, + "flos": 29142575274240.0, + "grad_norm": 1.628102056320393, + "language_loss": 0.76677316, + "learning_rate": 2.009931232064105e-06, + "loss": 0.79146481, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.23608398, + "step": 8540, + "time_per_iteration": 2.9349610805511475 + }, + { + "auxiliary_loss_clip": 0.01450458, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.27618027, + "balance_loss_mlp": 1.01701427, + "epoch": 0.5135127010371261, + "flos": 17463090983040.0, + "grad_norm": 1.7502535060232345, + "language_loss": 0.75944614, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.78436321, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.24243164, + "step": 8541, + "time_per_iteration": 4.290792226791382 + }, + { + "auxiliary_loss_clip": 0.01435181, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.26491868, + "balance_loss_mlp": 1.01669991, + "epoch": 0.5135728242897941, + "flos": 21955090337280.0, + "grad_norm": 1.6437737208063912, + "language_loss": 0.71258759, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.73732948, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.22314453, + "step": 8542, + "time_per_iteration": 2.845881700515747 + }, + { + "auxiliary_loss_clip": 0.01423058, + "auxiliary_loss_mlp": 0.01037439, + "balance_loss_clip": 1.25334382, + "balance_loss_mlp": 1.01306033, + "epoch": 0.513632947542462, + "flos": 22685207978880.0, + "grad_norm": 2.1632320383704773, + "language_loss": 0.80495483, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8295598, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.24365234, + "step": 8543, + "time_per_iteration": 2.8472506999969482 + }, + { + "auxiliary_loss_clip": 0.01424077, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.2574259, + "balance_loss_mlp": 1.01485455, + "epoch": 0.51369307079513, + "flos": 29468221234560.0, + "grad_norm": 3.314611651084389, + "language_loss": 0.68673265, + "learning_rate": 2.008373401689299e-06, + "loss": 0.71136832, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.24645996, + "step": 8544, + "time_per_iteration": 2.897733211517334 + }, + { + "auxiliary_loss_clip": 0.01438581, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.26556849, + "balance_loss_mlp": 1.0156436, + "epoch": 0.513753194047798, + "flos": 18998622892800.0, + "grad_norm": 4.447531597314397, + "language_loss": 0.73228896, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.75705302, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22192383, + "step": 8545, + "time_per_iteration": 2.84002423286438 + }, + { + "auxiliary_loss_clip": 0.01431548, + "auxiliary_loss_mlp": 0.01046137, + "balance_loss_clip": 1.25982928, + "balance_loss_mlp": 1.02237821, + "epoch": 0.513813317300466, + "flos": 17830841869440.0, + "grad_norm": 3.3879449544081646, + "language_loss": 0.82699788, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.85177469, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.23779297, + "step": 8546, + "time_per_iteration": 2.8078904151916504 + }, + { + "auxiliary_loss_clip": 0.01437686, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.26638234, + "balance_loss_mlp": 1.01553237, + "epoch": 0.5138734405531339, + "flos": 24071865937920.0, + "grad_norm": 1.670799900315332, + "language_loss": 0.74233508, + "learning_rate": 2.007205025522544e-06, + "loss": 0.76709533, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.22790527, + "step": 8547, + "time_per_iteration": 2.8861279487609863 + }, + { + "auxiliary_loss_clip": 0.01423114, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.25443709, + "balance_loss_mlp": 1.01757586, + "epoch": 0.5139335638058019, + "flos": 26106603436800.0, + "grad_norm": 1.5665632817531792, + "language_loss": 0.73918307, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.76381242, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.22253418, + "step": 8548, + "time_per_iteration": 2.886186122894287 + }, + { + "auxiliary_loss_clip": 0.01433551, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.2636354, + "balance_loss_mlp": 1.01658678, + "epoch": 0.5139936870584698, + "flos": 18926538647040.0, + "grad_norm": 3.0572230282780057, + "language_loss": 0.83179772, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.85653394, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.23498535, + "step": 8549, + "time_per_iteration": 2.811655282974243 + }, + { + "auxiliary_loss_clip": 0.01417894, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.25119901, + "balance_loss_mlp": 1.01580656, + "epoch": 0.5140538103111378, + "flos": 16152679791360.0, + "grad_norm": 2.1407275121904794, + "language_loss": 0.72996813, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.75452483, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.21984863, + "step": 8550, + "time_per_iteration": 2.850909471511841 + }, + { + "auxiliary_loss_clip": 0.01439972, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.26656544, + "balance_loss_mlp": 1.01265669, + "epoch": 0.5141139335638057, + "flos": 22430605633920.0, + "grad_norm": 1.5284276993503516, + "language_loss": 0.75932497, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.78407919, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.22766113, + "step": 8551, + "time_per_iteration": 4.37551474571228 + }, + { + "auxiliary_loss_clip": 0.01421046, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.25617242, + "balance_loss_mlp": 1.01531076, + "epoch": 0.5141740568164738, + "flos": 27101141544960.0, + "grad_norm": 1.7124714405554866, + "language_loss": 0.69616604, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.72074318, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.21350098, + "step": 8552, + "time_per_iteration": 2.902970314025879 + }, + { + "auxiliary_loss_clip": 0.01439824, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.26765919, + "balance_loss_mlp": 1.01421094, + "epoch": 0.5142341800691418, + "flos": 24984139720320.0, + "grad_norm": 4.595494278346746, + "language_loss": 0.7552011, + "learning_rate": 2.004868266210965e-06, + "loss": 0.77996445, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.22314453, + "step": 8553, + "time_per_iteration": 4.314698696136475 + }, + { + "auxiliary_loss_clip": 0.01424721, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.25685859, + "balance_loss_mlp": 1.01350522, + "epoch": 0.5142943033218097, + "flos": 20714320172160.0, + "grad_norm": 1.5983430933554927, + "language_loss": 0.68087047, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70546424, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.21142578, + "step": 8554, + "time_per_iteration": 4.346302032470703 + }, + { + "auxiliary_loss_clip": 0.01441748, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.26670253, + "balance_loss_mlp": 1.01500082, + "epoch": 0.5143544265744777, + "flos": 22934652416640.0, + "grad_norm": 1.7253439549209237, + "language_loss": 0.74359274, + "learning_rate": 2.004089344806068e-06, + "loss": 0.76839387, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.23364258, + "step": 8555, + "time_per_iteration": 2.8465735912323 + }, + { + "auxiliary_loss_clip": 0.01438673, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.26834881, + "balance_loss_mlp": 1.01516485, + "epoch": 0.5144145498271456, + "flos": 15929052151680.0, + "grad_norm": 2.345357059345884, + "language_loss": 0.76137418, + "learning_rate": 2.003699883863633e-06, + "loss": 0.78613937, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.22668457, + "step": 8556, + "time_per_iteration": 2.8128280639648438 + }, + { + "auxiliary_loss_clip": 0.01414731, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.24845719, + "balance_loss_mlp": 1.01507556, + "epoch": 0.5144746730798136, + "flos": 19690345681920.0, + "grad_norm": 2.2969614961983713, + "language_loss": 0.87041146, + "learning_rate": 2.003310422780898e-06, + "loss": 0.89492995, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.22058105, + "step": 8557, + "time_per_iteration": 2.8290698528289795 + }, + { + "auxiliary_loss_clip": 0.0141025, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.24595571, + "balance_loss_mlp": 1.01371241, + "epoch": 0.5145347963324816, + "flos": 23925208982400.0, + "grad_norm": 1.5433023268916481, + "language_loss": 0.89690745, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.92135423, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20727539, + "step": 8558, + "time_per_iteration": 2.899775743484497 + }, + { + "auxiliary_loss_clip": 0.01411073, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.24776888, + "balance_loss_mlp": 1.01429367, + "epoch": 0.5145949195851496, + "flos": 18269183923200.0, + "grad_norm": 1.8422062224290896, + "language_loss": 0.66216087, + "learning_rate": 2.002531500253602e-06, + "loss": 0.68664616, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.23156738, + "step": 8559, + "time_per_iteration": 2.8489816188812256 + }, + { + "auxiliary_loss_clip": 0.01419818, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.253057, + "balance_loss_mlp": 1.01334965, + "epoch": 0.5146550428378175, + "flos": 26224547927040.0, + "grad_norm": 1.6802778885134806, + "language_loss": 0.63894749, + "learning_rate": 2.002142038838577e-06, + "loss": 0.6634925, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21337891, + "step": 8560, + "time_per_iteration": 2.873159408569336 + }, + { + "auxiliary_loss_clip": 0.01427496, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.25955343, + "balance_loss_mlp": 1.0123055, + "epoch": 0.5147151660904855, + "flos": 22684348327680.0, + "grad_norm": 1.5622693371043959, + "language_loss": 0.70944893, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.73406351, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.2166748, + "step": 8561, + "time_per_iteration": 2.881477117538452 + }, + { + "auxiliary_loss_clip": 0.01426005, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.25761962, + "balance_loss_mlp": 1.01750636, + "epoch": 0.5147752893431534, + "flos": 24983053845120.0, + "grad_norm": 1.5017496340486944, + "language_loss": 0.67342985, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.69807684, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21179199, + "step": 8562, + "time_per_iteration": 2.8703243732452393 + }, + { + "auxiliary_loss_clip": 0.01439901, + "auxiliary_loss_mlp": 0.01037687, + "balance_loss_clip": 1.26994205, + "balance_loss_mlp": 1.01621723, + "epoch": 0.5148354125958214, + "flos": 22754215578240.0, + "grad_norm": 3.7265979325736773, + "language_loss": 0.78623223, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.81100816, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21472168, + "step": 8563, + "time_per_iteration": 2.912950277328491 + }, + { + "auxiliary_loss_clip": 0.01441795, + "auxiliary_loss_mlp": 0.01040337, + "balance_loss_clip": 1.26799798, + "balance_loss_mlp": 1.01659048, + "epoch": 0.5148955358484893, + "flos": 23077282584960.0, + "grad_norm": 1.9758184765681857, + "language_loss": 0.84045267, + "learning_rate": 2.0005841925139e-06, + "loss": 0.86527401, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.23754883, + "step": 8564, + "time_per_iteration": 2.8751637935638428 + }, + { + "auxiliary_loss_clip": 0.01458272, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.28665781, + "balance_loss_mlp": 1.01348543, + "epoch": 0.5149556591011574, + "flos": 20349962645760.0, + "grad_norm": 1.8730441111013136, + "language_loss": 0.74011374, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.76504707, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21557617, + "step": 8565, + "time_per_iteration": 2.876016616821289 + }, + { + "auxiliary_loss_clip": 0.01452457, + "auxiliary_loss_mlp": 0.01034385, + "balance_loss_clip": 1.2778337, + "balance_loss_mlp": 1.01245058, + "epoch": 0.5150157823538254, + "flos": 22648532428800.0, + "grad_norm": 2.00728745813461, + "language_loss": 0.69277853, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.71764696, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.21923828, + "step": 8566, + "time_per_iteration": 2.884556293487549 + }, + { + "auxiliary_loss_clip": 0.0145258, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.27557302, + "balance_loss_mlp": 1.01485455, + "epoch": 0.5150759056064933, + "flos": 26078750622720.0, + "grad_norm": 1.8283237830297245, + "language_loss": 0.78864336, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.81353283, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.21520996, + "step": 8567, + "time_per_iteration": 2.874546527862549 + }, + { + "auxiliary_loss_clip": 0.01447375, + "auxiliary_loss_mlp": 0.01043144, + "balance_loss_clip": 1.27551389, + "balance_loss_mlp": 1.02114987, + "epoch": 0.5151360288591613, + "flos": 25962299210880.0, + "grad_norm": 2.7175653700919846, + "language_loss": 0.79828227, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.82318741, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.2199707, + "step": 8568, + "time_per_iteration": 2.9142093658447266 + }, + { + "auxiliary_loss_clip": 0.0141924, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.25256538, + "balance_loss_mlp": 1.0118711, + "epoch": 0.5151961521118292, + "flos": 18514375349760.0, + "grad_norm": 2.0849067280410742, + "language_loss": 0.91622198, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.94075572, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.22265625, + "step": 8569, + "time_per_iteration": 2.8379158973693848 + }, + { + "auxiliary_loss_clip": 0.01446977, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.2746067, + "balance_loss_mlp": 1.01818562, + "epoch": 0.5152562753644973, + "flos": 22243065361920.0, + "grad_norm": 1.6608159036229495, + "language_loss": 0.77047825, + "learning_rate": 1.998247422657674e-06, + "loss": 0.79534197, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21203613, + "step": 8570, + "time_per_iteration": 2.9448909759521484 + }, + { + "auxiliary_loss_clip": 0.01438893, + "auxiliary_loss_mlp": 0.0104721, + "balance_loss_clip": 1.26856351, + "balance_loss_mlp": 1.02357078, + "epoch": 0.5153163986171652, + "flos": 38450953088640.0, + "grad_norm": 1.5395901724754373, + "language_loss": 0.74449921, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76936024, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.23657227, + "step": 8571, + "time_per_iteration": 2.9909887313842773 + }, + { + "auxiliary_loss_clip": 0.0123378, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.12884378, + "balance_loss_mlp": 1.01503265, + "epoch": 0.5153765218698332, + "flos": 66415585242240.0, + "grad_norm": 0.7768200589214824, + "language_loss": 0.52912205, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.55189627, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.28515625, + "step": 8572, + "time_per_iteration": 3.441209316253662 + }, + { + "auxiliary_loss_clip": 0.01425195, + "auxiliary_loss_mlp": 0.01037776, + "balance_loss_clip": 1.26077175, + "balance_loss_mlp": 1.01587725, + "epoch": 0.5154366451225011, + "flos": 24035326122240.0, + "grad_norm": 1.9383061055277306, + "language_loss": 0.7759099, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.80053955, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.21899414, + "step": 8573, + "time_per_iteration": 2.9549922943115234 + }, + { + "auxiliary_loss_clip": 0.01427681, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.26064324, + "balance_loss_mlp": 1.01437783, + "epoch": 0.5154967683751691, + "flos": 23477139296640.0, + "grad_norm": 1.8522111306105504, + "language_loss": 0.78557229, + "learning_rate": 1.996689577219102e-06, + "loss": 0.8102212, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.22814941, + "step": 8574, + "time_per_iteration": 2.8661134243011475 + }, + { + "auxiliary_loss_clip": 0.01410022, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.24393344, + "balance_loss_mlp": 1.01738441, + "epoch": 0.515556891627837, + "flos": 23816222962560.0, + "grad_norm": 1.8156620061907192, + "language_loss": 0.85733879, + "learning_rate": 1.996300116136367e-06, + "loss": 0.88182366, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21081543, + "step": 8575, + "time_per_iteration": 2.890826940536499 + }, + { + "auxiliary_loss_clip": 0.01432172, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.26232529, + "balance_loss_mlp": 1.01033092, + "epoch": 0.515617014880505, + "flos": 19838043267840.0, + "grad_norm": 1.6311877269754578, + "language_loss": 0.77425218, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79889286, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21569824, + "step": 8576, + "time_per_iteration": 4.354767799377441 + }, + { + "auxiliary_loss_clip": 0.01458829, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.28054905, + "balance_loss_mlp": 1.01516545, + "epoch": 0.515677138133173, + "flos": 14253876230400.0, + "grad_norm": 2.6929223152262747, + "language_loss": 0.77100194, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.79597086, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.22888184, + "step": 8577, + "time_per_iteration": 2.8082566261291504 + }, + { + "auxiliary_loss_clip": 0.01437301, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.26562965, + "balance_loss_mlp": 1.01387382, + "epoch": 0.515737261385841, + "flos": 28300485456000.0, + "grad_norm": 1.648461017084631, + "language_loss": 0.81525743, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.84001166, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.24230957, + "step": 8578, + "time_per_iteration": 2.939502477645874 + }, + { + "auxiliary_loss_clip": 0.0141783, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.24995995, + "balance_loss_mlp": 1.01261127, + "epoch": 0.515797384638509, + "flos": 27903524411520.0, + "grad_norm": 1.701713119170849, + "language_loss": 0.76608086, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.79061157, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.22644043, + "step": 8579, + "time_per_iteration": 2.930098056793213 + }, + { + "auxiliary_loss_clip": 0.01426267, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.25619125, + "balance_loss_mlp": 1.01394713, + "epoch": 0.5158575078911769, + "flos": 23050606135680.0, + "grad_norm": 1.6834395028611489, + "language_loss": 0.80090529, + "learning_rate": 1.994352813122559e-06, + "loss": 0.8255257, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.21850586, + "step": 8580, + "time_per_iteration": 2.8765671253204346 + }, + { + "auxiliary_loss_clip": 0.01440472, + "auxiliary_loss_mlp": 0.01037085, + "balance_loss_clip": 1.26780593, + "balance_loss_mlp": 1.01370835, + "epoch": 0.5159176311438449, + "flos": 12649110497280.0, + "grad_norm": 2.102008924324362, + "language_loss": 0.73541576, + "learning_rate": 1.99396335310315e-06, + "loss": 0.76019132, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.23376465, + "step": 8581, + "time_per_iteration": 2.8574702739715576 + }, + { + "auxiliary_loss_clip": 0.01421442, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.25513172, + "balance_loss_mlp": 1.01546085, + "epoch": 0.5159777543965128, + "flos": 15566142458880.0, + "grad_norm": 2.404181812462474, + "language_loss": 0.7515825, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.77616978, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21813965, + "step": 8582, + "time_per_iteration": 2.814892292022705 + }, + { + "auxiliary_loss_clip": 0.01425901, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.2589438, + "balance_loss_mlp": 1.01245379, + "epoch": 0.5160378776491809, + "flos": 23232038359680.0, + "grad_norm": 1.9164216890935626, + "language_loss": 0.669595, + "learning_rate": 1.99318443376583e-06, + "loss": 0.69419497, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.21643066, + "step": 8583, + "time_per_iteration": 2.8361470699310303 + }, + { + "auxiliary_loss_clip": 0.01437522, + "auxiliary_loss_mlp": 0.01041994, + "balance_loss_clip": 1.26871347, + "balance_loss_mlp": 1.01928449, + "epoch": 0.5160980009018488, + "flos": 21954637889280.0, + "grad_norm": 1.4106971058169335, + "language_loss": 0.76505542, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78985053, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.22717285, + "step": 8584, + "time_per_iteration": 2.8420839309692383 + }, + { + "auxiliary_loss_clip": 0.01438041, + "auxiliary_loss_mlp": 0.01042835, + "balance_loss_clip": 1.26649523, + "balance_loss_mlp": 1.01999414, + "epoch": 0.5161581241545168, + "flos": 22794374977920.0, + "grad_norm": 2.134842685029386, + "language_loss": 0.79876035, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.82356906, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22851562, + "step": 8585, + "time_per_iteration": 2.868936061859131 + }, + { + "auxiliary_loss_clip": 0.01409995, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.24757004, + "balance_loss_mlp": 1.01240647, + "epoch": 0.5162182474071847, + "flos": 19682970779520.0, + "grad_norm": 3.7350805582910276, + "language_loss": 0.80722094, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83165586, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.21081543, + "step": 8586, + "time_per_iteration": 4.289619445800781 + }, + { + "auxiliary_loss_clip": 0.01418492, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.25177002, + "balance_loss_mlp": 1.01722252, + "epoch": 0.5162783706598527, + "flos": 20055427125120.0, + "grad_norm": 2.174855892957231, + "language_loss": 0.73329449, + "learning_rate": 1.991626598310701e-06, + "loss": 0.75787604, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.2244873, + "step": 8587, + "time_per_iteration": 2.860093116760254 + }, + { + "auxiliary_loss_clip": 0.01247973, + "auxiliary_loss_mlp": 0.01020779, + "balance_loss_clip": 1.14338362, + "balance_loss_mlp": 0.996746, + "epoch": 0.5163384939125206, + "flos": 69991419761280.0, + "grad_norm": 0.7292709598892758, + "language_loss": 0.57904816, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.60173559, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.24023438, + "step": 8588, + "time_per_iteration": 6.288766384124756 + }, + { + "auxiliary_loss_clip": 0.01428929, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.25837266, + "balance_loss_mlp": 1.01479697, + "epoch": 0.5163986171651886, + "flos": 17425012844160.0, + "grad_norm": 2.09518075568316, + "language_loss": 0.76002419, + "learning_rate": 1.990847682429185e-06, + "loss": 0.78469324, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.23181152, + "step": 8589, + "time_per_iteration": 2.8317511081695557 + }, + { + "auxiliary_loss_clip": 0.01432846, + "auxiliary_loss_mlp": 0.0103696, + "balance_loss_clip": 1.26329958, + "balance_loss_mlp": 1.01584768, + "epoch": 0.5164587404178566, + "flos": 21332103678720.0, + "grad_norm": 1.6475224654886933, + "language_loss": 0.67909479, + "learning_rate": 1.990458225001627e-06, + "loss": 0.70379281, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.21105957, + "step": 8590, + "time_per_iteration": 2.8623545169830322 + }, + { + "auxiliary_loss_clip": 0.01244416, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.14062238, + "balance_loss_mlp": 1.00330377, + "epoch": 0.5165188636705246, + "flos": 68087186824320.0, + "grad_norm": 0.785103453897445, + "language_loss": 0.55863392, + "learning_rate": 1.990068767935895e-06, + "loss": 0.58137429, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.26367188, + "step": 8591, + "time_per_iteration": 3.2376537322998047 + }, + { + "auxiliary_loss_clip": 0.0139777, + "auxiliary_loss_mlp": 0.010379, + "balance_loss_clip": 1.23838603, + "balance_loss_mlp": 1.0156436, + "epoch": 0.5165789869231926, + "flos": 19393728900480.0, + "grad_norm": 1.5857166825992317, + "language_loss": 0.82411504, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.84847176, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.22265625, + "step": 8592, + "time_per_iteration": 2.900273323059082 + }, + { + "auxiliary_loss_clip": 0.01415103, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.25099492, + "balance_loss_mlp": 1.00844419, + "epoch": 0.5166391101758605, + "flos": 20970144126720.0, + "grad_norm": 2.9296953171523588, + "language_loss": 0.83971125, + "learning_rate": 1.989289854948979e-06, + "loss": 0.86416638, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.21984863, + "step": 8593, + "time_per_iteration": 2.9012959003448486 + }, + { + "auxiliary_loss_clip": 0.01414455, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.24807239, + "balance_loss_mlp": 1.0155592, + "epoch": 0.5166992334285285, + "flos": 29474826975360.0, + "grad_norm": 2.117285919252505, + "language_loss": 0.70229453, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.72682029, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.22570801, + "step": 8594, + "time_per_iteration": 2.924818277359009 + }, + { + "auxiliary_loss_clip": 0.01417343, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.25171018, + "balance_loss_mlp": 1.01462281, + "epoch": 0.5167593566811964, + "flos": 20314372970880.0, + "grad_norm": 1.440637379396733, + "language_loss": 0.78186822, + "learning_rate": 1.988510943586582e-06, + "loss": 0.80641067, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.22253418, + "step": 8595, + "time_per_iteration": 2.895172357559204 + }, + { + "auxiliary_loss_clip": 0.01416238, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.25012255, + "balance_loss_mlp": 1.01400399, + "epoch": 0.5168194799338645, + "flos": 14619591100800.0, + "grad_norm": 1.6963020197701275, + "language_loss": 0.65837669, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.68289393, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21484375, + "step": 8596, + "time_per_iteration": 2.834728240966797 + }, + { + "auxiliary_loss_clip": 0.01427272, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.26088214, + "balance_loss_mlp": 1.01647401, + "epoch": 0.5168796031865324, + "flos": 25017512400000.0, + "grad_norm": 1.6060162711811252, + "language_loss": 0.76429975, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.78896546, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.22827148, + "step": 8597, + "time_per_iteration": 2.9289236068725586 + }, + { + "auxiliary_loss_clip": 0.014271, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.25964975, + "balance_loss_mlp": 1.01493001, + "epoch": 0.5169397264392004, + "flos": 26950457802240.0, + "grad_norm": 1.7877374412665, + "language_loss": 0.82113898, + "learning_rate": 1.987342579847403e-06, + "loss": 0.8457824, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.2232666, + "step": 8598, + "time_per_iteration": 2.9432528018951416 + }, + { + "auxiliary_loss_clip": 0.01412665, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.246418, + "balance_loss_mlp": 1.0125227, + "epoch": 0.5169998496918683, + "flos": 25417957294080.0, + "grad_norm": 1.664915774289835, + "language_loss": 0.76155764, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.78604364, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.23413086, + "step": 8599, + "time_per_iteration": 2.8695733547210693 + }, + { + "auxiliary_loss_clip": 0.01418366, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.25428271, + "balance_loss_mlp": 1.01289809, + "epoch": 0.5170599729445363, + "flos": 24691459236480.0, + "grad_norm": 2.15065901035362, + "language_loss": 0.73668087, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.76120991, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.21618652, + "step": 8600, + "time_per_iteration": 2.9076287746429443 + }, + { + "auxiliary_loss_clip": 0.01420353, + "auxiliary_loss_mlp": 0.01038318, + "balance_loss_clip": 1.25509191, + "balance_loss_mlp": 1.01604986, + "epoch": 0.5171200961972042, + "flos": 21003833520000.0, + "grad_norm": 1.578070606654716, + "language_loss": 0.7529161, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.77750278, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.22265625, + "step": 8601, + "time_per_iteration": 2.833125114440918 + }, + { + "auxiliary_loss_clip": 0.01419744, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.25246656, + "balance_loss_mlp": 1.01714647, + "epoch": 0.5171802194498722, + "flos": 22755437187840.0, + "grad_norm": 3.1491846710680726, + "language_loss": 0.85497224, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.87957191, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.23071289, + "step": 8602, + "time_per_iteration": 2.8346335887908936 + }, + { + "auxiliary_loss_clip": 0.01417785, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.25068688, + "balance_loss_mlp": 1.01458073, + "epoch": 0.5172403427025402, + "flos": 28186567752960.0, + "grad_norm": 1.738717168138201, + "language_loss": 0.75564915, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.78019857, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.22570801, + "step": 8603, + "time_per_iteration": 2.8779842853546143 + }, + { + "auxiliary_loss_clip": 0.01433109, + "auxiliary_loss_mlp": 0.01041485, + "balance_loss_clip": 1.26472139, + "balance_loss_mlp": 1.01994419, + "epoch": 0.5173004659552082, + "flos": 20347066978560.0, + "grad_norm": 2.5264268562428174, + "language_loss": 0.74053353, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.76527953, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21533203, + "step": 8604, + "time_per_iteration": 2.8286824226379395 + }, + { + "auxiliary_loss_clip": 0.01444193, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.26743686, + "balance_loss_mlp": 1.01108241, + "epoch": 0.5173605892078762, + "flos": 19072878888960.0, + "grad_norm": 2.363241814024563, + "language_loss": 0.86125481, + "learning_rate": 1.984616415277469e-06, + "loss": 0.88604689, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 1.76855469, + "router_z_loss_mlp": 0.23937988, + "step": 8605, + "time_per_iteration": 2.855822801589966 + }, + { + "auxiliary_loss_clip": 0.01416442, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.24975896, + "balance_loss_mlp": 1.01285267, + "epoch": 0.5174207124605441, + "flos": 28005180773760.0, + "grad_norm": 1.517408712855388, + "language_loss": 0.65413946, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67865592, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.22351074, + "step": 8606, + "time_per_iteration": 3.0367088317871094 + }, + { + "auxiliary_loss_clip": 0.01426361, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.2607795, + "balance_loss_mlp": 1.01500952, + "epoch": 0.5174808357132121, + "flos": 19505158139520.0, + "grad_norm": 1.4474617254817248, + "language_loss": 0.78195989, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80659676, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.2232666, + "step": 8607, + "time_per_iteration": 2.9227051734924316 + }, + { + "auxiliary_loss_clip": 0.01423007, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.25592279, + "balance_loss_mlp": 1.02165449, + "epoch": 0.51754095896588, + "flos": 22794510712320.0, + "grad_norm": 1.600782028660385, + "language_loss": 0.72311389, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74779564, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.23522949, + "step": 8608, + "time_per_iteration": 3.0031864643096924 + }, + { + "auxiliary_loss_clip": 0.01434514, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.26273942, + "balance_loss_mlp": 1.01460147, + "epoch": 0.5176010822185481, + "flos": 22678149790080.0, + "grad_norm": 1.9216289840562966, + "language_loss": 0.87841308, + "learning_rate": 1.983058619460531e-06, + "loss": 0.90313977, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.23583984, + "step": 8609, + "time_per_iteration": 2.889958143234253 + }, + { + "auxiliary_loss_clip": 0.01416238, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.24808836, + "balance_loss_mlp": 1.01487529, + "epoch": 0.517661205471216, + "flos": 23961703553280.0, + "grad_norm": 1.6186988628422483, + "language_loss": 0.74295551, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.76749045, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.22375488, + "step": 8610, + "time_per_iteration": 2.87986421585083 + }, + { + "auxiliary_loss_clip": 0.01434876, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.26209044, + "balance_loss_mlp": 1.01294446, + "epoch": 0.517721328723884, + "flos": 15604446821760.0, + "grad_norm": 2.9072636268469267, + "language_loss": 0.68749034, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.71219039, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22192383, + "step": 8611, + "time_per_iteration": 4.315730571746826 + }, + { + "auxiliary_loss_clip": 0.01431761, + "auxiliary_loss_mlp": 0.01038321, + "balance_loss_clip": 1.26297545, + "balance_loss_mlp": 1.01489592, + "epoch": 0.5177814519765519, + "flos": 20970144126720.0, + "grad_norm": 2.1567603449227857, + "language_loss": 0.78425252, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.80895329, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.23425293, + "step": 8612, + "time_per_iteration": 2.8778679370880127 + }, + { + "auxiliary_loss_clip": 0.01424279, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.25432992, + "balance_loss_mlp": 1.01483023, + "epoch": 0.5178415752292199, + "flos": 17976865397760.0, + "grad_norm": 1.935708719116835, + "language_loss": 0.82669294, + "learning_rate": 1.981500833922294e-06, + "loss": 0.85131156, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.22753906, + "step": 8613, + "time_per_iteration": 2.8252224922180176 + }, + { + "auxiliary_loss_clip": 0.01437811, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.26670516, + "balance_loss_mlp": 1.0154022, + "epoch": 0.5179016984818878, + "flos": 17830389421440.0, + "grad_norm": 2.302460056280268, + "language_loss": 0.67783988, + "learning_rate": 1.981111389254541e-06, + "loss": 0.70260751, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.2355957, + "step": 8614, + "time_per_iteration": 2.7872633934020996 + }, + { + "auxiliary_loss_clip": 0.0143034, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.26096296, + "balance_loss_mlp": 1.0132376, + "epoch": 0.5179618217345558, + "flos": 17829077322240.0, + "grad_norm": 1.9150861113586388, + "language_loss": 0.87210906, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89677346, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.2286377, + "step": 8615, + "time_per_iteration": 2.8271243572235107 + }, + { + "auxiliary_loss_clip": 0.01423481, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.25796497, + "balance_loss_mlp": 1.02044463, + "epoch": 0.5180219449872238, + "flos": 22530678428160.0, + "grad_norm": 1.8367271041958324, + "language_loss": 0.81343091, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83810198, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.23181152, + "step": 8616, + "time_per_iteration": 2.945751905441284 + }, + { + "auxiliary_loss_clip": 0.01445834, + "auxiliary_loss_mlp": 0.01043678, + "balance_loss_clip": 1.27351022, + "balance_loss_mlp": 1.01932359, + "epoch": 0.5180820682398918, + "flos": 23926159123200.0, + "grad_norm": 1.9464631489514959, + "language_loss": 0.76093554, + "learning_rate": 1.9799430596079e-06, + "loss": 0.78583062, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.24353027, + "step": 8617, + "time_per_iteration": 2.906691789627075 + }, + { + "auxiliary_loss_clip": 0.01439866, + "auxiliary_loss_mlp": 0.01046758, + "balance_loss_clip": 1.26937747, + "balance_loss_mlp": 1.02432275, + "epoch": 0.5181421914925598, + "flos": 16987982889600.0, + "grad_norm": 2.308076907681896, + "language_loss": 0.70751309, + "learning_rate": 1.979553617893785e-06, + "loss": 0.73237932, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.22436523, + "step": 8618, + "time_per_iteration": 2.8437342643737793 + }, + { + "auxiliary_loss_clip": 0.0124703, + "auxiliary_loss_mlp": 0.01047363, + "balance_loss_clip": 1.14251626, + "balance_loss_mlp": 1.02313924, + "epoch": 0.5182023147452277, + "flos": 66091839563520.0, + "grad_norm": 0.952369521090946, + "language_loss": 0.67315549, + "learning_rate": 1.979164176954999e-06, + "loss": 0.6960994, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.2421875, + "step": 8619, + "time_per_iteration": 3.311448097229004 + }, + { + "auxiliary_loss_clip": 0.01408471, + "auxiliary_loss_mlp": 0.01036041, + "balance_loss_clip": 1.24539638, + "balance_loss_mlp": 1.01409423, + "epoch": 0.5182624379978957, + "flos": 18197099677440.0, + "grad_norm": 2.0673089570892196, + "language_loss": 0.81179458, + "learning_rate": 1.97877473680631e-06, + "loss": 0.8362397, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.21923828, + "step": 8620, + "time_per_iteration": 2.8482272624969482 + }, + { + "auxiliary_loss_clip": 0.01415135, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.2509892, + "balance_loss_mlp": 1.01779246, + "epoch": 0.5183225612505636, + "flos": 14034546846720.0, + "grad_norm": 2.044219244362442, + "language_loss": 0.82607317, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.85062611, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.22351074, + "step": 8621, + "time_per_iteration": 4.255959510803223 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01039586, + "balance_loss_clip": 1.25564349, + "balance_loss_mlp": 1.01688886, + "epoch": 0.5183826845032317, + "flos": 23670335168640.0, + "grad_norm": 2.3985188505698276, + "language_loss": 0.65926021, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.68388855, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.22705078, + "step": 8622, + "time_per_iteration": 2.8502089977264404 + }, + { + "auxiliary_loss_clip": 0.01445751, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.27125084, + "balance_loss_mlp": 1.01750267, + "epoch": 0.5184428077558996, + "flos": 15897217795200.0, + "grad_norm": 3.6521865258497037, + "language_loss": 0.62200838, + "learning_rate": 1.977606421248497e-06, + "loss": 0.64687854, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.23754883, + "step": 8623, + "time_per_iteration": 4.256067752838135 + }, + { + "auxiliary_loss_clip": 0.01412287, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.24631131, + "balance_loss_mlp": 1.0129807, + "epoch": 0.5185029310085676, + "flos": 21040192356480.0, + "grad_norm": 1.6288499219975408, + "language_loss": 0.76599979, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.79048008, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.22741699, + "step": 8624, + "time_per_iteration": 4.234911918640137 + }, + { + "auxiliary_loss_clip": 0.01430086, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.26001859, + "balance_loss_mlp": 1.01442814, + "epoch": 0.5185630542612355, + "flos": 26554130184960.0, + "grad_norm": 1.7715748114051129, + "language_loss": 0.72559273, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.75026286, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22497559, + "step": 8625, + "time_per_iteration": 2.8537485599517822 + }, + { + "auxiliary_loss_clip": 0.0141565, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.24917889, + "balance_loss_mlp": 1.01430273, + "epoch": 0.5186231775139035, + "flos": 20678323294080.0, + "grad_norm": 2.070492977160189, + "language_loss": 0.68390667, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70842499, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21875, + "step": 8626, + "time_per_iteration": 2.8241803646087646 + }, + { + "auxiliary_loss_clip": 0.01416645, + "auxiliary_loss_mlp": 0.01034619, + "balance_loss_clip": 1.24934769, + "balance_loss_mlp": 1.01215959, + "epoch": 0.5186833007665714, + "flos": 20894983234560.0, + "grad_norm": 2.459023087576182, + "language_loss": 0.71284986, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.7373625, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.2244873, + "step": 8627, + "time_per_iteration": 2.8440866470336914 + }, + { + "auxiliary_loss_clip": 0.01435089, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.26317787, + "balance_loss_mlp": 1.01529002, + "epoch": 0.5187434240192395, + "flos": 20896747781760.0, + "grad_norm": 1.964780179890976, + "language_loss": 0.73704928, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.76178753, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.234375, + "step": 8628, + "time_per_iteration": 2.8834712505340576 + }, + { + "auxiliary_loss_clip": 0.01418171, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.25150752, + "balance_loss_mlp": 1.01451254, + "epoch": 0.5188035472719074, + "flos": 19868113077120.0, + "grad_norm": 1.6678636127626478, + "language_loss": 0.78088641, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.80543423, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.22094727, + "step": 8629, + "time_per_iteration": 2.859098196029663 + }, + { + "auxiliary_loss_clip": 0.01445015, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.27343953, + "balance_loss_mlp": 1.01459479, + "epoch": 0.5188636705245754, + "flos": 21147187605120.0, + "grad_norm": 2.578989712161607, + "language_loss": 0.75542837, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.7802636, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.23913574, + "step": 8630, + "time_per_iteration": 2.8534443378448486 + }, + { + "auxiliary_loss_clip": 0.01432731, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.26197779, + "balance_loss_mlp": 1.01590621, + "epoch": 0.5189237937772434, + "flos": 22430153185920.0, + "grad_norm": 1.8998323648804376, + "language_loss": 0.8172704, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.84198451, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.2277832, + "step": 8631, + "time_per_iteration": 2.8692946434020996 + }, + { + "auxiliary_loss_clip": 0.01436539, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.26600838, + "balance_loss_mlp": 1.01319778, + "epoch": 0.5189839170299113, + "flos": 25457483266560.0, + "grad_norm": 1.4494253262754708, + "language_loss": 0.7549181, + "learning_rate": 1.974101522024942e-06, + "loss": 0.77964175, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.22644043, + "step": 8632, + "time_per_iteration": 2.8876359462738037 + }, + { + "auxiliary_loss_clip": 0.01408685, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.24499559, + "balance_loss_mlp": 1.01228237, + "epoch": 0.5190440402825793, + "flos": 18596820654720.0, + "grad_norm": 1.860212879089833, + "language_loss": 0.79257828, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81701255, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.22473145, + "step": 8633, + "time_per_iteration": 2.863926887512207 + }, + { + "auxiliary_loss_clip": 0.01423013, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.25457001, + "balance_loss_mlp": 1.01652837, + "epoch": 0.5191041635352472, + "flos": 21918776745600.0, + "grad_norm": 1.801417209724989, + "language_loss": 0.8129167, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.83755016, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.23828125, + "step": 8634, + "time_per_iteration": 2.8558287620544434 + }, + { + "auxiliary_loss_clip": 0.01421677, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.25525999, + "balance_loss_mlp": 1.01504183, + "epoch": 0.5191642867879153, + "flos": 27539709822720.0, + "grad_norm": 1.586951144675779, + "language_loss": 0.69693935, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.72153318, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.22668457, + "step": 8635, + "time_per_iteration": 2.917783737182617 + }, + { + "auxiliary_loss_clip": 0.0143877, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.26708198, + "balance_loss_mlp": 1.01545572, + "epoch": 0.5192244100405832, + "flos": 15714518716800.0, + "grad_norm": 1.6239807803248498, + "language_loss": 0.78356314, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.8083297, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.22436523, + "step": 8636, + "time_per_iteration": 2.8527562618255615 + }, + { + "auxiliary_loss_clip": 0.01441578, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.27007627, + "balance_loss_mlp": 1.0130899, + "epoch": 0.5192845332932512, + "flos": 12064563936000.0, + "grad_norm": 3.1978475476058317, + "language_loss": 0.72664297, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.75141627, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.22668457, + "step": 8637, + "time_per_iteration": 2.8194668292999268 + }, + { + "auxiliary_loss_clip": 0.01410884, + "auxiliary_loss_mlp": 0.0104082, + "balance_loss_clip": 1.2463001, + "balance_loss_mlp": 1.01817024, + "epoch": 0.5193446565459191, + "flos": 18962535525120.0, + "grad_norm": 3.575424544539419, + "language_loss": 0.77004516, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.79456216, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.22631836, + "step": 8638, + "time_per_iteration": 2.8709611892700195 + }, + { + "auxiliary_loss_clip": 0.01428671, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.26160908, + "balance_loss_mlp": 1.0115087, + "epoch": 0.5194047797985871, + "flos": 20384375955840.0, + "grad_norm": 1.8240228900880884, + "language_loss": 0.75420022, + "learning_rate": 1.971375543740272e-06, + "loss": 0.77882242, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.22033691, + "step": 8639, + "time_per_iteration": 2.837958574295044 + }, + { + "auxiliary_loss_clip": 0.01420217, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.25331783, + "balance_loss_mlp": 1.01157498, + "epoch": 0.519464903051255, + "flos": 24363732015360.0, + "grad_norm": 1.677727294163514, + "language_loss": 0.78503776, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.80957991, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.22399902, + "step": 8640, + "time_per_iteration": 2.8946542739868164 + }, + { + "auxiliary_loss_clip": 0.01417036, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.25294018, + "balance_loss_mlp": 1.01542592, + "epoch": 0.519525026303923, + "flos": 14069322115200.0, + "grad_norm": 2.0968399967940075, + "language_loss": 0.66649711, + "learning_rate": 1.97059670234927e-06, + "loss": 0.69104868, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.22705078, + "step": 8641, + "time_per_iteration": 2.8317363262176514 + }, + { + "auxiliary_loss_clip": 0.014226, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.25675392, + "balance_loss_mlp": 1.01019216, + "epoch": 0.519585149556591, + "flos": 28846501430400.0, + "grad_norm": 1.9547504086393888, + "language_loss": 0.7705906, + "learning_rate": 1.97020728331885e-06, + "loss": 0.7951417, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.22314453, + "step": 8642, + "time_per_iteration": 2.8858158588409424 + }, + { + "auxiliary_loss_clip": 0.01422671, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.25625217, + "balance_loss_mlp": 1.01124609, + "epoch": 0.519645272809259, + "flos": 25383589228800.0, + "grad_norm": 1.4763516223620523, + "language_loss": 0.83560228, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.86016518, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.22375488, + "step": 8643, + "time_per_iteration": 2.8954999446868896 + }, + { + "auxiliary_loss_clip": 0.01435004, + "auxiliary_loss_mlp": 0.01046838, + "balance_loss_clip": 1.26325178, + "balance_loss_mlp": 1.02411699, + "epoch": 0.519705396061927, + "flos": 25383408249600.0, + "grad_norm": 1.5476529847380043, + "language_loss": 0.70939302, + "learning_rate": 1.969428448662004e-06, + "loss": 0.73421139, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.22729492, + "step": 8644, + "time_per_iteration": 2.8646645545959473 + }, + { + "auxiliary_loss_clip": 0.01432045, + "auxiliary_loss_mlp": 0.01037812, + "balance_loss_clip": 1.26384974, + "balance_loss_mlp": 1.01538873, + "epoch": 0.5197655193145949, + "flos": 28488116217600.0, + "grad_norm": 1.6385286287669225, + "language_loss": 0.80688095, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.83157951, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.22412109, + "step": 8645, + "time_per_iteration": 2.9118285179138184 + }, + { + "auxiliary_loss_clip": 0.01421876, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.25324273, + "balance_loss_mlp": 1.01462984, + "epoch": 0.5198256425672629, + "flos": 20018208637440.0, + "grad_norm": 1.863719455787869, + "language_loss": 0.78057897, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80517733, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.23327637, + "step": 8646, + "time_per_iteration": 4.3217527866363525 + }, + { + "auxiliary_loss_clip": 0.01429046, + "auxiliary_loss_mlp": 0.01039842, + "balance_loss_clip": 1.26178384, + "balance_loss_mlp": 1.01806235, + "epoch": 0.5198857658199308, + "flos": 19838450471040.0, + "grad_norm": 2.5204431944277177, + "language_loss": 0.66676521, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.69145405, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21765137, + "step": 8647, + "time_per_iteration": 2.8281073570251465 + }, + { + "auxiliary_loss_clip": 0.01433981, + "auxiliary_loss_mlp": 0.01039718, + "balance_loss_clip": 1.26215827, + "balance_loss_mlp": 1.01674628, + "epoch": 0.5199458890725989, + "flos": 24472537056000.0, + "grad_norm": 9.904541994808268, + "language_loss": 0.72061908, + "learning_rate": 1.967870793377763e-06, + "loss": 0.74535608, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.22949219, + "step": 8648, + "time_per_iteration": 2.98569393157959 + }, + { + "auxiliary_loss_clip": 0.0143423, + "auxiliary_loss_mlp": 0.01039422, + "balance_loss_clip": 1.26611662, + "balance_loss_mlp": 1.01573563, + "epoch": 0.5200060123252668, + "flos": 23415370865280.0, + "grad_norm": 1.7001647291819395, + "language_loss": 0.65769076, + "learning_rate": 1.967481382565642e-06, + "loss": 0.68242729, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.23681641, + "step": 8649, + "time_per_iteration": 3.0430822372436523 + }, + { + "auxiliary_loss_clip": 0.01442535, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.26918328, + "balance_loss_mlp": 1.0167551, + "epoch": 0.5200661355779348, + "flos": 17210117450880.0, + "grad_norm": 1.7989133742411625, + "language_loss": 0.70948184, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.73429573, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.22094727, + "step": 8650, + "time_per_iteration": 2.862105369567871 + }, + { + "auxiliary_loss_clip": 0.01415776, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.24878037, + "balance_loss_mlp": 1.01784539, + "epoch": 0.5201262588306027, + "flos": 18524238716160.0, + "grad_norm": 1.7413234974732394, + "language_loss": 0.77832502, + "learning_rate": 1.966702564655496e-06, + "loss": 0.80288053, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.21936035, + "step": 8651, + "time_per_iteration": 2.8867571353912354 + }, + { + "auxiliary_loss_clip": 0.01437268, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.26740038, + "balance_loss_mlp": 1.02087331, + "epoch": 0.5201863820832707, + "flos": 18627252422400.0, + "grad_norm": 2.1986053933455736, + "language_loss": 0.79657221, + "learning_rate": 1.966313157587003e-06, + "loss": 0.82138264, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22912598, + "step": 8652, + "time_per_iteration": 2.8454201221466064 + }, + { + "auxiliary_loss_clip": 0.01433579, + "auxiliary_loss_mlp": 0.01044564, + "balance_loss_clip": 1.26633573, + "balance_loss_mlp": 1.02017426, + "epoch": 0.5202465053359386, + "flos": 22867499854080.0, + "grad_norm": 2.0918641959083653, + "language_loss": 0.70918107, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.73396254, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.24389648, + "step": 8653, + "time_per_iteration": 2.8421266078948975 + }, + { + "auxiliary_loss_clip": 0.01447871, + "auxiliary_loss_mlp": 0.0105035, + "balance_loss_clip": 1.27563, + "balance_loss_mlp": 1.02680588, + "epoch": 0.5203066285886067, + "flos": 21991675397760.0, + "grad_norm": 1.759529301277713, + "language_loss": 0.79305232, + "learning_rate": 1.965534347297008e-06, + "loss": 0.81803453, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.23547363, + "step": 8654, + "time_per_iteration": 2.8366036415100098 + }, + { + "auxiliary_loss_clip": 0.01452576, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.27764833, + "balance_loss_mlp": 1.02147818, + "epoch": 0.5203667518412746, + "flos": 20243827048320.0, + "grad_norm": 1.8684350863058066, + "language_loss": 0.84529775, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.8702572, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.21875, + "step": 8655, + "time_per_iteration": 2.8998117446899414 + }, + { + "auxiliary_loss_clip": 0.01438089, + "auxiliary_loss_mlp": 0.01043024, + "balance_loss_clip": 1.27163196, + "balance_loss_mlp": 1.02038598, + "epoch": 0.5204268750939426, + "flos": 15713840044800.0, + "grad_norm": 2.12059196589611, + "language_loss": 0.67958069, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.70439178, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.22619629, + "step": 8656, + "time_per_iteration": 2.7986974716186523 + }, + { + "auxiliary_loss_clip": 0.01432692, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.26348639, + "balance_loss_mlp": 1.01529169, + "epoch": 0.5204869983466105, + "flos": 27459752981760.0, + "grad_norm": 2.039336498804898, + "language_loss": 0.74506706, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.76976019, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21313477, + "step": 8657, + "time_per_iteration": 4.2787744998931885 + }, + { + "auxiliary_loss_clip": 0.01424716, + "auxiliary_loss_mlp": 0.01041747, + "balance_loss_clip": 1.25719333, + "balance_loss_mlp": 1.0192405, + "epoch": 0.5205471215992785, + "flos": 20605334152320.0, + "grad_norm": 1.792366109831257, + "language_loss": 0.72415549, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.74882013, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.22521973, + "step": 8658, + "time_per_iteration": 4.214099407196045 + }, + { + "auxiliary_loss_clip": 0.01430739, + "auxiliary_loss_mlp": 0.01042657, + "balance_loss_clip": 1.26281798, + "balance_loss_mlp": 1.02009034, + "epoch": 0.5206072448519465, + "flos": 22138196618880.0, + "grad_norm": 1.6613150579769567, + "language_loss": 0.8424266, + "learning_rate": 1.963587344701897e-06, + "loss": 0.86716056, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.22583008, + "step": 8659, + "time_per_iteration": 4.2712812423706055 + }, + { + "auxiliary_loss_clip": 0.0145311, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.27744961, + "balance_loss_mlp": 1.02402949, + "epoch": 0.5206673681046144, + "flos": 18338960684160.0, + "grad_norm": 2.080892480883796, + "language_loss": 0.76077008, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.785779, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.23730469, + "step": 8660, + "time_per_iteration": 2.8500616550445557 + }, + { + "auxiliary_loss_clip": 0.0142996, + "auxiliary_loss_mlp": 0.01041582, + "balance_loss_clip": 1.26054549, + "balance_loss_mlp": 1.01948023, + "epoch": 0.5207274913572825, + "flos": 20239935995520.0, + "grad_norm": 1.6867721932715662, + "language_loss": 0.78390443, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.8086198, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.22094727, + "step": 8661, + "time_per_iteration": 2.909979820251465 + }, + { + "auxiliary_loss_clip": 0.01440276, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.26966846, + "balance_loss_mlp": 1.01562905, + "epoch": 0.5207876146099504, + "flos": 22137110743680.0, + "grad_norm": 1.7071137755458996, + "language_loss": 0.71369624, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.73847538, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.22021484, + "step": 8662, + "time_per_iteration": 2.9941656589508057 + }, + { + "auxiliary_loss_clip": 0.01417676, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.25249195, + "balance_loss_mlp": 1.01522231, + "epoch": 0.5208477378626184, + "flos": 23889574062720.0, + "grad_norm": 3.403398030035798, + "language_loss": 0.70378315, + "learning_rate": 1.962029767391098e-06, + "loss": 0.72833681, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.22460938, + "step": 8663, + "time_per_iteration": 2.894280195236206 + }, + { + "auxiliary_loss_clip": 0.01434022, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.26446474, + "balance_loss_mlp": 1.01696706, + "epoch": 0.5209078611152863, + "flos": 20971818184320.0, + "grad_norm": 1.5472933057078044, + "language_loss": 0.77328295, + "learning_rate": 1.961640376626072e-06, + "loss": 0.79803735, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.24462891, + "step": 8664, + "time_per_iteration": 2.903635025024414 + }, + { + "auxiliary_loss_clip": 0.01426397, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.2572099, + "balance_loss_mlp": 1.01592994, + "epoch": 0.5209679843679543, + "flos": 20677463642880.0, + "grad_norm": 2.9267120390402632, + "language_loss": 0.76969332, + "learning_rate": 1.961250987315646e-06, + "loss": 0.79434013, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.22338867, + "step": 8665, + "time_per_iteration": 2.8916714191436768 + }, + { + "auxiliary_loss_clip": 0.01421547, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.25514448, + "balance_loss_mlp": 1.01307476, + "epoch": 0.5210281076206222, + "flos": 20236225921920.0, + "grad_norm": 2.7931245763094723, + "language_loss": 0.72632968, + "learning_rate": 1.960861599474586e-06, + "loss": 0.75089449, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21875, + "step": 8666, + "time_per_iteration": 2.9067909717559814 + }, + { + "auxiliary_loss_clip": 0.01466503, + "auxiliary_loss_mlp": 0.01040436, + "balance_loss_clip": 1.28542233, + "balance_loss_mlp": 1.01659381, + "epoch": 0.5210882308732903, + "flos": 16078378550400.0, + "grad_norm": 2.3153056865306687, + "language_loss": 0.69982439, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.72489381, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.23828125, + "step": 8667, + "time_per_iteration": 2.884077548980713 + }, + { + "auxiliary_loss_clip": 0.01424611, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.25958991, + "balance_loss_mlp": 1.01700747, + "epoch": 0.5211483541259582, + "flos": 24835627728000.0, + "grad_norm": 1.3752063366566738, + "language_loss": 0.81719434, + "learning_rate": 1.960082828259629e-06, + "loss": 0.84184504, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.23449707, + "step": 8668, + "time_per_iteration": 2.9702322483062744 + }, + { + "auxiliary_loss_clip": 0.01442199, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.27307177, + "balance_loss_mlp": 1.01457322, + "epoch": 0.5212084773786262, + "flos": 20378946579840.0, + "grad_norm": 1.8142536826367668, + "language_loss": 0.64705729, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.67184377, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21862793, + "step": 8669, + "time_per_iteration": 2.9243452548980713 + }, + { + "auxiliary_loss_clip": 0.01430553, + "auxiliary_loss_mlp": 0.01038132, + "balance_loss_clip": 1.26163936, + "balance_loss_mlp": 1.01542306, + "epoch": 0.5212686006312941, + "flos": 23154977185920.0, + "grad_norm": 1.6652448427165083, + "language_loss": 0.67478335, + "learning_rate": 1.959304063099325e-06, + "loss": 0.69947028, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.22705078, + "step": 8670, + "time_per_iteration": 2.929241180419922 + }, + { + "auxiliary_loss_clip": 0.01414909, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.24942601, + "balance_loss_mlp": 1.01735246, + "epoch": 0.5213287238839621, + "flos": 27783408170880.0, + "grad_norm": 2.345522623299387, + "language_loss": 0.77196723, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.79651278, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.22302246, + "step": 8671, + "time_per_iteration": 2.915121078491211 + }, + { + "auxiliary_loss_clip": 0.01444757, + "auxiliary_loss_mlp": 0.01043277, + "balance_loss_clip": 1.27282357, + "balance_loss_mlp": 1.02046049, + "epoch": 0.5213888471366301, + "flos": 19947119777280.0, + "grad_norm": 2.760209779772705, + "language_loss": 0.78877187, + "learning_rate": 1.958525304111796e-06, + "loss": 0.81365216, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.22814941, + "step": 8672, + "time_per_iteration": 2.89900279045105 + }, + { + "auxiliary_loss_clip": 0.01421881, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.25332808, + "balance_loss_mlp": 1.01412463, + "epoch": 0.521448970389298, + "flos": 16991511984000.0, + "grad_norm": 3.3563980653976655, + "language_loss": 0.72366297, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74823797, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21496582, + "step": 8673, + "time_per_iteration": 2.880209445953369 + }, + { + "auxiliary_loss_clip": 0.01417687, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.25010395, + "balance_loss_mlp": 1.01467824, + "epoch": 0.5215090936419661, + "flos": 18999075340800.0, + "grad_norm": 2.0217226975625873, + "language_loss": 0.75902009, + "learning_rate": 1.957746551415166e-06, + "loss": 0.78357452, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.23071289, + "step": 8674, + "time_per_iteration": 2.9045138359069824 + }, + { + "auxiliary_loss_clip": 0.01432173, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.25972342, + "balance_loss_mlp": 1.01741111, + "epoch": 0.521569216894634, + "flos": 16151955874560.0, + "grad_norm": 2.0760677618110353, + "language_loss": 0.86774719, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.89247572, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.23266602, + "step": 8675, + "time_per_iteration": 2.8877787590026855 + }, + { + "auxiliary_loss_clip": 0.01223728, + "auxiliary_loss_mlp": 0.01040556, + "balance_loss_clip": 1.12271976, + "balance_loss_mlp": 1.00813091, + "epoch": 0.521629340147302, + "flos": 57605191407360.0, + "grad_norm": 0.8740351772811429, + "language_loss": 0.63229823, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65494102, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.32421875, + "step": 8676, + "time_per_iteration": 3.3835835456848145 + }, + { + "auxiliary_loss_clip": 0.01420001, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.2513268, + "balance_loss_mlp": 1.01326656, + "epoch": 0.5216894633999699, + "flos": 26808099102720.0, + "grad_norm": 5.725600368200996, + "language_loss": 0.68822861, + "learning_rate": 1.956578434424046e-06, + "loss": 0.71277857, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21740723, + "step": 8677, + "time_per_iteration": 2.9613449573516846 + }, + { + "auxiliary_loss_clip": 0.01425452, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.25908566, + "balance_loss_mlp": 1.01493728, + "epoch": 0.5217495866526379, + "flos": 26369711804160.0, + "grad_norm": 1.7720293663194315, + "language_loss": 0.65988523, + "learning_rate": 1.956189065367086e-06, + "loss": 0.68451685, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.22753906, + "step": 8678, + "time_per_iteration": 2.9780349731445312 + }, + { + "auxiliary_loss_clip": 0.01444895, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.26994061, + "balance_loss_mlp": 1.01268649, + "epoch": 0.5218097099053058, + "flos": 23593816932480.0, + "grad_norm": 2.308526178679417, + "language_loss": 0.68997848, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71478212, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.22790527, + "step": 8679, + "time_per_iteration": 2.940371513366699 + }, + { + "auxiliary_loss_clip": 0.01439722, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.26997495, + "balance_loss_mlp": 1.01192021, + "epoch": 0.5218698331579739, + "flos": 18086484844800.0, + "grad_norm": 1.7367010236964207, + "language_loss": 0.67812192, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.70284998, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21179199, + "step": 8680, + "time_per_iteration": 2.8905317783355713 + }, + { + "auxiliary_loss_clip": 0.01447112, + "auxiliary_loss_mlp": 0.01037265, + "balance_loss_clip": 1.27513623, + "balance_loss_mlp": 1.01442468, + "epoch": 0.5219299564106418, + "flos": 19290443725440.0, + "grad_norm": 6.50166766622628, + "language_loss": 0.83848196, + "learning_rate": 1.955020968223156e-06, + "loss": 0.86332577, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.22851562, + "step": 8681, + "time_per_iteration": 4.266220808029175 + }, + { + "auxiliary_loss_clip": 0.0142651, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.2582171, + "balance_loss_mlp": 1.01229548, + "epoch": 0.5219900796633098, + "flos": 26662618512000.0, + "grad_norm": 1.8259938476678474, + "language_loss": 0.78665322, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.81126916, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.2277832, + "step": 8682, + "time_per_iteration": 2.900270700454712 + }, + { + "auxiliary_loss_clip": 0.01434119, + "auxiliary_loss_mlp": 0.01043814, + "balance_loss_clip": 1.2653203, + "balance_loss_mlp": 1.02180791, + "epoch": 0.5220502029159777, + "flos": 34325799724800.0, + "grad_norm": 1.5875297924098681, + "language_loss": 0.69477952, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71955884, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.22009277, + "step": 8683, + "time_per_iteration": 2.9405548572540283 + }, + { + "auxiliary_loss_clip": 0.01432578, + "auxiliary_loss_mlp": 0.01042676, + "balance_loss_clip": 1.26116014, + "balance_loss_mlp": 1.01962066, + "epoch": 0.5221103261686457, + "flos": 22164737333760.0, + "grad_norm": 1.5950549985588451, + "language_loss": 0.77018666, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.79493916, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.23034668, + "step": 8684, + "time_per_iteration": 2.8775296211242676 + }, + { + "auxiliary_loss_clip": 0.01426038, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.26007867, + "balance_loss_mlp": 1.01225197, + "epoch": 0.5221704494213137, + "flos": 19217771297280.0, + "grad_norm": 2.112818763278496, + "language_loss": 0.76604962, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.79066384, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.23144531, + "step": 8685, + "time_per_iteration": 2.8297641277313232 + }, + { + "auxiliary_loss_clip": 0.01449657, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.27654588, + "balance_loss_mlp": 1.02239251, + "epoch": 0.5222305726739817, + "flos": 19363387622400.0, + "grad_norm": 3.3173716185161273, + "language_loss": 0.81562221, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.84057426, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.23156738, + "step": 8686, + "time_per_iteration": 2.8636879920959473 + }, + { + "auxiliary_loss_clip": 0.01407366, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.24435592, + "balance_loss_mlp": 1.01357925, + "epoch": 0.5222906959266497, + "flos": 27825286872960.0, + "grad_norm": 4.932936987156935, + "language_loss": 0.70630151, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.73073906, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.22814941, + "step": 8687, + "time_per_iteration": 2.9503347873687744 + }, + { + "auxiliary_loss_clip": 0.0141015, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.2457974, + "balance_loss_mlp": 1.01217043, + "epoch": 0.5223508191793176, + "flos": 12720425581440.0, + "grad_norm": 6.175388522840969, + "language_loss": 0.8317554, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85619545, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.21691895, + "step": 8688, + "time_per_iteration": 2.868380546569824 + }, + { + "auxiliary_loss_clip": 0.01429925, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.2614677, + "balance_loss_mlp": 1.01471269, + "epoch": 0.5224109424319856, + "flos": 15640534189440.0, + "grad_norm": 2.3766813696010387, + "language_loss": 0.75216746, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.77684605, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.2322998, + "step": 8689, + "time_per_iteration": 2.984994649887085 + }, + { + "auxiliary_loss_clip": 0.01417621, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.25201488, + "balance_loss_mlp": 1.01451802, + "epoch": 0.5224710656846535, + "flos": 15750696574080.0, + "grad_norm": 1.9933809226295882, + "language_loss": 0.834378, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85892802, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.2286377, + "step": 8690, + "time_per_iteration": 2.8842451572418213 + }, + { + "auxiliary_loss_clip": 0.01434467, + "auxiliary_loss_mlp": 0.01041848, + "balance_loss_clip": 1.265028, + "balance_loss_mlp": 1.01830435, + "epoch": 0.5225311889373215, + "flos": 26042075072640.0, + "grad_norm": 1.9108560111027004, + "language_loss": 0.79847389, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.82323706, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.23547363, + "step": 8691, + "time_per_iteration": 4.313936471939087 + }, + { + "auxiliary_loss_clip": 0.01432275, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.26037824, + "balance_loss_mlp": 1.01766479, + "epoch": 0.5225913121899894, + "flos": 18378441411840.0, + "grad_norm": 2.1609467624933654, + "language_loss": 0.77627993, + "learning_rate": 1.950738079725646e-06, + "loss": 0.80101967, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.24047852, + "step": 8692, + "time_per_iteration": 2.913616180419922 + }, + { + "auxiliary_loss_clip": 0.01423836, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.25937176, + "balance_loss_mlp": 1.02353907, + "epoch": 0.5226514354426575, + "flos": 29284210056960.0, + "grad_norm": 1.7983586855325433, + "language_loss": 0.72784805, + "learning_rate": 1.950348737138691e-06, + "loss": 0.75254387, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.22216797, + "step": 8693, + "time_per_iteration": 2.9010279178619385 + }, + { + "auxiliary_loss_clip": 0.01449637, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.27425587, + "balance_loss_mlp": 1.01790261, + "epoch": 0.5227115586953254, + "flos": 22862658660480.0, + "grad_norm": 6.2782600656544805, + "language_loss": 0.83053547, + "learning_rate": 1.949959396434517e-06, + "loss": 0.85543734, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.22644043, + "step": 8694, + "time_per_iteration": 4.290493726730347 + }, + { + "auxiliary_loss_clip": 0.01222424, + "auxiliary_loss_mlp": 0.01039937, + "balance_loss_clip": 1.12347078, + "balance_loss_mlp": 1.00713086, + "epoch": 0.5227716819479934, + "flos": 57501227560320.0, + "grad_norm": 0.7544515112290998, + "language_loss": 0.55717349, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57979715, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.328125, + "step": 8695, + "time_per_iteration": 3.400378704071045 + }, + { + "auxiliary_loss_clip": 0.0143567, + "auxiliary_loss_mlp": 0.01040241, + "balance_loss_clip": 1.26560605, + "balance_loss_mlp": 1.01710248, + "epoch": 0.5228318052006613, + "flos": 13816167603840.0, + "grad_norm": 1.7340808031825012, + "language_loss": 0.73508185, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75984097, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.23132324, + "step": 8696, + "time_per_iteration": 2.8728489875793457 + }, + { + "auxiliary_loss_clip": 0.01437081, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.26580834, + "balance_loss_mlp": 1.01348257, + "epoch": 0.5228919284533293, + "flos": 15604582556160.0, + "grad_norm": 1.7726519697564127, + "language_loss": 0.72185171, + "learning_rate": 1.948791385766319e-06, + "loss": 0.74658298, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22595215, + "step": 8697, + "time_per_iteration": 2.8408749103546143 + }, + { + "auxiliary_loss_clip": 0.01434089, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.26625419, + "balance_loss_mlp": 1.01300073, + "epoch": 0.5229520517059973, + "flos": 22501558759680.0, + "grad_norm": 2.2834703196666615, + "language_loss": 0.81493723, + "learning_rate": 1.948402052740906e-06, + "loss": 0.83963341, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.22521973, + "step": 8698, + "time_per_iteration": 2.873993158340454 + }, + { + "auxiliary_loss_clip": 0.01434345, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.26549673, + "balance_loss_mlp": 1.01841533, + "epoch": 0.5230121749586653, + "flos": 22101023376000.0, + "grad_norm": 1.8067306665909637, + "language_loss": 0.75037909, + "learning_rate": 1.948012721672093e-06, + "loss": 0.77512574, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21887207, + "step": 8699, + "time_per_iteration": 2.8649888038635254 + }, + { + "auxiliary_loss_clip": 0.01453318, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.27685189, + "balance_loss_mlp": 1.01521087, + "epoch": 0.5230722982113333, + "flos": 22137246478080.0, + "grad_norm": 2.0809222700888097, + "language_loss": 0.73586994, + "learning_rate": 1.947623392574642e-06, + "loss": 0.76078111, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.22595215, + "step": 8700, + "time_per_iteration": 2.887451171875 + }, + { + "auxiliary_loss_clip": 0.0144177, + "auxiliary_loss_mlp": 0.01042734, + "balance_loss_clip": 1.27022421, + "balance_loss_mlp": 1.02022767, + "epoch": 0.5231324214640012, + "flos": 25020046108800.0, + "grad_norm": 1.7259179290422546, + "language_loss": 0.68856555, + "learning_rate": 1.947234065463318e-06, + "loss": 0.71341062, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.22521973, + "step": 8701, + "time_per_iteration": 2.933171272277832 + }, + { + "auxiliary_loss_clip": 0.01434882, + "auxiliary_loss_mlp": 0.01040738, + "balance_loss_clip": 1.26613402, + "balance_loss_mlp": 1.01861238, + "epoch": 0.5231925447166692, + "flos": 25751883052800.0, + "grad_norm": 1.7644656755120014, + "language_loss": 0.67170274, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.69645888, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.22119141, + "step": 8702, + "time_per_iteration": 2.9349236488342285 + }, + { + "auxiliary_loss_clip": 0.01430658, + "auxiliary_loss_mlp": 0.01040672, + "balance_loss_clip": 1.26138568, + "balance_loss_mlp": 1.01854634, + "epoch": 0.5232526679693371, + "flos": 21443849631360.0, + "grad_norm": 1.9392648119999083, + "language_loss": 0.76971948, + "learning_rate": 1.946455417258101e-06, + "loss": 0.79443282, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.22119141, + "step": 8703, + "time_per_iteration": 2.8604087829589844 + }, + { + "auxiliary_loss_clip": 0.01454605, + "auxiliary_loss_mlp": 0.01046378, + "balance_loss_clip": 1.27638268, + "balance_loss_mlp": 1.0228703, + "epoch": 0.5233127912220051, + "flos": 35311877055360.0, + "grad_norm": 1.9924875203816799, + "language_loss": 0.77497941, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79998928, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.23486328, + "step": 8704, + "time_per_iteration": 2.9390411376953125 + }, + { + "auxiliary_loss_clip": 0.01435246, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.26677632, + "balance_loss_mlp": 1.02134562, + "epoch": 0.523372914474673, + "flos": 17058981260160.0, + "grad_norm": 1.8890442938585859, + "language_loss": 0.79056221, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.81534159, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21325684, + "step": 8705, + "time_per_iteration": 2.806535005569458 + }, + { + "auxiliary_loss_clip": 0.01453124, + "auxiliary_loss_mlp": 0.01035879, + "balance_loss_clip": 1.27757168, + "balance_loss_mlp": 1.0138967, + "epoch": 0.5234330377273411, + "flos": 18415614654720.0, + "grad_norm": 2.5282118937483786, + "language_loss": 0.70484716, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.72973716, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21972656, + "step": 8706, + "time_per_iteration": 2.806443452835083 + }, + { + "auxiliary_loss_clip": 0.01217362, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.11899328, + "balance_loss_mlp": 1.00303519, + "epoch": 0.523493160980009, + "flos": 65884635786240.0, + "grad_norm": 0.6779726978998978, + "language_loss": 0.5254972, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54796052, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.25976562, + "step": 8707, + "time_per_iteration": 3.429171085357666 + }, + { + "auxiliary_loss_clip": 0.01428115, + "auxiliary_loss_mlp": 0.01043258, + "balance_loss_clip": 1.25944483, + "balance_loss_mlp": 1.02078664, + "epoch": 0.523553284232677, + "flos": 21882191685120.0, + "grad_norm": 1.7064573689241458, + "language_loss": 0.75664192, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.78135568, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.22460938, + "step": 8708, + "time_per_iteration": 2.8814473152160645 + }, + { + "auxiliary_loss_clip": 0.01428207, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.26110649, + "balance_loss_mlp": 1.01070631, + "epoch": 0.5236134074853449, + "flos": 20857448033280.0, + "grad_norm": 1.5748890033673513, + "language_loss": 0.78026497, + "learning_rate": 1.944119521844849e-06, + "loss": 0.80487776, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.22351074, + "step": 8709, + "time_per_iteration": 2.902432918548584 + }, + { + "auxiliary_loss_clip": 0.01459918, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.28259575, + "balance_loss_mlp": 1.01820636, + "epoch": 0.5236735307380129, + "flos": 25531422549120.0, + "grad_norm": 2.3042632962770564, + "language_loss": 0.8423053, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.86732137, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.23486328, + "step": 8710, + "time_per_iteration": 2.930004835128784 + }, + { + "auxiliary_loss_clip": 0.01435601, + "auxiliary_loss_mlp": 0.01036305, + "balance_loss_clip": 1.26771474, + "balance_loss_mlp": 1.01530004, + "epoch": 0.523733653990681, + "flos": 23592685812480.0, + "grad_norm": 1.9653450678461135, + "language_loss": 0.70342129, + "learning_rate": 1.943340906834908e-06, + "loss": 0.72814035, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21008301, + "step": 8711, + "time_per_iteration": 2.896667718887329 + }, + { + "auxiliary_loss_clip": 0.01422858, + "auxiliary_loss_mlp": 0.01038385, + "balance_loss_clip": 1.25382257, + "balance_loss_mlp": 1.01555634, + "epoch": 0.5237937772433489, + "flos": 21116258144640.0, + "grad_norm": 2.035276995213851, + "language_loss": 0.83864206, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.86325443, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.22814941, + "step": 8712, + "time_per_iteration": 2.8728442192077637 + }, + { + "auxiliary_loss_clip": 0.01438232, + "auxiliary_loss_mlp": 0.01039007, + "balance_loss_clip": 1.26591825, + "balance_loss_mlp": 1.01470041, + "epoch": 0.5238539004960169, + "flos": 19182498336000.0, + "grad_norm": 1.7975799082431119, + "language_loss": 0.69896102, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.72373331, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.24328613, + "step": 8713, + "time_per_iteration": 2.881561040878296 + }, + { + "auxiliary_loss_clip": 0.01447462, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.27170944, + "balance_loss_mlp": 1.01534224, + "epoch": 0.5239140237486848, + "flos": 17895505968000.0, + "grad_norm": 2.6169529860918317, + "language_loss": 0.77833641, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.80319929, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.23474121, + "step": 8714, + "time_per_iteration": 2.847336530685425 + }, + { + "auxiliary_loss_clip": 0.0144675, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.27299476, + "balance_loss_mlp": 1.01242304, + "epoch": 0.5239741470013528, + "flos": 17939194462080.0, + "grad_norm": 2.1585717139085734, + "language_loss": 0.76831394, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.79313314, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.22741699, + "step": 8715, + "time_per_iteration": 2.8359851837158203 + }, + { + "auxiliary_loss_clip": 0.01428034, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.25964761, + "balance_loss_mlp": 1.01921773, + "epoch": 0.5240342702540207, + "flos": 31005834405120.0, + "grad_norm": 1.6712454989971324, + "language_loss": 0.72139132, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.74608755, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.22387695, + "step": 8716, + "time_per_iteration": 4.357141971588135 + }, + { + "auxiliary_loss_clip": 0.01430411, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.2607578, + "balance_loss_mlp": 1.01502144, + "epoch": 0.5240943935066887, + "flos": 25014752467200.0, + "grad_norm": 1.8441746235884853, + "language_loss": 0.87364447, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89831293, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.2142334, + "step": 8717, + "time_per_iteration": 2.837306022644043 + }, + { + "auxiliary_loss_clip": 0.0142424, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.25421929, + "balance_loss_mlp": 1.01553953, + "epoch": 0.5241545167593566, + "flos": 23669475517440.0, + "grad_norm": 1.8849469284130125, + "language_loss": 0.62596458, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.65058279, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22033691, + "step": 8718, + "time_per_iteration": 2.876553773880005 + }, + { + "auxiliary_loss_clip": 0.01438166, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.26457632, + "balance_loss_mlp": 1.01268435, + "epoch": 0.5242146400120247, + "flos": 23409850999680.0, + "grad_norm": 1.6520630066717683, + "language_loss": 0.72703975, + "learning_rate": 1.940226533916872e-06, + "loss": 0.75178397, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.2355957, + "step": 8719, + "time_per_iteration": 2.86631178855896 + }, + { + "auxiliary_loss_clip": 0.01417236, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.25006795, + "balance_loss_mlp": 1.01465917, + "epoch": 0.5242747632646926, + "flos": 17758078951680.0, + "grad_norm": 1.8652464661622177, + "language_loss": 0.74097371, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.76550621, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21337891, + "step": 8720, + "time_per_iteration": 2.834582567214966 + }, + { + "auxiliary_loss_clip": 0.01423701, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.2541374, + "balance_loss_mlp": 1.01424992, + "epoch": 0.5243348865173606, + "flos": 32610328669440.0, + "grad_norm": 1.5769831748074792, + "language_loss": 0.70833611, + "learning_rate": 1.939447963058281e-06, + "loss": 0.73294079, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.2253418, + "step": 8721, + "time_per_iteration": 2.9149248600006104 + }, + { + "auxiliary_loss_clip": 0.01435317, + "auxiliary_loss_mlp": 0.01040393, + "balance_loss_clip": 1.26562846, + "balance_loss_mlp": 1.01757693, + "epoch": 0.5243950097700285, + "flos": 25495154202240.0, + "grad_norm": 1.6776504379994015, + "language_loss": 0.86821866, + "learning_rate": 1.939058681065813e-06, + "loss": 0.89297581, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.22814941, + "step": 8722, + "time_per_iteration": 2.8608219623565674 + }, + { + "auxiliary_loss_clip": 0.01427736, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.25919724, + "balance_loss_mlp": 1.01351118, + "epoch": 0.5244551330226965, + "flos": 15276855335040.0, + "grad_norm": 1.7466848447137104, + "language_loss": 0.80657172, + "learning_rate": 1.938669401384247e-06, + "loss": 0.83120304, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21899414, + "step": 8723, + "time_per_iteration": 2.806652784347534 + }, + { + "auxiliary_loss_clip": 0.01446459, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.27372658, + "balance_loss_mlp": 1.01674426, + "epoch": 0.5245152562753645, + "flos": 22247363617920.0, + "grad_norm": 2.46185143559545, + "language_loss": 0.76091677, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.78578502, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.23608398, + "step": 8724, + "time_per_iteration": 2.872649908065796 + }, + { + "auxiliary_loss_clip": 0.01451956, + "auxiliary_loss_mlp": 0.01043497, + "balance_loss_clip": 1.27494025, + "balance_loss_mlp": 1.02053738, + "epoch": 0.5245753795280325, + "flos": 29439011076480.0, + "grad_norm": 1.6002125436526868, + "language_loss": 0.70973098, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.73468548, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 1.77148438, + "router_z_loss_mlp": 0.22961426, + "step": 8725, + "time_per_iteration": 2.9218711853027344 + }, + { + "auxiliary_loss_clip": 0.01217781, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.11896992, + "balance_loss_mlp": 1.01678681, + "epoch": 0.5246355027807005, + "flos": 58864149982080.0, + "grad_norm": 0.7778012333367323, + "language_loss": 0.55684251, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57939041, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.20214844, + "step": 8726, + "time_per_iteration": 4.762006998062134 + }, + { + "auxiliary_loss_clip": 0.01212489, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.11545527, + "balance_loss_mlp": 1.01224566, + "epoch": 0.5246956260333684, + "flos": 64557936466560.0, + "grad_norm": 0.7993136744378349, + "language_loss": 0.58404535, + "learning_rate": 1.937112306062219e-06, + "loss": 0.606493, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.20019531, + "step": 8727, + "time_per_iteration": 3.2745563983917236 + }, + { + "auxiliary_loss_clip": 0.01446457, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.27191257, + "balance_loss_mlp": 1.01882958, + "epoch": 0.5247557492860364, + "flos": 24543942629760.0, + "grad_norm": 1.2919400210879883, + "language_loss": 0.7135483, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.73842406, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.22290039, + "step": 8728, + "time_per_iteration": 2.909552574157715 + }, + { + "auxiliary_loss_clip": 0.01432062, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.26190412, + "balance_loss_mlp": 1.01471281, + "epoch": 0.5248158725387043, + "flos": 18815154652800.0, + "grad_norm": 1.5487725503254566, + "language_loss": 0.69698286, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.72165847, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20751953, + "step": 8729, + "time_per_iteration": 5.7708892822265625 + }, + { + "auxiliary_loss_clip": 0.01446805, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.27386665, + "balance_loss_mlp": 1.01894784, + "epoch": 0.5248759957913723, + "flos": 20964669505920.0, + "grad_norm": 2.2254103429191034, + "language_loss": 0.84482479, + "learning_rate": 1.935944509558464e-06, + "loss": 0.86969477, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21240234, + "step": 8730, + "time_per_iteration": 2.853999137878418 + }, + { + "auxiliary_loss_clip": 0.0144817, + "auxiliary_loss_mlp": 0.01048331, + "balance_loss_clip": 1.27709258, + "balance_loss_mlp": 1.02648008, + "epoch": 0.5249361190440403, + "flos": 18670171754880.0, + "grad_norm": 2.4184364167087757, + "language_loss": 0.7992419, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.82420695, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21838379, + "step": 8731, + "time_per_iteration": 2.8147811889648438 + }, + { + "auxiliary_loss_clip": 0.01421719, + "auxiliary_loss_mlp": 0.01040384, + "balance_loss_clip": 1.25641322, + "balance_loss_mlp": 1.01791334, + "epoch": 0.5249962422967083, + "flos": 24874067825280.0, + "grad_norm": 1.6181430199173799, + "language_loss": 0.8396827, + "learning_rate": 1.935165990676312e-06, + "loss": 0.86430371, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.22497559, + "step": 8732, + "time_per_iteration": 2.922715187072754 + }, + { + "auxiliary_loss_clip": 0.01425023, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.25572157, + "balance_loss_mlp": 1.02023268, + "epoch": 0.5250563655493762, + "flos": 15269661411840.0, + "grad_norm": 1.5594107793685648, + "language_loss": 0.78315556, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80781436, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20629883, + "step": 8733, + "time_per_iteration": 2.8406968116760254 + }, + { + "auxiliary_loss_clip": 0.01450289, + "auxiliary_loss_mlp": 0.01043126, + "balance_loss_clip": 1.27645874, + "balance_loss_mlp": 1.02052355, + "epoch": 0.5251164888020442, + "flos": 18634536835200.0, + "grad_norm": 2.9286396254012064, + "language_loss": 0.82172465, + "learning_rate": 1.934387481628208e-06, + "loss": 0.84665883, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.22607422, + "step": 8734, + "time_per_iteration": 2.8771281242370605 + }, + { + "auxiliary_loss_clip": 0.01429193, + "auxiliary_loss_mlp": 0.01045502, + "balance_loss_clip": 1.26059663, + "balance_loss_mlp": 1.02354407, + "epoch": 0.5251766120547121, + "flos": 29721420990720.0, + "grad_norm": 1.409983861843141, + "language_loss": 0.77015269, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79489958, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21948242, + "step": 8735, + "time_per_iteration": 2.880671977996826 + }, + { + "auxiliary_loss_clip": 0.01423538, + "auxiliary_loss_mlp": 0.01042758, + "balance_loss_clip": 1.25480807, + "balance_loss_mlp": 1.02217078, + "epoch": 0.5252367353073801, + "flos": 23450643826560.0, + "grad_norm": 1.5086434854480595, + "language_loss": 0.81007409, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.83473706, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20593262, + "step": 8736, + "time_per_iteration": 2.8694167137145996 + }, + { + "auxiliary_loss_clip": 0.01435369, + "auxiliary_loss_mlp": 0.01045502, + "balance_loss_clip": 1.26536691, + "balance_loss_mlp": 1.0228281, + "epoch": 0.5252968585600482, + "flos": 30823044837120.0, + "grad_norm": 2.133741611460623, + "language_loss": 0.70779854, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.73260725, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22680664, + "step": 8737, + "time_per_iteration": 2.9072306156158447 + }, + { + "auxiliary_loss_clip": 0.0143236, + "auxiliary_loss_mlp": 0.01048382, + "balance_loss_clip": 1.26104331, + "balance_loss_mlp": 1.02634025, + "epoch": 0.5253569818127161, + "flos": 20637213753600.0, + "grad_norm": 1.435211591103227, + "language_loss": 0.7800473, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.80485475, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.22045898, + "step": 8738, + "time_per_iteration": 2.8810434341430664 + }, + { + "auxiliary_loss_clip": 0.01218097, + "auxiliary_loss_mlp": 0.01048648, + "balance_loss_clip": 1.12017787, + "balance_loss_mlp": 1.02747655, + "epoch": 0.5254171050653841, + "flos": 63458891573760.0, + "grad_norm": 0.7440063838295472, + "language_loss": 0.54520476, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56787223, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.21191406, + "step": 8739, + "time_per_iteration": 3.3355932235717773 + }, + { + "auxiliary_loss_clip": 0.01430198, + "auxiliary_loss_mlp": 0.01044716, + "balance_loss_clip": 1.26075459, + "balance_loss_mlp": 1.02423549, + "epoch": 0.525477228318052, + "flos": 34682194166400.0, + "grad_norm": 1.803616612090727, + "language_loss": 0.85113323, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.87588239, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20507812, + "step": 8740, + "time_per_iteration": 2.955010175704956 + }, + { + "auxiliary_loss_clip": 0.0143058, + "auxiliary_loss_mlp": 0.01047038, + "balance_loss_clip": 1.26212168, + "balance_loss_mlp": 1.0262835, + "epoch": 0.52553735157072, + "flos": 17939737399680.0, + "grad_norm": 2.4212645953831746, + "language_loss": 0.70107806, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.72585422, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20739746, + "step": 8741, + "time_per_iteration": 2.8294286727905273 + }, + { + "auxiliary_loss_clip": 0.0144541, + "auxiliary_loss_mlp": 0.01043809, + "balance_loss_clip": 1.27176952, + "balance_loss_mlp": 1.02081335, + "epoch": 0.5255974748233879, + "flos": 9947788335360.0, + "grad_norm": 1.9695357766915123, + "language_loss": 0.67461324, + "learning_rate": 1.931273546137947e-06, + "loss": 0.69950545, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.23010254, + "step": 8742, + "time_per_iteration": 2.8184654712677 + }, + { + "auxiliary_loss_clip": 0.01442982, + "auxiliary_loss_mlp": 0.01043962, + "balance_loss_clip": 1.26770842, + "balance_loss_mlp": 1.02165806, + "epoch": 0.5256575980760559, + "flos": 16875920223360.0, + "grad_norm": 2.572580999628291, + "language_loss": 0.63674122, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.66161066, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 1.75195312, + "router_z_loss_mlp": 0.22314453, + "step": 8743, + "time_per_iteration": 2.7902088165283203 + }, + { + "auxiliary_loss_clip": 0.01212374, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.11811805, + "balance_loss_mlp": 1.02396762, + "epoch": 0.5257177213287239, + "flos": 62420185261440.0, + "grad_norm": 0.7757922865130171, + "language_loss": 0.54161704, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56415212, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.171875, + "step": 8744, + "time_per_iteration": 3.4824509620666504 + }, + { + "auxiliary_loss_clip": 0.01454064, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.27740383, + "balance_loss_mlp": 1.02070761, + "epoch": 0.5257778445813919, + "flos": 20786585397120.0, + "grad_norm": 2.2136442035060653, + "language_loss": 0.77154922, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.79653049, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.23364258, + "step": 8745, + "time_per_iteration": 2.8471662998199463 + }, + { + "auxiliary_loss_clip": 0.01427904, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.25963616, + "balance_loss_mlp": 1.01652181, + "epoch": 0.5258379678340598, + "flos": 17027237393280.0, + "grad_norm": 2.2274427574889883, + "language_loss": 0.82166576, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.84633851, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.22851562, + "step": 8746, + "time_per_iteration": 2.855973720550537 + }, + { + "auxiliary_loss_clip": 0.01414344, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.24842334, + "balance_loss_mlp": 1.01551747, + "epoch": 0.5258980910867278, + "flos": 21078496719360.0, + "grad_norm": 1.7717707560122293, + "language_loss": 0.76101941, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.78555906, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.2409668, + "step": 8747, + "time_per_iteration": 2.8706488609313965 + }, + { + "auxiliary_loss_clip": 0.01401209, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.23642349, + "balance_loss_mlp": 1.01112247, + "epoch": 0.5259582143393957, + "flos": 18013540947840.0, + "grad_norm": 3.7811661382953514, + "language_loss": 0.83530742, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.8596645, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.23400879, + "step": 8748, + "time_per_iteration": 2.8194899559020996 + }, + { + "auxiliary_loss_clip": 0.01439, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.26621819, + "balance_loss_mlp": 1.01504374, + "epoch": 0.5260183375920637, + "flos": 22794103509120.0, + "grad_norm": 2.1245999698110194, + "language_loss": 0.8193332, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.8441084, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.23486328, + "step": 8749, + "time_per_iteration": 2.9239392280578613 + }, + { + "auxiliary_loss_clip": 0.0142506, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.25713837, + "balance_loss_mlp": 1.01964331, + "epoch": 0.5260784608447318, + "flos": 27063696833280.0, + "grad_norm": 1.9457856628049557, + "language_loss": 0.73520625, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.75989425, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.2409668, + "step": 8750, + "time_per_iteration": 2.9139065742492676 + }, + { + "auxiliary_loss_clip": 0.01432798, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.26173449, + "balance_loss_mlp": 1.0158838, + "epoch": 0.5261385840973997, + "flos": 20671265105280.0, + "grad_norm": 1.6334525633414536, + "language_loss": 0.76932061, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.79403007, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.22265625, + "step": 8751, + "time_per_iteration": 4.25470757484436 + }, + { + "auxiliary_loss_clip": 0.01422546, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.25422668, + "balance_loss_mlp": 1.01541638, + "epoch": 0.5261987073500677, + "flos": 23633026191360.0, + "grad_norm": 1.4350382239197972, + "language_loss": 0.76807415, + "learning_rate": 1.927381362210902e-06, + "loss": 0.79268092, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.22705078, + "step": 8752, + "time_per_iteration": 2.890977144241333 + }, + { + "auxiliary_loss_clip": 0.0144297, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.26909208, + "balance_loss_mlp": 1.01564837, + "epoch": 0.5262588306027356, + "flos": 27647700456960.0, + "grad_norm": 1.4526081537495705, + "language_loss": 0.68140531, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70622104, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.22973633, + "step": 8753, + "time_per_iteration": 2.9059669971466064 + }, + { + "auxiliary_loss_clip": 0.01432158, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.26445985, + "balance_loss_mlp": 1.01736856, + "epoch": 0.5263189538554036, + "flos": 21769043143680.0, + "grad_norm": 1.4680646437046334, + "language_loss": 0.84383678, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86855829, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.22619629, + "step": 8754, + "time_per_iteration": 2.8734164237976074 + }, + { + "auxiliary_loss_clip": 0.01437231, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.26441133, + "balance_loss_mlp": 1.01537478, + "epoch": 0.5263790771080715, + "flos": 14282769674880.0, + "grad_norm": 2.2041005046268083, + "language_loss": 0.88337982, + "learning_rate": 1.926213760058522e-06, + "loss": 0.90813202, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.22619629, + "step": 8755, + "time_per_iteration": 2.8080687522888184 + }, + { + "auxiliary_loss_clip": 0.01212621, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.11549759, + "balance_loss_mlp": 1.00608623, + "epoch": 0.5264392003607395, + "flos": 65838820786560.0, + "grad_norm": 0.722105104684321, + "language_loss": 0.58907813, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.61149311, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.22753906, + "step": 8756, + "time_per_iteration": 3.434338331222534 + }, + { + "auxiliary_loss_clip": 0.01452085, + "auxiliary_loss_mlp": 0.01040832, + "balance_loss_clip": 1.27560067, + "balance_loss_mlp": 1.01803958, + "epoch": 0.5264993236134075, + "flos": 21042183127680.0, + "grad_norm": 1.615195511140322, + "language_loss": 0.71164739, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73657662, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.22790527, + "step": 8757, + "time_per_iteration": 2.837550401687622 + }, + { + "auxiliary_loss_clip": 0.01433274, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.26276076, + "balance_loss_mlp": 1.01593089, + "epoch": 0.5265594468660755, + "flos": 16626973478400.0, + "grad_norm": 1.6331562858148623, + "language_loss": 0.88430816, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90902698, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.22668457, + "step": 8758, + "time_per_iteration": 2.8085427284240723 + }, + { + "auxiliary_loss_clip": 0.0143722, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.26342654, + "balance_loss_mlp": 1.01468825, + "epoch": 0.5266195701187434, + "flos": 24144809834880.0, + "grad_norm": 1.4375647532350224, + "language_loss": 0.7682575, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.79299951, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.22290039, + "step": 8759, + "time_per_iteration": 2.880958080291748 + }, + { + "auxiliary_loss_clip": 0.01414367, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.24756849, + "balance_loss_mlp": 1.01395798, + "epoch": 0.5266796933714114, + "flos": 15850633633920.0, + "grad_norm": 2.1361868588963246, + "language_loss": 0.72964072, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.7541523, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.22851562, + "step": 8760, + "time_per_iteration": 2.828507423400879 + }, + { + "auxiliary_loss_clip": 0.0144135, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.26634061, + "balance_loss_mlp": 1.01932228, + "epoch": 0.5267398166240793, + "flos": 20959104395520.0, + "grad_norm": 2.3683782942269658, + "language_loss": 0.76521891, + "learning_rate": 1.923878631697736e-06, + "loss": 0.79005152, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22595215, + "step": 8761, + "time_per_iteration": 2.8233888149261475 + }, + { + "auxiliary_loss_clip": 0.01436695, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.26292765, + "balance_loss_mlp": 1.01776433, + "epoch": 0.5267999398767473, + "flos": 21006276739200.0, + "grad_norm": 1.88874835516363, + "language_loss": 0.71764654, + "learning_rate": 1.923489453654373e-06, + "loss": 0.74241507, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.22375488, + "step": 8762, + "time_per_iteration": 4.186644554138184 + }, + { + "auxiliary_loss_clip": 0.01209704, + "auxiliary_loss_mlp": 0.01023211, + "balance_loss_clip": 1.11212337, + "balance_loss_mlp": 1.00060928, + "epoch": 0.5268600631294152, + "flos": 66878160526080.0, + "grad_norm": 0.9398142954319888, + "language_loss": 0.6556437, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67797279, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.22558594, + "step": 8763, + "time_per_iteration": 3.234480619430542 + }, + { + "auxiliary_loss_clip": 0.01440548, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.26825118, + "balance_loss_mlp": 1.01492715, + "epoch": 0.5269201863820833, + "flos": 17174482531200.0, + "grad_norm": 1.8350991444776976, + "language_loss": 0.71577322, + "learning_rate": 1.922711106286265e-06, + "loss": 0.7405479, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21972656, + "step": 8764, + "time_per_iteration": 5.644627809524536 + }, + { + "auxiliary_loss_clip": 0.0143643, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.26360941, + "balance_loss_mlp": 1.01659966, + "epoch": 0.5269803096347513, + "flos": 20532480744960.0, + "grad_norm": 1.6227103207247704, + "language_loss": 0.75380886, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.77856791, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22875977, + "step": 8765, + "time_per_iteration": 2.8590245246887207 + }, + { + "auxiliary_loss_clip": 0.01430878, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.25701737, + "balance_loss_mlp": 1.01097989, + "epoch": 0.5270404328874192, + "flos": 27241328494080.0, + "grad_norm": 1.8664767557464925, + "language_loss": 0.86115289, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.88579994, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.2286377, + "step": 8766, + "time_per_iteration": 2.935728073120117 + }, + { + "auxiliary_loss_clip": 0.01447417, + "auxiliary_loss_mlp": 0.01041251, + "balance_loss_clip": 1.27331936, + "balance_loss_mlp": 1.01864862, + "epoch": 0.5271005561400872, + "flos": 23120382896640.0, + "grad_norm": 2.271140646213792, + "language_loss": 0.79440689, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81929362, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.22607422, + "step": 8767, + "time_per_iteration": 2.894381284713745 + }, + { + "auxiliary_loss_clip": 0.01448878, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.27418518, + "balance_loss_mlp": 1.01576436, + "epoch": 0.5271606793927551, + "flos": 22574955104640.0, + "grad_norm": 1.7247085110664362, + "language_loss": 0.74474561, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.76961923, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22705078, + "step": 8768, + "time_per_iteration": 2.87921142578125 + }, + { + "auxiliary_loss_clip": 0.01429389, + "auxiliary_loss_mlp": 0.01039914, + "balance_loss_clip": 1.25984907, + "balance_loss_mlp": 1.01875472, + "epoch": 0.5272208026454231, + "flos": 18771873361920.0, + "grad_norm": 1.815893473334963, + "language_loss": 0.74980247, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.77449548, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21154785, + "step": 8769, + "time_per_iteration": 2.8532793521881104 + }, + { + "auxiliary_loss_clip": 0.01440246, + "auxiliary_loss_mlp": 0.0104074, + "balance_loss_clip": 1.2685138, + "balance_loss_mlp": 1.01906753, + "epoch": 0.5272809258980911, + "flos": 20421775422720.0, + "grad_norm": 1.7174099069385786, + "language_loss": 0.74666184, + "learning_rate": 1.920376134993436e-06, + "loss": 0.77147162, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.21655273, + "step": 8770, + "time_per_iteration": 2.867903232574463 + }, + { + "auxiliary_loss_clip": 0.01435546, + "auxiliary_loss_mlp": 0.01038181, + "balance_loss_clip": 1.26545715, + "balance_loss_mlp": 1.01516175, + "epoch": 0.5273410491507591, + "flos": 28268424875520.0, + "grad_norm": 1.6866715715570193, + "language_loss": 0.68991208, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.71464932, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.23034668, + "step": 8771, + "time_per_iteration": 2.9144198894500732 + }, + { + "auxiliary_loss_clip": 0.01431217, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.26163673, + "balance_loss_mlp": 1.01724529, + "epoch": 0.527401172403427, + "flos": 22465245168000.0, + "grad_norm": 1.8694496159156737, + "language_loss": 0.77158803, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.79629529, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.22265625, + "step": 8772, + "time_per_iteration": 2.8367183208465576 + }, + { + "auxiliary_loss_clip": 0.01437275, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.2645061, + "balance_loss_mlp": 1.01699638, + "epoch": 0.527461295656095, + "flos": 21040509070080.0, + "grad_norm": 36.85186210536831, + "language_loss": 0.66610205, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.69087934, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.23449707, + "step": 8773, + "time_per_iteration": 2.896742343902588 + }, + { + "auxiliary_loss_clip": 0.01428821, + "auxiliary_loss_mlp": 0.01037466, + "balance_loss_clip": 1.25743854, + "balance_loss_mlp": 1.0164969, + "epoch": 0.5275214189087629, + "flos": 26332945764480.0, + "grad_norm": 1.4870884558201882, + "language_loss": 0.86000347, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88466632, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20959473, + "step": 8774, + "time_per_iteration": 2.856257915496826 + }, + { + "auxiliary_loss_clip": 0.01421994, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.25186896, + "balance_loss_mlp": 1.01433444, + "epoch": 0.5275815421614309, + "flos": 20056467755520.0, + "grad_norm": 2.130582152378828, + "language_loss": 0.80706322, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.83164573, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21923828, + "step": 8775, + "time_per_iteration": 2.8674657344818115 + }, + { + "auxiliary_loss_clip": 0.01425198, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.25802803, + "balance_loss_mlp": 1.01848507, + "epoch": 0.5276416654140988, + "flos": 21441677880960.0, + "grad_norm": 1.7505815149034518, + "language_loss": 0.84283531, + "learning_rate": 1.918041272397012e-06, + "loss": 0.86749727, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.22509766, + "step": 8776, + "time_per_iteration": 2.8411974906921387 + }, + { + "auxiliary_loss_clip": 0.01439011, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.26621938, + "balance_loss_mlp": 1.01203382, + "epoch": 0.5277017886667669, + "flos": 17173894348800.0, + "grad_norm": 1.5801377867231254, + "language_loss": 0.68097913, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70570725, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.21765137, + "step": 8777, + "time_per_iteration": 2.9549124240875244 + }, + { + "auxiliary_loss_clip": 0.01421248, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.25229561, + "balance_loss_mlp": 1.01626456, + "epoch": 0.5277619119194349, + "flos": 20457636566400.0, + "grad_norm": 1.4146098540259497, + "language_loss": 0.82898527, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.85357761, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21728516, + "step": 8778, + "time_per_iteration": 2.911442518234253 + }, + { + "auxiliary_loss_clip": 0.01439432, + "auxiliary_loss_mlp": 0.01038439, + "balance_loss_clip": 1.26654327, + "balance_loss_mlp": 1.0162183, + "epoch": 0.5278220351721028, + "flos": 24071730203520.0, + "grad_norm": 2.443825284861556, + "language_loss": 0.80469275, + "learning_rate": 1.916873882856013e-06, + "loss": 0.82947147, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.22229004, + "step": 8779, + "time_per_iteration": 2.8582377433776855 + }, + { + "auxiliary_loss_clip": 0.01427617, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.25877118, + "balance_loss_mlp": 1.01420784, + "epoch": 0.5278821584247708, + "flos": 24653200118400.0, + "grad_norm": 2.5326297173412886, + "language_loss": 0.77327615, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.79790211, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20776367, + "step": 8780, + "time_per_iteration": 2.8794655799865723 + }, + { + "auxiliary_loss_clip": 0.01445579, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.26999724, + "balance_loss_mlp": 1.01587784, + "epoch": 0.5279422816774387, + "flos": 35421858460800.0, + "grad_norm": 1.6491312557827194, + "language_loss": 0.70479405, + "learning_rate": 1.916095638898174e-06, + "loss": 0.72963035, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.22180176, + "step": 8781, + "time_per_iteration": 2.953895330429077 + }, + { + "auxiliary_loss_clip": 0.01414393, + "auxiliary_loss_mlp": 0.01036351, + "balance_loss_clip": 1.24719918, + "balance_loss_mlp": 1.01593077, + "epoch": 0.5280024049301068, + "flos": 22977526504320.0, + "grad_norm": 1.5998695060798207, + "language_loss": 0.72995842, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.75446594, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20422363, + "step": 8782, + "time_per_iteration": 2.8601887226104736 + }, + { + "auxiliary_loss_clip": 0.01414124, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.24758112, + "balance_loss_mlp": 1.01265836, + "epoch": 0.5280625281827747, + "flos": 21517743669120.0, + "grad_norm": 2.4411170503542787, + "language_loss": 0.69207215, + "learning_rate": 1.915317407666982e-06, + "loss": 0.71655142, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21154785, + "step": 8783, + "time_per_iteration": 2.849311113357544 + }, + { + "auxiliary_loss_clip": 0.01468234, + "auxiliary_loss_mlp": 0.01043568, + "balance_loss_clip": 1.28824484, + "balance_loss_mlp": 1.02050138, + "epoch": 0.5281226514354427, + "flos": 31219598678400.0, + "grad_norm": 1.6110857811698185, + "language_loss": 0.70075589, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.72587383, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.23059082, + "step": 8784, + "time_per_iteration": 2.928645133972168 + }, + { + "auxiliary_loss_clip": 0.01456178, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.27779603, + "balance_loss_mlp": 1.01339149, + "epoch": 0.5281827746881106, + "flos": 25087470140160.0, + "grad_norm": 2.106533408456001, + "language_loss": 0.76252258, + "learning_rate": 1.91453918928048e-06, + "loss": 0.78743118, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.21289062, + "step": 8785, + "time_per_iteration": 2.913539409637451 + }, + { + "auxiliary_loss_clip": 0.01431852, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.26232719, + "balance_loss_mlp": 1.01420379, + "epoch": 0.5282428979407786, + "flos": 20640969072000.0, + "grad_norm": 1.6262516711294497, + "language_loss": 0.83996463, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.8646574, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.2322998, + "step": 8786, + "time_per_iteration": 4.225078821182251 + }, + { + "auxiliary_loss_clip": 0.01413344, + "auxiliary_loss_mlp": 0.01039985, + "balance_loss_clip": 1.24718666, + "balance_loss_mlp": 1.01966023, + "epoch": 0.5283030211934465, + "flos": 22429655493120.0, + "grad_norm": 2.508715494208792, + "language_loss": 0.83901787, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.86355114, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20324707, + "step": 8787, + "time_per_iteration": 2.909749746322632 + }, + { + "auxiliary_loss_clip": 0.01424772, + "auxiliary_loss_mlp": 0.01030906, + "balance_loss_clip": 1.25661111, + "balance_loss_mlp": 1.01114082, + "epoch": 0.5283631444461145, + "flos": 23623977231360.0, + "grad_norm": 2.0730698438651776, + "language_loss": 0.84151733, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.86607414, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19750977, + "step": 8788, + "time_per_iteration": 2.842941999435425 + }, + { + "auxiliary_loss_clip": 0.01424965, + "auxiliary_loss_mlp": 0.01042655, + "balance_loss_clip": 1.25789547, + "balance_loss_mlp": 1.02131701, + "epoch": 0.5284232676987825, + "flos": 32684946624000.0, + "grad_norm": 3.48355726910227, + "language_loss": 0.75692397, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.78160024, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.21337891, + "step": 8789, + "time_per_iteration": 2.9299349784851074 + }, + { + "auxiliary_loss_clip": 0.01435875, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.26517451, + "balance_loss_mlp": 1.01599884, + "epoch": 0.5284833909514505, + "flos": 26772237959040.0, + "grad_norm": 1.6230245830002972, + "language_loss": 0.71058965, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.7353152, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20678711, + "step": 8790, + "time_per_iteration": 2.949162483215332 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.26212442, + "balance_loss_mlp": 1.01143479, + "epoch": 0.5285435142041185, + "flos": 22100797152000.0, + "grad_norm": 1.641425080174639, + "language_loss": 0.79404342, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81864619, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19165039, + "step": 8791, + "time_per_iteration": 2.832998275756836 + }, + { + "auxiliary_loss_clip": 0.01427417, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.26080084, + "balance_loss_mlp": 1.009884, + "epoch": 0.5286036374567864, + "flos": 20385009383040.0, + "grad_norm": 2.176251538812177, + "language_loss": 0.67526037, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.6998415, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20812988, + "step": 8792, + "time_per_iteration": 2.835094451904297 + }, + { + "auxiliary_loss_clip": 0.01423296, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.25520039, + "balance_loss_mlp": 1.01627338, + "epoch": 0.5286637607094544, + "flos": 24362691384960.0, + "grad_norm": 1.8915353409360078, + "language_loss": 0.80345666, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82805634, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20397949, + "step": 8793, + "time_per_iteration": 2.8584837913513184 + }, + { + "auxiliary_loss_clip": 0.01429303, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.26048946, + "balance_loss_mlp": 1.0203526, + "epoch": 0.5287238839621223, + "flos": 17279125050240.0, + "grad_norm": 2.0118201908760502, + "language_loss": 0.85937744, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.88408691, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21289062, + "step": 8794, + "time_per_iteration": 2.802241802215576 + }, + { + "auxiliary_loss_clip": 0.01448583, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.27307916, + "balance_loss_mlp": 1.0177381, + "epoch": 0.5287840072147904, + "flos": 17575922810880.0, + "grad_norm": 2.5349009397731286, + "language_loss": 0.69198364, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.71685159, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.20495605, + "step": 8795, + "time_per_iteration": 2.8242945671081543 + }, + { + "auxiliary_loss_clip": 0.01434934, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.26356149, + "balance_loss_mlp": 1.01594949, + "epoch": 0.5288441304674583, + "flos": 18561411959040.0, + "grad_norm": 1.8190963174994497, + "language_loss": 0.81624967, + "learning_rate": 1.910259223028374e-06, + "loss": 0.84096819, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.20983887, + "step": 8796, + "time_per_iteration": 2.8238120079040527 + }, + { + "auxiliary_loss_clip": 0.014329, + "auxiliary_loss_mlp": 0.01035888, + "balance_loss_clip": 1.26315522, + "balance_loss_mlp": 1.01460898, + "epoch": 0.5289042537201263, + "flos": 20824482556800.0, + "grad_norm": 2.1511222148454823, + "language_loss": 0.70159656, + "learning_rate": 1.909870155310071e-06, + "loss": 0.7262845, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21289062, + "step": 8797, + "time_per_iteration": 4.285072565078735 + }, + { + "auxiliary_loss_clip": 0.01419048, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.25426602, + "balance_loss_mlp": 1.01644981, + "epoch": 0.5289643769727942, + "flos": 15741919082880.0, + "grad_norm": 1.5283411544731922, + "language_loss": 0.82553327, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.85009408, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20581055, + "step": 8798, + "time_per_iteration": 4.2175164222717285 + }, + { + "auxiliary_loss_clip": 0.01449727, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.27510905, + "balance_loss_mlp": 1.01637471, + "epoch": 0.5290245002254622, + "flos": 19546901107200.0, + "grad_norm": 2.3822751606281987, + "language_loss": 0.71225846, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.73714197, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.22229004, + "step": 8799, + "time_per_iteration": 4.270482540130615 + }, + { + "auxiliary_loss_clip": 0.01413544, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.24977696, + "balance_loss_mlp": 1.0207665, + "epoch": 0.5290846234781301, + "flos": 15823595226240.0, + "grad_norm": 2.260616545833147, + "language_loss": 0.70517373, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.72972137, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20458984, + "step": 8800, + "time_per_iteration": 2.812929630279541 + }, + { + "auxiliary_loss_clip": 0.01206812, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.11499953, + "balance_loss_mlp": 1.01201975, + "epoch": 0.5291447467307981, + "flos": 70086560872320.0, + "grad_norm": 0.9692953755789581, + "language_loss": 0.57007611, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59242272, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15820312, + "step": 8801, + "time_per_iteration": 3.243163824081421 + }, + { + "auxiliary_loss_clip": 0.01431042, + "auxiliary_loss_mlp": 0.01038042, + "balance_loss_clip": 1.26005459, + "balance_loss_mlp": 1.01737082, + "epoch": 0.529204869983466, + "flos": 28375374879360.0, + "grad_norm": 1.5427075698018011, + "language_loss": 0.64754599, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.6722368, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20666504, + "step": 8802, + "time_per_iteration": 2.920804500579834 + }, + { + "auxiliary_loss_clip": 0.01430239, + "auxiliary_loss_mlp": 0.01041722, + "balance_loss_clip": 1.26325619, + "balance_loss_mlp": 1.02102709, + "epoch": 0.5292649932361341, + "flos": 33770056118400.0, + "grad_norm": 1.704249799922217, + "language_loss": 0.6958949, + "learning_rate": 1.907535821289003e-06, + "loss": 0.72061449, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20690918, + "step": 8803, + "time_per_iteration": 2.9439170360565186 + }, + { + "auxiliary_loss_clip": 0.0142439, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.25820971, + "balance_loss_mlp": 1.01847434, + "epoch": 0.5293251164888021, + "flos": 20457048384000.0, + "grad_norm": 1.9177814334077956, + "language_loss": 0.766922, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.79155171, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20117188, + "step": 8804, + "time_per_iteration": 2.9329490661621094 + }, + { + "auxiliary_loss_clip": 0.01207574, + "auxiliary_loss_mlp": 0.01023298, + "balance_loss_clip": 1.1150403, + "balance_loss_mlp": 1.0055598, + "epoch": 0.52938523974147, + "flos": 66580294872960.0, + "grad_norm": 0.7588912456657476, + "language_loss": 0.53032529, + "learning_rate": 1.906757737841291e-06, + "loss": 0.552634, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.17773438, + "step": 8805, + "time_per_iteration": 3.391428232192993 + }, + { + "auxiliary_loss_clip": 0.01208627, + "auxiliary_loss_mlp": 0.01018533, + "balance_loss_clip": 1.11675096, + "balance_loss_mlp": 1.00270164, + "epoch": 0.529445362994138, + "flos": 67183283329920.0, + "grad_norm": 0.742572667260767, + "language_loss": 0.63882744, + "learning_rate": 1.906368701413693e-06, + "loss": 0.66109896, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15820312, + "step": 8806, + "time_per_iteration": 3.2838244438171387 + }, + { + "auxiliary_loss_clip": 0.01443915, + "auxiliary_loss_mlp": 0.01040062, + "balance_loss_clip": 1.26749098, + "balance_loss_mlp": 1.01921272, + "epoch": 0.5295054862468059, + "flos": 17758395665280.0, + "grad_norm": 1.665566081116359, + "language_loss": 0.72987866, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.75471848, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.20861816, + "step": 8807, + "time_per_iteration": 2.891240358352661 + }, + { + "auxiliary_loss_clip": 0.01426941, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.2594502, + "balance_loss_mlp": 1.02157211, + "epoch": 0.529565609499474, + "flos": 11403951586560.0, + "grad_norm": 2.176363969644659, + "language_loss": 0.71134853, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.73602957, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19592285, + "step": 8808, + "time_per_iteration": 2.8540472984313965 + }, + { + "auxiliary_loss_clip": 0.01431724, + "auxiliary_loss_mlp": 0.01038893, + "balance_loss_clip": 1.26265073, + "balance_loss_mlp": 1.01947427, + "epoch": 0.5296257327521419, + "flos": 17203828423680.0, + "grad_norm": 1.7727718572722577, + "language_loss": 0.87823373, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.90293992, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19396973, + "step": 8809, + "time_per_iteration": 2.844578266143799 + }, + { + "auxiliary_loss_clip": 0.0146029, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.28154755, + "balance_loss_mlp": 1.01554775, + "epoch": 0.5296858560048099, + "flos": 39977617017600.0, + "grad_norm": 1.6554312032956429, + "language_loss": 0.64825422, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.67323446, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.22167969, + "step": 8810, + "time_per_iteration": 2.96635365486145 + }, + { + "auxiliary_loss_clip": 0.01420454, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.25365555, + "balance_loss_mlp": 1.01677465, + "epoch": 0.5297459792574778, + "flos": 20971682449920.0, + "grad_norm": 2.73870256573814, + "language_loss": 0.68571365, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.71028244, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1965332, + "step": 8811, + "time_per_iteration": 2.8747780323028564 + }, + { + "auxiliary_loss_clip": 0.01201562, + "auxiliary_loss_mlp": 0.01017434, + "balance_loss_clip": 1.1095103, + "balance_loss_mlp": 1.00141239, + "epoch": 0.5298061025101458, + "flos": 66552967013760.0, + "grad_norm": 0.6626694853394004, + "language_loss": 0.53414667, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55633664, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16015625, + "step": 8812, + "time_per_iteration": 3.4563448429107666 + }, + { + "auxiliary_loss_clip": 0.01201976, + "auxiliary_loss_mlp": 0.01012123, + "balance_loss_clip": 1.10981917, + "balance_loss_mlp": 0.99762672, + "epoch": 0.5298662257628137, + "flos": 67694795504640.0, + "grad_norm": 0.7274847592762559, + "language_loss": 0.56322312, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.5853641, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.14453125, + "step": 8813, + "time_per_iteration": 3.3455958366394043 + }, + { + "auxiliary_loss_clip": 0.01423777, + "auxiliary_loss_mlp": 0.01036351, + "balance_loss_clip": 1.258394, + "balance_loss_mlp": 1.01588273, + "epoch": 0.5299263490154817, + "flos": 19655932371840.0, + "grad_norm": 1.7962616070249795, + "language_loss": 0.82430863, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.84890985, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20483398, + "step": 8814, + "time_per_iteration": 2.830716609954834 + }, + { + "auxiliary_loss_clip": 0.01458145, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.28235793, + "balance_loss_mlp": 1.01853633, + "epoch": 0.5299864722681497, + "flos": 22065297966720.0, + "grad_norm": 1.5774213463902094, + "language_loss": 0.85730016, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.8822751, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20812988, + "step": 8815, + "time_per_iteration": 2.849721908569336 + }, + { + "auxiliary_loss_clip": 0.01423864, + "auxiliary_loss_mlp": 0.01033723, + "balance_loss_clip": 1.2584933, + "balance_loss_mlp": 1.01414895, + "epoch": 0.5300465955208177, + "flos": 21774065316480.0, + "grad_norm": 4.137982845746858, + "language_loss": 0.67364442, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69822031, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19555664, + "step": 8816, + "time_per_iteration": 2.843815565109253 + }, + { + "auxiliary_loss_clip": 0.0143019, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.26112032, + "balance_loss_mlp": 1.02061462, + "epoch": 0.5301067187734857, + "flos": 43011869552640.0, + "grad_norm": 1.8719714488835064, + "language_loss": 0.73104727, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.75575477, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.19946289, + "step": 8817, + "time_per_iteration": 3.0196549892425537 + }, + { + "auxiliary_loss_clip": 0.01430428, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.25958014, + "balance_loss_mlp": 1.01481485, + "epoch": 0.5301668420261536, + "flos": 20562957757440.0, + "grad_norm": 1.6892600829755202, + "language_loss": 0.6592443, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.68390989, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.21350098, + "step": 8818, + "time_per_iteration": 2.873847723007202 + }, + { + "auxiliary_loss_clip": 0.01434893, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.26367927, + "balance_loss_mlp": 1.01630354, + "epoch": 0.5302269652788216, + "flos": 17493794219520.0, + "grad_norm": 3.8001029537097337, + "language_loss": 0.76059425, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.78531659, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21032715, + "step": 8819, + "time_per_iteration": 2.8446319103240967 + }, + { + "auxiliary_loss_clip": 0.01453134, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.27845478, + "balance_loss_mlp": 1.02266157, + "epoch": 0.5302870885314895, + "flos": 14582236878720.0, + "grad_norm": 1.9730044568934135, + "language_loss": 0.82571322, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.85067546, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.2043457, + "step": 8820, + "time_per_iteration": 2.9303741455078125 + }, + { + "auxiliary_loss_clip": 0.01428979, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.25876093, + "balance_loss_mlp": 1.0163238, + "epoch": 0.5303472117841576, + "flos": 23447793404160.0, + "grad_norm": 1.7059309158588933, + "language_loss": 0.72701776, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.75166774, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19689941, + "step": 8821, + "time_per_iteration": 4.348541498184204 + }, + { + "auxiliary_loss_clip": 0.01425363, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.25757766, + "balance_loss_mlp": 1.02154827, + "epoch": 0.5304073350368255, + "flos": 22718761637760.0, + "grad_norm": 1.5097278569951182, + "language_loss": 0.74559736, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.77025306, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.18676758, + "step": 8822, + "time_per_iteration": 2.8640997409820557 + }, + { + "auxiliary_loss_clip": 0.01429934, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.25927961, + "balance_loss_mlp": 1.01645803, + "epoch": 0.5304674582894935, + "flos": 27940199961600.0, + "grad_norm": 1.8496115534577406, + "language_loss": 0.68094337, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70562017, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21289062, + "step": 8823, + "time_per_iteration": 2.8662829399108887 + }, + { + "auxiliary_loss_clip": 0.01443168, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.27090812, + "balance_loss_mlp": 1.01392686, + "epoch": 0.5305275815421614, + "flos": 21260290901760.0, + "grad_norm": 1.6996458295104548, + "language_loss": 0.69972324, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.72450298, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20861816, + "step": 8824, + "time_per_iteration": 2.831275224685669 + }, + { + "auxiliary_loss_clip": 0.01418817, + "auxiliary_loss_mlp": 0.01038077, + "balance_loss_clip": 1.25349808, + "balance_loss_mlp": 1.01843095, + "epoch": 0.5305877047948294, + "flos": 17612055423360.0, + "grad_norm": 2.8203133687300044, + "language_loss": 0.77010268, + "learning_rate": 1.898977700702689e-06, + "loss": 0.79467165, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1965332, + "step": 8825, + "time_per_iteration": 2.8152706623077393 + }, + { + "auxiliary_loss_clip": 0.01413988, + "auxiliary_loss_mlp": 0.01040204, + "balance_loss_clip": 1.24860632, + "balance_loss_mlp": 1.01919973, + "epoch": 0.5306478280474973, + "flos": 15203956682880.0, + "grad_norm": 1.8188845261006807, + "language_loss": 0.86510086, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.88964272, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20996094, + "step": 8826, + "time_per_iteration": 2.8048508167266846 + }, + { + "auxiliary_loss_clip": 0.01423737, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.25773478, + "balance_loss_mlp": 1.01405358, + "epoch": 0.5307079513001653, + "flos": 15349663497600.0, + "grad_norm": 1.487386710268766, + "language_loss": 0.64777899, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.67235887, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.2019043, + "step": 8827, + "time_per_iteration": 2.8064804077148438 + }, + { + "auxiliary_loss_clip": 0.01431408, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.2617321, + "balance_loss_mlp": 1.01652908, + "epoch": 0.5307680745528333, + "flos": 43560147767040.0, + "grad_norm": 1.5615440184545537, + "language_loss": 0.60281485, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62749797, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20361328, + "step": 8828, + "time_per_iteration": 3.031874179840088 + }, + { + "auxiliary_loss_clip": 0.01451571, + "auxiliary_loss_mlp": 0.01040044, + "balance_loss_clip": 1.27876413, + "balance_loss_mlp": 1.01914668, + "epoch": 0.5308281978055013, + "flos": 20058865729920.0, + "grad_norm": 2.1331856062856334, + "language_loss": 0.82011205, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.84502816, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20910645, + "step": 8829, + "time_per_iteration": 2.8473265171051025 + }, + { + "auxiliary_loss_clip": 0.0141749, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.25106442, + "balance_loss_mlp": 1.01136696, + "epoch": 0.5308883210581693, + "flos": 20713596255360.0, + "grad_norm": 1.4228037860535858, + "language_loss": 0.78610235, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.81059289, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.2019043, + "step": 8830, + "time_per_iteration": 2.798396348953247 + }, + { + "auxiliary_loss_clip": 0.01427324, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.25803602, + "balance_loss_mlp": 1.01364875, + "epoch": 0.5309484443108372, + "flos": 14363540922240.0, + "grad_norm": 2.077584187931709, + "language_loss": 0.81346893, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.83807033, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19177246, + "step": 8831, + "time_per_iteration": 2.792092800140381 + }, + { + "auxiliary_loss_clip": 0.01420556, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.25305462, + "balance_loss_mlp": 1.01466584, + "epoch": 0.5310085675635052, + "flos": 20019927939840.0, + "grad_norm": 1.839931995524094, + "language_loss": 0.74610317, + "learning_rate": 1.896255043672186e-06, + "loss": 0.77065384, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19848633, + "step": 8832, + "time_per_iteration": 4.2181665897369385 + }, + { + "auxiliary_loss_clip": 0.01443681, + "auxiliary_loss_mlp": 0.01039071, + "balance_loss_clip": 1.27119756, + "balance_loss_mlp": 1.01904392, + "epoch": 0.5310686908161731, + "flos": 22137427457280.0, + "grad_norm": 1.901835911119162, + "language_loss": 0.76472789, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.78955543, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.20031738, + "step": 8833, + "time_per_iteration": 4.290328025817871 + }, + { + "auxiliary_loss_clip": 0.01439676, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.26763177, + "balance_loss_mlp": 1.01788187, + "epoch": 0.5311288140688412, + "flos": 24728722968960.0, + "grad_norm": 1.738875731957632, + "language_loss": 0.73832065, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.76309067, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.19421387, + "step": 8834, + "time_per_iteration": 4.296040058135986 + }, + { + "auxiliary_loss_clip": 0.01455111, + "auxiliary_loss_mlp": 0.01040422, + "balance_loss_clip": 1.27741671, + "balance_loss_mlp": 1.02034688, + "epoch": 0.5311889373215091, + "flos": 24108405753600.0, + "grad_norm": 2.1236175092062726, + "language_loss": 0.78863287, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.81358826, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.20068359, + "step": 8835, + "time_per_iteration": 2.8598556518554688 + }, + { + "auxiliary_loss_clip": 0.01422377, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.25341153, + "balance_loss_mlp": 1.01101923, + "epoch": 0.5312490605741771, + "flos": 22026903114240.0, + "grad_norm": 1.7726651596003606, + "language_loss": 0.72693956, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.75149381, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.22058105, + "step": 8836, + "time_per_iteration": 2.855607032775879 + }, + { + "auxiliary_loss_clip": 0.01426411, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.25476658, + "balance_loss_mlp": 1.01539063, + "epoch": 0.531309183826845, + "flos": 19399882193280.0, + "grad_norm": 1.6702781930271118, + "language_loss": 0.81195903, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83659542, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.21838379, + "step": 8837, + "time_per_iteration": 2.811922311782837 + }, + { + "auxiliary_loss_clip": 0.01416523, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.25048339, + "balance_loss_mlp": 1.01445031, + "epoch": 0.531369307079513, + "flos": 20198781210240.0, + "grad_norm": 3.5069943109706596, + "language_loss": 0.86963886, + "learning_rate": 1.893921490881035e-06, + "loss": 0.89415681, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20825195, + "step": 8838, + "time_per_iteration": 2.8138225078582764 + }, + { + "auxiliary_loss_clip": 0.01415656, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.24957848, + "balance_loss_mlp": 1.01251662, + "epoch": 0.5314294303321809, + "flos": 18889455893760.0, + "grad_norm": 1.699227251714988, + "language_loss": 0.73668069, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.7611686, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20617676, + "step": 8839, + "time_per_iteration": 2.959918737411499 + }, + { + "auxiliary_loss_clip": 0.01425869, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.2552731, + "balance_loss_mlp": 1.01712167, + "epoch": 0.531489553584849, + "flos": 23050560890880.0, + "grad_norm": 5.841162525210617, + "language_loss": 0.77264905, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79728258, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20373535, + "step": 8840, + "time_per_iteration": 2.9403014183044434 + }, + { + "auxiliary_loss_clip": 0.01429243, + "auxiliary_loss_mlp": 0.01038658, + "balance_loss_clip": 1.25798023, + "balance_loss_mlp": 1.01591277, + "epoch": 0.5315496768375169, + "flos": 19799557925760.0, + "grad_norm": 3.597842799302848, + "language_loss": 0.77347648, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79815555, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.22729492, + "step": 8841, + "time_per_iteration": 2.863783121109009 + }, + { + "auxiliary_loss_clip": 0.0121127, + "auxiliary_loss_mlp": 0.01023734, + "balance_loss_clip": 1.11670947, + "balance_loss_mlp": 1.00933349, + "epoch": 0.5316098000901849, + "flos": 71056820505600.0, + "grad_norm": 0.6960591058827164, + "language_loss": 0.56823635, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.59058642, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.14355469, + "step": 8842, + "time_per_iteration": 3.492741346359253 + }, + { + "auxiliary_loss_clip": 0.01440013, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.26711655, + "balance_loss_mlp": 1.01455331, + "epoch": 0.5316699233428529, + "flos": 16444229155200.0, + "grad_norm": 1.6812562413717513, + "language_loss": 0.74289036, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76763868, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.20263672, + "step": 8843, + "time_per_iteration": 2.8766441345214844 + }, + { + "auxiliary_loss_clip": 0.01210103, + "auxiliary_loss_mlp": 0.01017871, + "balance_loss_clip": 1.11604214, + "balance_loss_mlp": 1.00213552, + "epoch": 0.5317300465955208, + "flos": 67455983594880.0, + "grad_norm": 0.8830084509329251, + "language_loss": 0.61160439, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63388413, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15722656, + "step": 8844, + "time_per_iteration": 3.3341541290283203 + }, + { + "auxiliary_loss_clip": 0.01213624, + "auxiliary_loss_mlp": 0.01017718, + "balance_loss_clip": 1.11753523, + "balance_loss_mlp": 1.00245881, + "epoch": 0.5317901698481888, + "flos": 59532933657600.0, + "grad_norm": 0.8439931980558784, + "language_loss": 0.62209809, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.6444115, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15234375, + "step": 8845, + "time_per_iteration": 3.3040082454681396 + }, + { + "auxiliary_loss_clip": 0.01427752, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.25956023, + "balance_loss_mlp": 1.01327562, + "epoch": 0.5318502931008567, + "flos": 19136954805120.0, + "grad_norm": 2.226302747796165, + "language_loss": 0.76895672, + "learning_rate": 1.890810312970474e-06, + "loss": 0.79358351, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21643066, + "step": 8846, + "time_per_iteration": 2.8570191860198975 + }, + { + "auxiliary_loss_clip": 0.01431652, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.26033282, + "balance_loss_mlp": 1.01730597, + "epoch": 0.5319104163535248, + "flos": 24691775950080.0, + "grad_norm": 1.6989911170044525, + "language_loss": 0.76413083, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.78881574, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.19543457, + "step": 8847, + "time_per_iteration": 2.879509687423706 + }, + { + "auxiliary_loss_clip": 0.01415444, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.24900901, + "balance_loss_mlp": 1.0189147, + "epoch": 0.5319705396061927, + "flos": 19393547921280.0, + "grad_norm": 1.5646435578738025, + "language_loss": 0.88190579, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90644711, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19775391, + "step": 8848, + "time_per_iteration": 2.8821699619293213 + }, + { + "auxiliary_loss_clip": 0.01426462, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.25627804, + "balance_loss_mlp": 1.01851416, + "epoch": 0.5320306628588607, + "flos": 18268324272000.0, + "grad_norm": 2.598598248842916, + "language_loss": 0.75547963, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.78014135, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.21203613, + "step": 8849, + "time_per_iteration": 2.8607053756713867 + }, + { + "auxiliary_loss_clip": 0.01439759, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.26564872, + "balance_loss_mlp": 1.01723695, + "epoch": 0.5320907861115286, + "flos": 23742781372800.0, + "grad_norm": 2.863602753926276, + "language_loss": 0.8051492, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.82993233, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21276855, + "step": 8850, + "time_per_iteration": 2.8167285919189453 + }, + { + "auxiliary_loss_clip": 0.01417076, + "auxiliary_loss_mlp": 0.01036744, + "balance_loss_clip": 1.24883342, + "balance_loss_mlp": 1.01587033, + "epoch": 0.5321509093641966, + "flos": 34508408313600.0, + "grad_norm": 1.3634999708125297, + "language_loss": 0.55440974, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57894796, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20874023, + "step": 8851, + "time_per_iteration": 2.9646615982055664 + }, + { + "auxiliary_loss_clip": 0.01437525, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.26626492, + "balance_loss_mlp": 1.02094579, + "epoch": 0.5322110326168645, + "flos": 20020470877440.0, + "grad_norm": 2.0753918674280922, + "language_loss": 0.69481426, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.71959448, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.19555664, + "step": 8852, + "time_per_iteration": 2.798671007156372 + }, + { + "auxiliary_loss_clip": 0.01208746, + "auxiliary_loss_mlp": 0.01026063, + "balance_loss_clip": 1.1162287, + "balance_loss_mlp": 1.00956464, + "epoch": 0.5322711558695326, + "flos": 64661266886400.0, + "grad_norm": 0.8044294190472469, + "language_loss": 0.62994313, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.65229124, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16503906, + "step": 8853, + "time_per_iteration": 3.3102242946624756 + }, + { + "auxiliary_loss_clip": 0.01443286, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.26803732, + "balance_loss_mlp": 1.02444315, + "epoch": 0.5323312791222005, + "flos": 14947544545920.0, + "grad_norm": 2.4999240924995507, + "language_loss": 0.80914903, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.83402455, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.19824219, + "step": 8854, + "time_per_iteration": 2.7819952964782715 + }, + { + "auxiliary_loss_clip": 0.01413758, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.24979234, + "balance_loss_mlp": 1.01551235, + "epoch": 0.5323914023748685, + "flos": 23451322498560.0, + "grad_norm": 1.8176025394958988, + "language_loss": 0.74519753, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.76967919, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18884277, + "step": 8855, + "time_per_iteration": 2.835422992706299 + }, + { + "auxiliary_loss_clip": 0.01422524, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.25471759, + "balance_loss_mlp": 1.01562965, + "epoch": 0.5324515256275365, + "flos": 26297265600000.0, + "grad_norm": 2.1642255286287897, + "language_loss": 0.66275465, + "learning_rate": 1.886921714110507e-06, + "loss": 0.68732679, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19055176, + "step": 8856, + "time_per_iteration": 4.3179240226745605 + }, + { + "auxiliary_loss_clip": 0.01433811, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.26088095, + "balance_loss_mlp": 1.01590073, + "epoch": 0.5325116488802044, + "flos": 26882400343680.0, + "grad_norm": 2.381173755041629, + "language_loss": 0.78076309, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.80546331, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.203125, + "step": 8857, + "time_per_iteration": 2.854440689086914 + }, + { + "auxiliary_loss_clip": 0.01430457, + "auxiliary_loss_mlp": 0.01040316, + "balance_loss_clip": 1.26102209, + "balance_loss_mlp": 1.01971674, + "epoch": 0.5325717721328724, + "flos": 25895689585920.0, + "grad_norm": 17.638073917407514, + "language_loss": 0.71705532, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.741763, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20605469, + "step": 8858, + "time_per_iteration": 2.845877170562744 + }, + { + "auxiliary_loss_clip": 0.01429206, + "auxiliary_loss_mlp": 0.01041994, + "balance_loss_clip": 1.25950623, + "balance_loss_mlp": 1.02069104, + "epoch": 0.5326318953855403, + "flos": 21809474012160.0, + "grad_norm": 1.8458563322804848, + "language_loss": 0.70355803, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.72827005, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21313477, + "step": 8859, + "time_per_iteration": 2.9480140209198 + }, + { + "auxiliary_loss_clip": 0.01416178, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.25220227, + "balance_loss_mlp": 1.01671731, + "epoch": 0.5326920186382084, + "flos": 20932654170240.0, + "grad_norm": 1.5625653568535898, + "language_loss": 0.70179343, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.72631812, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19567871, + "step": 8860, + "time_per_iteration": 2.8792800903320312 + }, + { + "auxiliary_loss_clip": 0.01419947, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.2535336, + "balance_loss_mlp": 1.014539, + "epoch": 0.5327521418908763, + "flos": 21443216204160.0, + "grad_norm": 2.045119622674687, + "language_loss": 0.7853806, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80992156, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19604492, + "step": 8861, + "time_per_iteration": 2.85494327545166 + }, + { + "auxiliary_loss_clip": 0.01427386, + "auxiliary_loss_mlp": 0.01041047, + "balance_loss_clip": 1.25754976, + "balance_loss_mlp": 1.01945782, + "epoch": 0.5328122651435443, + "flos": 21769721815680.0, + "grad_norm": 2.1045457623875357, + "language_loss": 0.86404967, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88873404, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21594238, + "step": 8862, + "time_per_iteration": 3.0078816413879395 + }, + { + "auxiliary_loss_clip": 0.01431251, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.25851583, + "balance_loss_mlp": 1.01185298, + "epoch": 0.5328723883962122, + "flos": 18305678494080.0, + "grad_norm": 1.9433975953215303, + "language_loss": 0.62562531, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.65028405, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.2277832, + "step": 8863, + "time_per_iteration": 2.8746144771575928 + }, + { + "auxiliary_loss_clip": 0.01422817, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.25823522, + "balance_loss_mlp": 1.01678491, + "epoch": 0.5329325116488802, + "flos": 25385987203200.0, + "grad_norm": 2.859434726164187, + "language_loss": 0.74938512, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7739867, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20556641, + "step": 8864, + "time_per_iteration": 2.8922362327575684 + }, + { + "auxiliary_loss_clip": 0.01414509, + "auxiliary_loss_mlp": 0.01035994, + "balance_loss_clip": 1.24856043, + "balance_loss_mlp": 1.01541877, + "epoch": 0.5329926349015481, + "flos": 25602782878080.0, + "grad_norm": 1.989030663091117, + "language_loss": 0.65468109, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.67918611, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20581055, + "step": 8865, + "time_per_iteration": 2.9081478118896484 + }, + { + "auxiliary_loss_clip": 0.01433147, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.26530325, + "balance_loss_mlp": 1.01253295, + "epoch": 0.5330527581542162, + "flos": 22898927007360.0, + "grad_norm": 2.0235807801460894, + "language_loss": 0.79424822, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.81890774, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20251465, + "step": 8866, + "time_per_iteration": 2.818025827407837 + }, + { + "auxiliary_loss_clip": 0.01430357, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.26228976, + "balance_loss_mlp": 1.02143335, + "epoch": 0.5331128814068841, + "flos": 16033377957120.0, + "grad_norm": 2.0347730340952555, + "language_loss": 0.74608225, + "learning_rate": 1.882644751189108e-06, + "loss": 0.77080286, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20275879, + "step": 8867, + "time_per_iteration": 4.230633497238159 + }, + { + "auxiliary_loss_clip": 0.01425008, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.25628424, + "balance_loss_mlp": 1.01747656, + "epoch": 0.5331730046595521, + "flos": 39358295187840.0, + "grad_norm": 1.5583660696227977, + "language_loss": 0.72863257, + "learning_rate": 1.88225596278394e-06, + "loss": 0.75326854, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21130371, + "step": 8868, + "time_per_iteration": 4.420438051223755 + }, + { + "auxiliary_loss_clip": 0.01428111, + "auxiliary_loss_mlp": 0.01039691, + "balance_loss_clip": 1.26008701, + "balance_loss_mlp": 1.01929402, + "epoch": 0.5332331279122201, + "flos": 24034964163840.0, + "grad_norm": 1.987840621428596, + "language_loss": 0.7903254, + "learning_rate": 1.881867178843637e-06, + "loss": 0.8150034, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20397949, + "step": 8869, + "time_per_iteration": 4.410567998886108 + }, + { + "auxiliary_loss_clip": 0.01447211, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.27390599, + "balance_loss_mlp": 1.01927328, + "epoch": 0.533293251164888, + "flos": 17138304673920.0, + "grad_norm": 2.2665875144807694, + "language_loss": 0.76660019, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.79147089, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20568848, + "step": 8870, + "time_per_iteration": 2.8205859661102295 + }, + { + "auxiliary_loss_clip": 0.01448273, + "auxiliary_loss_mlp": 0.01044275, + "balance_loss_clip": 1.27521312, + "balance_loss_mlp": 1.02185202, + "epoch": 0.533353374417556, + "flos": 22136251092480.0, + "grad_norm": 2.1505184398936334, + "language_loss": 0.75913543, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.7840609, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.22412109, + "step": 8871, + "time_per_iteration": 2.8773412704467773 + }, + { + "auxiliary_loss_clip": 0.01432255, + "auxiliary_loss_mlp": 0.01040992, + "balance_loss_clip": 1.26278901, + "balance_loss_mlp": 1.02095342, + "epoch": 0.533413497670224, + "flos": 15018995364480.0, + "grad_norm": 1.972671176364538, + "language_loss": 0.72483617, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.7495687, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20031738, + "step": 8872, + "time_per_iteration": 2.8744421005249023 + }, + { + "auxiliary_loss_clip": 0.01421765, + "auxiliary_loss_mlp": 0.01037698, + "balance_loss_clip": 1.25539088, + "balance_loss_mlp": 1.01755166, + "epoch": 0.533473620922892, + "flos": 19619483045760.0, + "grad_norm": 2.4611469090000857, + "language_loss": 0.6572938, + "learning_rate": 1.880312088025936e-06, + "loss": 0.68188846, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20153809, + "step": 8873, + "time_per_iteration": 2.8315553665161133 + }, + { + "auxiliary_loss_clip": 0.01428491, + "auxiliary_loss_mlp": 0.01040209, + "balance_loss_clip": 1.26123977, + "balance_loss_mlp": 1.01958621, + "epoch": 0.5335337441755599, + "flos": 14290687514880.0, + "grad_norm": 3.3821941473010892, + "language_loss": 0.80577004, + "learning_rate": 1.879923326631099e-06, + "loss": 0.83045709, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20617676, + "step": 8874, + "time_per_iteration": 2.814331531524658 + }, + { + "auxiliary_loss_clip": 0.01418703, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.25098395, + "balance_loss_mlp": 1.01162922, + "epoch": 0.5335938674282279, + "flos": 20824889760000.0, + "grad_norm": 2.224870553059437, + "language_loss": 0.70400536, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72851008, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20129395, + "step": 8875, + "time_per_iteration": 2.853872060775757 + }, + { + "auxiliary_loss_clip": 0.01216674, + "auxiliary_loss_mlp": 0.01021291, + "balance_loss_clip": 1.12600315, + "balance_loss_mlp": 1.00584173, + "epoch": 0.5336539906808958, + "flos": 71432308252800.0, + "grad_norm": 0.7296616415787806, + "language_loss": 0.59737259, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61975223, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15429688, + "step": 8876, + "time_per_iteration": 3.4561338424682617 + }, + { + "auxiliary_loss_clip": 0.01425471, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.25599205, + "balance_loss_mlp": 1.01663399, + "epoch": 0.5337141139335638, + "flos": 20161562722560.0, + "grad_norm": 1.8792087772960298, + "language_loss": 0.75667512, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.78129292, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19677734, + "step": 8877, + "time_per_iteration": 2.8564629554748535 + }, + { + "auxiliary_loss_clip": 0.01218936, + "auxiliary_loss_mlp": 0.01020744, + "balance_loss_clip": 1.12477255, + "balance_loss_mlp": 1.00224292, + "epoch": 0.5337742371862317, + "flos": 67758237993600.0, + "grad_norm": 0.76282272658024, + "language_loss": 0.57271379, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.5951106, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.18457031, + "step": 8878, + "time_per_iteration": 3.249645471572876 + }, + { + "auxiliary_loss_clip": 0.0144643, + "auxiliary_loss_mlp": 0.01041913, + "balance_loss_clip": 1.27197826, + "balance_loss_mlp": 1.02071738, + "epoch": 0.5338343604388998, + "flos": 25019593660800.0, + "grad_norm": 1.4895106366175577, + "language_loss": 0.73510408, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.75998747, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21179199, + "step": 8879, + "time_per_iteration": 2.903784990310669 + }, + { + "auxiliary_loss_clip": 0.01433637, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.26190472, + "balance_loss_mlp": 1.01231027, + "epoch": 0.5338944836915677, + "flos": 17609747938560.0, + "grad_norm": 2.172844974558253, + "language_loss": 0.84581095, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.87047255, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.20227051, + "step": 8880, + "time_per_iteration": 2.948892593383789 + }, + { + "auxiliary_loss_clip": 0.01423185, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.25766861, + "balance_loss_mlp": 1.01837051, + "epoch": 0.5339546069442357, + "flos": 21733679692800.0, + "grad_norm": 1.4286569998474872, + "language_loss": 0.80452615, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.82913983, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19812012, + "step": 8881, + "time_per_iteration": 2.8531334400177 + }, + { + "auxiliary_loss_clip": 0.01214078, + "auxiliary_loss_mlp": 0.01025706, + "balance_loss_clip": 1.12211788, + "balance_loss_mlp": 1.00787187, + "epoch": 0.5340147301969036, + "flos": 69750961056000.0, + "grad_norm": 0.8005766389682626, + "language_loss": 0.59291178, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61530966, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.17871094, + "step": 8882, + "time_per_iteration": 3.250101327896118 + }, + { + "auxiliary_loss_clip": 0.01211607, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.12010467, + "balance_loss_mlp": 1.01594281, + "epoch": 0.5340748534495716, + "flos": 63905223974400.0, + "grad_norm": 0.8638521507057143, + "language_loss": 0.63813633, + "learning_rate": 1.876424680745913e-06, + "loss": 0.66060162, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.18945312, + "step": 8883, + "time_per_iteration": 3.1181132793426514 + }, + { + "auxiliary_loss_clip": 0.01435152, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.26364994, + "balance_loss_mlp": 1.01487541, + "epoch": 0.5341349767022396, + "flos": 28706043012480.0, + "grad_norm": 2.150999768283258, + "language_loss": 0.83269572, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.85740334, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20739746, + "step": 8884, + "time_per_iteration": 2.91094708442688 + }, + { + "auxiliary_loss_clip": 0.01413099, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.24977911, + "balance_loss_mlp": 1.01809096, + "epoch": 0.5341950999549075, + "flos": 16298341361280.0, + "grad_norm": 1.498129726939141, + "language_loss": 0.72541142, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74992824, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20483398, + "step": 8885, + "time_per_iteration": 2.819249391555786 + }, + { + "auxiliary_loss_clip": 0.01440691, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.26716471, + "balance_loss_mlp": 1.01266229, + "epoch": 0.5342552232075756, + "flos": 14363721901440.0, + "grad_norm": 2.369993488732426, + "language_loss": 0.80112314, + "learning_rate": 1.87525854926798e-06, + "loss": 0.82586247, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20581055, + "step": 8886, + "time_per_iteration": 2.796842098236084 + }, + { + "auxiliary_loss_clip": 0.01432241, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.26209617, + "balance_loss_mlp": 1.01459169, + "epoch": 0.5343153464602435, + "flos": 30309677625600.0, + "grad_norm": 1.872123671431763, + "language_loss": 0.75612843, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.78082073, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22399902, + "step": 8887, + "time_per_iteration": 2.9201488494873047 + }, + { + "auxiliary_loss_clip": 0.01435002, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.26601338, + "balance_loss_mlp": 1.01171565, + "epoch": 0.5343754697129115, + "flos": 15604718290560.0, + "grad_norm": 3.181768667518862, + "language_loss": 0.70266426, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.72733206, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20056152, + "step": 8888, + "time_per_iteration": 2.8565337657928467 + }, + { + "auxiliary_loss_clip": 0.01450655, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.2735393, + "balance_loss_mlp": 1.01425719, + "epoch": 0.5344355929655794, + "flos": 16918341863040.0, + "grad_norm": 2.1693031550369715, + "language_loss": 0.78246927, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.80732775, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 1.77246094, + "router_z_loss_mlp": 0.20947266, + "step": 8889, + "time_per_iteration": 2.819161891937256 + }, + { + "auxiliary_loss_clip": 0.01425152, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.25769567, + "balance_loss_mlp": 1.01731682, + "epoch": 0.5344957162182474, + "flos": 16806369686400.0, + "grad_norm": 2.1061710133666436, + "language_loss": 0.70230043, + "learning_rate": 1.873703773589102e-06, + "loss": 0.72694647, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.22119141, + "step": 8890, + "time_per_iteration": 2.811339855194092 + }, + { + "auxiliary_loss_clip": 0.0143266, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.26003778, + "balance_loss_mlp": 1.01909649, + "epoch": 0.5345558394709153, + "flos": 12711105152640.0, + "grad_norm": 2.6252363475755027, + "language_loss": 0.77643985, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.80118132, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22387695, + "step": 8891, + "time_per_iteration": 4.270822048187256 + }, + { + "auxiliary_loss_clip": 0.01413488, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.24818623, + "balance_loss_mlp": 1.0131377, + "epoch": 0.5346159627235834, + "flos": 22465109433600.0, + "grad_norm": 1.4909887070357695, + "language_loss": 0.75160748, + "learning_rate": 1.872926414425699e-06, + "loss": 0.77608168, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20788574, + "step": 8892, + "time_per_iteration": 2.8724372386932373 + }, + { + "auxiliary_loss_clip": 0.01427236, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.25812674, + "balance_loss_mlp": 1.01186299, + "epoch": 0.5346760859762513, + "flos": 22425085768320.0, + "grad_norm": 1.5883401918041742, + "language_loss": 0.88294351, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90753436, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19958496, + "step": 8893, + "time_per_iteration": 2.856987237930298 + }, + { + "auxiliary_loss_clip": 0.01417241, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.2507956, + "balance_loss_mlp": 1.01225805, + "epoch": 0.5347362092289193, + "flos": 22825304438400.0, + "grad_norm": 2.32890627896578, + "language_loss": 0.74270296, + "learning_rate": 1.872149074536869e-06, + "loss": 0.76720834, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21032715, + "step": 8894, + "time_per_iteration": 2.8510994911193848 + }, + { + "auxiliary_loss_clip": 0.01401141, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.23750377, + "balance_loss_mlp": 1.01173329, + "epoch": 0.5347963324815872, + "flos": 23229278426880.0, + "grad_norm": 1.7408587739287371, + "language_loss": 0.75281864, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77716553, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.21813965, + "step": 8895, + "time_per_iteration": 2.851470470428467 + }, + { + "auxiliary_loss_clip": 0.01422652, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.25380456, + "balance_loss_mlp": 1.01547456, + "epoch": 0.5348564557342552, + "flos": 22611540165120.0, + "grad_norm": 1.7074404696461465, + "language_loss": 0.77273154, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79733682, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.22399902, + "step": 8896, + "time_per_iteration": 2.8344829082489014 + }, + { + "auxiliary_loss_clip": 0.01405159, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.24056518, + "balance_loss_mlp": 1.01662016, + "epoch": 0.5349165789869232, + "flos": 18010916749440.0, + "grad_norm": 1.777899018249959, + "language_loss": 0.79224575, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81668651, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.22290039, + "step": 8897, + "time_per_iteration": 2.8258559703826904 + }, + { + "auxiliary_loss_clip": 0.01424236, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.25462902, + "balance_loss_mlp": 1.01382971, + "epoch": 0.5349767022395912, + "flos": 17167198118400.0, + "grad_norm": 2.843313671400572, + "language_loss": 0.76714981, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.79175258, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.22229004, + "step": 8898, + "time_per_iteration": 2.815018653869629 + }, + { + "auxiliary_loss_clip": 0.01214043, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_clip": 1.12038422, + "balance_loss_mlp": 1.02425671, + "epoch": 0.5350368254922592, + "flos": 71027022165120.0, + "grad_norm": 0.8569755672076108, + "language_loss": 0.5804143, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60298902, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.19140625, + "step": 8899, + "time_per_iteration": 3.5368707180023193 + }, + { + "auxiliary_loss_clip": 0.01408653, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.24284542, + "balance_loss_mlp": 1.01503062, + "epoch": 0.5350969487449271, + "flos": 27429366458880.0, + "grad_norm": 1.7773252354404405, + "language_loss": 0.70429337, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72873008, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1998291, + "step": 8900, + "time_per_iteration": 2.955561637878418 + }, + { + "auxiliary_loss_clip": 0.01424052, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.25405955, + "balance_loss_mlp": 1.01358247, + "epoch": 0.5351570719975951, + "flos": 19325083259520.0, + "grad_norm": 2.895172247888875, + "language_loss": 0.71986711, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.74445748, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21398926, + "step": 8901, + "time_per_iteration": 2.830043077468872 + }, + { + "auxiliary_loss_clip": 0.01425633, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.25625873, + "balance_loss_mlp": 1.01345825, + "epoch": 0.535217195250263, + "flos": 19838088512640.0, + "grad_norm": 2.457606105317198, + "language_loss": 0.78345859, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.80806679, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21728516, + "step": 8902, + "time_per_iteration": 4.377692461013794 + }, + { + "auxiliary_loss_clip": 0.01403984, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.24120939, + "balance_loss_mlp": 1.01093316, + "epoch": 0.535277318502931, + "flos": 22138151374080.0, + "grad_norm": 1.5191504119178216, + "language_loss": 0.70813847, + "learning_rate": 1.868651286721281e-06, + "loss": 0.7324999, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.21228027, + "step": 8903, + "time_per_iteration": 4.269007205963135 + }, + { + "auxiliary_loss_clip": 0.01424259, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.25321674, + "balance_loss_mlp": 1.01609945, + "epoch": 0.5353374417555989, + "flos": 25056721658880.0, + "grad_norm": 1.647286037561347, + "language_loss": 0.73141348, + "learning_rate": 1.86826266833795e-06, + "loss": 0.75602388, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20678711, + "step": 8904, + "time_per_iteration": 2.9894731044769287 + }, + { + "auxiliary_loss_clip": 0.01424979, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.256109, + "balance_loss_mlp": 1.0155077, + "epoch": 0.535397565008267, + "flos": 19397574708480.0, + "grad_norm": 2.338463669712278, + "language_loss": 0.74180728, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.76643002, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21777344, + "step": 8905, + "time_per_iteration": 4.360944986343384 + }, + { + "auxiliary_loss_clip": 0.0140337, + "auxiliary_loss_mlp": 0.01038814, + "balance_loss_clip": 1.2405597, + "balance_loss_mlp": 1.01767778, + "epoch": 0.5354576882609349, + "flos": 21481339587840.0, + "grad_norm": 1.5089512103997396, + "language_loss": 0.84615541, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.87057728, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.21130371, + "step": 8906, + "time_per_iteration": 2.882399797439575 + }, + { + "auxiliary_loss_clip": 0.01425731, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.25497806, + "balance_loss_mlp": 1.01490927, + "epoch": 0.5355178115136029, + "flos": 20787354558720.0, + "grad_norm": 1.792465442918349, + "language_loss": 0.74657035, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.77119011, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21362305, + "step": 8907, + "time_per_iteration": 2.856821060180664 + }, + { + "auxiliary_loss_clip": 0.01411553, + "auxiliary_loss_mlp": 0.01040252, + "balance_loss_clip": 1.24542594, + "balance_loss_mlp": 1.01850796, + "epoch": 0.5355779347662708, + "flos": 23524583109120.0, + "grad_norm": 2.1198747698168754, + "language_loss": 0.76898015, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79349816, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21740723, + "step": 8908, + "time_per_iteration": 2.84199595451355 + }, + { + "auxiliary_loss_clip": 0.01425684, + "auxiliary_loss_mlp": 0.01039724, + "balance_loss_clip": 1.25550222, + "balance_loss_mlp": 1.01870739, + "epoch": 0.5356380580189388, + "flos": 20312925137280.0, + "grad_norm": 2.060152889227769, + "language_loss": 0.74808395, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7727381, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21020508, + "step": 8909, + "time_per_iteration": 2.8210549354553223 + }, + { + "auxiliary_loss_clip": 0.01412617, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.24724424, + "balance_loss_mlp": 1.02019572, + "epoch": 0.5356981812716068, + "flos": 21371720140800.0, + "grad_norm": 2.057554132336899, + "language_loss": 0.84555817, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.87009716, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.2109375, + "step": 8910, + "time_per_iteration": 2.8127779960632324 + }, + { + "auxiliary_loss_clip": 0.01413378, + "auxiliary_loss_mlp": 0.01037871, + "balance_loss_clip": 1.24592292, + "balance_loss_mlp": 1.01634145, + "epoch": 0.5357583045242748, + "flos": 23121333037440.0, + "grad_norm": 1.691613980109304, + "language_loss": 0.82861853, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.85313106, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21533203, + "step": 8911, + "time_per_iteration": 2.8943979740142822 + }, + { + "auxiliary_loss_clip": 0.01415985, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.25052273, + "balance_loss_mlp": 1.01514053, + "epoch": 0.5358184277769428, + "flos": 21151531105920.0, + "grad_norm": 2.0116558569311462, + "language_loss": 0.69844258, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.72295386, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20007324, + "step": 8912, + "time_per_iteration": 2.816542148590088 + }, + { + "auxiliary_loss_clip": 0.01413638, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.24820507, + "balance_loss_mlp": 1.02145362, + "epoch": 0.5358785510296107, + "flos": 16289156666880.0, + "grad_norm": 2.2878478264434143, + "language_loss": 0.72789335, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.75244915, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20507812, + "step": 8913, + "time_per_iteration": 2.8243134021759033 + }, + { + "auxiliary_loss_clip": 0.0142617, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.25298274, + "balance_loss_mlp": 1.01951456, + "epoch": 0.5359386742822787, + "flos": 16984137081600.0, + "grad_norm": 1.7288964849335533, + "language_loss": 0.72728276, + "learning_rate": 1.864376761688156e-06, + "loss": 0.75194919, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20959473, + "step": 8914, + "time_per_iteration": 2.8418385982513428 + }, + { + "auxiliary_loss_clip": 0.01437058, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.26330781, + "balance_loss_mlp": 1.01926076, + "epoch": 0.5359987975349466, + "flos": 20822491785600.0, + "grad_norm": 2.0284683327384423, + "language_loss": 0.7085917, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.73337704, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.22241211, + "step": 8915, + "time_per_iteration": 2.9107766151428223 + }, + { + "auxiliary_loss_clip": 0.01406072, + "auxiliary_loss_mlp": 0.01042036, + "balance_loss_clip": 1.23985565, + "balance_loss_mlp": 1.02099526, + "epoch": 0.5360589207876146, + "flos": 22210009395840.0, + "grad_norm": 1.5821632383443327, + "language_loss": 0.75997221, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.78445327, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21032715, + "step": 8916, + "time_per_iteration": 2.869527578353882 + }, + { + "auxiliary_loss_clip": 0.01422654, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.25346208, + "balance_loss_mlp": 1.01852, + "epoch": 0.5361190440402825, + "flos": 31406822236800.0, + "grad_norm": 3.0032879639462844, + "language_loss": 0.72666186, + "learning_rate": 1.863211089308289e-06, + "loss": 0.75128305, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20922852, + "step": 8917, + "time_per_iteration": 2.8968751430511475 + }, + { + "auxiliary_loss_clip": 0.01426998, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.25818098, + "balance_loss_mlp": 1.02462864, + "epoch": 0.5361791672929506, + "flos": 16078242816000.0, + "grad_norm": 2.1520764269799346, + "language_loss": 0.72786689, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.75259233, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20910645, + "step": 8918, + "time_per_iteration": 2.847893238067627 + }, + { + "auxiliary_loss_clip": 0.01411477, + "auxiliary_loss_mlp": 0.01043463, + "balance_loss_clip": 1.2440505, + "balance_loss_mlp": 1.02335215, + "epoch": 0.5362392905456185, + "flos": 20750724253440.0, + "grad_norm": 1.585030033720012, + "language_loss": 0.7550866, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77963603, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.2010498, + "step": 8919, + "time_per_iteration": 2.85774302482605 + }, + { + "auxiliary_loss_clip": 0.01422848, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.25204635, + "balance_loss_mlp": 1.02029216, + "epoch": 0.5362994137982865, + "flos": 17347001529600.0, + "grad_norm": 3.1736842466194273, + "language_loss": 0.72428405, + "learning_rate": 1.862045463611864e-06, + "loss": 0.74891376, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.19836426, + "step": 8920, + "time_per_iteration": 2.7780590057373047 + }, + { + "auxiliary_loss_clip": 0.014188, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.25118518, + "balance_loss_mlp": 1.01665044, + "epoch": 0.5363595370509544, + "flos": 42829260963840.0, + "grad_norm": 5.795808833494504, + "language_loss": 0.69395804, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.71851629, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20361328, + "step": 8921, + "time_per_iteration": 3.032231330871582 + }, + { + "auxiliary_loss_clip": 0.01429536, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.26095545, + "balance_loss_mlp": 1.01832032, + "epoch": 0.5364196603036224, + "flos": 19181321971200.0, + "grad_norm": 2.286362525925914, + "language_loss": 0.81849205, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84316397, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19335938, + "step": 8922, + "time_per_iteration": 2.8201193809509277 + }, + { + "auxiliary_loss_clip": 0.01427598, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.25805807, + "balance_loss_mlp": 1.0148747, + "epoch": 0.5364797835562904, + "flos": 17940008868480.0, + "grad_norm": 1.999784984703286, + "language_loss": 0.77737546, + "learning_rate": 1.860879884996686e-06, + "loss": 0.80199629, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19616699, + "step": 8923, + "time_per_iteration": 2.8236656188964844 + }, + { + "auxiliary_loss_clip": 0.01432919, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.26205516, + "balance_loss_mlp": 1.01730967, + "epoch": 0.5365399068089584, + "flos": 30240534291840.0, + "grad_norm": 1.453904520558473, + "language_loss": 0.71153915, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.73624313, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20166016, + "step": 8924, + "time_per_iteration": 2.884310007095337 + }, + { + "auxiliary_loss_clip": 0.01438358, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.26547384, + "balance_loss_mlp": 1.02032387, + "epoch": 0.5366000300616264, + "flos": 24900337071360.0, + "grad_norm": 11.781586507407393, + "language_loss": 0.87968248, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.90447098, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20166016, + "step": 8925, + "time_per_iteration": 2.8831605911254883 + }, + { + "auxiliary_loss_clip": 0.01433546, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.26087844, + "balance_loss_mlp": 1.01431119, + "epoch": 0.5366601533142943, + "flos": 29839003522560.0, + "grad_norm": 1.8893982466035257, + "language_loss": 0.78774905, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.81242841, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20068359, + "step": 8926, + "time_per_iteration": 2.941638231277466 + }, + { + "auxiliary_loss_clip": 0.01410222, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.24570656, + "balance_loss_mlp": 1.01663685, + "epoch": 0.5367202765669623, + "flos": 27210941971200.0, + "grad_norm": 1.4958331309021753, + "language_loss": 0.67480373, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69928151, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20910645, + "step": 8927, + "time_per_iteration": 4.358684062957764 + }, + { + "auxiliary_loss_clip": 0.01423281, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.2519815, + "balance_loss_mlp": 1.01384997, + "epoch": 0.5367803998196302, + "flos": 20239574037120.0, + "grad_norm": 2.310734486582534, + "language_loss": 0.74209642, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.76666266, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.19494629, + "step": 8928, + "time_per_iteration": 2.867027997970581 + }, + { + "auxiliary_loss_clip": 0.01417262, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.24895072, + "balance_loss_mlp": 1.01375747, + "epoch": 0.5368405230722982, + "flos": 32165742833280.0, + "grad_norm": 1.861919820381051, + "language_loss": 0.63687021, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.66137671, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19616699, + "step": 8929, + "time_per_iteration": 2.9286742210388184 + }, + { + "auxiliary_loss_clip": 0.01430094, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.2599982, + "balance_loss_mlp": 1.01877642, + "epoch": 0.5369006463249661, + "flos": 26258554033920.0, + "grad_norm": 2.0123728486869172, + "language_loss": 0.66967595, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.69437385, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20935059, + "step": 8930, + "time_per_iteration": 2.9005870819091797 + }, + { + "auxiliary_loss_clip": 0.01398483, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.23422778, + "balance_loss_mlp": 1.0122683, + "epoch": 0.5369607695776342, + "flos": 26221878483840.0, + "grad_norm": 1.5428557619981387, + "language_loss": 0.67624468, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.70054507, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19287109, + "step": 8931, + "time_per_iteration": 2.8487045764923096 + }, + { + "auxiliary_loss_clip": 0.01425965, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.25845075, + "balance_loss_mlp": 1.0157094, + "epoch": 0.5370208928303021, + "flos": 25019729395200.0, + "grad_norm": 1.949682724500395, + "language_loss": 0.76562423, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.79025757, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.2166748, + "step": 8932, + "time_per_iteration": 2.8382511138916016 + }, + { + "auxiliary_loss_clip": 0.01414882, + "auxiliary_loss_mlp": 0.01035198, + "balance_loss_clip": 1.24960828, + "balance_loss_mlp": 1.01518297, + "epoch": 0.5370810160829701, + "flos": 31803783281280.0, + "grad_norm": 1.8737736474575197, + "language_loss": 0.66652882, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.69102961, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20019531, + "step": 8933, + "time_per_iteration": 2.9049322605133057 + }, + { + "auxiliary_loss_clip": 0.01413058, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.24813747, + "balance_loss_mlp": 1.01881802, + "epoch": 0.537141139335638, + "flos": 23853305715840.0, + "grad_norm": 1.618397850895475, + "language_loss": 0.83735406, + "learning_rate": 1.856606505975565e-06, + "loss": 0.86188602, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.21325684, + "step": 8934, + "time_per_iteration": 2.9780325889587402 + }, + { + "auxiliary_loss_clip": 0.01410029, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.2437849, + "balance_loss_mlp": 1.00974488, + "epoch": 0.537201262588306, + "flos": 18516366120960.0, + "grad_norm": 1.795865389196859, + "language_loss": 0.80171472, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82612073, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20837402, + "step": 8935, + "time_per_iteration": 2.8895907402038574 + }, + { + "auxiliary_loss_clip": 0.01415633, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.24640203, + "balance_loss_mlp": 1.01511872, + "epoch": 0.537261385840974, + "flos": 25673102576640.0, + "grad_norm": 1.705570607818306, + "language_loss": 0.846807, + "learning_rate": 1.855829598084659e-06, + "loss": 0.87132251, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20812988, + "step": 8936, + "time_per_iteration": 2.8970675468444824 + }, + { + "auxiliary_loss_clip": 0.01423974, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.25647259, + "balance_loss_mlp": 1.01742029, + "epoch": 0.537321509093642, + "flos": 40749975319680.0, + "grad_norm": 1.293126378967101, + "language_loss": 0.73221517, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.75684249, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21325684, + "step": 8937, + "time_per_iteration": 2.9818408489227295 + }, + { + "auxiliary_loss_clip": 0.01426059, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.25478601, + "balance_loss_mlp": 1.00875759, + "epoch": 0.53738163234631, + "flos": 17247426428160.0, + "grad_norm": 3.129650767987743, + "language_loss": 0.83071494, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.8552742, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21118164, + "step": 8938, + "time_per_iteration": 5.655934810638428 + }, + { + "auxiliary_loss_clip": 0.01445762, + "auxiliary_loss_mlp": 0.01035072, + "balance_loss_clip": 1.26995993, + "balance_loss_mlp": 1.01440144, + "epoch": 0.5374417555989779, + "flos": 12829321111680.0, + "grad_norm": 2.6890863820618676, + "language_loss": 0.81703407, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.84184235, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.20678711, + "step": 8939, + "time_per_iteration": 2.802462577819824 + }, + { + "auxiliary_loss_clip": 0.01208704, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.11386037, + "balance_loss_mlp": 1.00631988, + "epoch": 0.5375018788516459, + "flos": 67286251791360.0, + "grad_norm": 0.7062389003169989, + "language_loss": 0.52476394, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54712683, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.21289062, + "step": 8940, + "time_per_iteration": 4.801281690597534 + }, + { + "auxiliary_loss_clip": 0.01414582, + "auxiliary_loss_mlp": 0.01031674, + "balance_loss_clip": 1.2497499, + "balance_loss_mlp": 1.0113008, + "epoch": 0.5375620021043138, + "flos": 18123341374080.0, + "grad_norm": 1.9000843037793789, + "language_loss": 0.72590655, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.75036913, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20373535, + "step": 8941, + "time_per_iteration": 2.855745792388916 + }, + { + "auxiliary_loss_clip": 0.01413088, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.24863303, + "balance_loss_mlp": 1.01615119, + "epoch": 0.5376221253569818, + "flos": 23159908869120.0, + "grad_norm": 1.6490166189503295, + "language_loss": 0.79996347, + "learning_rate": 1.853499006090237e-06, + "loss": 0.82445896, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20324707, + "step": 8942, + "time_per_iteration": 2.8407585620880127 + }, + { + "auxiliary_loss_clip": 0.01449928, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.2771982, + "balance_loss_mlp": 1.01651049, + "epoch": 0.5376822486096497, + "flos": 29984619847680.0, + "grad_norm": 1.6611610232794216, + "language_loss": 0.71256363, + "learning_rate": 1.853110593448911e-06, + "loss": 0.73743284, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.20483398, + "step": 8943, + "time_per_iteration": 2.949086904525757 + }, + { + "auxiliary_loss_clip": 0.01206402, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.11367047, + "balance_loss_mlp": 1.0199368, + "epoch": 0.5377423718623178, + "flos": 54198726733440.0, + "grad_norm": 0.813589041006005, + "language_loss": 0.59702611, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61948216, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.19238281, + "step": 8944, + "time_per_iteration": 3.2829582691192627 + }, + { + "auxiliary_loss_clip": 0.01458174, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.28006017, + "balance_loss_mlp": 1.01425898, + "epoch": 0.5378024951149857, + "flos": 23267175586560.0, + "grad_norm": 2.04047266928568, + "language_loss": 0.7845487, + "learning_rate": 1.852333784891169e-06, + "loss": 0.8094871, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 1.78125, + "router_z_loss_mlp": 0.21411133, + "step": 8945, + "time_per_iteration": 2.841061592102051 + }, + { + "auxiliary_loss_clip": 0.01428168, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.2582407, + "balance_loss_mlp": 1.01306748, + "epoch": 0.5378626183676537, + "flos": 24034602205440.0, + "grad_norm": 1.8248802552683994, + "language_loss": 0.69540739, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.72002137, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20166016, + "step": 8946, + "time_per_iteration": 2.8461451530456543 + }, + { + "auxiliary_loss_clip": 0.01411906, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.24858689, + "balance_loss_mlp": 1.01274657, + "epoch": 0.5379227416203216, + "flos": 27173090056320.0, + "grad_norm": 1.9986825820327534, + "language_loss": 0.78047812, + "learning_rate": 1.851556998731498e-06, + "loss": 0.80492848, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20385742, + "step": 8947, + "time_per_iteration": 2.9473037719726562 + }, + { + "auxiliary_loss_clip": 0.01419878, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.25226617, + "balance_loss_mlp": 1.01791048, + "epoch": 0.5379828648729896, + "flos": 24692499866880.0, + "grad_norm": 3.877371744059826, + "language_loss": 0.60491252, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62949252, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20202637, + "step": 8948, + "time_per_iteration": 2.901362895965576 + }, + { + "auxiliary_loss_clip": 0.01443299, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.27236784, + "balance_loss_mlp": 1.01309884, + "epoch": 0.5380429881256577, + "flos": 22532126261760.0, + "grad_norm": 1.7163230636559528, + "language_loss": 0.79929006, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.82404852, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19433594, + "step": 8949, + "time_per_iteration": 2.8166987895965576 + }, + { + "auxiliary_loss_clip": 0.01417082, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.2518183, + "balance_loss_mlp": 1.01400888, + "epoch": 0.5381031113783256, + "flos": 26990843425920.0, + "grad_norm": 1.6434753119784284, + "language_loss": 0.78666568, + "learning_rate": 1.850391861746111e-06, + "loss": 0.81119293, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.21618652, + "step": 8950, + "time_per_iteration": 2.880636215209961 + }, + { + "auxiliary_loss_clip": 0.0141686, + "auxiliary_loss_mlp": 0.01035231, + "balance_loss_clip": 1.25192738, + "balance_loss_mlp": 1.01431024, + "epoch": 0.5381632346309936, + "flos": 24764855581440.0, + "grad_norm": 1.4953268545907838, + "language_loss": 0.73843145, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.76295233, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20910645, + "step": 8951, + "time_per_iteration": 2.878356456756592 + }, + { + "auxiliary_loss_clip": 0.01444955, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.27137482, + "balance_loss_mlp": 1.01268339, + "epoch": 0.5382233578836615, + "flos": 15568540433280.0, + "grad_norm": 1.6480328691742299, + "language_loss": 0.77006304, + "learning_rate": 1.849615132097085e-06, + "loss": 0.79484612, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20678711, + "step": 8952, + "time_per_iteration": 2.804182529449463 + }, + { + "auxiliary_loss_clip": 0.01415179, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.24869061, + "balance_loss_mlp": 1.01270556, + "epoch": 0.5382834811363295, + "flos": 25095885672960.0, + "grad_norm": 1.4198517555916905, + "language_loss": 0.80000925, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.82449698, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20874023, + "step": 8953, + "time_per_iteration": 2.85758113861084 + }, + { + "auxiliary_loss_clip": 0.01420201, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.25490987, + "balance_loss_mlp": 1.01550102, + "epoch": 0.5383436043889974, + "flos": 13305786549120.0, + "grad_norm": 1.9091832665307673, + "language_loss": 0.81318176, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.83776027, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.22143555, + "step": 8954, + "time_per_iteration": 2.813615083694458 + }, + { + "auxiliary_loss_clip": 0.01427551, + "auxiliary_loss_mlp": 0.01032985, + "balance_loss_clip": 1.2594018, + "balance_loss_mlp": 1.01221848, + "epoch": 0.5384037276416654, + "flos": 23049746484480.0, + "grad_norm": 2.6216317461378007, + "language_loss": 0.77287388, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.79747921, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20776367, + "step": 8955, + "time_per_iteration": 2.872096538543701 + }, + { + "auxiliary_loss_clip": 0.01429392, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.26105499, + "balance_loss_mlp": 1.01283967, + "epoch": 0.5384638508943334, + "flos": 20640833337600.0, + "grad_norm": 1.5407344513667265, + "language_loss": 0.78748143, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.8121106, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20690918, + "step": 8956, + "time_per_iteration": 2.8361237049102783 + }, + { + "auxiliary_loss_clip": 0.01208313, + "auxiliary_loss_mlp": 0.01043622, + "balance_loss_clip": 1.11444044, + "balance_loss_mlp": 1.02655113, + "epoch": 0.5385239741470014, + "flos": 66765826391040.0, + "grad_norm": 0.8642590157164473, + "language_loss": 0.63478488, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65730423, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.17089844, + "step": 8957, + "time_per_iteration": 3.274256467819214 + }, + { + "auxiliary_loss_clip": 0.01207879, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.11468422, + "balance_loss_mlp": 1.0134306, + "epoch": 0.5385840973996693, + "flos": 64749277301760.0, + "grad_norm": 0.7370190205337258, + "language_loss": 0.51688123, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53928316, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.18847656, + "step": 8958, + "time_per_iteration": 3.33748459815979 + }, + { + "auxiliary_loss_clip": 0.01448755, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.27758956, + "balance_loss_mlp": 1.01807928, + "epoch": 0.5386442206523373, + "flos": 26153232842880.0, + "grad_norm": 1.6910147276020522, + "language_loss": 0.77894878, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.80382836, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.21118164, + "step": 8959, + "time_per_iteration": 2.889307737350464 + }, + { + "auxiliary_loss_clip": 0.01438348, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.26826692, + "balance_loss_mlp": 1.01724863, + "epoch": 0.5387043439050052, + "flos": 18258913353600.0, + "grad_norm": 2.325495932986291, + "language_loss": 0.8492732, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.87403792, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20874023, + "step": 8960, + "time_per_iteration": 2.8324408531188965 + }, + { + "auxiliary_loss_clip": 0.01423746, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.25631177, + "balance_loss_mlp": 1.01214194, + "epoch": 0.5387644671576732, + "flos": 29800156222080.0, + "grad_norm": 1.4609898878660013, + "language_loss": 0.78895223, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.81351572, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20458984, + "step": 8961, + "time_per_iteration": 2.9115750789642334 + }, + { + "auxiliary_loss_clip": 0.01427444, + "auxiliary_loss_mlp": 0.01043902, + "balance_loss_clip": 1.25821018, + "balance_loss_mlp": 1.02276587, + "epoch": 0.5388245904103413, + "flos": 22382528394240.0, + "grad_norm": 2.3369410929456165, + "language_loss": 0.84721607, + "learning_rate": 1.845731828364681e-06, + "loss": 0.87192953, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21118164, + "step": 8962, + "time_per_iteration": 4.32567286491394 + }, + { + "auxiliary_loss_clip": 0.01204558, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.11267567, + "balance_loss_mlp": 1.0141207, + "epoch": 0.5388847136630092, + "flos": 69838111820160.0, + "grad_norm": 0.7326750547948051, + "language_loss": 0.54170394, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.5640862, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.1953125, + "step": 8963, + "time_per_iteration": 3.3045709133148193 + }, + { + "auxiliary_loss_clip": 0.01206057, + "auxiliary_loss_mlp": 0.01024358, + "balance_loss_clip": 1.11266994, + "balance_loss_mlp": 1.00623786, + "epoch": 0.5389448369156772, + "flos": 69856390719360.0, + "grad_norm": 0.8081750044033027, + "language_loss": 0.63489872, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.6572029, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.18164062, + "step": 8964, + "time_per_iteration": 3.3794405460357666 + }, + { + "auxiliary_loss_clip": 0.01443927, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.2700367, + "balance_loss_mlp": 1.01518869, + "epoch": 0.5390049601683451, + "flos": 31734911416320.0, + "grad_norm": 1.6803053996999882, + "language_loss": 0.70667315, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.73147565, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21154785, + "step": 8965, + "time_per_iteration": 2.9054064750671387 + }, + { + "auxiliary_loss_clip": 0.0144338, + "auxiliary_loss_mlp": 0.01039576, + "balance_loss_clip": 1.27119017, + "balance_loss_mlp": 1.01777291, + "epoch": 0.5390650834210131, + "flos": 18122481722880.0, + "grad_norm": 1.9533646333418373, + "language_loss": 0.82758373, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.8524133, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21801758, + "step": 8966, + "time_per_iteration": 2.78814959526062 + }, + { + "auxiliary_loss_clip": 0.01428164, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.26118183, + "balance_loss_mlp": 1.0128212, + "epoch": 0.539125206673681, + "flos": 17424605640960.0, + "grad_norm": 2.1499490910245713, + "language_loss": 0.73314631, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.75776005, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20385742, + "step": 8967, + "time_per_iteration": 2.830320358276367 + }, + { + "auxiliary_loss_clip": 0.01421334, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.25373745, + "balance_loss_mlp": 1.0112052, + "epoch": 0.539185329926349, + "flos": 22208742541440.0, + "grad_norm": 1.7773521615617205, + "language_loss": 0.82324696, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.8477785, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20617676, + "step": 8968, + "time_per_iteration": 2.8684980869293213 + }, + { + "auxiliary_loss_clip": 0.01426907, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.25685215, + "balance_loss_mlp": 1.01123476, + "epoch": 0.539245453179017, + "flos": 21444437813760.0, + "grad_norm": 1.949239452855266, + "language_loss": 0.74837804, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.77296633, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20690918, + "step": 8969, + "time_per_iteration": 2.853201389312744 + }, + { + "auxiliary_loss_clip": 0.01437077, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.26357996, + "balance_loss_mlp": 1.01280379, + "epoch": 0.539305576431685, + "flos": 20743258861440.0, + "grad_norm": 2.8336195709217082, + "language_loss": 0.83057809, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.85528624, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.20947266, + "step": 8970, + "time_per_iteration": 2.82312273979187 + }, + { + "auxiliary_loss_clip": 0.01423289, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.25801837, + "balance_loss_mlp": 1.01464176, + "epoch": 0.5393656996843529, + "flos": 30932890508160.0, + "grad_norm": 1.402408770740932, + "language_loss": 0.75941885, + "learning_rate": 1.842237354749146e-06, + "loss": 0.78399974, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20166016, + "step": 8971, + "time_per_iteration": 2.885859727859497 + }, + { + "auxiliary_loss_clip": 0.01197938, + "auxiliary_loss_mlp": 0.01011492, + "balance_loss_clip": 1.10799575, + "balance_loss_mlp": 0.99594766, + "epoch": 0.5394258229370209, + "flos": 50341323968640.0, + "grad_norm": 0.8834873485070505, + "language_loss": 0.60382694, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62592131, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.15527344, + "step": 8972, + "time_per_iteration": 3.390813112258911 + }, + { + "auxiliary_loss_clip": 0.01423888, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.25396931, + "balance_loss_mlp": 1.01379597, + "epoch": 0.5394859461896888, + "flos": 25423160446080.0, + "grad_norm": 1.586718592731762, + "language_loss": 0.79044104, + "learning_rate": 1.841460870485045e-06, + "loss": 0.81501985, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.2019043, + "step": 8973, + "time_per_iteration": 5.8584747314453125 + }, + { + "auxiliary_loss_clip": 0.01450065, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.2721504, + "balance_loss_mlp": 1.01609588, + "epoch": 0.5395460694423568, + "flos": 25488050768640.0, + "grad_norm": 1.83022667258761, + "language_loss": 0.74905324, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.77393126, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.21630859, + "step": 8974, + "time_per_iteration": 2.9077672958374023 + }, + { + "auxiliary_loss_clip": 0.0119964, + "auxiliary_loss_mlp": 0.01018476, + "balance_loss_clip": 1.10754347, + "balance_loss_mlp": 0.99816316, + "epoch": 0.5396061926950249, + "flos": 53277132522240.0, + "grad_norm": 0.7410595503179584, + "language_loss": 0.51195085, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.534132, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.203125, + "step": 8975, + "time_per_iteration": 4.790737628936768 + }, + { + "auxiliary_loss_clip": 0.01424118, + "auxiliary_loss_mlp": 0.0103763, + "balance_loss_clip": 1.255795, + "balance_loss_mlp": 1.01739967, + "epoch": 0.5396663159476928, + "flos": 26736557794560.0, + "grad_norm": 1.877260784147271, + "language_loss": 0.72972363, + "learning_rate": 1.840296189214344e-06, + "loss": 0.75434113, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20227051, + "step": 8976, + "time_per_iteration": 3.0006206035614014 + }, + { + "auxiliary_loss_clip": 0.014348, + "auxiliary_loss_mlp": 0.01041555, + "balance_loss_clip": 1.2667383, + "balance_loss_mlp": 1.02099133, + "epoch": 0.5397264392003608, + "flos": 23262515372160.0, + "grad_norm": 2.555205938058129, + "language_loss": 0.71158767, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.73635125, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20556641, + "step": 8977, + "time_per_iteration": 2.883582353591919 + }, + { + "auxiliary_loss_clip": 0.01437383, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_clip": 1.26557648, + "balance_loss_mlp": 1.01855695, + "epoch": 0.5397865624530287, + "flos": 18302782826880.0, + "grad_norm": 1.7828817385215254, + "language_loss": 0.73909175, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.76388258, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.23156738, + "step": 8978, + "time_per_iteration": 2.8186049461364746 + }, + { + "auxiliary_loss_clip": 0.01439972, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.26562917, + "balance_loss_mlp": 1.01823449, + "epoch": 0.5398466857056967, + "flos": 15304074721920.0, + "grad_norm": 2.8524206509386967, + "language_loss": 0.74713862, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.77193838, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.21777344, + "step": 8979, + "time_per_iteration": 2.847637176513672 + }, + { + "auxiliary_loss_clip": 0.01442909, + "auxiliary_loss_mlp": 0.01045469, + "balance_loss_clip": 1.26891875, + "balance_loss_mlp": 1.0235827, + "epoch": 0.5399068089583646, + "flos": 17830706135040.0, + "grad_norm": 1.953831979204437, + "language_loss": 0.77401197, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79889572, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21875, + "step": 8980, + "time_per_iteration": 2.814945697784424 + }, + { + "auxiliary_loss_clip": 0.01423362, + "auxiliary_loss_mlp": 0.01039551, + "balance_loss_clip": 1.25379133, + "balance_loss_mlp": 1.01828384, + "epoch": 0.5399669322110326, + "flos": 27393233846400.0, + "grad_norm": 1.8524360861150808, + "language_loss": 0.82941604, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.85404515, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21276855, + "step": 8981, + "time_per_iteration": 2.8697774410247803 + }, + { + "auxiliary_loss_clip": 0.01426541, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.2543447, + "balance_loss_mlp": 1.02242494, + "epoch": 0.5400270554637006, + "flos": 20458858176000.0, + "grad_norm": 1.831212148143244, + "language_loss": 0.67514265, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69984555, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21313477, + "step": 8982, + "time_per_iteration": 2.825181484222412 + }, + { + "auxiliary_loss_clip": 0.01428235, + "auxiliary_loss_mlp": 0.0104487, + "balance_loss_clip": 1.26042676, + "balance_loss_mlp": 1.02553427, + "epoch": 0.5400871787163686, + "flos": 21699628341120.0, + "grad_norm": 1.669578840398646, + "language_loss": 0.83441806, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.8591491, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19335938, + "step": 8983, + "time_per_iteration": 2.8300886154174805 + }, + { + "auxiliary_loss_clip": 0.01418362, + "auxiliary_loss_mlp": 0.01040033, + "balance_loss_clip": 1.25051641, + "balance_loss_mlp": 1.01806259, + "epoch": 0.5401473019690365, + "flos": 19213156327680.0, + "grad_norm": 1.7999974974715864, + "language_loss": 0.72459704, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.74918097, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21960449, + "step": 8984, + "time_per_iteration": 2.788011312484741 + }, + { + "auxiliary_loss_clip": 0.01456014, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.28040481, + "balance_loss_mlp": 1.01730871, + "epoch": 0.5402074252217045, + "flos": 20636082633600.0, + "grad_norm": 1.9355153499945539, + "language_loss": 0.81204146, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.83698487, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.21008301, + "step": 8985, + "time_per_iteration": 2.8138253688812256 + }, + { + "auxiliary_loss_clip": 0.01398136, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.23723912, + "balance_loss_mlp": 1.01047993, + "epoch": 0.5402675484743724, + "flos": 24984411189120.0, + "grad_norm": 1.5089977535448962, + "language_loss": 0.79450333, + "learning_rate": 1.83641431418363e-06, + "loss": 0.81879342, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.20410156, + "step": 8986, + "time_per_iteration": 2.8650524616241455 + }, + { + "auxiliary_loss_clip": 0.01421681, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.25376916, + "balance_loss_mlp": 1.01314163, + "epoch": 0.5403276717270404, + "flos": 19467215735040.0, + "grad_norm": 1.6536001458292293, + "language_loss": 0.77683431, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.80139136, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20874023, + "step": 8987, + "time_per_iteration": 2.857736349105835 + }, + { + "auxiliary_loss_clip": 0.01430842, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.26120782, + "balance_loss_mlp": 1.01454306, + "epoch": 0.5403877949797083, + "flos": 18451294819200.0, + "grad_norm": 1.8466210808749368, + "language_loss": 0.72386605, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.74852508, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20532227, + "step": 8988, + "time_per_iteration": 2.8420827388763428 + }, + { + "auxiliary_loss_clip": 0.01433871, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.26267183, + "balance_loss_mlp": 1.01371634, + "epoch": 0.5404479182323764, + "flos": 28304466998400.0, + "grad_norm": 4.045882411279951, + "language_loss": 0.69125223, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.71597379, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.24572754, + "step": 8989, + "time_per_iteration": 2.9180662631988525 + }, + { + "auxiliary_loss_clip": 0.01426703, + "auxiliary_loss_mlp": 0.01039369, + "balance_loss_clip": 1.25628281, + "balance_loss_mlp": 1.01818538, + "epoch": 0.5405080414850444, + "flos": 23377654684800.0, + "grad_norm": 1.4565178976040045, + "language_loss": 0.78692031, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.81158102, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21179199, + "step": 8990, + "time_per_iteration": 2.93687105178833 + }, + { + "auxiliary_loss_clip": 0.01421698, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.25359011, + "balance_loss_mlp": 1.01579165, + "epoch": 0.5405681647377123, + "flos": 21116484368640.0, + "grad_norm": 1.616427644708359, + "language_loss": 0.69514138, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71971941, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.203125, + "step": 8991, + "time_per_iteration": 2.887840509414673 + }, + { + "auxiliary_loss_clip": 0.01428886, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.25942206, + "balance_loss_mlp": 1.01238632, + "epoch": 0.5406282879903803, + "flos": 20458948665600.0, + "grad_norm": 1.732940196385924, + "language_loss": 0.76927686, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.79389441, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20471191, + "step": 8992, + "time_per_iteration": 2.828887939453125 + }, + { + "auxiliary_loss_clip": 0.01432085, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.26124644, + "balance_loss_mlp": 1.01545382, + "epoch": 0.5406884112430482, + "flos": 14217472149120.0, + "grad_norm": 2.7720614914376, + "language_loss": 0.77117658, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.7958495, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19750977, + "step": 8993, + "time_per_iteration": 2.8124849796295166 + }, + { + "auxiliary_loss_clip": 0.01423831, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.25739157, + "balance_loss_mlp": 1.01232445, + "epoch": 0.5407485344957162, + "flos": 23885366296320.0, + "grad_norm": 1.751813055868749, + "language_loss": 0.71057951, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.73514938, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20837402, + "step": 8994, + "time_per_iteration": 2.8828647136688232 + }, + { + "auxiliary_loss_clip": 0.01436553, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.2648443, + "balance_loss_mlp": 1.01426136, + "epoch": 0.5408086577483842, + "flos": 23158777749120.0, + "grad_norm": 1.76609591056567, + "language_loss": 0.75739348, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.78211707, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.2154541, + "step": 8995, + "time_per_iteration": 2.842376947402954 + }, + { + "auxiliary_loss_clip": 0.01407771, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.24318898, + "balance_loss_mlp": 1.01362777, + "epoch": 0.5408687810010522, + "flos": 18780424629120.0, + "grad_norm": 2.3533047585742985, + "language_loss": 0.73768067, + "learning_rate": 1.832533059471282e-06, + "loss": 0.76209134, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19665527, + "step": 8996, + "time_per_iteration": 2.85122013092041 + }, + { + "auxiliary_loss_clip": 0.01410859, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.24557853, + "balance_loss_mlp": 1.02006817, + "epoch": 0.5409289042537201, + "flos": 13889563948800.0, + "grad_norm": 1.7570971174379282, + "language_loss": 0.74015641, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.76467144, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20568848, + "step": 8997, + "time_per_iteration": 4.35371208190918 + }, + { + "auxiliary_loss_clip": 0.01436945, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.26725101, + "balance_loss_mlp": 1.01487756, + "epoch": 0.5409890275063881, + "flos": 14473522327680.0, + "grad_norm": 2.4362652965827882, + "language_loss": 0.72732371, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.75205064, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20874023, + "step": 8998, + "time_per_iteration": 2.919760227203369 + }, + { + "auxiliary_loss_clip": 0.01432738, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.26422215, + "balance_loss_mlp": 1.01375604, + "epoch": 0.541049150759056, + "flos": 48993676306560.0, + "grad_norm": 1.6167789804925377, + "language_loss": 0.71070206, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.73537195, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20483398, + "step": 8999, + "time_per_iteration": 3.110267162322998 + }, + { + "auxiliary_loss_clip": 0.01429224, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.26166725, + "balance_loss_mlp": 1.013098, + "epoch": 0.541109274011724, + "flos": 18155990136960.0, + "grad_norm": 4.268038388493625, + "language_loss": 0.81602931, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.84065425, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20178223, + "step": 9000, + "time_per_iteration": 2.7846786975860596 + }, + { + "auxiliary_loss_clip": 0.01419266, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.25172198, + "balance_loss_mlp": 1.0130918, + "epoch": 0.541169397264392, + "flos": 20532435500160.0, + "grad_norm": 1.9084544103321934, + "language_loss": 0.73852444, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.76306295, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.21484375, + "step": 9001, + "time_per_iteration": 2.8318424224853516 + }, + { + "auxiliary_loss_clip": 0.01437943, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.26570177, + "balance_loss_mlp": 1.0121944, + "epoch": 0.54122952051706, + "flos": 20052712437120.0, + "grad_norm": 2.1629965730297145, + "language_loss": 0.86094469, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.885665, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.21887207, + "step": 9002, + "time_per_iteration": 2.814337730407715 + }, + { + "auxiliary_loss_clip": 0.01430596, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.26375592, + "balance_loss_mlp": 1.01492071, + "epoch": 0.541289643769728, + "flos": 19071521544960.0, + "grad_norm": 2.0588551274814075, + "language_loss": 0.78756249, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.81221741, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19970703, + "step": 9003, + "time_per_iteration": 2.792806386947632 + }, + { + "auxiliary_loss_clip": 0.01432448, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.26378584, + "balance_loss_mlp": 1.01343918, + "epoch": 0.5413497670223959, + "flos": 22392391760640.0, + "grad_norm": 1.8002831305398725, + "language_loss": 0.70410299, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.72877705, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21520996, + "step": 9004, + "time_per_iteration": 2.8294837474823 + }, + { + "auxiliary_loss_clip": 0.01209547, + "auxiliary_loss_mlp": 0.01061536, + "balance_loss_clip": 1.1134572, + "balance_loss_mlp": 1.03502393, + "epoch": 0.5414098902750639, + "flos": 70063413517440.0, + "grad_norm": 1.3888896692995731, + "language_loss": 0.59272075, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61543155, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.265625, + "step": 9005, + "time_per_iteration": 3.472856044769287 + }, + { + "auxiliary_loss_clip": 0.01431084, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.26063335, + "balance_loss_mlp": 1.01927209, + "epoch": 0.5414700135277318, + "flos": 21809021564160.0, + "grad_norm": 1.8128510358825087, + "language_loss": 0.79211777, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.81680954, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.18835449, + "step": 9006, + "time_per_iteration": 2.8310296535491943 + }, + { + "auxiliary_loss_clip": 0.01419508, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.25318682, + "balance_loss_mlp": 1.01765656, + "epoch": 0.5415301367803999, + "flos": 16916215357440.0, + "grad_norm": 1.7781748116201377, + "language_loss": 0.83971131, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.86427069, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1875, + "step": 9007, + "time_per_iteration": 2.841433525085449 + }, + { + "auxiliary_loss_clip": 0.01423298, + "auxiliary_loss_mlp": 0.01036926, + "balance_loss_clip": 1.25536466, + "balance_loss_mlp": 1.01570702, + "epoch": 0.5415902600330678, + "flos": 25715569461120.0, + "grad_norm": 1.8859863271913577, + "language_loss": 0.67859268, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.70319492, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.2121582, + "step": 9008, + "time_per_iteration": 4.418330907821655 + }, + { + "auxiliary_loss_clip": 0.0144922, + "auxiliary_loss_mlp": 0.0103635, + "balance_loss_clip": 1.27506316, + "balance_loss_mlp": 1.01508272, + "epoch": 0.5416503832857358, + "flos": 19217454583680.0, + "grad_norm": 2.5197503073992724, + "language_loss": 0.75140703, + "learning_rate": 1.827488379924234e-06, + "loss": 0.77626276, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.21276855, + "step": 9009, + "time_per_iteration": 2.8470699787139893 + }, + { + "auxiliary_loss_clip": 0.01435415, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.26409507, + "balance_loss_mlp": 1.01823699, + "epoch": 0.5417105065384037, + "flos": 12721330477440.0, + "grad_norm": 4.406774708242277, + "language_loss": 0.88438904, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90912282, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.19726562, + "step": 9010, + "time_per_iteration": 4.251540184020996 + }, + { + "auxiliary_loss_clip": 0.01433421, + "auxiliary_loss_mlp": 0.01038981, + "balance_loss_clip": 1.2644906, + "balance_loss_mlp": 1.0188942, + "epoch": 0.5417706297910717, + "flos": 30348027233280.0, + "grad_norm": 8.245887491059138, + "language_loss": 0.66197205, + "learning_rate": 1.826712372694122e-06, + "loss": 0.68669599, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20092773, + "step": 9011, + "time_per_iteration": 2.906947612762451 + }, + { + "auxiliary_loss_clip": 0.01426366, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.25915635, + "balance_loss_mlp": 1.01671791, + "epoch": 0.5418307530437396, + "flos": 29032367644800.0, + "grad_norm": 2.6719770667710914, + "language_loss": 0.79918057, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.82380223, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19091797, + "step": 9012, + "time_per_iteration": 2.8591902256011963 + }, + { + "auxiliary_loss_clip": 0.01432107, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.26306772, + "balance_loss_mlp": 1.01566195, + "epoch": 0.5418908762964076, + "flos": 16882254495360.0, + "grad_norm": 1.8957806315297538, + "language_loss": 0.75777745, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.78245187, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19665527, + "step": 9013, + "time_per_iteration": 2.822934627532959 + }, + { + "auxiliary_loss_clip": 0.01450231, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.27640676, + "balance_loss_mlp": 1.01864243, + "epoch": 0.5419509995490756, + "flos": 18958146779520.0, + "grad_norm": 5.507497368292329, + "language_loss": 0.73131073, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.756208, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20849609, + "step": 9014, + "time_per_iteration": 2.8279306888580322 + }, + { + "auxiliary_loss_clip": 0.01428734, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.26131296, + "balance_loss_mlp": 1.01777422, + "epoch": 0.5420111228017436, + "flos": 18086892048000.0, + "grad_norm": 1.5963577225886043, + "language_loss": 0.8141551, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.83882809, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20776367, + "step": 9015, + "time_per_iteration": 2.8383967876434326 + }, + { + "auxiliary_loss_clip": 0.01438045, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.26566541, + "balance_loss_mlp": 1.01846838, + "epoch": 0.5420712460544116, + "flos": 19070933362560.0, + "grad_norm": 2.366538450955225, + "language_loss": 0.82370704, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.84848601, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21386719, + "step": 9016, + "time_per_iteration": 2.8019955158233643 + }, + { + "auxiliary_loss_clip": 0.01425881, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.25867844, + "balance_loss_mlp": 1.01534641, + "epoch": 0.5421313693070795, + "flos": 18196285271040.0, + "grad_norm": 1.6387790798944495, + "language_loss": 0.8200525, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.84466946, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20471191, + "step": 9017, + "time_per_iteration": 2.8908965587615967 + }, + { + "auxiliary_loss_clip": 0.01421537, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.25875854, + "balance_loss_mlp": 1.01991248, + "epoch": 0.5421914925597475, + "flos": 13013739492480.0, + "grad_norm": 1.5897500874077168, + "language_loss": 0.78323913, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.80786943, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.21569824, + "step": 9018, + "time_per_iteration": 2.8278541564941406 + }, + { + "auxiliary_loss_clip": 0.01438623, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.26482177, + "balance_loss_mlp": 1.01805294, + "epoch": 0.5422516158124154, + "flos": 46775832526080.0, + "grad_norm": 1.4565404085814075, + "language_loss": 0.66593969, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.69071591, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.20959473, + "step": 9019, + "time_per_iteration": 3.06899094581604 + }, + { + "auxiliary_loss_clip": 0.01409133, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.2464391, + "balance_loss_mlp": 1.0140028, + "epoch": 0.5423117390650835, + "flos": 31771044028800.0, + "grad_norm": 2.5363525183448616, + "language_loss": 0.70842516, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.73285586, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19934082, + "step": 9020, + "time_per_iteration": 2.9403109550476074 + }, + { + "auxiliary_loss_clip": 0.01407744, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.24431634, + "balance_loss_mlp": 1.01587653, + "epoch": 0.5423718623177514, + "flos": 27214244841600.0, + "grad_norm": 1.4534641032114617, + "language_loss": 0.80309725, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82754171, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20825195, + "step": 9021, + "time_per_iteration": 2.9365930557250977 + }, + { + "auxiliary_loss_clip": 0.01413097, + "auxiliary_loss_mlp": 0.01038277, + "balance_loss_clip": 1.24717414, + "balance_loss_mlp": 1.01699829, + "epoch": 0.5424319855704194, + "flos": 23555965017600.0, + "grad_norm": 1.7257260235664884, + "language_loss": 0.79799879, + "learning_rate": 1.822444805916788e-06, + "loss": 0.82251257, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21276855, + "step": 9022, + "time_per_iteration": 2.8879082202911377 + }, + { + "auxiliary_loss_clip": 0.01411936, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.24600077, + "balance_loss_mlp": 1.01629162, + "epoch": 0.5424921088230873, + "flos": 26627074081920.0, + "grad_norm": 4.198052277733731, + "language_loss": 0.83055651, + "learning_rate": 1.822056885403915e-06, + "loss": 0.85504746, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20874023, + "step": 9023, + "time_per_iteration": 2.87813401222229 + }, + { + "auxiliary_loss_clip": 0.01417061, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.24888682, + "balance_loss_mlp": 1.01733136, + "epoch": 0.5425522320757553, + "flos": 23597346026880.0, + "grad_norm": 1.6137985307750142, + "language_loss": 0.72168696, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.7462334, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20263672, + "step": 9024, + "time_per_iteration": 2.8306806087493896 + }, + { + "auxiliary_loss_clip": 0.01419108, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.24974191, + "balance_loss_mlp": 1.0170629, + "epoch": 0.5426123553284232, + "flos": 30604665594240.0, + "grad_norm": 2.3272578719603487, + "language_loss": 0.65865695, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.68322521, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20666504, + "step": 9025, + "time_per_iteration": 2.8984808921813965 + }, + { + "auxiliary_loss_clip": 0.0143682, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.26440167, + "balance_loss_mlp": 1.01128614, + "epoch": 0.5426724785810912, + "flos": 12502770255360.0, + "grad_norm": 6.175099596573325, + "language_loss": 0.74893928, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.77362466, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.2043457, + "step": 9026, + "time_per_iteration": 2.808222770690918 + }, + { + "auxiliary_loss_clip": 0.01416621, + "auxiliary_loss_mlp": 0.01041276, + "balance_loss_clip": 1.24784195, + "balance_loss_mlp": 1.01809001, + "epoch": 0.5427326018337592, + "flos": 26074950059520.0, + "grad_norm": 2.055632565277127, + "language_loss": 0.79767925, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.82225823, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.23205566, + "step": 9027, + "time_per_iteration": 2.8749659061431885 + }, + { + "auxiliary_loss_clip": 0.01211062, + "auxiliary_loss_mlp": 0.01061495, + "balance_loss_clip": 1.11849284, + "balance_loss_mlp": 1.0319314, + "epoch": 0.5427927250864272, + "flos": 66016135733760.0, + "grad_norm": 0.7525474905907152, + "language_loss": 0.56618929, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58891487, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.29492188, + "step": 9028, + "time_per_iteration": 3.394814968109131 + }, + { + "auxiliary_loss_clip": 0.01412644, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.24426734, + "balance_loss_mlp": 1.01391065, + "epoch": 0.5428528483390952, + "flos": 19985152671360.0, + "grad_norm": 2.0523283784278177, + "language_loss": 0.78797269, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.81246221, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.22375488, + "step": 9029, + "time_per_iteration": 2.8460285663604736 + }, + { + "auxiliary_loss_clip": 0.01404711, + "auxiliary_loss_mlp": 0.01044773, + "balance_loss_clip": 1.23879433, + "balance_loss_mlp": 1.02233744, + "epoch": 0.5429129715917631, + "flos": 21841579837440.0, + "grad_norm": 1.51280404985086, + "language_loss": 0.83646327, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.8609581, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.2244873, + "step": 9030, + "time_per_iteration": 2.8728456497192383 + }, + { + "auxiliary_loss_clip": 0.01405991, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.2396394, + "balance_loss_mlp": 1.01609778, + "epoch": 0.5429730948444311, + "flos": 27794312167680.0, + "grad_norm": 1.8285167872649473, + "language_loss": 0.75795197, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.78239286, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.2199707, + "step": 9031, + "time_per_iteration": 4.294595241546631 + }, + { + "auxiliary_loss_clip": 0.0140066, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.23880172, + "balance_loss_mlp": 1.01604629, + "epoch": 0.543033218097099, + "flos": 26771649776640.0, + "grad_norm": 1.7441965143108005, + "language_loss": 0.85997349, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.88435078, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.21032715, + "step": 9032, + "time_per_iteration": 2.914761543273926 + }, + { + "auxiliary_loss_clip": 0.01429691, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.25801158, + "balance_loss_mlp": 1.01678538, + "epoch": 0.5430933413497671, + "flos": 22685705671680.0, + "grad_norm": 1.5768800694538117, + "language_loss": 0.74842083, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.77311075, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.22521973, + "step": 9033, + "time_per_iteration": 3.0117685794830322 + }, + { + "auxiliary_loss_clip": 0.01403666, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.23680711, + "balance_loss_mlp": 1.01589453, + "epoch": 0.543153464602435, + "flos": 24618062891520.0, + "grad_norm": 2.122467325821096, + "language_loss": 0.76687783, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.79129469, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.22106934, + "step": 9034, + "time_per_iteration": 2.890709161758423 + }, + { + "auxiliary_loss_clip": 0.01401356, + "auxiliary_loss_mlp": 0.01036824, + "balance_loss_clip": 1.23519075, + "balance_loss_mlp": 1.01570058, + "epoch": 0.543213587855103, + "flos": 19034981729280.0, + "grad_norm": 2.4874599758451885, + "language_loss": 0.8496722, + "learning_rate": 1.817402369770655e-06, + "loss": 0.87405401, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.21118164, + "step": 9035, + "time_per_iteration": 2.8266024589538574 + }, + { + "auxiliary_loss_clip": 0.01209474, + "auxiliary_loss_mlp": 0.01045244, + "balance_loss_clip": 1.11577749, + "balance_loss_mlp": 1.01777864, + "epoch": 0.5432737111077709, + "flos": 65716080347520.0, + "grad_norm": 0.7282683398210658, + "language_loss": 0.55919766, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.58174479, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.27539062, + "step": 9036, + "time_per_iteration": 3.3816280364990234 + }, + { + "auxiliary_loss_clip": 0.01422908, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.25287366, + "balance_loss_mlp": 1.01347959, + "epoch": 0.5433338343604389, + "flos": 22102018761600.0, + "grad_norm": 1.6830019413057093, + "language_loss": 0.75868732, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.78327173, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.22045898, + "step": 9037, + "time_per_iteration": 2.881230115890503 + }, + { + "auxiliary_loss_clip": 0.01403423, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.23785639, + "balance_loss_mlp": 1.01731205, + "epoch": 0.5433939576131068, + "flos": 34684546896000.0, + "grad_norm": 1.5409802959120982, + "language_loss": 0.67750365, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.70192921, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.21838379, + "step": 9038, + "time_per_iteration": 2.971674919128418 + }, + { + "auxiliary_loss_clip": 0.01414227, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.24565077, + "balance_loss_mlp": 1.01414275, + "epoch": 0.5434540808657748, + "flos": 20313106116480.0, + "grad_norm": 1.7672357724434005, + "language_loss": 0.78821027, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.81270581, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21166992, + "step": 9039, + "time_per_iteration": 2.8569648265838623 + }, + { + "auxiliary_loss_clip": 0.01414498, + "auxiliary_loss_mlp": 0.01039335, + "balance_loss_clip": 1.24673426, + "balance_loss_mlp": 1.01720965, + "epoch": 0.5435142041184428, + "flos": 23123278563840.0, + "grad_norm": 2.0231369873598672, + "language_loss": 0.7731545, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.79769284, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.22131348, + "step": 9040, + "time_per_iteration": 2.9937820434570312 + }, + { + "auxiliary_loss_clip": 0.01209577, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.11732531, + "balance_loss_mlp": 1.01274812, + "epoch": 0.5435743273711108, + "flos": 64043166666240.0, + "grad_norm": 0.6701205428892485, + "language_loss": 0.52555817, + "learning_rate": 1.815075484268074e-06, + "loss": 0.5480141, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.23242188, + "step": 9041, + "time_per_iteration": 3.4729487895965576 + }, + { + "auxiliary_loss_clip": 0.01414007, + "auxiliary_loss_mlp": 0.01039793, + "balance_loss_clip": 1.24550104, + "balance_loss_mlp": 1.018538, + "epoch": 0.5436344506237788, + "flos": 25129756045440.0, + "grad_norm": 1.6063892960454471, + "language_loss": 0.76365924, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78819722, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21264648, + "step": 9042, + "time_per_iteration": 3.0355684757232666 + }, + { + "auxiliary_loss_clip": 0.01407105, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.24202943, + "balance_loss_mlp": 1.01905227, + "epoch": 0.5436945738764467, + "flos": 19582355047680.0, + "grad_norm": 2.1101972892227043, + "language_loss": 0.68079531, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.70527124, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.21435547, + "step": 9043, + "time_per_iteration": 5.672695159912109 + }, + { + "auxiliary_loss_clip": 0.01404139, + "auxiliary_loss_mlp": 0.01037612, + "balance_loss_clip": 1.24001193, + "balance_loss_mlp": 1.01712012, + "epoch": 0.5437546971291147, + "flos": 21152481246720.0, + "grad_norm": 1.7199596387440441, + "language_loss": 0.85026282, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.8746804, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20483398, + "step": 9044, + "time_per_iteration": 2.846230983734131 + }, + { + "auxiliary_loss_clip": 0.01431916, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.25846481, + "balance_loss_mlp": 1.01384735, + "epoch": 0.5438148203817826, + "flos": 25129484576640.0, + "grad_norm": 1.5958204975438293, + "language_loss": 0.62387735, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64854443, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20947266, + "step": 9045, + "time_per_iteration": 4.251859426498413 + }, + { + "auxiliary_loss_clip": 0.01414492, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.2475419, + "balance_loss_mlp": 1.01659346, + "epoch": 0.5438749436344507, + "flos": 23013297158400.0, + "grad_norm": 1.9126515091823184, + "language_loss": 0.71258557, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.73710531, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20898438, + "step": 9046, + "time_per_iteration": 2.8860623836517334 + }, + { + "auxiliary_loss_clip": 0.0140048, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.23645735, + "balance_loss_mlp": 1.01420701, + "epoch": 0.5439350668871186, + "flos": 15495460801920.0, + "grad_norm": 1.487736516374117, + "language_loss": 0.78131527, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.80566645, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2043457, + "step": 9047, + "time_per_iteration": 2.8618526458740234 + }, + { + "auxiliary_loss_clip": 0.01417569, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.24991131, + "balance_loss_mlp": 1.01758361, + "epoch": 0.5439951901397866, + "flos": 17247335938560.0, + "grad_norm": 1.708040071749754, + "language_loss": 0.73748779, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.76206356, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.22436523, + "step": 9048, + "time_per_iteration": 2.814488410949707 + }, + { + "auxiliary_loss_clip": 0.01407521, + "auxiliary_loss_mlp": 0.01042717, + "balance_loss_clip": 1.2419703, + "balance_loss_mlp": 1.02019811, + "epoch": 0.5440553133924545, + "flos": 18670216999680.0, + "grad_norm": 2.5192054874523233, + "language_loss": 0.94065231, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.96515465, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.2253418, + "step": 9049, + "time_per_iteration": 2.8397445678710938 + }, + { + "auxiliary_loss_clip": 0.01405238, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.24053621, + "balance_loss_mlp": 1.01852608, + "epoch": 0.5441154366451225, + "flos": 27133609328640.0, + "grad_norm": 2.093694159076858, + "language_loss": 0.7476896, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.77212858, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20141602, + "step": 9050, + "time_per_iteration": 2.8776893615722656 + }, + { + "auxiliary_loss_clip": 0.0142162, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.25305295, + "balance_loss_mlp": 1.0146656, + "epoch": 0.5441755598977904, + "flos": 26004766095360.0, + "grad_norm": 1.9916390275369067, + "language_loss": 0.68226546, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.70683527, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20690918, + "step": 9051, + "time_per_iteration": 2.8801233768463135 + }, + { + "auxiliary_loss_clip": 0.01408842, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.24265301, + "balance_loss_mlp": 1.0185101, + "epoch": 0.5442356831504584, + "flos": 32392899567360.0, + "grad_norm": 1.6767264047926993, + "language_loss": 0.68214607, + "learning_rate": 1.810810185460011e-06, + "loss": 0.70663381, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21435547, + "step": 9052, + "time_per_iteration": 2.936046600341797 + }, + { + "auxiliary_loss_clip": 0.01413848, + "auxiliary_loss_mlp": 0.01044674, + "balance_loss_clip": 1.24552798, + "balance_loss_mlp": 1.02269197, + "epoch": 0.5442958064031264, + "flos": 24173341320960.0, + "grad_norm": 1.8095970766775444, + "language_loss": 0.9393208, + "learning_rate": 1.810422473773436e-06, + "loss": 0.96390605, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21972656, + "step": 9053, + "time_per_iteration": 2.8618719577789307 + }, + { + "auxiliary_loss_clip": 0.01421561, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.2518661, + "balance_loss_mlp": 1.02310205, + "epoch": 0.5443559296557944, + "flos": 18772416299520.0, + "grad_norm": 2.1869103240094145, + "language_loss": 0.84693682, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.87160152, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21789551, + "step": 9054, + "time_per_iteration": 2.808042049407959 + }, + { + "auxiliary_loss_clip": 0.01418084, + "auxiliary_loss_mlp": 0.01044048, + "balance_loss_clip": 1.25048006, + "balance_loss_mlp": 1.02273321, + "epoch": 0.5444160529084624, + "flos": 22641383750400.0, + "grad_norm": 2.5098837502472695, + "language_loss": 0.69735873, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.72198009, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21337891, + "step": 9055, + "time_per_iteration": 2.8379151821136475 + }, + { + "auxiliary_loss_clip": 0.01218372, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_clip": 1.12301707, + "balance_loss_mlp": 1.01668262, + "epoch": 0.5444761761611303, + "flos": 69704802080640.0, + "grad_norm": 0.7929515856145496, + "language_loss": 0.57766998, + "learning_rate": 1.80925938190531e-06, + "loss": 0.60031617, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.29492188, + "step": 9056, + "time_per_iteration": 3.351738214492798 + }, + { + "auxiliary_loss_clip": 0.01408997, + "auxiliary_loss_mlp": 0.01041998, + "balance_loss_clip": 1.23948872, + "balance_loss_mlp": 1.01970553, + "epoch": 0.5445362994137983, + "flos": 14285393873280.0, + "grad_norm": 2.210096255297203, + "language_loss": 0.70902705, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.73353696, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.22277832, + "step": 9057, + "time_per_iteration": 2.8208749294281006 + }, + { + "auxiliary_loss_clip": 0.01398128, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.23238349, + "balance_loss_mlp": 1.01689887, + "epoch": 0.5445964226664662, + "flos": 28997999579520.0, + "grad_norm": 1.7962518849649793, + "language_loss": 0.75496304, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77932978, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21643066, + "step": 9058, + "time_per_iteration": 2.9139692783355713 + }, + { + "auxiliary_loss_clip": 0.01215417, + "auxiliary_loss_mlp": 0.01046622, + "balance_loss_clip": 1.12200165, + "balance_loss_mlp": 1.01991892, + "epoch": 0.5446565459191343, + "flos": 68654513099520.0, + "grad_norm": 0.815636861953567, + "language_loss": 0.62775707, + "learning_rate": 1.808096355133312e-06, + "loss": 0.65037751, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.26757812, + "step": 9059, + "time_per_iteration": 3.379582166671753 + }, + { + "auxiliary_loss_clip": 0.01393995, + "auxiliary_loss_mlp": 0.01043516, + "balance_loss_clip": 1.23090005, + "balance_loss_mlp": 1.02143884, + "epoch": 0.5447166691718022, + "flos": 16225035505920.0, + "grad_norm": 1.7890475447033054, + "language_loss": 0.80328143, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.82765657, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.22094727, + "step": 9060, + "time_per_iteration": 2.8314931392669678 + }, + { + "auxiliary_loss_clip": 0.01404928, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.23935616, + "balance_loss_mlp": 1.01745987, + "epoch": 0.5447767924244702, + "flos": 25860190400640.0, + "grad_norm": 1.598708239119304, + "language_loss": 0.80958223, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.83402121, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.21520996, + "step": 9061, + "time_per_iteration": 2.8762452602386475 + }, + { + "auxiliary_loss_clip": 0.01402366, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.23723841, + "balance_loss_mlp": 1.01356292, + "epoch": 0.5448369156771381, + "flos": 19685911691520.0, + "grad_norm": 1.7327631731673026, + "language_loss": 0.875193, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89956671, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.21447754, + "step": 9062, + "time_per_iteration": 2.8251326084136963 + }, + { + "auxiliary_loss_clip": 0.01421981, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.24990225, + "balance_loss_mlp": 1.01154697, + "epoch": 0.5448970389298061, + "flos": 19291303376640.0, + "grad_norm": 2.060732136789699, + "language_loss": 0.8362726, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.86081874, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.2109375, + "step": 9063, + "time_per_iteration": 2.812537908554077 + }, + { + "auxiliary_loss_clip": 0.01413823, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.2453723, + "balance_loss_mlp": 1.0136106, + "epoch": 0.544957162182474, + "flos": 21000349670400.0, + "grad_norm": 1.5095463910732438, + "language_loss": 0.63933098, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66381735, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.2121582, + "step": 9064, + "time_per_iteration": 2.8358242511749268 + }, + { + "auxiliary_loss_clip": 0.01416059, + "auxiliary_loss_mlp": 0.01036953, + "balance_loss_clip": 1.24573803, + "balance_loss_mlp": 1.01541162, + "epoch": 0.545017285435142, + "flos": 25385580000000.0, + "grad_norm": 1.6284050072801928, + "language_loss": 0.80852187, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.83305192, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21557617, + "step": 9065, + "time_per_iteration": 2.9099161624908447 + }, + { + "auxiliary_loss_clip": 0.01402708, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.2377243, + "balance_loss_mlp": 1.01696527, + "epoch": 0.54507740868781, + "flos": 19143922504320.0, + "grad_norm": 1.9522535492158117, + "language_loss": 0.79123825, + "learning_rate": 1.805382881379827e-06, + "loss": 0.81563103, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19616699, + "step": 9066, + "time_per_iteration": 4.251797914505005 + }, + { + "auxiliary_loss_clip": 0.01420348, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.24934518, + "balance_loss_mlp": 1.01408184, + "epoch": 0.545137531940478, + "flos": 26260906763520.0, + "grad_norm": 1.8309276368620941, + "language_loss": 0.76377088, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78832781, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21264648, + "step": 9067, + "time_per_iteration": 2.8982961177825928 + }, + { + "auxiliary_loss_clip": 0.01427648, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.25444913, + "balance_loss_mlp": 1.01397431, + "epoch": 0.545197655193146, + "flos": 37568432401920.0, + "grad_norm": 1.8702867626071644, + "language_loss": 0.63930261, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.66394198, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.2232666, + "step": 9068, + "time_per_iteration": 2.991487979888916 + }, + { + "auxiliary_loss_clip": 0.01401014, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.23740208, + "balance_loss_mlp": 1.01337528, + "epoch": 0.5452577784458139, + "flos": 26042391786240.0, + "grad_norm": 2.0900795366328855, + "language_loss": 0.72821689, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.75257176, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.2109375, + "step": 9069, + "time_per_iteration": 2.900423526763916 + }, + { + "auxiliary_loss_clip": 0.01408573, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.24385619, + "balance_loss_mlp": 1.01436067, + "epoch": 0.5453179016984819, + "flos": 17647871322240.0, + "grad_norm": 2.3405339600953323, + "language_loss": 0.74767733, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.77210212, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19543457, + "step": 9070, + "time_per_iteration": 2.8094053268432617 + }, + { + "auxiliary_loss_clip": 0.01414261, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.24819767, + "balance_loss_mlp": 1.01040506, + "epoch": 0.5453780249511498, + "flos": 23225749332480.0, + "grad_norm": 1.7593845303393616, + "language_loss": 0.61334419, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.63779461, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20361328, + "step": 9071, + "time_per_iteration": 2.8430984020233154 + }, + { + "auxiliary_loss_clip": 0.0121071, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.11938488, + "balance_loss_mlp": 1.00983202, + "epoch": 0.5454381482038179, + "flos": 68731664762880.0, + "grad_norm": 0.7022042239494368, + "language_loss": 0.57212991, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59451652, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.18164062, + "step": 9072, + "time_per_iteration": 3.483304977416992 + }, + { + "auxiliary_loss_clip": 0.01397164, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.2338469, + "balance_loss_mlp": 1.01720178, + "epoch": 0.5454982714564858, + "flos": 13268296592640.0, + "grad_norm": 2.081102575821753, + "language_loss": 0.71088165, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.73523772, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.21252441, + "step": 9073, + "time_per_iteration": 2.8298094272613525 + }, + { + "auxiliary_loss_clip": 0.01401523, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.23925257, + "balance_loss_mlp": 1.01154768, + "epoch": 0.5455583947091538, + "flos": 21846059072640.0, + "grad_norm": 1.6937981585483044, + "language_loss": 0.72343028, + "learning_rate": 1.802282211606627e-06, + "loss": 0.74776357, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20275879, + "step": 9074, + "time_per_iteration": 2.8695781230926514 + }, + { + "auxiliary_loss_clip": 0.01413754, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.24655557, + "balance_loss_mlp": 1.01460028, + "epoch": 0.5456185179618217, + "flos": 17825276759040.0, + "grad_norm": 1.9294554315583292, + "language_loss": 0.69292313, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.7174052, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1986084, + "step": 9075, + "time_per_iteration": 2.906996488571167 + }, + { + "auxiliary_loss_clip": 0.01412335, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.24792719, + "balance_loss_mlp": 1.01470113, + "epoch": 0.5456786412144897, + "flos": 21079311125760.0, + "grad_norm": 1.7518203280597282, + "language_loss": 0.8161, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.84056938, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19909668, + "step": 9076, + "time_per_iteration": 2.8891210556030273 + }, + { + "auxiliary_loss_clip": 0.0142443, + "auxiliary_loss_mlp": 0.01033061, + "balance_loss_clip": 1.2571981, + "balance_loss_mlp": 1.01422584, + "epoch": 0.5457387644671576, + "flos": 23305434704640.0, + "grad_norm": 2.121942328085589, + "language_loss": 0.81235451, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.83692944, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.18835449, + "step": 9077, + "time_per_iteration": 2.827404260635376 + }, + { + "auxiliary_loss_clip": 0.0141918, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.25148618, + "balance_loss_mlp": 1.01443219, + "epoch": 0.5457988877198257, + "flos": 21627182136960.0, + "grad_norm": 1.9758339813509054, + "language_loss": 0.68987453, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.71440846, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19775391, + "step": 9078, + "time_per_iteration": 5.748022556304932 + }, + { + "auxiliary_loss_clip": 0.01427681, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.25812495, + "balance_loss_mlp": 1.01267743, + "epoch": 0.5458590109724936, + "flos": 23771312858880.0, + "grad_norm": 1.7364010211421763, + "language_loss": 0.81566846, + "learning_rate": 1.800344536188764e-06, + "loss": 0.84027505, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20300293, + "step": 9079, + "time_per_iteration": 2.8544726371765137 + }, + { + "auxiliary_loss_clip": 0.01437032, + "auxiliary_loss_mlp": 0.01036834, + "balance_loss_clip": 1.26376939, + "balance_loss_mlp": 1.01611495, + "epoch": 0.5459191342251616, + "flos": 24434594651520.0, + "grad_norm": 1.627042112276473, + "language_loss": 0.7660802, + "learning_rate": 1.799957023759277e-06, + "loss": 0.79081893, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20703125, + "step": 9080, + "time_per_iteration": 4.230427980422974 + }, + { + "auxiliary_loss_clip": 0.01421601, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.25229955, + "balance_loss_mlp": 1.01698351, + "epoch": 0.5459792574778296, + "flos": 23633478639360.0, + "grad_norm": 2.5115755437337826, + "language_loss": 0.84320474, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.86779666, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20617676, + "step": 9081, + "time_per_iteration": 2.84775972366333 + }, + { + "auxiliary_loss_clip": 0.01432165, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.2618475, + "balance_loss_mlp": 1.01448989, + "epoch": 0.5460393807304975, + "flos": 19144917889920.0, + "grad_norm": 1.817337853522643, + "language_loss": 0.703619, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72829562, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21032715, + "step": 9082, + "time_per_iteration": 2.8508224487304688 + }, + { + "auxiliary_loss_clip": 0.01413143, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.24883616, + "balance_loss_mlp": 1.01304412, + "epoch": 0.5460995039831655, + "flos": 35932149025920.0, + "grad_norm": 3.945495440577287, + "language_loss": 0.67164081, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.69610751, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20495605, + "step": 9083, + "time_per_iteration": 2.961620569229126 + }, + { + "auxiliary_loss_clip": 0.01409304, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.24584067, + "balance_loss_mlp": 1.01319766, + "epoch": 0.5461596272358334, + "flos": 26770111453440.0, + "grad_norm": 1.758398148162635, + "language_loss": 0.80121392, + "learning_rate": 1.798407050044766e-06, + "loss": 0.82563621, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19714355, + "step": 9084, + "time_per_iteration": 2.8601982593536377 + }, + { + "auxiliary_loss_clip": 0.01432713, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.26348233, + "balance_loss_mlp": 1.01948214, + "epoch": 0.5462197504885015, + "flos": 20896340578560.0, + "grad_norm": 1.744454373484502, + "language_loss": 0.76453453, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.78925508, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19873047, + "step": 9085, + "time_per_iteration": 2.8377902507781982 + }, + { + "auxiliary_loss_clip": 0.01422779, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.25283909, + "balance_loss_mlp": 1.01788425, + "epoch": 0.5462798737411694, + "flos": 25814149176960.0, + "grad_norm": 1.7539367059886104, + "language_loss": 0.75391912, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.77851892, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.19299316, + "step": 9086, + "time_per_iteration": 2.8716304302215576 + }, + { + "auxiliary_loss_clip": 0.01425355, + "auxiliary_loss_mlp": 0.01034201, + "balance_loss_clip": 1.25871301, + "balance_loss_mlp": 1.01382792, + "epoch": 0.5463399969938374, + "flos": 25785889159680.0, + "grad_norm": 1.5799684673524834, + "language_loss": 0.77690279, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.80149835, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20373535, + "step": 9087, + "time_per_iteration": 2.9198362827301025 + }, + { + "auxiliary_loss_clip": 0.01425479, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.25619125, + "balance_loss_mlp": 1.0185219, + "epoch": 0.5464001202465053, + "flos": 18852508874880.0, + "grad_norm": 1.7114051372167765, + "language_loss": 0.77953506, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.80418599, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.21105957, + "step": 9088, + "time_per_iteration": 2.902883529663086 + }, + { + "auxiliary_loss_clip": 0.01228308, + "auxiliary_loss_mlp": 0.01046234, + "balance_loss_clip": 1.12959051, + "balance_loss_mlp": 1.02449036, + "epoch": 0.5464602434991733, + "flos": 69081724932480.0, + "grad_norm": 0.7572139893915811, + "language_loss": 0.57790196, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.60064739, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.21777344, + "step": 9089, + "time_per_iteration": 3.3946759700775146 + }, + { + "auxiliary_loss_clip": 0.01437776, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.26642001, + "balance_loss_mlp": 1.0173614, + "epoch": 0.5465203667518412, + "flos": 27570639283200.0, + "grad_norm": 1.6720913180745938, + "language_loss": 0.77665651, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.80140185, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19421387, + "step": 9090, + "time_per_iteration": 2.909102201461792 + }, + { + "auxiliary_loss_clip": 0.01437836, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.26648235, + "balance_loss_mlp": 1.02219546, + "epoch": 0.5465804900045093, + "flos": 21218819402880.0, + "grad_norm": 1.9608294179543067, + "language_loss": 0.74279493, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.7676053, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.21020508, + "step": 9091, + "time_per_iteration": 2.885824680328369 + }, + { + "auxiliary_loss_clip": 0.01424677, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.25683045, + "balance_loss_mlp": 1.02445364, + "epoch": 0.5466406132571772, + "flos": 22498346378880.0, + "grad_norm": 1.7729110363318752, + "language_loss": 0.78730679, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.8119995, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20153809, + "step": 9092, + "time_per_iteration": 2.843961238861084 + }, + { + "auxiliary_loss_clip": 0.01433869, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.26273465, + "balance_loss_mlp": 1.0235641, + "epoch": 0.5467007365098452, + "flos": 17684773096320.0, + "grad_norm": 3.648892480836694, + "language_loss": 0.76190746, + "learning_rate": 1.794920057818476e-06, + "loss": 0.78668404, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20227051, + "step": 9093, + "time_per_iteration": 2.8119425773620605 + }, + { + "auxiliary_loss_clip": 0.01427372, + "auxiliary_loss_mlp": 0.01043056, + "balance_loss_clip": 1.25674045, + "balance_loss_mlp": 1.02035809, + "epoch": 0.5467608597625132, + "flos": 15705515001600.0, + "grad_norm": 2.064180699333355, + "language_loss": 0.69942135, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.72412562, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.22705078, + "step": 9094, + "time_per_iteration": 2.8016257286071777 + }, + { + "auxiliary_loss_clip": 0.01434192, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.26584005, + "balance_loss_mlp": 1.02062654, + "epoch": 0.5468209830151811, + "flos": 24322577230080.0, + "grad_norm": 3.5633570993508648, + "language_loss": 0.68961048, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.71435988, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20117188, + "step": 9095, + "time_per_iteration": 2.8455612659454346 + }, + { + "auxiliary_loss_clip": 0.01441145, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.27251935, + "balance_loss_mlp": 1.02439237, + "epoch": 0.5468811062678491, + "flos": 29177576766720.0, + "grad_norm": 1.6151145072257393, + "language_loss": 0.67951763, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.70437014, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19702148, + "step": 9096, + "time_per_iteration": 2.893735647201538 + }, + { + "auxiliary_loss_clip": 0.01231394, + "auxiliary_loss_mlp": 0.01042539, + "balance_loss_clip": 1.12970591, + "balance_loss_mlp": 1.02213001, + "epoch": 0.546941229520517, + "flos": 67895031582720.0, + "grad_norm": 0.7502373525587244, + "language_loss": 0.57557315, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.5983125, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.20410156, + "step": 9097, + "time_per_iteration": 3.475468397140503 + }, + { + "auxiliary_loss_clip": 0.01225701, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.12753582, + "balance_loss_mlp": 1.01118755, + "epoch": 0.5470013527731851, + "flos": 58295556852480.0, + "grad_norm": 0.9287890373437211, + "language_loss": 0.64855701, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.67114425, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.21875, + "step": 9098, + "time_per_iteration": 3.2841601371765137 + }, + { + "auxiliary_loss_clip": 0.01434948, + "auxiliary_loss_mlp": 0.01044402, + "balance_loss_clip": 1.2634964, + "balance_loss_mlp": 1.02255058, + "epoch": 0.547061476025853, + "flos": 22976123915520.0, + "grad_norm": 1.814869540371251, + "language_loss": 0.74431217, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.76910567, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.21838379, + "step": 9099, + "time_per_iteration": 2.8698720932006836 + }, + { + "auxiliary_loss_clip": 0.01420325, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.25331104, + "balance_loss_mlp": 1.0164541, + "epoch": 0.547121599278521, + "flos": 29979507185280.0, + "grad_norm": 1.879240180359159, + "language_loss": 0.74631906, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.77087498, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18811035, + "step": 9100, + "time_per_iteration": 2.9458322525024414 + }, + { + "auxiliary_loss_clip": 0.01418656, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.25176024, + "balance_loss_mlp": 1.0145874, + "epoch": 0.5471817225311889, + "flos": 36548348964480.0, + "grad_norm": 2.2143472904602706, + "language_loss": 0.69005001, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.71460104, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.21862793, + "step": 9101, + "time_per_iteration": 4.3752782344818115 + }, + { + "auxiliary_loss_clip": 0.01417056, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.24988127, + "balance_loss_mlp": 1.01309276, + "epoch": 0.5472418457838569, + "flos": 25786160628480.0, + "grad_norm": 1.6930406978861114, + "language_loss": 0.78603065, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.81051975, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.18762207, + "step": 9102, + "time_per_iteration": 2.9035396575927734 + }, + { + "auxiliary_loss_clip": 0.01408768, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.24585247, + "balance_loss_mlp": 1.0148437, + "epoch": 0.5473019690365248, + "flos": 27898185525120.0, + "grad_norm": 1.415233596086656, + "language_loss": 0.72948146, + "learning_rate": 1.791046361258413e-06, + "loss": 0.75392479, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20727539, + "step": 9103, + "time_per_iteration": 2.898728847503662 + }, + { + "auxiliary_loss_clip": 0.0142368, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.25664306, + "balance_loss_mlp": 1.01296484, + "epoch": 0.5473620922891929, + "flos": 57651757585920.0, + "grad_norm": 1.2672445067178304, + "language_loss": 0.65769935, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.68227983, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21411133, + "step": 9104, + "time_per_iteration": 3.1887693405151367 + }, + { + "auxiliary_loss_clip": 0.01414155, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.24547553, + "balance_loss_mlp": 1.01515365, + "epoch": 0.5474222155418608, + "flos": 19363387622400.0, + "grad_norm": 1.757954234944769, + "language_loss": 0.82896346, + "learning_rate": 1.790271716558888e-06, + "loss": 0.85346872, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.2121582, + "step": 9105, + "time_per_iteration": 2.82035493850708 + }, + { + "auxiliary_loss_clip": 0.01412953, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.24735045, + "balance_loss_mlp": 1.01351643, + "epoch": 0.5474823387945288, + "flos": 25131701571840.0, + "grad_norm": 1.5114408110847612, + "language_loss": 0.81053388, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.83499074, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19213867, + "step": 9106, + "time_per_iteration": 2.890048027038574 + }, + { + "auxiliary_loss_clip": 0.01429468, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.26041079, + "balance_loss_mlp": 1.01400006, + "epoch": 0.5475424620471967, + "flos": 18013178989440.0, + "grad_norm": 2.350112029112099, + "language_loss": 0.7035799, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.72821575, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20117188, + "step": 9107, + "time_per_iteration": 2.792612075805664 + }, + { + "auxiliary_loss_clip": 0.01425682, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.25583148, + "balance_loss_mlp": 1.01036358, + "epoch": 0.5476025852998647, + "flos": 22319402618880.0, + "grad_norm": 1.7959530258903016, + "language_loss": 0.6453619, + "learning_rate": 1.789109809193197e-06, + "loss": 0.66992933, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20690918, + "step": 9108, + "time_per_iteration": 2.8813259601593018 + }, + { + "auxiliary_loss_clip": 0.01414844, + "auxiliary_loss_mlp": 0.01031749, + "balance_loss_clip": 1.24770164, + "balance_loss_mlp": 1.01202011, + "epoch": 0.5476627085525327, + "flos": 20130090324480.0, + "grad_norm": 1.783881515514454, + "language_loss": 0.76026726, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.78473318, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.1973877, + "step": 9109, + "time_per_iteration": 2.8597776889801025 + }, + { + "auxiliary_loss_clip": 0.01416666, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.25138867, + "balance_loss_mlp": 1.01686621, + "epoch": 0.5477228318052006, + "flos": 17721584380800.0, + "grad_norm": 10.429240478908586, + "language_loss": 0.78085279, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.80540133, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.21325684, + "step": 9110, + "time_per_iteration": 2.8172390460968018 + }, + { + "auxiliary_loss_clip": 0.01403497, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.2403003, + "balance_loss_mlp": 1.01247668, + "epoch": 0.5477829550578687, + "flos": 25860009421440.0, + "grad_norm": 1.5026533163110216, + "language_loss": 0.72233093, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.74667835, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.1875, + "step": 9111, + "time_per_iteration": 2.8923873901367188 + }, + { + "auxiliary_loss_clip": 0.01410487, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.24341941, + "balance_loss_mlp": 1.01373363, + "epoch": 0.5478430783105366, + "flos": 23050108442880.0, + "grad_norm": 1.680443442609996, + "language_loss": 0.71696734, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.74141729, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.2076416, + "step": 9112, + "time_per_iteration": 2.838939905166626 + }, + { + "auxiliary_loss_clip": 0.0142364, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.25425351, + "balance_loss_mlp": 1.01693916, + "epoch": 0.5479032015632046, + "flos": 16079916873600.0, + "grad_norm": 2.0893631248831617, + "language_loss": 0.89657182, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.92118084, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.203125, + "step": 9113, + "time_per_iteration": 4.217290878295898 + }, + { + "auxiliary_loss_clip": 0.01416381, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.24831152, + "balance_loss_mlp": 1.01331198, + "epoch": 0.5479633248158725, + "flos": 24289068816000.0, + "grad_norm": 1.4790353052585457, + "language_loss": 0.73941541, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.7639184, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20593262, + "step": 9114, + "time_per_iteration": 5.6567277908325195 + }, + { + "auxiliary_loss_clip": 0.01414523, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.24769163, + "balance_loss_mlp": 1.01514208, + "epoch": 0.5480234480685405, + "flos": 26369078376960.0, + "grad_norm": 1.6329720493439075, + "language_loss": 0.72853744, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.7530216, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18737793, + "step": 9115, + "time_per_iteration": 2.9427483081817627 + }, + { + "auxiliary_loss_clip": 0.01429097, + "auxiliary_loss_mlp": 0.01031601, + "balance_loss_clip": 1.25761294, + "balance_loss_mlp": 1.01131153, + "epoch": 0.5480835713212084, + "flos": 22065388456320.0, + "grad_norm": 1.9015215179642648, + "language_loss": 0.73013687, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.75474387, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20288086, + "step": 9116, + "time_per_iteration": 2.8350021839141846 + }, + { + "auxiliary_loss_clip": 0.01416386, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.24936485, + "balance_loss_mlp": 1.01812971, + "epoch": 0.5481436945738765, + "flos": 25311866941440.0, + "grad_norm": 1.9106291823324282, + "language_loss": 0.77558577, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.80014318, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.2121582, + "step": 9117, + "time_per_iteration": 2.995396852493286 + }, + { + "auxiliary_loss_clip": 0.01398214, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.23513973, + "balance_loss_mlp": 1.01300216, + "epoch": 0.5482038178265444, + "flos": 33593193619200.0, + "grad_norm": 1.633547070010084, + "language_loss": 0.63301086, + "learning_rate": 1.785237306671674e-06, + "loss": 0.65733826, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.21533203, + "step": 9118, + "time_per_iteration": 2.971527576446533 + }, + { + "auxiliary_loss_clip": 0.01439007, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.26688576, + "balance_loss_mlp": 1.01499391, + "epoch": 0.5482639410792124, + "flos": 19035569911680.0, + "grad_norm": 1.7211827462865907, + "language_loss": 0.7939564, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81870598, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.20947266, + "step": 9119, + "time_per_iteration": 2.8280746936798096 + }, + { + "auxiliary_loss_clip": 0.01414366, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.24995995, + "balance_loss_mlp": 1.01494169, + "epoch": 0.5483240643318803, + "flos": 25421124430080.0, + "grad_norm": 1.7098565852536771, + "language_loss": 0.83084321, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.85532945, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1932373, + "step": 9120, + "time_per_iteration": 2.885385513305664 + }, + { + "auxiliary_loss_clip": 0.01414931, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.24749422, + "balance_loss_mlp": 1.01277542, + "epoch": 0.5483841875845483, + "flos": 21475910211840.0, + "grad_norm": 2.6630875586881873, + "language_loss": 0.80689973, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.83137995, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20324707, + "step": 9121, + "time_per_iteration": 2.863783836364746 + }, + { + "auxiliary_loss_clip": 0.01422288, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.25311947, + "balance_loss_mlp": 1.01824164, + "epoch": 0.5484443108372163, + "flos": 24756259069440.0, + "grad_norm": 1.686944193010063, + "language_loss": 0.62179297, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.64640033, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20202637, + "step": 9122, + "time_per_iteration": 2.8834924697875977 + }, + { + "auxiliary_loss_clip": 0.01422471, + "auxiliary_loss_mlp": 0.01040155, + "balance_loss_clip": 1.25755072, + "balance_loss_mlp": 1.02033067, + "epoch": 0.5485044340898843, + "flos": 25386575385600.0, + "grad_norm": 1.7167641995892524, + "language_loss": 0.72045803, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74508435, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19836426, + "step": 9123, + "time_per_iteration": 2.896503448486328 + }, + { + "auxiliary_loss_clip": 0.01425183, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.25906563, + "balance_loss_mlp": 1.01754475, + "epoch": 0.5485645573425523, + "flos": 12648884273280.0, + "grad_norm": 1.9086068921915762, + "language_loss": 0.83992553, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.86454606, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1932373, + "step": 9124, + "time_per_iteration": 2.889718770980835 + }, + { + "auxiliary_loss_clip": 0.01407737, + "auxiliary_loss_mlp": 0.01033807, + "balance_loss_clip": 1.24439597, + "balance_loss_mlp": 1.0132314, + "epoch": 0.5486246805952202, + "flos": 28341052058880.0, + "grad_norm": 2.035248399097569, + "language_loss": 0.80967081, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.83408618, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20581055, + "step": 9125, + "time_per_iteration": 2.9590001106262207 + }, + { + "auxiliary_loss_clip": 0.01418822, + "auxiliary_loss_mlp": 0.01036112, + "balance_loss_clip": 1.24951625, + "balance_loss_mlp": 1.01607335, + "epoch": 0.5486848038478882, + "flos": 16808677171200.0, + "grad_norm": 1.8648248448535556, + "language_loss": 0.74926645, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77381581, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20019531, + "step": 9126, + "time_per_iteration": 2.8152525424957275 + }, + { + "auxiliary_loss_clip": 0.01420097, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.24980366, + "balance_loss_mlp": 1.01529527, + "epoch": 0.5487449271005561, + "flos": 17244576005760.0, + "grad_norm": 2.367891701545487, + "language_loss": 0.68752348, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.71208894, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.21166992, + "step": 9127, + "time_per_iteration": 2.8367934226989746 + }, + { + "auxiliary_loss_clip": 0.01413214, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.24756217, + "balance_loss_mlp": 1.01538038, + "epoch": 0.5488050503532241, + "flos": 17347996915200.0, + "grad_norm": 1.8270200155276102, + "language_loss": 0.83728671, + "learning_rate": 1.781365618532181e-06, + "loss": 0.86178154, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20874023, + "step": 9128, + "time_per_iteration": 2.8737471103668213 + }, + { + "auxiliary_loss_clip": 0.01412442, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.24706411, + "balance_loss_mlp": 1.01405191, + "epoch": 0.548865173605892, + "flos": 17248286079360.0, + "grad_norm": 1.8777837852238617, + "language_loss": 0.75239825, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.7768662, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20288086, + "step": 9129, + "time_per_iteration": 2.8646817207336426 + }, + { + "auxiliary_loss_clip": 0.01439414, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.26910925, + "balance_loss_mlp": 1.01350307, + "epoch": 0.5489252968585601, + "flos": 17465534202240.0, + "grad_norm": 2.3424393028962958, + "language_loss": 0.64374471, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.66847759, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20373535, + "step": 9130, + "time_per_iteration": 2.8226993083953857 + }, + { + "auxiliary_loss_clip": 0.01430506, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.26136446, + "balance_loss_mlp": 1.01856661, + "epoch": 0.548985420111228, + "flos": 26334665066880.0, + "grad_norm": 1.9106604047108067, + "language_loss": 0.63760149, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.66229761, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20544434, + "step": 9131, + "time_per_iteration": 2.8641152381896973 + }, + { + "auxiliary_loss_clip": 0.01430934, + "auxiliary_loss_mlp": 0.01036066, + "balance_loss_clip": 1.26319265, + "balance_loss_mlp": 1.01565766, + "epoch": 0.549045543363896, + "flos": 18701960866560.0, + "grad_norm": 2.1191353162236535, + "language_loss": 0.76219618, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.78686619, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20410156, + "step": 9132, + "time_per_iteration": 2.8237524032592773 + }, + { + "auxiliary_loss_clip": 0.01419595, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.25114346, + "balance_loss_mlp": 1.0139842, + "epoch": 0.5491056666165639, + "flos": 24728089541760.0, + "grad_norm": 2.166564023913624, + "language_loss": 0.82134151, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.84587872, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20129395, + "step": 9133, + "time_per_iteration": 2.8673338890075684 + }, + { + "auxiliary_loss_clip": 0.01414931, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.24899983, + "balance_loss_mlp": 1.01943469, + "epoch": 0.5491657898692319, + "flos": 21586117841280.0, + "grad_norm": 1.6498032937855793, + "language_loss": 0.70865196, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.7332027, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20703125, + "step": 9134, + "time_per_iteration": 2.8482511043548584 + }, + { + "auxiliary_loss_clip": 0.01434241, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.26419389, + "balance_loss_mlp": 1.01498771, + "epoch": 0.5492259131219, + "flos": 50493844765440.0, + "grad_norm": 2.1405621517916695, + "language_loss": 0.61300993, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63769424, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19189453, + "step": 9135, + "time_per_iteration": 3.0730113983154297 + }, + { + "auxiliary_loss_clip": 0.01421072, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.25238109, + "balance_loss_mlp": 1.01272702, + "epoch": 0.5492860363745679, + "flos": 25130208493440.0, + "grad_norm": 1.8718978544378502, + "language_loss": 0.73753494, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.76208329, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21032715, + "step": 9136, + "time_per_iteration": 4.296404123306274 + }, + { + "auxiliary_loss_clip": 0.0143881, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.26569057, + "balance_loss_mlp": 1.01810431, + "epoch": 0.5493461596272359, + "flos": 22642922073600.0, + "grad_norm": 2.0516947540541826, + "language_loss": 0.69200778, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.71678388, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20703125, + "step": 9137, + "time_per_iteration": 2.8520023822784424 + }, + { + "auxiliary_loss_clip": 0.01236231, + "auxiliary_loss_mlp": 0.01036186, + "balance_loss_clip": 1.13758874, + "balance_loss_mlp": 1.01777995, + "epoch": 0.5494062828799038, + "flos": 66179243813760.0, + "grad_norm": 0.7565129713164069, + "language_loss": 0.65355521, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67627937, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.18359375, + "step": 9138, + "time_per_iteration": 3.4025259017944336 + }, + { + "auxiliary_loss_clip": 0.01429049, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.25995636, + "balance_loss_mlp": 1.01461124, + "epoch": 0.5494664061325718, + "flos": 21115896186240.0, + "grad_norm": 1.7503214316382862, + "language_loss": 0.75811285, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.78276235, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21289062, + "step": 9139, + "time_per_iteration": 2.8350234031677246 + }, + { + "auxiliary_loss_clip": 0.0140333, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.23902702, + "balance_loss_mlp": 1.01374853, + "epoch": 0.5495265293852397, + "flos": 14400759409920.0, + "grad_norm": 1.9398652840921373, + "language_loss": 0.72375572, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.7481277, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20129395, + "step": 9140, + "time_per_iteration": 2.839172840118408 + }, + { + "auxiliary_loss_clip": 0.01406413, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.2410804, + "balance_loss_mlp": 1.0087539, + "epoch": 0.5495866526379077, + "flos": 25558144243200.0, + "grad_norm": 1.8307502302713003, + "language_loss": 0.77158189, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.79593611, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20239258, + "step": 9141, + "time_per_iteration": 2.8996851444244385 + }, + { + "auxiliary_loss_clip": 0.01405942, + "auxiliary_loss_mlp": 0.01035662, + "balance_loss_clip": 1.2451148, + "balance_loss_mlp": 1.01581347, + "epoch": 0.5496467758905756, + "flos": 21325271713920.0, + "grad_norm": 2.2492073298881605, + "language_loss": 0.75823903, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.782655, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19848633, + "step": 9142, + "time_per_iteration": 2.897500991821289 + }, + { + "auxiliary_loss_clip": 0.01429356, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.26061821, + "balance_loss_mlp": 1.0151875, + "epoch": 0.5497068991432437, + "flos": 22242296200320.0, + "grad_norm": 2.2799499677690456, + "language_loss": 0.77517581, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.79982847, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20703125, + "step": 9143, + "time_per_iteration": 2.837388753890991 + }, + { + "auxiliary_loss_clip": 0.01423348, + "auxiliary_loss_mlp": 0.01038459, + "balance_loss_clip": 1.25788271, + "balance_loss_mlp": 1.01751399, + "epoch": 0.5497670223959116, + "flos": 18488377572480.0, + "grad_norm": 2.4489796133825164, + "language_loss": 0.81579709, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.84041506, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20935059, + "step": 9144, + "time_per_iteration": 2.8427326679229736 + }, + { + "auxiliary_loss_clip": 0.01430124, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.26306677, + "balance_loss_mlp": 1.01108837, + "epoch": 0.5498271456485796, + "flos": 29216967004800.0, + "grad_norm": 1.7541524300642592, + "language_loss": 0.71561837, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.74023271, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20202637, + "step": 9145, + "time_per_iteration": 2.88031005859375 + }, + { + "auxiliary_loss_clip": 0.014272, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.26067781, + "balance_loss_mlp": 1.01764894, + "epoch": 0.5498872689012475, + "flos": 34837809592320.0, + "grad_norm": 1.4768600797483837, + "language_loss": 0.70829028, + "learning_rate": 1.774398678985076e-06, + "loss": 0.73293436, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19555664, + "step": 9146, + "time_per_iteration": 2.9555160999298096 + }, + { + "auxiliary_loss_clip": 0.01402181, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.24050188, + "balance_loss_mlp": 1.0164609, + "epoch": 0.5499473921539155, + "flos": 25933089052800.0, + "grad_norm": 1.7018950182593435, + "language_loss": 0.65263987, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.6770215, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19543457, + "step": 9147, + "time_per_iteration": 2.8988664150238037 + }, + { + "auxiliary_loss_clip": 0.01414235, + "auxiliary_loss_mlp": 0.0103824, + "balance_loss_clip": 1.25014699, + "balance_loss_mlp": 1.01894021, + "epoch": 0.5500075154065835, + "flos": 22283993923200.0, + "grad_norm": 1.940340215326659, + "language_loss": 0.81452781, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.83905256, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19299316, + "step": 9148, + "time_per_iteration": 4.334068775177002 + }, + { + "auxiliary_loss_clip": 0.01429982, + "auxiliary_loss_mlp": 0.01039674, + "balance_loss_clip": 1.263448, + "balance_loss_mlp": 1.01927733, + "epoch": 0.5500676386592515, + "flos": 28049050247040.0, + "grad_norm": 1.7599777543695485, + "language_loss": 0.80094439, + "learning_rate": 1.773237789559453e-06, + "loss": 0.82564098, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20385742, + "step": 9149, + "time_per_iteration": 4.414827108383179 + }, + { + "auxiliary_loss_clip": 0.01415041, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.24881113, + "balance_loss_mlp": 1.01627505, + "epoch": 0.5501277619119195, + "flos": 23925344716800.0, + "grad_norm": 2.149773307209368, + "language_loss": 0.73098147, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.75548935, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19445801, + "step": 9150, + "time_per_iteration": 2.8473238945007324 + }, + { + "auxiliary_loss_clip": 0.01429757, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.25908589, + "balance_loss_mlp": 1.01758683, + "epoch": 0.5501878851645874, + "flos": 20933740045440.0, + "grad_norm": 1.9085180619360453, + "language_loss": 0.755422, + "learning_rate": 1.772463906245477e-06, + "loss": 0.78010571, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.21020508, + "step": 9151, + "time_per_iteration": 2.802121877670288 + }, + { + "auxiliary_loss_clip": 0.01420162, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.2547307, + "balance_loss_mlp": 1.01637161, + "epoch": 0.5502480084172554, + "flos": 20674386996480.0, + "grad_norm": 1.82344951389052, + "language_loss": 0.76832521, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.7928828, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19238281, + "step": 9152, + "time_per_iteration": 2.8398048877716064 + }, + { + "auxiliary_loss_clip": 0.01406736, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.24425817, + "balance_loss_mlp": 1.01656878, + "epoch": 0.5503081316699233, + "flos": 26443786821120.0, + "grad_norm": 2.0160805450962243, + "language_loss": 0.83587551, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.86030328, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19458008, + "step": 9153, + "time_per_iteration": 2.8943891525268555 + }, + { + "auxiliary_loss_clip": 0.01409242, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.24608862, + "balance_loss_mlp": 1.01853037, + "epoch": 0.5503682549225913, + "flos": 30641657857920.0, + "grad_norm": 2.280596672040626, + "language_loss": 0.75065356, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.7751317, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20043945, + "step": 9154, + "time_per_iteration": 2.923618793487549 + }, + { + "auxiliary_loss_clip": 0.01431765, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.26033449, + "balance_loss_mlp": 1.01829457, + "epoch": 0.5504283781752592, + "flos": 22575769511040.0, + "grad_norm": 1.6391757145653894, + "language_loss": 0.73739552, + "learning_rate": 1.770916243273199e-06, + "loss": 0.76210105, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20495605, + "step": 9155, + "time_per_iteration": 2.8508124351501465 + }, + { + "auxiliary_loss_clip": 0.01233949, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.13531542, + "balance_loss_mlp": 1.01283979, + "epoch": 0.5504885014279273, + "flos": 67928359017600.0, + "grad_norm": 0.7614755764141152, + "language_loss": 0.55338782, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57604074, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.18457031, + "step": 9156, + "time_per_iteration": 3.4969427585601807 + }, + { + "auxiliary_loss_clip": 0.0142621, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.25915956, + "balance_loss_mlp": 1.01705027, + "epoch": 0.5505486246805952, + "flos": 22458820406400.0, + "grad_norm": 1.627020947520391, + "language_loss": 0.83168387, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.85631943, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20275879, + "step": 9157, + "time_per_iteration": 2.848139762878418 + }, + { + "auxiliary_loss_clip": 0.01432007, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.26043046, + "balance_loss_mlp": 1.01559293, + "epoch": 0.5506087479332632, + "flos": 26918261487360.0, + "grad_norm": 2.1040743097786896, + "language_loss": 0.76336122, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.78804946, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.2121582, + "step": 9158, + "time_per_iteration": 2.8947582244873047 + }, + { + "auxiliary_loss_clip": 0.01410817, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.24903095, + "balance_loss_mlp": 1.01742947, + "epoch": 0.5506688711859311, + "flos": 22940669975040.0, + "grad_norm": 1.7124340911438078, + "language_loss": 0.70670128, + "learning_rate": 1.769368719290979e-06, + "loss": 0.73118162, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19799805, + "step": 9159, + "time_per_iteration": 2.9524168968200684 + }, + { + "auxiliary_loss_clip": 0.01413875, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.24713969, + "balance_loss_mlp": 1.02150428, + "epoch": 0.5507289944385991, + "flos": 29618362039680.0, + "grad_norm": 1.9258338984711891, + "language_loss": 0.69732106, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.72187144, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19665527, + "step": 9160, + "time_per_iteration": 3.120114803314209 + }, + { + "auxiliary_loss_clip": 0.01414006, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.24879217, + "balance_loss_mlp": 1.01357424, + "epoch": 0.5507891176912671, + "flos": 15341112230400.0, + "grad_norm": 1.8790133588437596, + "language_loss": 0.72555482, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.75002617, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19567871, + "step": 9161, + "time_per_iteration": 2.8633625507354736 + }, + { + "auxiliary_loss_clip": 0.01426746, + "auxiliary_loss_mlp": 0.01047972, + "balance_loss_clip": 1.26083541, + "balance_loss_mlp": 1.02713454, + "epoch": 0.5508492409439351, + "flos": 26589538880640.0, + "grad_norm": 1.6565803047354553, + "language_loss": 0.70162684, + "learning_rate": 1.768208168081359e-06, + "loss": 0.72637403, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20837402, + "step": 9162, + "time_per_iteration": 2.8852133750915527 + }, + { + "auxiliary_loss_clip": 0.0141087, + "auxiliary_loss_mlp": 0.01039848, + "balance_loss_clip": 1.24679542, + "balance_loss_mlp": 1.01847434, + "epoch": 0.5509093641966031, + "flos": 25453365989760.0, + "grad_norm": 1.70336890007208, + "language_loss": 0.86793834, + "learning_rate": 1.767821335237733e-06, + "loss": 0.8924455, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.21374512, + "step": 9163, + "time_per_iteration": 2.8761379718780518 + }, + { + "auxiliary_loss_clip": 0.01413455, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.25058794, + "balance_loss_mlp": 1.01474321, + "epoch": 0.550969487449271, + "flos": 18708023669760.0, + "grad_norm": 1.66229913442318, + "language_loss": 0.81547582, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.83995068, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19299316, + "step": 9164, + "time_per_iteration": 2.8426811695098877 + }, + { + "auxiliary_loss_clip": 0.01427339, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.25873387, + "balance_loss_mlp": 1.01966584, + "epoch": 0.551029610701939, + "flos": 22718399679360.0, + "grad_norm": 2.033234516712243, + "language_loss": 0.74490082, + "learning_rate": 1.767047695977863e-06, + "loss": 0.76957309, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20214844, + "step": 9165, + "time_per_iteration": 2.8632712364196777 + }, + { + "auxiliary_loss_clip": 0.01403905, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.24078774, + "balance_loss_mlp": 1.01517546, + "epoch": 0.5510897339546069, + "flos": 12427699852800.0, + "grad_norm": 1.9588861113560423, + "language_loss": 0.80525649, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.82963955, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19226074, + "step": 9166, + "time_per_iteration": 2.806156635284424 + }, + { + "auxiliary_loss_clip": 0.01423109, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.25453532, + "balance_loss_mlp": 1.01091015, + "epoch": 0.5511498572072749, + "flos": 18779836446720.0, + "grad_norm": 2.306423290274891, + "language_loss": 0.77205682, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.79659545, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19836426, + "step": 9167, + "time_per_iteration": 2.9085423946380615 + }, + { + "auxiliary_loss_clip": 0.01408328, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.24267852, + "balance_loss_mlp": 1.01315069, + "epoch": 0.5512099804599428, + "flos": 19582807495680.0, + "grad_norm": 1.9337746296782128, + "language_loss": 0.81831932, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.84274071, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20654297, + "step": 9168, + "time_per_iteration": 2.9388225078582764 + }, + { + "auxiliary_loss_clip": 0.01436254, + "auxiliary_loss_mlp": 0.01040509, + "balance_loss_clip": 1.26672506, + "balance_loss_mlp": 1.01930153, + "epoch": 0.5512701037126109, + "flos": 26255613121920.0, + "grad_norm": 1.629054889370561, + "language_loss": 0.69553053, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.72029817, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.2121582, + "step": 9169, + "time_per_iteration": 2.9172489643096924 + }, + { + "auxiliary_loss_clip": 0.01402142, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.2414552, + "balance_loss_mlp": 1.0139904, + "epoch": 0.5513302269652788, + "flos": 21955633274880.0, + "grad_norm": 2.722099259582532, + "language_loss": 0.85652047, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.88087296, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19104004, + "step": 9170, + "time_per_iteration": 2.880378246307373 + }, + { + "auxiliary_loss_clip": 0.01226713, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.12763071, + "balance_loss_mlp": 1.0156945, + "epoch": 0.5513903502179468, + "flos": 68265089953920.0, + "grad_norm": 0.7909924409821402, + "language_loss": 0.60044181, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.62305367, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.1875, + "step": 9171, + "time_per_iteration": 4.806863784790039 + }, + { + "auxiliary_loss_clip": 0.01418167, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.25302601, + "balance_loss_mlp": 1.01944017, + "epoch": 0.5514504734706147, + "flos": 18743522855040.0, + "grad_norm": 1.6256597504623487, + "language_loss": 0.7148512, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.73942351, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19628906, + "step": 9172, + "time_per_iteration": 2.872443675994873 + }, + { + "auxiliary_loss_clip": 0.01412787, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.24724579, + "balance_loss_mlp": 1.01768589, + "epoch": 0.5515105967232827, + "flos": 22280555318400.0, + "grad_norm": 1.7573338556975262, + "language_loss": 0.77055025, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.79505432, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19921875, + "step": 9173, + "time_per_iteration": 2.874814748764038 + }, + { + "auxiliary_loss_clip": 0.01408263, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.24424613, + "balance_loss_mlp": 1.0150162, + "epoch": 0.5515707199759508, + "flos": 22565996634240.0, + "grad_norm": 2.588638353416107, + "language_loss": 0.76004046, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.78446341, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19018555, + "step": 9174, + "time_per_iteration": 2.855783700942993 + }, + { + "auxiliary_loss_clip": 0.01419899, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.25249386, + "balance_loss_mlp": 1.01592088, + "epoch": 0.5516308432286187, + "flos": 28302204758400.0, + "grad_norm": 2.0022832172105107, + "language_loss": 0.73603451, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.76058829, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19567871, + "step": 9175, + "time_per_iteration": 2.9203622341156006 + }, + { + "auxiliary_loss_clip": 0.01408131, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.2433548, + "balance_loss_mlp": 1.01490748, + "epoch": 0.5516909664812867, + "flos": 18772144830720.0, + "grad_norm": 1.8413960100493747, + "language_loss": 0.6973623, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.72178495, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19238281, + "step": 9176, + "time_per_iteration": 2.8106584548950195 + }, + { + "auxiliary_loss_clip": 0.01408194, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.24547851, + "balance_loss_mlp": 1.01283789, + "epoch": 0.5517510897339546, + "flos": 27750442694400.0, + "grad_norm": 1.6354286188960747, + "language_loss": 0.71338803, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73779958, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20117188, + "step": 9177, + "time_per_iteration": 2.897047758102417 + }, + { + "auxiliary_loss_clip": 0.01416365, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.25022936, + "balance_loss_mlp": 1.01518226, + "epoch": 0.5518112129866226, + "flos": 18413307169920.0, + "grad_norm": 1.6774299880871542, + "language_loss": 0.80288601, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82740378, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20227051, + "step": 9178, + "time_per_iteration": 2.8258657455444336 + }, + { + "auxiliary_loss_clip": 0.01432263, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.26222777, + "balance_loss_mlp": 1.01937175, + "epoch": 0.5518713362392905, + "flos": 25093578188160.0, + "grad_norm": 1.5941628618138666, + "language_loss": 0.76223242, + "learning_rate": 1.761633217089826e-06, + "loss": 0.78695393, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20507812, + "step": 9179, + "time_per_iteration": 2.8562726974487305 + }, + { + "auxiliary_loss_clip": 0.0141435, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.2493217, + "balance_loss_mlp": 1.01604152, + "epoch": 0.5519314594919585, + "flos": 36552828199680.0, + "grad_norm": 1.6896127172281417, + "language_loss": 0.70872533, + "learning_rate": 1.761246535912924e-06, + "loss": 0.73322618, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19702148, + "step": 9180, + "time_per_iteration": 2.967099189758301 + }, + { + "auxiliary_loss_clip": 0.01416677, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.25074077, + "balance_loss_mlp": 1.01628029, + "epoch": 0.5519915827446265, + "flos": 20458224748800.0, + "grad_norm": 1.764710868450375, + "language_loss": 0.68027431, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.70481372, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.2097168, + "step": 9181, + "time_per_iteration": 2.846630096435547 + }, + { + "auxiliary_loss_clip": 0.01431307, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.25971186, + "balance_loss_mlp": 1.01554966, + "epoch": 0.5520517059972945, + "flos": 23778325802880.0, + "grad_norm": 2.35549704852929, + "language_loss": 0.79626477, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.82094252, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20922852, + "step": 9182, + "time_per_iteration": 2.892591714859009 + }, + { + "auxiliary_loss_clip": 0.01417203, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.24898088, + "balance_loss_mlp": 1.01315212, + "epoch": 0.5521118292499624, + "flos": 22205620650240.0, + "grad_norm": 3.16564790796765, + "language_loss": 0.83848405, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.86298704, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19934082, + "step": 9183, + "time_per_iteration": 4.289189100265503 + }, + { + "auxiliary_loss_clip": 0.01405937, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.24113655, + "balance_loss_mlp": 1.01176751, + "epoch": 0.5521719525026304, + "flos": 23592912036480.0, + "grad_norm": 1.288063980745063, + "language_loss": 0.6804738, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.70484579, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19482422, + "step": 9184, + "time_per_iteration": 4.278069257736206 + }, + { + "auxiliary_loss_clip": 0.0141068, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.24536335, + "balance_loss_mlp": 1.0117358, + "epoch": 0.5522320757552983, + "flos": 26147984446080.0, + "grad_norm": 1.5729762248420704, + "language_loss": 0.77150178, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.79593253, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20654297, + "step": 9185, + "time_per_iteration": 4.260830402374268 + }, + { + "auxiliary_loss_clip": 0.01419899, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.25253344, + "balance_loss_mlp": 1.02308273, + "epoch": 0.5522921990079663, + "flos": 24685622657280.0, + "grad_norm": 1.9935410701657963, + "language_loss": 0.74460506, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76923847, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20361328, + "step": 9186, + "time_per_iteration": 2.8752036094665527 + }, + { + "auxiliary_loss_clip": 0.01429227, + "auxiliary_loss_mlp": 0.01037071, + "balance_loss_clip": 1.26080596, + "balance_loss_mlp": 1.01731753, + "epoch": 0.5523523222606344, + "flos": 22758604323840.0, + "grad_norm": 2.1455094395112217, + "language_loss": 0.66742527, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.69208819, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19763184, + "step": 9187, + "time_per_iteration": 2.8720691204071045 + }, + { + "auxiliary_loss_clip": 0.01424505, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.25719237, + "balance_loss_mlp": 1.01523757, + "epoch": 0.5524124455133023, + "flos": 19765370839680.0, + "grad_norm": 4.998499500343574, + "language_loss": 0.78588784, + "learning_rate": 1.758153413657318e-06, + "loss": 0.81048173, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19665527, + "step": 9188, + "time_per_iteration": 2.8594326972961426 + }, + { + "auxiliary_loss_clip": 0.01416663, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.25061989, + "balance_loss_mlp": 1.01518047, + "epoch": 0.5524725687659703, + "flos": 23305253725440.0, + "grad_norm": 1.8102656894487326, + "language_loss": 0.82758927, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.85210389, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19628906, + "step": 9189, + "time_per_iteration": 2.8807060718536377 + }, + { + "auxiliary_loss_clip": 0.01416304, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.25225127, + "balance_loss_mlp": 1.01218045, + "epoch": 0.5525326920186382, + "flos": 24872619991680.0, + "grad_norm": 1.3154899229975119, + "language_loss": 0.77203107, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.79652154, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20544434, + "step": 9190, + "time_per_iteration": 2.869014263153076 + }, + { + "auxiliary_loss_clip": 0.0143361, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.26100039, + "balance_loss_mlp": 1.01449072, + "epoch": 0.5525928152713062, + "flos": 13743540420480.0, + "grad_norm": 2.7238300374719056, + "language_loss": 0.80434698, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.82903063, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20263672, + "step": 9191, + "time_per_iteration": 2.8163223266601562 + }, + { + "auxiliary_loss_clip": 0.01420573, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.25472331, + "balance_loss_mlp": 1.0184412, + "epoch": 0.5526529385239741, + "flos": 13077498695040.0, + "grad_norm": 2.069404222315648, + "language_loss": 0.69438744, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71897489, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19714355, + "step": 9192, + "time_per_iteration": 2.8063876628875732 + }, + { + "auxiliary_loss_clip": 0.01414292, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.25039577, + "balance_loss_mlp": 1.01669693, + "epoch": 0.5527130617766421, + "flos": 23158642014720.0, + "grad_norm": 2.013441039073215, + "language_loss": 0.7815702, + "learning_rate": 1.756220509823588e-06, + "loss": 0.80607224, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19226074, + "step": 9193, + "time_per_iteration": 2.836169958114624 + }, + { + "auxiliary_loss_clip": 0.01410259, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.24575758, + "balance_loss_mlp": 1.01665998, + "epoch": 0.55277318502931, + "flos": 21294794701440.0, + "grad_norm": 1.9676124675269684, + "language_loss": 0.79464185, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.81910759, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19641113, + "step": 9194, + "time_per_iteration": 2.8515470027923584 + }, + { + "auxiliary_loss_clip": 0.0143172, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.2583909, + "balance_loss_mlp": 1.01715374, + "epoch": 0.5528333082819781, + "flos": 38338256995200.0, + "grad_norm": 2.0012023834639265, + "language_loss": 0.70325541, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.72794795, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20361328, + "step": 9195, + "time_per_iteration": 2.995591402053833 + }, + { + "auxiliary_loss_clip": 0.01432579, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.25924468, + "balance_loss_mlp": 1.01481342, + "epoch": 0.552893431534646, + "flos": 13561655748480.0, + "grad_norm": 4.437874115784483, + "language_loss": 0.7518003, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.77647436, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.20019531, + "step": 9196, + "time_per_iteration": 2.8335769176483154 + }, + { + "auxiliary_loss_clip": 0.01417032, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.25215244, + "balance_loss_mlp": 1.0178237, + "epoch": 0.552953554787314, + "flos": 21948077393280.0, + "grad_norm": 1.5009993245466737, + "language_loss": 0.77504855, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.7995981, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20092773, + "step": 9197, + "time_per_iteration": 2.836165428161621 + }, + { + "auxiliary_loss_clip": 0.01409879, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.24623275, + "balance_loss_mlp": 1.0129745, + "epoch": 0.5530136780399819, + "flos": 43674789386880.0, + "grad_norm": 1.5022509385703051, + "language_loss": 0.76980966, + "learning_rate": 1.754287837093407e-06, + "loss": 0.7942307, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19238281, + "step": 9198, + "time_per_iteration": 3.1392149925231934 + }, + { + "auxiliary_loss_clip": 0.01414737, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.2492981, + "balance_loss_mlp": 1.01415205, + "epoch": 0.5530738012926499, + "flos": 25056223966080.0, + "grad_norm": 8.912757707948662, + "language_loss": 0.79782701, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.82230079, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18481445, + "step": 9199, + "time_per_iteration": 2.854294538497925 + }, + { + "auxiliary_loss_clip": 0.01406636, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.24257565, + "balance_loss_mlp": 1.01703012, + "epoch": 0.553133924545318, + "flos": 16480768970880.0, + "grad_norm": 1.801779231346644, + "language_loss": 0.6401788, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66460466, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18908691, + "step": 9200, + "time_per_iteration": 2.8390984535217285 + }, + { + "auxiliary_loss_clip": 0.01429496, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.25890565, + "balance_loss_mlp": 1.01605117, + "epoch": 0.5531940477979859, + "flos": 24616434078720.0, + "grad_norm": 1.454438209389934, + "language_loss": 0.66725814, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.6919238, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21020508, + "step": 9201, + "time_per_iteration": 2.880108594894409 + }, + { + "auxiliary_loss_clip": 0.01429662, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.26409912, + "balance_loss_mlp": 1.01801109, + "epoch": 0.5532541710506539, + "flos": 22168628386560.0, + "grad_norm": 2.2356712301169224, + "language_loss": 0.61858606, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.64326793, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20532227, + "step": 9202, + "time_per_iteration": 2.876905918121338 + }, + { + "auxiliary_loss_clip": 0.01399678, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.2379663, + "balance_loss_mlp": 1.01515841, + "epoch": 0.5533142943033218, + "flos": 21407128836480.0, + "grad_norm": 1.4704429421879965, + "language_loss": 0.65283406, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.67718124, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19873047, + "step": 9203, + "time_per_iteration": 2.8461949825286865 + }, + { + "auxiliary_loss_clip": 0.01416522, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.25109816, + "balance_loss_mlp": 1.01380539, + "epoch": 0.5533744175559898, + "flos": 23561122924800.0, + "grad_norm": 1.8391345588480252, + "language_loss": 0.64408815, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.66859198, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20068359, + "step": 9204, + "time_per_iteration": 3.003610372543335 + }, + { + "auxiliary_loss_clip": 0.01398033, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.23652864, + "balance_loss_mlp": 1.01308644, + "epoch": 0.5534345408086577, + "flos": 24072273141120.0, + "grad_norm": 2.00345854403708, + "language_loss": 0.78001666, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.80432057, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19274902, + "step": 9205, + "time_per_iteration": 2.8933606147766113 + }, + { + "auxiliary_loss_clip": 0.01398523, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.23779058, + "balance_loss_mlp": 1.01549828, + "epoch": 0.5534946640613257, + "flos": 33786434736000.0, + "grad_norm": 1.8840794173613205, + "language_loss": 0.72919661, + "learning_rate": 1.751196045993537e-06, + "loss": 0.75354135, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20446777, + "step": 9206, + "time_per_iteration": 4.523120880126953 + }, + { + "auxiliary_loss_clip": 0.01418609, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.25232375, + "balance_loss_mlp": 1.01723647, + "epoch": 0.5535547873139937, + "flos": 15167733580800.0, + "grad_norm": 2.0369270287556245, + "language_loss": 0.76077515, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.78532898, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19543457, + "step": 9207, + "time_per_iteration": 2.850191116333008 + }, + { + "auxiliary_loss_clip": 0.0142886, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.25689507, + "balance_loss_mlp": 1.01246262, + "epoch": 0.5536149105666617, + "flos": 16989159254400.0, + "grad_norm": 2.3519025494515793, + "language_loss": 0.63673484, + "learning_rate": 1.750423192272189e-06, + "loss": 0.66136253, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21435547, + "step": 9208, + "time_per_iteration": 2.939732074737549 + }, + { + "auxiliary_loss_clip": 0.01421846, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.25171947, + "balance_loss_mlp": 1.01470089, + "epoch": 0.5536750338193296, + "flos": 18158931048960.0, + "grad_norm": 2.0255170405615397, + "language_loss": 0.65221727, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.67677873, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19604492, + "step": 9209, + "time_per_iteration": 2.8680806159973145 + }, + { + "auxiliary_loss_clip": 0.0141018, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.2452929, + "balance_loss_mlp": 1.01600742, + "epoch": 0.5537351570719976, + "flos": 22758151875840.0, + "grad_norm": 2.2166993799110406, + "language_loss": 0.83873928, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.86320949, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20849609, + "step": 9210, + "time_per_iteration": 2.8926303386688232 + }, + { + "auxiliary_loss_clip": 0.01416244, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.25239062, + "balance_loss_mlp": 1.0143609, + "epoch": 0.5537952803246655, + "flos": 26366318444160.0, + "grad_norm": 1.7220019171202197, + "language_loss": 0.73350996, + "learning_rate": 1.74926398270663e-06, + "loss": 0.75801861, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20251465, + "step": 9211, + "time_per_iteration": 2.927570104598999 + }, + { + "auxiliary_loss_clip": 0.01424956, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.25442648, + "balance_loss_mlp": 1.01600051, + "epoch": 0.5538554035773335, + "flos": 18045601528320.0, + "grad_norm": 2.1088253189996737, + "language_loss": 0.6796459, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.70426118, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20568848, + "step": 9212, + "time_per_iteration": 2.866010904312134 + }, + { + "auxiliary_loss_clip": 0.0141657, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.24584687, + "balance_loss_mlp": 1.01279509, + "epoch": 0.5539155268300014, + "flos": 31698778803840.0, + "grad_norm": 1.4160599044896138, + "language_loss": 0.52450621, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.54900849, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20849609, + "step": 9213, + "time_per_iteration": 2.9189345836639404 + }, + { + "auxiliary_loss_clip": 0.01421778, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.25391328, + "balance_loss_mlp": 1.01708055, + "epoch": 0.5539756500826695, + "flos": 15201784932480.0, + "grad_norm": 1.8412303452432042, + "language_loss": 0.86802304, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.89261436, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20288086, + "step": 9214, + "time_per_iteration": 2.8368709087371826 + }, + { + "auxiliary_loss_clip": 0.01406519, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.24217498, + "balance_loss_mlp": 1.01462984, + "epoch": 0.5540357733353375, + "flos": 26362653615360.0, + "grad_norm": 1.623618854730382, + "language_loss": 0.70911413, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.73352408, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1986084, + "step": 9215, + "time_per_iteration": 2.8909409046173096 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.25242436, + "balance_loss_mlp": 1.0156244, + "epoch": 0.5540958965880054, + "flos": 21333370533120.0, + "grad_norm": 1.594206212948758, + "language_loss": 0.73872721, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.76330256, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20349121, + "step": 9216, + "time_per_iteration": 2.903143882751465 + }, + { + "auxiliary_loss_clip": 0.0140819, + "auxiliary_loss_mlp": 0.01041681, + "balance_loss_clip": 1.24578953, + "balance_loss_mlp": 1.02172542, + "epoch": 0.5541560198406734, + "flos": 25677989015040.0, + "grad_norm": 2.4403332655548304, + "language_loss": 0.72662246, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.75112116, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19946289, + "step": 9217, + "time_per_iteration": 2.861161708831787 + }, + { + "auxiliary_loss_clip": 0.01415335, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.25100279, + "balance_loss_mlp": 1.01333976, + "epoch": 0.5542161430933413, + "flos": 21948348862080.0, + "grad_norm": 1.6659257658059903, + "language_loss": 0.78924412, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.81372368, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19287109, + "step": 9218, + "time_per_iteration": 4.258141279220581 + }, + { + "auxiliary_loss_clip": 0.01421345, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.25280154, + "balance_loss_mlp": 1.01549494, + "epoch": 0.5542762663460093, + "flos": 19580454766080.0, + "grad_norm": 2.56049766497957, + "language_loss": 0.73069715, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.75527787, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.2121582, + "step": 9219, + "time_per_iteration": 5.690054178237915 + }, + { + "auxiliary_loss_clip": 0.01428171, + "auxiliary_loss_mlp": 0.01036113, + "balance_loss_clip": 1.25945377, + "balance_loss_mlp": 1.01529872, + "epoch": 0.5543363895986773, + "flos": 19508189541120.0, + "grad_norm": 1.4930965608391995, + "language_loss": 0.7224474, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.74709022, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20812988, + "step": 9220, + "time_per_iteration": 2.9086062908172607 + }, + { + "auxiliary_loss_clip": 0.01409959, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.24628723, + "balance_loss_mlp": 1.01288795, + "epoch": 0.5543965128513453, + "flos": 22645229558400.0, + "grad_norm": 1.610922436656548, + "language_loss": 0.80456412, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.82899433, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20166016, + "step": 9221, + "time_per_iteration": 2.842595338821411 + }, + { + "auxiliary_loss_clip": 0.01412032, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.24885941, + "balance_loss_mlp": 1.01937342, + "epoch": 0.5544566361040132, + "flos": 25999834412160.0, + "grad_norm": 1.7039732439949604, + "language_loss": 0.84409475, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.8686167, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20800781, + "step": 9222, + "time_per_iteration": 2.8829965591430664 + }, + { + "auxiliary_loss_clip": 0.01419257, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.24984837, + "balance_loss_mlp": 1.02021563, + "epoch": 0.5545167593566812, + "flos": 28269465505920.0, + "grad_norm": 1.7254794986261688, + "language_loss": 0.76626468, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.79086977, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21032715, + "step": 9223, + "time_per_iteration": 2.8942532539367676 + }, + { + "auxiliary_loss_clip": 0.01417053, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.24959016, + "balance_loss_mlp": 1.01695633, + "epoch": 0.5545768826093491, + "flos": 28488523420800.0, + "grad_norm": 1.507401974586855, + "language_loss": 0.8256005, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.85014516, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20458984, + "step": 9224, + "time_per_iteration": 2.9010984897613525 + }, + { + "auxiliary_loss_clip": 0.01422363, + "auxiliary_loss_mlp": 0.01040041, + "balance_loss_clip": 1.25489759, + "balance_loss_mlp": 1.01984715, + "epoch": 0.5546370058620171, + "flos": 18487065473280.0, + "grad_norm": 1.975020045648244, + "language_loss": 0.58196604, + "learning_rate": 1.743855475904141e-06, + "loss": 0.60659015, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20202637, + "step": 9225, + "time_per_iteration": 2.8291940689086914 + }, + { + "auxiliary_loss_clip": 0.01418124, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.24927175, + "balance_loss_mlp": 1.01576853, + "epoch": 0.554697129114685, + "flos": 22940986688640.0, + "grad_norm": 1.5678424600177723, + "language_loss": 0.68233633, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.70687395, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.1986084, + "step": 9226, + "time_per_iteration": 2.8624730110168457 + }, + { + "auxiliary_loss_clip": 0.01419819, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.25186694, + "balance_loss_mlp": 1.01707602, + "epoch": 0.5547572523673531, + "flos": 21806759324160.0, + "grad_norm": 4.618002408653813, + "language_loss": 0.7545352, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.77910411, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.1998291, + "step": 9227, + "time_per_iteration": 2.8682520389556885 + }, + { + "auxiliary_loss_clip": 0.01435506, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.26647329, + "balance_loss_mlp": 1.01417756, + "epoch": 0.5548173756200211, + "flos": 22352141871360.0, + "grad_norm": 1.5576251326732928, + "language_loss": 0.74364966, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.76834249, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19604492, + "step": 9228, + "time_per_iteration": 2.980915069580078 + }, + { + "auxiliary_loss_clip": 0.01432521, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.26523566, + "balance_loss_mlp": 1.01358664, + "epoch": 0.554877498872689, + "flos": 17867834133120.0, + "grad_norm": 1.7808331930101902, + "language_loss": 0.76950371, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.79416406, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19909668, + "step": 9229, + "time_per_iteration": 2.850552558898926 + }, + { + "auxiliary_loss_clip": 0.01425061, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.25621772, + "balance_loss_mlp": 1.02481198, + "epoch": 0.554937622125357, + "flos": 17247245448960.0, + "grad_norm": 1.4252960391919423, + "language_loss": 0.69392419, + "learning_rate": 1.741924325613172e-06, + "loss": 0.71863192, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20910645, + "step": 9230, + "time_per_iteration": 2.8476011753082275 + }, + { + "auxiliary_loss_clip": 0.01432851, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.26247191, + "balance_loss_mlp": 1.01821828, + "epoch": 0.5549977453780249, + "flos": 25377390691200.0, + "grad_norm": 2.4032648541210895, + "language_loss": 0.69675225, + "learning_rate": 1.741538124855163e-06, + "loss": 0.72146624, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20324707, + "step": 9231, + "time_per_iteration": 2.8714869022369385 + }, + { + "auxiliary_loss_clip": 0.01444207, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.27080202, + "balance_loss_mlp": 1.01804042, + "epoch": 0.555057868630693, + "flos": 25089098952960.0, + "grad_norm": 1.7495695639287634, + "language_loss": 0.7849642, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80979389, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20727539, + "step": 9232, + "time_per_iteration": 2.882852077484131 + }, + { + "auxiliary_loss_clip": 0.01414326, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.2501204, + "balance_loss_mlp": 1.01311815, + "epoch": 0.5551179918833609, + "flos": 26115380928000.0, + "grad_norm": 1.7272922386620237, + "language_loss": 0.83804309, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.86251104, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19335938, + "step": 9233, + "time_per_iteration": 2.8633804321289062 + }, + { + "auxiliary_loss_clip": 0.01437391, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.26534951, + "balance_loss_mlp": 1.01875877, + "epoch": 0.5551781151360289, + "flos": 19392597780480.0, + "grad_norm": 2.024008225896809, + "language_loss": 0.76064992, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.78540981, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.19848633, + "step": 9234, + "time_per_iteration": 2.841935634613037 + }, + { + "auxiliary_loss_clip": 0.01419435, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.2537241, + "balance_loss_mlp": 1.01511276, + "epoch": 0.5552382383886968, + "flos": 21735987177600.0, + "grad_norm": 1.9233048336552236, + "language_loss": 0.66656262, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.69109857, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19055176, + "step": 9235, + "time_per_iteration": 2.842550754547119 + }, + { + "auxiliary_loss_clip": 0.01429017, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.26183391, + "balance_loss_mlp": 1.01493979, + "epoch": 0.5552983616413648, + "flos": 14364400573440.0, + "grad_norm": 1.8626865383138118, + "language_loss": 0.69477957, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.71941978, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20068359, + "step": 9236, + "time_per_iteration": 2.8254916667938232 + }, + { + "auxiliary_loss_clip": 0.01418376, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.25495124, + "balance_loss_mlp": 1.0165484, + "epoch": 0.5553584848940327, + "flos": 25488141258240.0, + "grad_norm": 1.572757364299027, + "language_loss": 0.86476958, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88931406, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19519043, + "step": 9237, + "time_per_iteration": 2.870772361755371 + }, + { + "auxiliary_loss_clip": 0.01417655, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.25427485, + "balance_loss_mlp": 1.01806784, + "epoch": 0.5554186081467007, + "flos": 22174012517760.0, + "grad_norm": 3.2005170364381152, + "language_loss": 0.74582595, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.77038139, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19824219, + "step": 9238, + "time_per_iteration": 2.91479229927063 + }, + { + "auxiliary_loss_clip": 0.01434514, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.26324713, + "balance_loss_mlp": 1.0163548, + "epoch": 0.5554787313993687, + "flos": 49763862858240.0, + "grad_norm": 1.8519414027964267, + "language_loss": 0.7908051, + "learning_rate": 1.73844887285358e-06, + "loss": 0.81551003, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19641113, + "step": 9239, + "time_per_iteration": 3.106022596359253 + }, + { + "auxiliary_loss_clip": 0.0143394, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.26566958, + "balance_loss_mlp": 1.01657414, + "epoch": 0.5555388546520367, + "flos": 22137517946880.0, + "grad_norm": 2.3376443846190447, + "language_loss": 0.80747378, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.83217126, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19238281, + "step": 9240, + "time_per_iteration": 4.314273118972778 + }, + { + "auxiliary_loss_clip": 0.01417105, + "auxiliary_loss_mlp": 0.01044246, + "balance_loss_clip": 1.25164926, + "balance_loss_mlp": 1.02409923, + "epoch": 0.5555989779047047, + "flos": 24693133294080.0, + "grad_norm": 2.178501707522861, + "language_loss": 0.66640037, + "learning_rate": 1.737676658740786e-06, + "loss": 0.69101393, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20141602, + "step": 9241, + "time_per_iteration": 2.9098565578460693 + }, + { + "auxiliary_loss_clip": 0.01428309, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.26077306, + "balance_loss_mlp": 1.0215342, + "epoch": 0.5556591011573726, + "flos": 16114918366080.0, + "grad_norm": 2.050148132496567, + "language_loss": 0.7352308, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.75992614, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19689941, + "step": 9242, + "time_per_iteration": 2.8350276947021484 + }, + { + "auxiliary_loss_clip": 0.01432142, + "auxiliary_loss_mlp": 0.01043459, + "balance_loss_clip": 1.26338899, + "balance_loss_mlp": 1.02308667, + "epoch": 0.5557192244100406, + "flos": 12941655246720.0, + "grad_norm": 1.8766084468539421, + "language_loss": 0.65139353, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.67614961, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20373535, + "step": 9243, + "time_per_iteration": 2.852041482925415 + }, + { + "auxiliary_loss_clip": 0.01423938, + "auxiliary_loss_mlp": 0.01037992, + "balance_loss_clip": 1.25721049, + "balance_loss_mlp": 1.01884675, + "epoch": 0.5557793476627085, + "flos": 23121921219840.0, + "grad_norm": 6.706419277091673, + "language_loss": 0.75821471, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.78283405, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19140625, + "step": 9244, + "time_per_iteration": 2.8342297077178955 + }, + { + "auxiliary_loss_clip": 0.01411375, + "auxiliary_loss_mlp": 0.01040215, + "balance_loss_clip": 1.24973071, + "balance_loss_mlp": 1.0205934, + "epoch": 0.5558394709153766, + "flos": 21436429484160.0, + "grad_norm": 2.6354787384983007, + "language_loss": 0.75685728, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.78137326, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19616699, + "step": 9245, + "time_per_iteration": 2.836925983428955 + }, + { + "auxiliary_loss_clip": 0.01441661, + "auxiliary_loss_mlp": 0.01040462, + "balance_loss_clip": 1.27015018, + "balance_loss_mlp": 1.01945734, + "epoch": 0.5558995941680445, + "flos": 25088103567360.0, + "grad_norm": 1.9305676850147917, + "language_loss": 0.80472702, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82954824, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.21020508, + "step": 9246, + "time_per_iteration": 2.8689868450164795 + }, + { + "auxiliary_loss_clip": 0.01437042, + "auxiliary_loss_mlp": 0.01037168, + "balance_loss_clip": 1.26835179, + "balance_loss_mlp": 1.01736712, + "epoch": 0.5559597174207125, + "flos": 20020742346240.0, + "grad_norm": 2.399745345100065, + "language_loss": 0.7439183, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76866043, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19812012, + "step": 9247, + "time_per_iteration": 2.981285333633423 + }, + { + "auxiliary_loss_clip": 0.01420917, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.2533536, + "balance_loss_mlp": 1.01673985, + "epoch": 0.5560198406733804, + "flos": 16844131111680.0, + "grad_norm": 2.840685804059471, + "language_loss": 0.77285171, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.79743648, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20800781, + "step": 9248, + "time_per_iteration": 2.8557651042938232 + }, + { + "auxiliary_loss_clip": 0.01211684, + "auxiliary_loss_mlp": 0.01022165, + "balance_loss_clip": 1.11866426, + "balance_loss_mlp": 1.00480831, + "epoch": 0.5560799639260484, + "flos": 70731129300480.0, + "grad_norm": 0.847614633551443, + "language_loss": 0.59502852, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61736703, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.17382812, + "step": 9249, + "time_per_iteration": 3.4729621410369873 + }, + { + "auxiliary_loss_clip": 0.01431435, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.2606771, + "balance_loss_mlp": 1.01443839, + "epoch": 0.5561400871787163, + "flos": 23158913483520.0, + "grad_norm": 1.913626393946407, + "language_loss": 0.80674154, + "learning_rate": 1.734202189316832e-06, + "loss": 0.83139145, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19116211, + "step": 9250, + "time_per_iteration": 2.842432737350464 + }, + { + "auxiliary_loss_clip": 0.01434941, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.26349056, + "balance_loss_mlp": 1.01341462, + "epoch": 0.5562002104313843, + "flos": 17575334628480.0, + "grad_norm": 4.140769766009932, + "language_loss": 0.70562959, + "learning_rate": 1.733816187358836e-06, + "loss": 0.73031169, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1986084, + "step": 9251, + "time_per_iteration": 2.8029706478118896 + }, + { + "auxiliary_loss_clip": 0.01427532, + "auxiliary_loss_mlp": 0.01034359, + "balance_loss_clip": 1.25880814, + "balance_loss_mlp": 1.01480913, + "epoch": 0.5562603336840523, + "flos": 25056133476480.0, + "grad_norm": 1.6235100342155677, + "language_loss": 0.76054144, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.7851603, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19543457, + "step": 9252, + "time_per_iteration": 2.8871986865997314 + }, + { + "auxiliary_loss_clip": 0.01439233, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.26668549, + "balance_loss_mlp": 1.01641941, + "epoch": 0.5563204569367203, + "flos": 29070807742080.0, + "grad_norm": 2.3376794423607814, + "language_loss": 0.73558795, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.76034331, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.19885254, + "step": 9253, + "time_per_iteration": 4.358929395675659 + }, + { + "auxiliary_loss_clip": 0.01429785, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.26221323, + "balance_loss_mlp": 1.01034296, + "epoch": 0.5563805801893883, + "flos": 22100480438400.0, + "grad_norm": 1.969050528774385, + "language_loss": 0.83608669, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.86068535, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.1973877, + "step": 9254, + "time_per_iteration": 4.280235528945923 + }, + { + "auxiliary_loss_clip": 0.01220104, + "auxiliary_loss_mlp": 0.0101876, + "balance_loss_clip": 1.1225481, + "balance_loss_mlp": 0.99940091, + "epoch": 0.5564407034420562, + "flos": 58661724170880.0, + "grad_norm": 0.8717556817510829, + "language_loss": 0.64900929, + "learning_rate": 1.732272280610387e-06, + "loss": 0.67139798, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.19335938, + "step": 9255, + "time_per_iteration": 4.573627233505249 + }, + { + "auxiliary_loss_clip": 0.01428673, + "auxiliary_loss_mlp": 0.01036494, + "balance_loss_clip": 1.26257312, + "balance_loss_mlp": 1.01786184, + "epoch": 0.5565008266947242, + "flos": 23122690381440.0, + "grad_norm": 2.3795448880263432, + "language_loss": 0.70203328, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.72668505, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1862793, + "step": 9256, + "time_per_iteration": 2.866562604904175 + }, + { + "auxiliary_loss_clip": 0.01410969, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.24761224, + "balance_loss_mlp": 1.01182282, + "epoch": 0.5565609499473921, + "flos": 21587972878080.0, + "grad_norm": 1.5915660823393525, + "language_loss": 0.76202178, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78643876, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18908691, + "step": 9257, + "time_per_iteration": 2.8297667503356934 + }, + { + "auxiliary_loss_clip": 0.01431724, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.26294756, + "balance_loss_mlp": 1.01214552, + "epoch": 0.5566210732000602, + "flos": 18378893859840.0, + "grad_norm": 2.1865536709232276, + "language_loss": 0.61827201, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.64291358, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20300293, + "step": 9258, + "time_per_iteration": 2.84818959236145 + }, + { + "auxiliary_loss_clip": 0.01433467, + "auxiliary_loss_mlp": 0.01038148, + "balance_loss_clip": 1.26387882, + "balance_loss_mlp": 1.01742935, + "epoch": 0.5566811964527281, + "flos": 25714528830720.0, + "grad_norm": 1.6310406172683192, + "language_loss": 0.80072582, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.82544202, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20715332, + "step": 9259, + "time_per_iteration": 2.8979134559631348 + }, + { + "auxiliary_loss_clip": 0.01415865, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.24972653, + "balance_loss_mlp": 1.01355553, + "epoch": 0.5567413197053961, + "flos": 26955525219840.0, + "grad_norm": 1.896720767264879, + "language_loss": 0.8204664, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.84495622, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19567871, + "step": 9260, + "time_per_iteration": 2.885531425476074 + }, + { + "auxiliary_loss_clip": 0.01432133, + "auxiliary_loss_mlp": 0.01037254, + "balance_loss_clip": 1.26328135, + "balance_loss_mlp": 1.01691675, + "epoch": 0.556801442958064, + "flos": 20860479434880.0, + "grad_norm": 1.422876516680739, + "language_loss": 0.69380534, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71849918, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20349121, + "step": 9261, + "time_per_iteration": 2.8547275066375732 + }, + { + "auxiliary_loss_clip": 0.01216215, + "auxiliary_loss_mlp": 0.01027154, + "balance_loss_clip": 1.1207794, + "balance_loss_mlp": 1.00626886, + "epoch": 0.556861566210732, + "flos": 70527979572480.0, + "grad_norm": 0.7432492172157791, + "language_loss": 0.61172593, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63415956, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.20898438, + "step": 9262, + "time_per_iteration": 3.3514251708984375 + }, + { + "auxiliary_loss_clip": 0.01432686, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.26388597, + "balance_loss_mlp": 1.01422799, + "epoch": 0.5569216894633999, + "flos": 25348044798720.0, + "grad_norm": 2.0333074872899446, + "language_loss": 0.65219796, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.67687142, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.2043457, + "step": 9263, + "time_per_iteration": 2.8993165493011475 + }, + { + "auxiliary_loss_clip": 0.01429936, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.26358628, + "balance_loss_mlp": 1.01434755, + "epoch": 0.556981812716068, + "flos": 22649165856000.0, + "grad_norm": 4.357551981511657, + "language_loss": 0.73649466, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.76112884, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19128418, + "step": 9264, + "time_per_iteration": 2.876309394836426 + }, + { + "auxiliary_loss_clip": 0.01428633, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.26083446, + "balance_loss_mlp": 1.01496863, + "epoch": 0.5570419359687359, + "flos": 11043168399360.0, + "grad_norm": 2.217781855285657, + "language_loss": 0.77826941, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.80290139, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19592285, + "step": 9265, + "time_per_iteration": 2.8559341430664062 + }, + { + "auxiliary_loss_clip": 0.01414961, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.25272107, + "balance_loss_mlp": 1.01786196, + "epoch": 0.5571020592214039, + "flos": 22834127174400.0, + "grad_norm": 1.5818533634717795, + "language_loss": 0.71660846, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.74112946, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19274902, + "step": 9266, + "time_per_iteration": 2.954575777053833 + }, + { + "auxiliary_loss_clip": 0.01414241, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.24916267, + "balance_loss_mlp": 1.01685381, + "epoch": 0.5571621824740719, + "flos": 22937548083840.0, + "grad_norm": 1.8483822437606747, + "language_loss": 0.69238961, + "learning_rate": 1.727641538728533e-06, + "loss": 0.71689606, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19543457, + "step": 9267, + "time_per_iteration": 2.848322629928589 + }, + { + "auxiliary_loss_clip": 0.01416821, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.25416505, + "balance_loss_mlp": 1.01565826, + "epoch": 0.5572223057267398, + "flos": 22977119301120.0, + "grad_norm": 2.0282062803527965, + "language_loss": 0.75365686, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.77817309, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19140625, + "step": 9268, + "time_per_iteration": 2.8956868648529053 + }, + { + "auxiliary_loss_clip": 0.01420458, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.25604057, + "balance_loss_mlp": 1.01697588, + "epoch": 0.5572824289794078, + "flos": 20969963147520.0, + "grad_norm": 2.0142127660046323, + "language_loss": 0.75951433, + "learning_rate": 1.726869892322104e-06, + "loss": 0.78407502, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18640137, + "step": 9269, + "time_per_iteration": 2.8706235885620117 + }, + { + "auxiliary_loss_clip": 0.01417318, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.25098372, + "balance_loss_mlp": 1.0155127, + "epoch": 0.5573425522320757, + "flos": 25052332913280.0, + "grad_norm": 1.8008261050843994, + "language_loss": 0.837165, + "learning_rate": 1.726484084647256e-06, + "loss": 0.86169821, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20495605, + "step": 9270, + "time_per_iteration": 2.882671594619751 + }, + { + "auxiliary_loss_clip": 0.01433964, + "auxiliary_loss_mlp": 0.01039542, + "balance_loss_clip": 1.26463532, + "balance_loss_mlp": 1.02026582, + "epoch": 0.5574026754847438, + "flos": 23670154189440.0, + "grad_norm": 1.8727066345973866, + "language_loss": 0.80649883, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.83123386, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19287109, + "step": 9271, + "time_per_iteration": 2.854628086090088 + }, + { + "auxiliary_loss_clip": 0.01426142, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.26070237, + "balance_loss_mlp": 1.01819396, + "epoch": 0.5574627987374117, + "flos": 24790853358720.0, + "grad_norm": 1.825897085425875, + "language_loss": 0.90769219, + "learning_rate": 1.725712500427442e-06, + "loss": 0.93232864, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19311523, + "step": 9272, + "time_per_iteration": 2.8624749183654785 + }, + { + "auxiliary_loss_clip": 0.01409868, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.24770832, + "balance_loss_mlp": 1.01951432, + "epoch": 0.5575229219900797, + "flos": 21844882707840.0, + "grad_norm": 1.8957955201404233, + "language_loss": 0.84707189, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.87154806, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18225098, + "step": 9273, + "time_per_iteration": 2.8766167163848877 + }, + { + "auxiliary_loss_clip": 0.01413011, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.24861872, + "balance_loss_mlp": 1.01924944, + "epoch": 0.5575830452427476, + "flos": 27825739320960.0, + "grad_norm": 2.9992908999921197, + "language_loss": 0.75172174, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.77624249, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19799805, + "step": 9274, + "time_per_iteration": 2.897397994995117 + }, + { + "auxiliary_loss_clip": 0.01437291, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.26378989, + "balance_loss_mlp": 1.01983619, + "epoch": 0.5576431684954156, + "flos": 17819440179840.0, + "grad_norm": 3.0783446678549473, + "language_loss": 0.7973572, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8221367, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.20825195, + "step": 9275, + "time_per_iteration": 4.236073732376099 + }, + { + "auxiliary_loss_clip": 0.01430793, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.26365089, + "balance_loss_mlp": 1.01432955, + "epoch": 0.5577032917480835, + "flos": 15495279822720.0, + "grad_norm": 1.7890290772592448, + "language_loss": 0.76463413, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.78928578, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20031738, + "step": 9276, + "time_per_iteration": 2.8965048789978027 + }, + { + "auxiliary_loss_clip": 0.01416858, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.25064206, + "balance_loss_mlp": 1.01674294, + "epoch": 0.5577634150007516, + "flos": 21589692180480.0, + "grad_norm": 2.0599009634448646, + "language_loss": 0.76853293, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.79305786, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18884277, + "step": 9277, + "time_per_iteration": 2.8652896881103516 + }, + { + "auxiliary_loss_clip": 0.01407785, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.24409163, + "balance_loss_mlp": 1.01946831, + "epoch": 0.5578235382534195, + "flos": 21148997397120.0, + "grad_norm": 1.4742552708690304, + "language_loss": 0.72323096, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.74769217, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18859863, + "step": 9278, + "time_per_iteration": 2.8807456493377686 + }, + { + "auxiliary_loss_clip": 0.01419731, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.25138164, + "balance_loss_mlp": 1.01260078, + "epoch": 0.5578836615060875, + "flos": 26516368759680.0, + "grad_norm": 1.697758220821337, + "language_loss": 0.76115042, + "learning_rate": 1.723012284057868e-06, + "loss": 0.7856791, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20544434, + "step": 9279, + "time_per_iteration": 2.9149298667907715 + }, + { + "auxiliary_loss_clip": 0.01424347, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.2568419, + "balance_loss_mlp": 1.01790607, + "epoch": 0.5579437847587555, + "flos": 20162558108160.0, + "grad_norm": 1.9944024837492509, + "language_loss": 0.68205965, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.70668423, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20202637, + "step": 9280, + "time_per_iteration": 2.8445005416870117 + }, + { + "auxiliary_loss_clip": 0.01434038, + "auxiliary_loss_mlp": 0.01039879, + "balance_loss_clip": 1.26390243, + "balance_loss_mlp": 1.02019715, + "epoch": 0.5580039080114234, + "flos": 26112937708800.0, + "grad_norm": 1.8286381368691422, + "language_loss": 0.73993313, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.76467228, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19677734, + "step": 9281, + "time_per_iteration": 2.9101452827453613 + }, + { + "auxiliary_loss_clip": 0.0141895, + "auxiliary_loss_mlp": 0.01040492, + "balance_loss_clip": 1.25440645, + "balance_loss_mlp": 1.02053607, + "epoch": 0.5580640312640914, + "flos": 13779537298560.0, + "grad_norm": 3.4304962154970475, + "language_loss": 0.75196344, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77655786, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19958496, + "step": 9282, + "time_per_iteration": 2.829041004180908 + }, + { + "auxiliary_loss_clip": 0.01414664, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.250126, + "balance_loss_mlp": 1.01671159, + "epoch": 0.5581241545167593, + "flos": 17684954075520.0, + "grad_norm": 1.7217431304903226, + "language_loss": 0.66650379, + "learning_rate": 1.721469534028297e-06, + "loss": 0.69101483, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19726562, + "step": 9283, + "time_per_iteration": 2.8560469150543213 + }, + { + "auxiliary_loss_clip": 0.01419436, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.25245571, + "balance_loss_mlp": 1.01476002, + "epoch": 0.5581842777694274, + "flos": 19577559098880.0, + "grad_norm": 2.162969930711974, + "language_loss": 0.83748436, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.86201382, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18737793, + "step": 9284, + "time_per_iteration": 2.8415160179138184 + }, + { + "auxiliary_loss_clip": 0.01422443, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.25695956, + "balance_loss_mlp": 1.01693463, + "epoch": 0.5582444010220953, + "flos": 20604972193920.0, + "grad_norm": 3.165396790514728, + "language_loss": 0.85994667, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.88453114, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1907959, + "step": 9285, + "time_per_iteration": 2.865011692047119 + }, + { + "auxiliary_loss_clip": 0.01414836, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.25034177, + "balance_loss_mlp": 1.0210638, + "epoch": 0.5583045242747633, + "flos": 19144872645120.0, + "grad_norm": 2.1443568091571255, + "language_loss": 0.75049198, + "learning_rate": 1.720312582354912e-06, + "loss": 0.77505147, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20056152, + "step": 9286, + "time_per_iteration": 2.83902907371521 + }, + { + "auxiliary_loss_clip": 0.01415815, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.25092292, + "balance_loss_mlp": 1.01477766, + "epoch": 0.5583646475274312, + "flos": 27466177743360.0, + "grad_norm": 1.6057836200072089, + "language_loss": 0.74798, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.77248812, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20214844, + "step": 9287, + "time_per_iteration": 2.9636423587799072 + }, + { + "auxiliary_loss_clip": 0.01435097, + "auxiliary_loss_mlp": 0.01037229, + "balance_loss_clip": 1.26438236, + "balance_loss_mlp": 1.01770294, + "epoch": 0.5584247707800992, + "flos": 23662779287040.0, + "grad_norm": 1.5682248037557764, + "language_loss": 0.75770253, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.78242576, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.1953125, + "step": 9288, + "time_per_iteration": 4.363052606582642 + }, + { + "auxiliary_loss_clip": 0.01424051, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.25793839, + "balance_loss_mlp": 1.01746917, + "epoch": 0.5584848940327671, + "flos": 13706774380800.0, + "grad_norm": 6.124889762055855, + "language_loss": 0.7923367, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.81694871, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19677734, + "step": 9289, + "time_per_iteration": 4.371615886688232 + }, + { + "auxiliary_loss_clip": 0.01446536, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.27421403, + "balance_loss_mlp": 1.01983571, + "epoch": 0.5585450172854352, + "flos": 27027564220800.0, + "grad_norm": 1.7483631623971752, + "language_loss": 0.62133467, + "learning_rate": 1.718770128672817e-06, + "loss": 0.64620894, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21057129, + "step": 9290, + "time_per_iteration": 4.2945849895477295 + }, + { + "auxiliary_loss_clip": 0.01424829, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.2553668, + "balance_loss_mlp": 1.01091075, + "epoch": 0.5586051405381031, + "flos": 23196131971200.0, + "grad_norm": 1.8062052815106115, + "language_loss": 0.68405575, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70861113, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19812012, + "step": 9291, + "time_per_iteration": 2.8358399868011475 + }, + { + "auxiliary_loss_clip": 0.0141675, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.24974561, + "balance_loss_mlp": 1.01656759, + "epoch": 0.5586652637907711, + "flos": 20784866094720.0, + "grad_norm": 1.8476504235218487, + "language_loss": 0.85069847, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.87523013, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19836426, + "step": 9292, + "time_per_iteration": 2.8832287788391113 + }, + { + "auxiliary_loss_clip": 0.01415172, + "auxiliary_loss_mlp": 0.01036228, + "balance_loss_clip": 1.25100327, + "balance_loss_mlp": 1.01695204, + "epoch": 0.5587253870434391, + "flos": 28231477856640.0, + "grad_norm": 4.029888371957987, + "language_loss": 0.7477597, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.77227372, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19274902, + "step": 9293, + "time_per_iteration": 2.879192590713501 + }, + { + "auxiliary_loss_clip": 0.0141109, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.2460475, + "balance_loss_mlp": 1.01832652, + "epoch": 0.558785510296107, + "flos": 26627209816320.0, + "grad_norm": 1.6827269062399426, + "language_loss": 0.7279743, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.75247073, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20227051, + "step": 9294, + "time_per_iteration": 2.8775336742401123 + }, + { + "auxiliary_loss_clip": 0.0141989, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.25350988, + "balance_loss_mlp": 1.0206697, + "epoch": 0.558845633548775, + "flos": 20166313426560.0, + "grad_norm": 3.003460357299817, + "language_loss": 0.68947786, + "learning_rate": 1.716842301625806e-06, + "loss": 0.71407753, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19396973, + "step": 9295, + "time_per_iteration": 2.8540284633636475 + }, + { + "auxiliary_loss_clip": 0.01413907, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.24801052, + "balance_loss_mlp": 1.01707697, + "epoch": 0.5589057568014429, + "flos": 24360926837760.0, + "grad_norm": 1.5515043894462173, + "language_loss": 0.81789756, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.84241295, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20544434, + "step": 9296, + "time_per_iteration": 2.9652867317199707 + }, + { + "auxiliary_loss_clip": 0.01409637, + "auxiliary_loss_mlp": 0.0104146, + "balance_loss_clip": 1.24465823, + "balance_loss_mlp": 1.0203954, + "epoch": 0.558965880054111, + "flos": 21114900800640.0, + "grad_norm": 1.701714122889957, + "language_loss": 0.65689182, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.6814028, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.21057129, + "step": 9297, + "time_per_iteration": 2.8729748725891113 + }, + { + "auxiliary_loss_clip": 0.01433684, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.26402259, + "balance_loss_mlp": 1.01977015, + "epoch": 0.5590260033067789, + "flos": 18443648448000.0, + "grad_norm": 1.566592526248745, + "language_loss": 0.76452398, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.78926212, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20361328, + "step": 9298, + "time_per_iteration": 2.850032329559326 + }, + { + "auxiliary_loss_clip": 0.01224613, + "auxiliary_loss_mlp": 0.01064895, + "balance_loss_clip": 1.12164044, + "balance_loss_mlp": 1.03628433, + "epoch": 0.5590861265594469, + "flos": 70608841309440.0, + "grad_norm": 0.7209466849920116, + "language_loss": 0.52462107, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54751611, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.28515625, + "step": 9299, + "time_per_iteration": 3.482156991958618 + }, + { + "auxiliary_loss_clip": 0.01401917, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.23959577, + "balance_loss_mlp": 1.01681828, + "epoch": 0.5591462498121148, + "flos": 30676614105600.0, + "grad_norm": 1.818869366568032, + "language_loss": 0.70368147, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.72808009, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.21130371, + "step": 9300, + "time_per_iteration": 2.998533010482788 + }, + { + "auxiliary_loss_clip": 0.01426893, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.25639915, + "balance_loss_mlp": 1.02424169, + "epoch": 0.5592063730647828, + "flos": 18159564476160.0, + "grad_norm": 1.705624328466361, + "language_loss": 0.82484972, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.8495751, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21398926, + "step": 9301, + "time_per_iteration": 2.8691420555114746 + }, + { + "auxiliary_loss_clip": 0.0142428, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.25579929, + "balance_loss_mlp": 1.01664114, + "epoch": 0.5592664963174507, + "flos": 24070870552320.0, + "grad_norm": 1.924345217538142, + "language_loss": 0.68566865, + "learning_rate": 1.714143795138756e-06, + "loss": 0.7102893, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21142578, + "step": 9302, + "time_per_iteration": 2.896989107131958 + }, + { + "auxiliary_loss_clip": 0.01426183, + "auxiliary_loss_mlp": 0.01038328, + "balance_loss_clip": 1.2552917, + "balance_loss_mlp": 1.01704884, + "epoch": 0.5593266195701188, + "flos": 19837228861440.0, + "grad_norm": 1.6307211442437868, + "language_loss": 0.70959425, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73423934, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21276855, + "step": 9303, + "time_per_iteration": 2.861273765563965 + }, + { + "auxiliary_loss_clip": 0.01406283, + "auxiliary_loss_mlp": 0.01049962, + "balance_loss_clip": 1.24444389, + "balance_loss_mlp": 1.03006601, + "epoch": 0.5593867428227867, + "flos": 25311504983040.0, + "grad_norm": 1.6023447496843732, + "language_loss": 0.73489279, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.75945526, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19885254, + "step": 9304, + "time_per_iteration": 2.9268555641174316 + }, + { + "auxiliary_loss_clip": 0.01402024, + "auxiliary_loss_mlp": 0.01051206, + "balance_loss_clip": 1.23684478, + "balance_loss_mlp": 1.02980781, + "epoch": 0.5594468660754547, + "flos": 12940795595520.0, + "grad_norm": 2.7310462241503353, + "language_loss": 0.78705919, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.81159151, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.21411133, + "step": 9305, + "time_per_iteration": 2.8147459030151367 + }, + { + "auxiliary_loss_clip": 0.01393784, + "auxiliary_loss_mlp": 0.01049584, + "balance_loss_clip": 1.2325691, + "balance_loss_mlp": 1.02826929, + "epoch": 0.5595069893281227, + "flos": 19072109727360.0, + "grad_norm": 2.5432460525039757, + "language_loss": 0.70187515, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.72630882, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.21313477, + "step": 9306, + "time_per_iteration": 2.844775438308716 + }, + { + "auxiliary_loss_clip": 0.0123112, + "auxiliary_loss_mlp": 0.0104833, + "balance_loss_clip": 1.12491226, + "balance_loss_mlp": 1.01838517, + "epoch": 0.5595671125807906, + "flos": 70301546755200.0, + "grad_norm": 0.9144979160315362, + "language_loss": 0.60335124, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62614584, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.29882812, + "step": 9307, + "time_per_iteration": 3.4641947746276855 + }, + { + "auxiliary_loss_clip": 0.0139548, + "auxiliary_loss_mlp": 0.01044742, + "balance_loss_clip": 1.23317146, + "balance_loss_mlp": 1.0235343, + "epoch": 0.5596272358334586, + "flos": 20674658465280.0, + "grad_norm": 1.8246570396371218, + "language_loss": 0.74453717, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76893938, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.21203613, + "step": 9308, + "time_per_iteration": 2.8633666038513184 + }, + { + "auxiliary_loss_clip": 0.01401732, + "auxiliary_loss_mlp": 0.01054485, + "balance_loss_clip": 1.23444819, + "balance_loss_mlp": 1.03048825, + "epoch": 0.5596873590861265, + "flos": 25050568366080.0, + "grad_norm": 1.7826291089004214, + "language_loss": 0.70911634, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.73367852, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.24023438, + "step": 9309, + "time_per_iteration": 2.9289212226867676 + }, + { + "auxiliary_loss_clip": 0.01413355, + "auxiliary_loss_mlp": 0.0104225, + "balance_loss_clip": 1.24766254, + "balance_loss_mlp": 1.01973128, + "epoch": 0.5597474823387946, + "flos": 25969538378880.0, + "grad_norm": 2.899117471551678, + "language_loss": 0.76131952, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.78587556, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.22521973, + "step": 9310, + "time_per_iteration": 4.369547128677368 + }, + { + "auxiliary_loss_clip": 0.01424579, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.25530648, + "balance_loss_mlp": 1.02094924, + "epoch": 0.5598076055914625, + "flos": 26188686783360.0, + "grad_norm": 1.9349595045523655, + "language_loss": 0.703529, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.72820234, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21813965, + "step": 9311, + "time_per_iteration": 2.9144012928009033 + }, + { + "auxiliary_loss_clip": 0.01404755, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.24051404, + "balance_loss_mlp": 1.01965272, + "epoch": 0.5598677288441305, + "flos": 11663168901120.0, + "grad_norm": 4.080888023727629, + "language_loss": 0.73353374, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.75799286, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.21496582, + "step": 9312, + "time_per_iteration": 2.848167657852173 + }, + { + "auxiliary_loss_clip": 0.01408004, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.24433696, + "balance_loss_mlp": 1.01667845, + "epoch": 0.5599278520967984, + "flos": 22976983566720.0, + "grad_norm": 1.9420917527393908, + "language_loss": 0.90240407, + "learning_rate": 1.709904360003822e-06, + "loss": 0.92686391, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.2130127, + "step": 9313, + "time_per_iteration": 2.840688467025757 + }, + { + "auxiliary_loss_clip": 0.01405407, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.24227023, + "balance_loss_mlp": 1.01648188, + "epoch": 0.5599879753494664, + "flos": 21225560878080.0, + "grad_norm": 1.6355564578980195, + "language_loss": 0.78113729, + "learning_rate": 1.709519022520204e-06, + "loss": 0.80556852, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.21228027, + "step": 9314, + "time_per_iteration": 2.8981707096099854 + }, + { + "auxiliary_loss_clip": 0.01404077, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.2403512, + "balance_loss_mlp": 1.01465499, + "epoch": 0.5600480986021343, + "flos": 31915303009920.0, + "grad_norm": 2.0085464865945655, + "language_loss": 0.7098068, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.73419952, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20532227, + "step": 9315, + "time_per_iteration": 2.9505648612976074 + }, + { + "auxiliary_loss_clip": 0.01424053, + "auxiliary_loss_mlp": 0.010385, + "balance_loss_clip": 1.25474882, + "balance_loss_mlp": 1.01798379, + "epoch": 0.5601082218548024, + "flos": 28487889993600.0, + "grad_norm": 1.7916021451333635, + "language_loss": 0.67531383, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.69993937, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20507812, + "step": 9316, + "time_per_iteration": 2.908461093902588 + }, + { + "auxiliary_loss_clip": 0.01395683, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.23247313, + "balance_loss_mlp": 1.01362264, + "epoch": 0.5601683451074703, + "flos": 24107727081600.0, + "grad_norm": 1.8772187334713977, + "language_loss": 0.87311065, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89741051, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20678711, + "step": 9317, + "time_per_iteration": 2.8824527263641357 + }, + { + "auxiliary_loss_clip": 0.0141445, + "auxiliary_loss_mlp": 0.01040446, + "balance_loss_clip": 1.24603486, + "balance_loss_mlp": 1.01865423, + "epoch": 0.5602284683601383, + "flos": 26366544668160.0, + "grad_norm": 1.6855291213773649, + "language_loss": 0.78416234, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.80871129, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21777344, + "step": 9318, + "time_per_iteration": 2.887216806411743 + }, + { + "auxiliary_loss_clip": 0.01406962, + "auxiliary_loss_mlp": 0.01044556, + "balance_loss_clip": 1.24191833, + "balance_loss_mlp": 1.02455282, + "epoch": 0.5602885916128063, + "flos": 24506497918080.0, + "grad_norm": 1.6035752381846726, + "language_loss": 0.76457369, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78908885, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20007324, + "step": 9319, + "time_per_iteration": 2.9100589752197266 + }, + { + "auxiliary_loss_clip": 0.01407235, + "auxiliary_loss_mlp": 0.01040574, + "balance_loss_clip": 1.24284172, + "balance_loss_mlp": 1.02010536, + "epoch": 0.5603487148654742, + "flos": 27356558296320.0, + "grad_norm": 16.766503018733943, + "language_loss": 0.86186934, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.88634747, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20458984, + "step": 9320, + "time_per_iteration": 2.929385185241699 + }, + { + "auxiliary_loss_clip": 0.01244405, + "auxiliary_loss_mlp": 0.01063641, + "balance_loss_clip": 1.14190555, + "balance_loss_mlp": 1.03865516, + "epoch": 0.5604088381181422, + "flos": 54115376532480.0, + "grad_norm": 0.7644657537327092, + "language_loss": 0.52555829, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54863876, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.24902344, + "step": 9321, + "time_per_iteration": 3.173736333847046 + }, + { + "auxiliary_loss_clip": 0.01402749, + "auxiliary_loss_mlp": 0.01040432, + "balance_loss_clip": 1.24119782, + "balance_loss_mlp": 1.0199399, + "epoch": 0.5604689613708101, + "flos": 22246458721920.0, + "grad_norm": 1.3664775023855107, + "language_loss": 0.75285757, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.77728927, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20471191, + "step": 9322, + "time_per_iteration": 2.9062600135803223 + }, + { + "auxiliary_loss_clip": 0.01404462, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_clip": 1.23807192, + "balance_loss_mlp": 1.01576853, + "epoch": 0.5605290846234782, + "flos": 35311107893760.0, + "grad_norm": 1.847533923289402, + "language_loss": 0.74081314, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.76523453, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21911621, + "step": 9323, + "time_per_iteration": 4.436866760253906 + }, + { + "auxiliary_loss_clip": 0.01418372, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.24987829, + "balance_loss_mlp": 1.01657867, + "epoch": 0.5605892078761461, + "flos": 20272132310400.0, + "grad_norm": 1.6100945449504744, + "language_loss": 0.62854695, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.65310991, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21362305, + "step": 9324, + "time_per_iteration": 2.8644447326660156 + }, + { + "auxiliary_loss_clip": 0.01401056, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.23616254, + "balance_loss_mlp": 1.01568079, + "epoch": 0.5606493311288141, + "flos": 17316479272320.0, + "grad_norm": 1.8741131451055681, + "language_loss": 0.88587064, + "learning_rate": 1.705281040409226e-06, + "loss": 0.91025913, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.22131348, + "step": 9325, + "time_per_iteration": 5.599449872970581 + }, + { + "auxiliary_loss_clip": 0.01410726, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.24296522, + "balance_loss_mlp": 1.01606297, + "epoch": 0.560709454381482, + "flos": 21662998035840.0, + "grad_norm": 1.6330646843795364, + "language_loss": 0.74878407, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.77326661, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.21472168, + "step": 9326, + "time_per_iteration": 2.852649450302124 + }, + { + "auxiliary_loss_clip": 0.0142531, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.25514007, + "balance_loss_mlp": 1.01160789, + "epoch": 0.56076957763415, + "flos": 20313106116480.0, + "grad_norm": 2.6814447180793337, + "language_loss": 0.79156345, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.81614792, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21520996, + "step": 9327, + "time_per_iteration": 2.8337314128875732 + }, + { + "auxiliary_loss_clip": 0.01407243, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.24119925, + "balance_loss_mlp": 1.01407385, + "epoch": 0.5608297008868179, + "flos": 25056721658880.0, + "grad_norm": 1.3883721348505498, + "language_loss": 0.79077792, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.81520498, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21386719, + "step": 9328, + "time_per_iteration": 2.9429502487182617 + }, + { + "auxiliary_loss_clip": 0.01395138, + "auxiliary_loss_mlp": 0.01034888, + "balance_loss_clip": 1.23244667, + "balance_loss_mlp": 1.01267934, + "epoch": 0.560889824139486, + "flos": 19876800078720.0, + "grad_norm": 1.4897740457775261, + "language_loss": 0.74595141, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.77025175, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.22192383, + "step": 9329, + "time_per_iteration": 2.895230770111084 + }, + { + "auxiliary_loss_clip": 0.01430742, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.25829935, + "balance_loss_mlp": 1.01523519, + "epoch": 0.5609499473921539, + "flos": 22939584099840.0, + "grad_norm": 1.8491541623266425, + "language_loss": 0.84281659, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.86749816, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.22167969, + "step": 9330, + "time_per_iteration": 2.8667471408843994 + }, + { + "auxiliary_loss_clip": 0.01235737, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.13549376, + "balance_loss_mlp": 1.00432777, + "epoch": 0.5610100706448219, + "flos": 53064770837760.0, + "grad_norm": 0.7169877640485568, + "language_loss": 0.57893485, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.60157007, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.234375, + "step": 9331, + "time_per_iteration": 3.4176065921783447 + }, + { + "auxiliary_loss_clip": 0.01410596, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.24234819, + "balance_loss_mlp": 1.01461017, + "epoch": 0.5610701938974898, + "flos": 21844882707840.0, + "grad_norm": 1.8924933167931834, + "language_loss": 0.82623172, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.85070467, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.22106934, + "step": 9332, + "time_per_iteration": 2.849066734313965 + }, + { + "auxiliary_loss_clip": 0.01428777, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.25762749, + "balance_loss_mlp": 1.01989448, + "epoch": 0.5611303171501578, + "flos": 17466122384640.0, + "grad_norm": 1.9329738822333147, + "language_loss": 0.8256464, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.85036618, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.23278809, + "step": 9333, + "time_per_iteration": 2.858982563018799 + }, + { + "auxiliary_loss_clip": 0.01415006, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.246786, + "balance_loss_mlp": 1.01457143, + "epoch": 0.5611904404028258, + "flos": 22647989491200.0, + "grad_norm": 1.841649338890242, + "language_loss": 0.73362577, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75813776, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21618652, + "step": 9334, + "time_per_iteration": 2.889134168624878 + }, + { + "auxiliary_loss_clip": 0.01411417, + "auxiliary_loss_mlp": 0.01040017, + "balance_loss_clip": 1.2457211, + "balance_loss_mlp": 1.01906037, + "epoch": 0.5612505636554938, + "flos": 14323652991360.0, + "grad_norm": 1.7754154452143311, + "language_loss": 0.72288465, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.74739897, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.2097168, + "step": 9335, + "time_per_iteration": 2.831958770751953 + }, + { + "auxiliary_loss_clip": 0.0141692, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.24908781, + "balance_loss_mlp": 1.01531732, + "epoch": 0.5613106869081618, + "flos": 16516901583360.0, + "grad_norm": 1.7175507800729208, + "language_loss": 0.77346587, + "learning_rate": 1.701044410566205e-06, + "loss": 0.79800069, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21240234, + "step": 9336, + "time_per_iteration": 2.8406970500946045 + }, + { + "auxiliary_loss_clip": 0.01404825, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.24020505, + "balance_loss_mlp": 1.01982379, + "epoch": 0.5613708101608297, + "flos": 24068563067520.0, + "grad_norm": 2.628127408423442, + "language_loss": 0.65858203, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.68302727, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19873047, + "step": 9337, + "time_per_iteration": 2.843745708465576 + }, + { + "auxiliary_loss_clip": 0.01229456, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.1280508, + "balance_loss_mlp": 0.99747032, + "epoch": 0.5614309334134977, + "flos": 64931415459840.0, + "grad_norm": 0.8806738508813645, + "language_loss": 0.62647903, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64903438, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.28515625, + "step": 9338, + "time_per_iteration": 3.328169107437134 + }, + { + "auxiliary_loss_clip": 0.01411027, + "auxiliary_loss_mlp": 0.01036904, + "balance_loss_clip": 1.24548769, + "balance_loss_mlp": 1.01606607, + "epoch": 0.5614910566661656, + "flos": 32930907212160.0, + "grad_norm": 2.047629599986269, + "language_loss": 0.66775191, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.69223118, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20849609, + "step": 9339, + "time_per_iteration": 2.949659585952759 + }, + { + "auxiliary_loss_clip": 0.01400207, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.23661411, + "balance_loss_mlp": 1.01169384, + "epoch": 0.5615511799188336, + "flos": 18598268488320.0, + "grad_norm": 1.7716903093737688, + "language_loss": 0.70817304, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.73250186, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20983887, + "step": 9340, + "time_per_iteration": 2.870175361633301 + }, + { + "auxiliary_loss_clip": 0.01389237, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.22969866, + "balance_loss_mlp": 1.01385999, + "epoch": 0.5616113031715015, + "flos": 22830190876800.0, + "grad_norm": 1.6141062334516045, + "language_loss": 0.78106487, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.80529654, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.20092773, + "step": 9341, + "time_per_iteration": 2.8883588314056396 + }, + { + "auxiliary_loss_clip": 0.01413864, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.24569845, + "balance_loss_mlp": 1.01267874, + "epoch": 0.5616714264241696, + "flos": 22355761455360.0, + "grad_norm": 1.5484100035509594, + "language_loss": 0.80557287, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.83004069, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20227051, + "step": 9342, + "time_per_iteration": 2.8724279403686523 + }, + { + "auxiliary_loss_clip": 0.01420599, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.25048566, + "balance_loss_mlp": 1.01484156, + "epoch": 0.5617315496768375, + "flos": 18817235913600.0, + "grad_norm": 1.7554286137194293, + "language_loss": 0.77332783, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.79789448, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.2121582, + "step": 9343, + "time_per_iteration": 2.8901476860046387 + }, + { + "auxiliary_loss_clip": 0.01409825, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.24594426, + "balance_loss_mlp": 1.01727533, + "epoch": 0.5617916729295055, + "flos": 18378486656640.0, + "grad_norm": 1.726057266246324, + "language_loss": 0.70127022, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.7257694, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.22827148, + "step": 9344, + "time_per_iteration": 2.848414659500122 + }, + { + "auxiliary_loss_clip": 0.01403613, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.23727751, + "balance_loss_mlp": 1.01596367, + "epoch": 0.5618517961821734, + "flos": 28190639784960.0, + "grad_norm": 1.8077263449864864, + "language_loss": 0.67399824, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.69839871, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20471191, + "step": 9345, + "time_per_iteration": 2.8788208961486816 + }, + { + "auxiliary_loss_clip": 0.01413976, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.24861753, + "balance_loss_mlp": 1.01501155, + "epoch": 0.5619119194348414, + "flos": 15495279822720.0, + "grad_norm": 2.2772681082243946, + "language_loss": 0.87691343, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.90139359, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19030762, + "step": 9346, + "time_per_iteration": 4.216963529586792 + }, + { + "auxiliary_loss_clip": 0.01398171, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.23459649, + "balance_loss_mlp": 1.01562071, + "epoch": 0.5619720426875094, + "flos": 29139589117440.0, + "grad_norm": 2.809654557608602, + "language_loss": 0.59511828, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61946368, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20739746, + "step": 9347, + "time_per_iteration": 2.896257162094116 + }, + { + "auxiliary_loss_clip": 0.01410704, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.24358332, + "balance_loss_mlp": 1.01295018, + "epoch": 0.5620321659401774, + "flos": 18012636051840.0, + "grad_norm": 2.1874516860300184, + "language_loss": 0.7088939, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.73334312, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.21276855, + "step": 9348, + "time_per_iteration": 2.857466220855713 + }, + { + "auxiliary_loss_clip": 0.01424189, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.25444722, + "balance_loss_mlp": 1.01411676, + "epoch": 0.5620922891928454, + "flos": 20604157787520.0, + "grad_norm": 2.122950236091542, + "language_loss": 0.79793125, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.82253003, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.2154541, + "step": 9349, + "time_per_iteration": 2.8628158569335938 + }, + { + "auxiliary_loss_clip": 0.01409928, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.24472141, + "balance_loss_mlp": 1.01644957, + "epoch": 0.5621524124455133, + "flos": 26298306230400.0, + "grad_norm": 3.9983993034414484, + "language_loss": 0.68516552, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.70963514, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20593262, + "step": 9350, + "time_per_iteration": 2.8944284915924072 + }, + { + "auxiliary_loss_clip": 0.01420165, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.25065672, + "balance_loss_mlp": 1.01763368, + "epoch": 0.5622125356981813, + "flos": 12757960782720.0, + "grad_norm": 5.494201160770851, + "language_loss": 0.79982817, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.82441646, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21020508, + "step": 9351, + "time_per_iteration": 2.846269130706787 + }, + { + "auxiliary_loss_clip": 0.01437847, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.26592577, + "balance_loss_mlp": 1.01913333, + "epoch": 0.5622726589508492, + "flos": 23815815759360.0, + "grad_norm": 1.4588508595375442, + "language_loss": 0.5978151, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.62259877, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21386719, + "step": 9352, + "time_per_iteration": 2.863811492919922 + }, + { + "auxiliary_loss_clip": 0.01394603, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.23470068, + "balance_loss_mlp": 1.0115515, + "epoch": 0.5623327822035172, + "flos": 24729130172160.0, + "grad_norm": 1.7845986716832944, + "language_loss": 0.72836679, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.75263023, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.20178223, + "step": 9353, + "time_per_iteration": 2.898775577545166 + }, + { + "auxiliary_loss_clip": 0.01419994, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.25182569, + "balance_loss_mlp": 1.01937795, + "epoch": 0.5623929054561851, + "flos": 14025181173120.0, + "grad_norm": 3.1412465205016797, + "language_loss": 0.77755934, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.80216038, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20739746, + "step": 9354, + "time_per_iteration": 2.7942707538604736 + }, + { + "auxiliary_loss_clip": 0.01431758, + "auxiliary_loss_mlp": 0.01037168, + "balance_loss_clip": 1.2605269, + "balance_loss_mlp": 1.01520979, + "epoch": 0.5624530287088532, + "flos": 20714184437760.0, + "grad_norm": 2.0608824246219863, + "language_loss": 0.7387377, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.76342702, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21948242, + "step": 9355, + "time_per_iteration": 2.9006314277648926 + }, + { + "auxiliary_loss_clip": 0.01410145, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.2442559, + "balance_loss_mlp": 1.01443362, + "epoch": 0.5625131519615211, + "flos": 21480977629440.0, + "grad_norm": 1.6989513527288505, + "language_loss": 0.7432791, + "learning_rate": 1.693344975084274e-06, + "loss": 0.76772374, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19885254, + "step": 9356, + "time_per_iteration": 2.8853390216827393 + }, + { + "auxiliary_loss_clip": 0.01407872, + "auxiliary_loss_mlp": 0.01039565, + "balance_loss_clip": 1.24416196, + "balance_loss_mlp": 1.01803565, + "epoch": 0.5625732752141891, + "flos": 18707344997760.0, + "grad_norm": 1.9723809837186166, + "language_loss": 0.84349364, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.86796802, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.21520996, + "step": 9357, + "time_per_iteration": 2.8349740505218506 + }, + { + "auxiliary_loss_clip": 0.01411244, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.24705672, + "balance_loss_mlp": 1.01392484, + "epoch": 0.562633398466857, + "flos": 16225578443520.0, + "grad_norm": 2.0023594186351588, + "language_loss": 0.73083436, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.75529158, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20568848, + "step": 9358, + "time_per_iteration": 4.248685598373413 + }, + { + "auxiliary_loss_clip": 0.01404114, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.23988008, + "balance_loss_mlp": 1.01701689, + "epoch": 0.562693521719525, + "flos": 22502101697280.0, + "grad_norm": 1.7105102547204254, + "language_loss": 0.78646964, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.81088638, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20544434, + "step": 9359, + "time_per_iteration": 4.311997175216675 + }, + { + "auxiliary_loss_clip": 0.01416093, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.24921036, + "balance_loss_mlp": 1.01722574, + "epoch": 0.562753644972193, + "flos": 25340353182720.0, + "grad_norm": 1.8964015898555722, + "language_loss": 0.71313059, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.73765856, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19482422, + "step": 9360, + "time_per_iteration": 4.2870965003967285 + }, + { + "auxiliary_loss_clip": 0.01223791, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.12468517, + "balance_loss_mlp": 1.02578092, + "epoch": 0.562813768224861, + "flos": 67420466426880.0, + "grad_norm": 0.7777117777699426, + "language_loss": 0.55594409, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57869351, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.25390625, + "step": 9361, + "time_per_iteration": 3.3018205165863037 + }, + { + "auxiliary_loss_clip": 0.01404653, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.24203253, + "balance_loss_mlp": 1.01737213, + "epoch": 0.562873891477529, + "flos": 23341567317120.0, + "grad_norm": 1.4739897129490012, + "language_loss": 0.82213289, + "learning_rate": 1.691036046141018e-06, + "loss": 0.84655291, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1998291, + "step": 9362, + "time_per_iteration": 2.8663251399993896 + }, + { + "auxiliary_loss_clip": 0.01408437, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.24287617, + "balance_loss_mlp": 1.0189271, + "epoch": 0.5629340147301969, + "flos": 38487176190720.0, + "grad_norm": 2.0603222722126153, + "language_loss": 0.75718373, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20703125, + "step": 9363, + "time_per_iteration": 2.9943294525146484 + }, + { + "auxiliary_loss_clip": 0.01419933, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.25236869, + "balance_loss_mlp": 1.01462865, + "epoch": 0.5629941379828649, + "flos": 29254004513280.0, + "grad_norm": 1.6907564539087825, + "language_loss": 0.83718908, + "learning_rate": 1.690266496731839e-06, + "loss": 0.86173713, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20227051, + "step": 9364, + "time_per_iteration": 2.9175331592559814 + }, + { + "auxiliary_loss_clip": 0.01399491, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.2382338, + "balance_loss_mlp": 1.02019286, + "epoch": 0.5630542612355328, + "flos": 19428639903360.0, + "grad_norm": 2.112363532664369, + "language_loss": 0.66463292, + "learning_rate": 1.689881739637642e-06, + "loss": 0.68901616, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18640137, + "step": 9365, + "time_per_iteration": 2.8665270805358887 + }, + { + "auxiliary_loss_clip": 0.01438667, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.26545334, + "balance_loss_mlp": 1.01480079, + "epoch": 0.5631143844882008, + "flos": 22274673494400.0, + "grad_norm": 3.0530219738535824, + "language_loss": 0.82323456, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.84797609, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20690918, + "step": 9366, + "time_per_iteration": 2.821389675140381 + }, + { + "auxiliary_loss_clip": 0.01398512, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.23671603, + "balance_loss_mlp": 1.02064848, + "epoch": 0.5631745077408687, + "flos": 22975354753920.0, + "grad_norm": 1.4316957510975195, + "language_loss": 0.74765313, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.77204579, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20117188, + "step": 9367, + "time_per_iteration": 2.8589718341827393 + }, + { + "auxiliary_loss_clip": 0.01222432, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.1255362, + "balance_loss_mlp": 1.00885057, + "epoch": 0.5632346309935368, + "flos": 65113725317760.0, + "grad_norm": 0.6355456449918558, + "language_loss": 0.53555715, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55811608, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.24609375, + "step": 9368, + "time_per_iteration": 3.5262792110443115 + }, + { + "auxiliary_loss_clip": 0.01418707, + "auxiliary_loss_mlp": 0.01042219, + "balance_loss_clip": 1.25434899, + "balance_loss_mlp": 1.02263308, + "epoch": 0.5632947542462047, + "flos": 23013251913600.0, + "grad_norm": 1.6119780567689044, + "language_loss": 0.69318819, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71779752, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19580078, + "step": 9369, + "time_per_iteration": 2.8820393085479736 + }, + { + "auxiliary_loss_clip": 0.01406259, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.24030542, + "balance_loss_mlp": 1.0212847, + "epoch": 0.5633548774988727, + "flos": 30494910412800.0, + "grad_norm": 1.714091383526198, + "language_loss": 0.76295698, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.78743184, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19934082, + "step": 9370, + "time_per_iteration": 2.9321036338806152 + }, + { + "auxiliary_loss_clip": 0.01429197, + "auxiliary_loss_mlp": 0.0105185, + "balance_loss_clip": 1.25883889, + "balance_loss_mlp": 1.03023767, + "epoch": 0.5634150007515406, + "flos": 18524193471360.0, + "grad_norm": 1.9598298666554268, + "language_loss": 0.7643494, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78915989, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21606445, + "step": 9371, + "time_per_iteration": 2.8234100341796875 + }, + { + "auxiliary_loss_clip": 0.01411089, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.24779141, + "balance_loss_mlp": 1.02791619, + "epoch": 0.5634751240042086, + "flos": 19253994399360.0, + "grad_norm": 1.8302339502994007, + "language_loss": 0.76747006, + "learning_rate": 1.687188770067285e-06, + "loss": 0.79205644, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19628906, + "step": 9372, + "time_per_iteration": 2.8431994915008545 + }, + { + "auxiliary_loss_clip": 0.01407976, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.2453239, + "balance_loss_mlp": 1.0238272, + "epoch": 0.5635352472568766, + "flos": 12028114609920.0, + "grad_norm": 3.064069483457172, + "language_loss": 0.72621918, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.75074369, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.2064209, + "step": 9373, + "time_per_iteration": 2.8267507553100586 + }, + { + "auxiliary_loss_clip": 0.01417551, + "auxiliary_loss_mlp": 0.01045344, + "balance_loss_clip": 1.2511313, + "balance_loss_mlp": 1.02357697, + "epoch": 0.5635953705095446, + "flos": 21881648747520.0, + "grad_norm": 1.9323783826923924, + "language_loss": 0.84033948, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.86496842, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21765137, + "step": 9374, + "time_per_iteration": 2.96347975730896 + }, + { + "auxiliary_loss_clip": 0.01406106, + "auxiliary_loss_mlp": 0.01040547, + "balance_loss_clip": 1.24189246, + "balance_loss_mlp": 1.0211401, + "epoch": 0.5636554937622126, + "flos": 27137681360640.0, + "grad_norm": 1.5082911657906233, + "language_loss": 0.67234302, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.69680953, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1940918, + "step": 9375, + "time_per_iteration": 2.9098799228668213 + }, + { + "auxiliary_loss_clip": 0.01418046, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.24935555, + "balance_loss_mlp": 1.0225271, + "epoch": 0.5637156170148805, + "flos": 12932470552320.0, + "grad_norm": 2.074510503932805, + "language_loss": 0.81165183, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83626306, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20544434, + "step": 9376, + "time_per_iteration": 2.7961621284484863 + }, + { + "auxiliary_loss_clip": 0.01423293, + "auxiliary_loss_mlp": 0.01039398, + "balance_loss_clip": 1.25147486, + "balance_loss_mlp": 1.01908457, + "epoch": 0.5637757402675485, + "flos": 45567213431040.0, + "grad_norm": 1.36438214249065, + "language_loss": 0.70256335, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.72719026, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.203125, + "step": 9377, + "time_per_iteration": 3.052825450897217 + }, + { + "auxiliary_loss_clip": 0.01406413, + "auxiliary_loss_mlp": 0.0103999, + "balance_loss_clip": 1.24455607, + "balance_loss_mlp": 1.02035618, + "epoch": 0.5638358635202164, + "flos": 20895661906560.0, + "grad_norm": 1.3331957922916715, + "language_loss": 0.75091738, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.77538139, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19628906, + "step": 9378, + "time_per_iteration": 2.8639626502990723 + }, + { + "auxiliary_loss_clip": 0.01436299, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.26227987, + "balance_loss_mlp": 1.01489305, + "epoch": 0.5638959867728844, + "flos": 18815697590400.0, + "grad_norm": 2.268732276837131, + "language_loss": 0.83194983, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.8566612, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.19934082, + "step": 9379, + "time_per_iteration": 2.8374645709991455 + }, + { + "auxiliary_loss_clip": 0.01415682, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.24827003, + "balance_loss_mlp": 1.01864481, + "epoch": 0.5639561100255523, + "flos": 27501767418240.0, + "grad_norm": 2.223143489132079, + "language_loss": 0.73289323, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.75743294, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19641113, + "step": 9380, + "time_per_iteration": 2.8951053619384766 + }, + { + "auxiliary_loss_clip": 0.01421396, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.25323105, + "balance_loss_mlp": 1.02119863, + "epoch": 0.5640162332782204, + "flos": 18085444214400.0, + "grad_norm": 3.2341472460551066, + "language_loss": 0.75731438, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.781955, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21472168, + "step": 9381, + "time_per_iteration": 4.275495529174805 + }, + { + "auxiliary_loss_clip": 0.01422864, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.2526722, + "balance_loss_mlp": 1.01759052, + "epoch": 0.5640763565308883, + "flos": 20894168828160.0, + "grad_norm": 2.322667603813712, + "language_loss": 0.73521984, + "learning_rate": 1.683342680176499e-06, + "loss": 0.75982118, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19677734, + "step": 9382, + "time_per_iteration": 2.852339029312134 + }, + { + "auxiliary_loss_clip": 0.01227916, + "auxiliary_loss_mlp": 0.01024033, + "balance_loss_clip": 1.12773645, + "balance_loss_mlp": 1.00228965, + "epoch": 0.5641364797835563, + "flos": 64477753401600.0, + "grad_norm": 0.7581882613420264, + "language_loss": 0.54447323, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5669927, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.21777344, + "step": 9383, + "time_per_iteration": 3.487288475036621 + }, + { + "auxiliary_loss_clip": 0.01442321, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.26962399, + "balance_loss_mlp": 1.01478302, + "epoch": 0.5641966030362242, + "flos": 18670126510080.0, + "grad_norm": 1.8181744635838362, + "language_loss": 0.71667194, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.74144518, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20227051, + "step": 9384, + "time_per_iteration": 2.847229242324829 + }, + { + "auxiliary_loss_clip": 0.01416249, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.24791849, + "balance_loss_mlp": 1.01316214, + "epoch": 0.5642567262888922, + "flos": 22502599390080.0, + "grad_norm": 1.9385148323712222, + "language_loss": 0.76499945, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78949904, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20532227, + "step": 9385, + "time_per_iteration": 2.877686023712158 + }, + { + "auxiliary_loss_clip": 0.01410278, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.24477279, + "balance_loss_mlp": 1.01912558, + "epoch": 0.5643168495415603, + "flos": 13011251028480.0, + "grad_norm": 2.1146457356709565, + "language_loss": 0.82908708, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.85357535, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19421387, + "step": 9386, + "time_per_iteration": 2.8118197917938232 + }, + { + "auxiliary_loss_clip": 0.01437712, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.26500678, + "balance_loss_mlp": 1.01553965, + "epoch": 0.5643769727942282, + "flos": 18597544571520.0, + "grad_norm": 1.8255916140831692, + "language_loss": 0.71131074, + "learning_rate": 1.681420084607516e-06, + "loss": 0.73604774, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.20458984, + "step": 9387, + "time_per_iteration": 2.8321290016174316 + }, + { + "auxiliary_loss_clip": 0.01437643, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.26595187, + "balance_loss_mlp": 1.01619887, + "epoch": 0.5644370960468962, + "flos": 33820350353280.0, + "grad_norm": 1.5963940916579629, + "language_loss": 0.75297099, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.77771366, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20422363, + "step": 9388, + "time_per_iteration": 2.968747615814209 + }, + { + "auxiliary_loss_clip": 0.01405198, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.24193406, + "balance_loss_mlp": 1.01298797, + "epoch": 0.5644972192995641, + "flos": 21224746471680.0, + "grad_norm": 1.5038401033374895, + "language_loss": 0.82912266, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.85349089, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18640137, + "step": 9389, + "time_per_iteration": 2.884032964706421 + }, + { + "auxiliary_loss_clip": 0.01427182, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.25746167, + "balance_loss_mlp": 1.0150435, + "epoch": 0.5645573425522321, + "flos": 18596820654720.0, + "grad_norm": 1.859611929736712, + "language_loss": 0.64393973, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66856843, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.2064209, + "step": 9390, + "time_per_iteration": 2.8411383628845215 + }, + { + "auxiliary_loss_clip": 0.01411961, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.24688363, + "balance_loss_mlp": 1.01220465, + "epoch": 0.5646174658049, + "flos": 18122888926080.0, + "grad_norm": 1.7063661023909833, + "language_loss": 0.93209946, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.95653713, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19616699, + "step": 9391, + "time_per_iteration": 2.836562395095825 + }, + { + "auxiliary_loss_clip": 0.01438898, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.26500416, + "balance_loss_mlp": 1.01496673, + "epoch": 0.564677589057568, + "flos": 28341730730880.0, + "grad_norm": 2.466757371544214, + "language_loss": 0.62037551, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.64513099, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.2166748, + "step": 9392, + "time_per_iteration": 4.329622268676758 + }, + { + "auxiliary_loss_clip": 0.01421063, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.2527976, + "balance_loss_mlp": 1.01288581, + "epoch": 0.564737712310236, + "flos": 22173876783360.0, + "grad_norm": 2.3602645247450003, + "language_loss": 0.82508719, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.8496291, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20239258, + "step": 9393, + "time_per_iteration": 2.88104248046875 + }, + { + "auxiliary_loss_clip": 0.01423514, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.25686729, + "balance_loss_mlp": 1.01245511, + "epoch": 0.564797835562904, + "flos": 20968651048320.0, + "grad_norm": 1.972706552632908, + "language_loss": 0.87693, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.90149105, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20141602, + "step": 9394, + "time_per_iteration": 4.339195966720581 + }, + { + "auxiliary_loss_clip": 0.01420822, + "auxiliary_loss_mlp": 0.01040691, + "balance_loss_clip": 1.2549994, + "balance_loss_mlp": 1.02092648, + "epoch": 0.5648579588155719, + "flos": 17429130120960.0, + "grad_norm": 1.7747426413288732, + "language_loss": 0.86112571, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.88574082, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19763184, + "step": 9395, + "time_per_iteration": 4.272717475891113 + }, + { + "auxiliary_loss_clip": 0.01226234, + "auxiliary_loss_mlp": 0.01024949, + "balance_loss_clip": 1.13356709, + "balance_loss_mlp": 1.00263298, + "epoch": 0.5649180820682399, + "flos": 69963114499200.0, + "grad_norm": 0.795948793216419, + "language_loss": 0.58376384, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60627568, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.22363281, + "step": 9396, + "time_per_iteration": 3.3553919792175293 + }, + { + "auxiliary_loss_clip": 0.0144666, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.27568412, + "balance_loss_mlp": 1.01355386, + "epoch": 0.5649782053209078, + "flos": 24983913496320.0, + "grad_norm": 1.7967393232975348, + "language_loss": 0.71360409, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.7384035, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.1973877, + "step": 9397, + "time_per_iteration": 2.900918483734131 + }, + { + "auxiliary_loss_clip": 0.01438305, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.26708269, + "balance_loss_mlp": 1.01602387, + "epoch": 0.5650383285735758, + "flos": 21736801584000.0, + "grad_norm": 2.3240852399449223, + "language_loss": 0.67901731, + "learning_rate": 1.67719144001275e-06, + "loss": 0.70376104, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20043945, + "step": 9398, + "time_per_iteration": 2.8741259574890137 + }, + { + "auxiliary_loss_clip": 0.01238678, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.14254212, + "balance_loss_mlp": 0.99986249, + "epoch": 0.5650984518262439, + "flos": 65933961897600.0, + "grad_norm": 0.7804560745935144, + "language_loss": 0.58168489, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60432017, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.24902344, + "step": 9399, + "time_per_iteration": 3.268998146057129 + }, + { + "auxiliary_loss_clip": 0.01438163, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.26536059, + "balance_loss_mlp": 1.01773834, + "epoch": 0.5651585750789118, + "flos": 21042409351680.0, + "grad_norm": 1.8202441845040518, + "language_loss": 0.73517835, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75994432, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.20690918, + "step": 9400, + "time_per_iteration": 2.8744728565216064 + }, + { + "auxiliary_loss_clip": 0.01451148, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.27694178, + "balance_loss_mlp": 1.0203867, + "epoch": 0.5652186983315798, + "flos": 18561185735040.0, + "grad_norm": 2.1055788907332804, + "language_loss": 0.61871833, + "learning_rate": 1.676038429548412e-06, + "loss": 0.64364374, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.21008301, + "step": 9401, + "time_per_iteration": 2.9127774238586426 + }, + { + "auxiliary_loss_clip": 0.01436551, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.26808512, + "balance_loss_mlp": 1.01540065, + "epoch": 0.5652788215842477, + "flos": 18487834634880.0, + "grad_norm": 1.8836278288537285, + "language_loss": 0.81901073, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.8437258, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19543457, + "step": 9402, + "time_per_iteration": 2.8449625968933105 + }, + { + "auxiliary_loss_clip": 0.01425072, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.25865054, + "balance_loss_mlp": 1.0227468, + "epoch": 0.5653389448369157, + "flos": 30056070666240.0, + "grad_norm": 1.3837350678578444, + "language_loss": 0.78417832, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80884427, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.18762207, + "step": 9403, + "time_per_iteration": 2.9033427238464355 + }, + { + "auxiliary_loss_clip": 0.01438379, + "auxiliary_loss_mlp": 0.01044512, + "balance_loss_clip": 1.27000928, + "balance_loss_mlp": 1.02436519, + "epoch": 0.5653990680895836, + "flos": 16736321456640.0, + "grad_norm": 1.688410070589302, + "language_loss": 0.69972134, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.72455025, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20141602, + "step": 9404, + "time_per_iteration": 2.8530681133270264 + }, + { + "auxiliary_loss_clip": 0.01423742, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.2598207, + "balance_loss_mlp": 1.01717854, + "epoch": 0.5654591913422516, + "flos": 14546420979840.0, + "grad_norm": 1.8894020963364817, + "language_loss": 0.67997372, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.70457578, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19274902, + "step": 9405, + "time_per_iteration": 2.815533399581909 + }, + { + "auxiliary_loss_clip": 0.01420662, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.26054168, + "balance_loss_mlp": 1.01742983, + "epoch": 0.5655193145949196, + "flos": 26220113936640.0, + "grad_norm": 1.7506548530627937, + "language_loss": 0.75133622, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.77590823, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19104004, + "step": 9406, + "time_per_iteration": 2.8814337253570557 + }, + { + "auxiliary_loss_clip": 0.01451575, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.28044248, + "balance_loss_mlp": 1.0195632, + "epoch": 0.5655794378475876, + "flos": 25057581310080.0, + "grad_norm": 4.235114589794394, + "language_loss": 0.80161226, + "learning_rate": 1.673732740698882e-06, + "loss": 0.82652903, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.20544434, + "step": 9407, + "time_per_iteration": 2.8819329738616943 + }, + { + "auxiliary_loss_clip": 0.01431928, + "auxiliary_loss_mlp": 0.01039565, + "balance_loss_clip": 1.26857901, + "balance_loss_mlp": 1.02017009, + "epoch": 0.5656395611002555, + "flos": 31046084294400.0, + "grad_norm": 1.4342920188084056, + "language_loss": 0.72090453, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.74561942, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19396973, + "step": 9408, + "time_per_iteration": 2.9531404972076416 + }, + { + "auxiliary_loss_clip": 0.01427705, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.26391757, + "balance_loss_mlp": 1.0143851, + "epoch": 0.5656996843529235, + "flos": 20239031099520.0, + "grad_norm": 2.2292977384186052, + "language_loss": 0.81928909, + "learning_rate": 1.672964276570308e-06, + "loss": 0.8439039, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19396973, + "step": 9409, + "time_per_iteration": 2.8472046852111816 + }, + { + "auxiliary_loss_clip": 0.01439677, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.27089834, + "balance_loss_mlp": 1.01451159, + "epoch": 0.5657598076055914, + "flos": 21006095760000.0, + "grad_norm": 1.6612052019295793, + "language_loss": 0.78912699, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.81386429, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19543457, + "step": 9410, + "time_per_iteration": 2.8426504135131836 + }, + { + "auxiliary_loss_clip": 0.01437124, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.26987588, + "balance_loss_mlp": 1.01736927, + "epoch": 0.5658199308582594, + "flos": 11553277985280.0, + "grad_norm": 2.383238828255604, + "language_loss": 0.8424896, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.86723024, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19555664, + "step": 9411, + "time_per_iteration": 2.834829568862915 + }, + { + "auxiliary_loss_clip": 0.01452636, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.28040481, + "balance_loss_mlp": 1.01644742, + "epoch": 0.5658800541109275, + "flos": 14179891703040.0, + "grad_norm": 7.261702149814612, + "language_loss": 0.68351585, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.70840979, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20300293, + "step": 9412, + "time_per_iteration": 2.8223440647125244 + }, + { + "auxiliary_loss_clip": 0.01421868, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.26023149, + "balance_loss_mlp": 1.01352525, + "epoch": 0.5659401773635954, + "flos": 27315720224640.0, + "grad_norm": 1.40825707180426, + "language_loss": 0.58803034, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.61257613, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19189453, + "step": 9413, + "time_per_iteration": 2.8971059322357178 + }, + { + "auxiliary_loss_clip": 0.01423166, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.25879955, + "balance_loss_mlp": 1.01635754, + "epoch": 0.5660003006162634, + "flos": 16737135863040.0, + "grad_norm": 1.689984000509327, + "language_loss": 0.70497054, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.72955352, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18798828, + "step": 9414, + "time_per_iteration": 2.815650224685669 + }, + { + "auxiliary_loss_clip": 0.01430463, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.26534247, + "balance_loss_mlp": 1.01580513, + "epoch": 0.5660604238689313, + "flos": 21663586218240.0, + "grad_norm": 1.5307436858923236, + "language_loss": 0.7880283, + "learning_rate": 1.670659182280247e-06, + "loss": 0.81267828, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.18713379, + "step": 9415, + "time_per_iteration": 4.318682670593262 + }, + { + "auxiliary_loss_clip": 0.01239351, + "auxiliary_loss_mlp": 0.01019786, + "balance_loss_clip": 1.14269853, + "balance_loss_mlp": 1.0009985, + "epoch": 0.5661205471215993, + "flos": 68854432464000.0, + "grad_norm": 0.6834210269193465, + "language_loss": 0.49173048, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51432186, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.1875, + "step": 9416, + "time_per_iteration": 3.5066428184509277 + }, + { + "auxiliary_loss_clip": 0.01443836, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.27535439, + "balance_loss_mlp": 1.01690745, + "epoch": 0.5661806703742672, + "flos": 28633596808320.0, + "grad_norm": 2.3201586426157017, + "language_loss": 0.64116001, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6659683, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20080566, + "step": 9417, + "time_per_iteration": 2.957254409790039 + }, + { + "auxiliary_loss_clip": 0.01436984, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.26676369, + "balance_loss_mlp": 1.01299, + "epoch": 0.5662407936269352, + "flos": 21407988487680.0, + "grad_norm": 1.658988816320876, + "language_loss": 0.69352567, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71822447, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19885254, + "step": 9418, + "time_per_iteration": 2.9104931354522705 + }, + { + "auxiliary_loss_clip": 0.01431905, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.26424956, + "balance_loss_mlp": 1.01454866, + "epoch": 0.5663009168796032, + "flos": 25669392503040.0, + "grad_norm": 1.7886751878880403, + "language_loss": 0.65221971, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.67688638, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20227051, + "step": 9419, + "time_per_iteration": 2.9190573692321777 + }, + { + "auxiliary_loss_clip": 0.01234458, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.13892555, + "balance_loss_mlp": 1.00367022, + "epoch": 0.5663610401322712, + "flos": 67965459753600.0, + "grad_norm": 0.7534772980186489, + "language_loss": 0.59746611, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.62006676, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.21972656, + "step": 9420, + "time_per_iteration": 3.353546619415283 + }, + { + "auxiliary_loss_clip": 0.01430826, + "auxiliary_loss_mlp": 0.01037782, + "balance_loss_clip": 1.26598978, + "balance_loss_mlp": 1.01897073, + "epoch": 0.5664211633849391, + "flos": 24620415621120.0, + "grad_norm": 1.7420575134703122, + "language_loss": 0.75812018, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.78280628, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18798828, + "step": 9421, + "time_per_iteration": 2.8828439712524414 + }, + { + "auxiliary_loss_clip": 0.01435542, + "auxiliary_loss_mlp": 0.01035761, + "balance_loss_clip": 1.26540816, + "balance_loss_mlp": 1.01668715, + "epoch": 0.5664812866376071, + "flos": 11654391409920.0, + "grad_norm": 2.1195294468722357, + "language_loss": 0.74239552, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.76710856, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.1907959, + "step": 9422, + "time_per_iteration": 2.841552972793579 + }, + { + "auxiliary_loss_clip": 0.0142642, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.26231456, + "balance_loss_mlp": 1.01683092, + "epoch": 0.566541409890275, + "flos": 24654557462400.0, + "grad_norm": 1.8289011604541323, + "language_loss": 0.82164085, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84625411, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18066406, + "step": 9423, + "time_per_iteration": 2.8847339153289795 + }, + { + "auxiliary_loss_clip": 0.01434456, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.26784575, + "balance_loss_mlp": 1.01741958, + "epoch": 0.566601533142943, + "flos": 22280057625600.0, + "grad_norm": 1.7173130125778444, + "language_loss": 0.81583792, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.84055144, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19482422, + "step": 9424, + "time_per_iteration": 2.8601438999176025 + }, + { + "auxiliary_loss_clip": 0.01438195, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.26757312, + "balance_loss_mlp": 1.01350617, + "epoch": 0.5666616563956111, + "flos": 29983805441280.0, + "grad_norm": 1.9357972786342548, + "language_loss": 0.79486442, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.81957698, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19543457, + "step": 9425, + "time_per_iteration": 2.897944927215576 + }, + { + "auxiliary_loss_clip": 0.01421783, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.25585485, + "balance_loss_mlp": 1.01396883, + "epoch": 0.566721779648279, + "flos": 17789415615360.0, + "grad_norm": 2.0877704395054213, + "language_loss": 0.59876502, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.62332016, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19763184, + "step": 9426, + "time_per_iteration": 2.831064462661743 + }, + { + "auxiliary_loss_clip": 0.01441147, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.27021086, + "balance_loss_mlp": 1.01458192, + "epoch": 0.566781902900947, + "flos": 21043540471680.0, + "grad_norm": 1.5933838572721544, + "language_loss": 0.82286489, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.847615, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19274902, + "step": 9427, + "time_per_iteration": 4.355990886688232 + }, + { + "auxiliary_loss_clip": 0.01423334, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.26056004, + "balance_loss_mlp": 1.01465487, + "epoch": 0.5668420261536149, + "flos": 23158913483520.0, + "grad_norm": 1.7717698194449667, + "language_loss": 0.86842209, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.89299583, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19384766, + "step": 9428, + "time_per_iteration": 2.897840976715088 + }, + { + "auxiliary_loss_clip": 0.01438728, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.2664783, + "balance_loss_mlp": 1.02009869, + "epoch": 0.5669021494062829, + "flos": 22611856878720.0, + "grad_norm": 2.7325118607728913, + "language_loss": 0.73919731, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.76398706, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20129395, + "step": 9429, + "time_per_iteration": 4.237279891967773 + }, + { + "auxiliary_loss_clip": 0.01437018, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.26581514, + "balance_loss_mlp": 1.01581979, + "epoch": 0.5669622726589508, + "flos": 17389377924480.0, + "grad_norm": 1.7631268607340798, + "language_loss": 0.76056099, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.78528923, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.1998291, + "step": 9430, + "time_per_iteration": 4.221285104751587 + }, + { + "auxiliary_loss_clip": 0.01434374, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.26724231, + "balance_loss_mlp": 1.01503515, + "epoch": 0.5670223959116188, + "flos": 18770606507520.0, + "grad_norm": 2.037654132148481, + "language_loss": 0.7358259, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.76051855, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1986084, + "step": 9431, + "time_per_iteration": 2.852400541305542 + }, + { + "auxiliary_loss_clip": 0.01393186, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.23572087, + "balance_loss_mlp": 1.01281655, + "epoch": 0.5670825191642868, + "flos": 13561293790080.0, + "grad_norm": 1.7425923165099155, + "language_loss": 0.74225903, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.7665084, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18933105, + "step": 9432, + "time_per_iteration": 2.8428280353546143 + }, + { + "auxiliary_loss_clip": 0.01424315, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.25721073, + "balance_loss_mlp": 1.01466429, + "epoch": 0.5671426424169548, + "flos": 22064076357120.0, + "grad_norm": 1.4640797497492521, + "language_loss": 0.78866804, + "learning_rate": 1.663746609539197e-06, + "loss": 0.81325489, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19714355, + "step": 9433, + "time_per_iteration": 2.871584415435791 + }, + { + "auxiliary_loss_clip": 0.01428109, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.25704789, + "balance_loss_mlp": 1.01402521, + "epoch": 0.5672027656696227, + "flos": 21333732491520.0, + "grad_norm": 3.061552338031281, + "language_loss": 0.64415681, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.66879785, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.21972656, + "step": 9434, + "time_per_iteration": 2.847593069076538 + }, + { + "auxiliary_loss_clip": 0.01408463, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.24457812, + "balance_loss_mlp": 1.01438928, + "epoch": 0.5672628889222907, + "flos": 23524628353920.0, + "grad_norm": 1.8061187349951455, + "language_loss": 0.66941649, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.69384038, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19543457, + "step": 9435, + "time_per_iteration": 2.9053030014038086 + }, + { + "auxiliary_loss_clip": 0.01407147, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.24265754, + "balance_loss_mlp": 1.01047516, + "epoch": 0.5673230121749586, + "flos": 27132342474240.0, + "grad_norm": 34.46757317767153, + "language_loss": 0.72155058, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.74591887, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19189453, + "step": 9436, + "time_per_iteration": 2.8863418102264404 + }, + { + "auxiliary_loss_clip": 0.01429011, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.25790048, + "balance_loss_mlp": 1.01515937, + "epoch": 0.5673831354276266, + "flos": 31155160803840.0, + "grad_norm": 1.6100964268055542, + "language_loss": 0.74743372, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.77207905, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20361328, + "step": 9437, + "time_per_iteration": 2.947248697280884 + }, + { + "auxiliary_loss_clip": 0.01427966, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.25898719, + "balance_loss_mlp": 1.01506472, + "epoch": 0.5674432586802945, + "flos": 27684783210240.0, + "grad_norm": 1.9088402119251382, + "language_loss": 0.61716866, + "learning_rate": 1.661827179985277e-06, + "loss": 0.64179695, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19775391, + "step": 9438, + "time_per_iteration": 2.9599976539611816 + }, + { + "auxiliary_loss_clip": 0.01419722, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.25071859, + "balance_loss_mlp": 1.0153712, + "epoch": 0.5675033819329626, + "flos": 26626485899520.0, + "grad_norm": 1.4536720182862295, + "language_loss": 0.75945854, + "learning_rate": 1.661443332486909e-06, + "loss": 0.78400385, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19445801, + "step": 9439, + "time_per_iteration": 2.8907687664031982 + }, + { + "auxiliary_loss_clip": 0.01413379, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.2486304, + "balance_loss_mlp": 1.01270962, + "epoch": 0.5675635051856306, + "flos": 19107563667840.0, + "grad_norm": 1.9506268761534913, + "language_loss": 0.84168601, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.86615717, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.21032715, + "step": 9440, + "time_per_iteration": 2.8396666049957275 + }, + { + "auxiliary_loss_clip": 0.01433925, + "auxiliary_loss_mlp": 0.01038772, + "balance_loss_clip": 1.26058745, + "balance_loss_mlp": 1.01792192, + "epoch": 0.5676236284382985, + "flos": 17575198894080.0, + "grad_norm": 2.0667624243172926, + "language_loss": 0.76768792, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.79241484, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20849609, + "step": 9441, + "time_per_iteration": 2.8016974925994873 + }, + { + "auxiliary_loss_clip": 0.01420666, + "auxiliary_loss_mlp": 0.01035337, + "balance_loss_clip": 1.25374007, + "balance_loss_mlp": 1.01561987, + "epoch": 0.5676837516909665, + "flos": 15960750773760.0, + "grad_norm": 1.8099527254814918, + "language_loss": 0.8363477, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.86090779, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19726562, + "step": 9442, + "time_per_iteration": 2.8361682891845703 + }, + { + "auxiliary_loss_clip": 0.01398651, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.23849964, + "balance_loss_mlp": 1.0153017, + "epoch": 0.5677438749436344, + "flos": 18304954577280.0, + "grad_norm": 2.0092365533127996, + "language_loss": 0.75162971, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.77598137, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.21228027, + "step": 9443, + "time_per_iteration": 2.8197946548461914 + }, + { + "auxiliary_loss_clip": 0.01422529, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.25566483, + "balance_loss_mlp": 1.01216388, + "epoch": 0.5678039981963025, + "flos": 17940325582080.0, + "grad_norm": 1.8382722535720921, + "language_loss": 0.78273636, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.80728316, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1998291, + "step": 9444, + "time_per_iteration": 2.895387649536133 + }, + { + "auxiliary_loss_clip": 0.01431482, + "auxiliary_loss_mlp": 0.01041739, + "balance_loss_clip": 1.25951064, + "balance_loss_mlp": 1.02061558, + "epoch": 0.5678641214489704, + "flos": 19325354728320.0, + "grad_norm": 2.0585487914692755, + "language_loss": 0.81391823, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83865047, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21130371, + "step": 9445, + "time_per_iteration": 2.8279154300689697 + }, + { + "auxiliary_loss_clip": 0.0141918, + "auxiliary_loss_mlp": 0.01033739, + "balance_loss_clip": 1.25174046, + "balance_loss_mlp": 1.01336598, + "epoch": 0.5679242447016384, + "flos": 27763970889600.0, + "grad_norm": 1.4110182194907486, + "language_loss": 0.71606338, + "learning_rate": 1.658756760280259e-06, + "loss": 0.7405926, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20361328, + "step": 9446, + "time_per_iteration": 2.917187213897705 + }, + { + "auxiliary_loss_clip": 0.01430781, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.25838065, + "balance_loss_mlp": 1.01409972, + "epoch": 0.5679843679543063, + "flos": 23779637902080.0, + "grad_norm": 1.7287073250120715, + "language_loss": 0.74734282, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.77199399, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20227051, + "step": 9447, + "time_per_iteration": 2.8806142807006836 + }, + { + "auxiliary_loss_clip": 0.01431203, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.25962043, + "balance_loss_mlp": 1.01435745, + "epoch": 0.5680444912069743, + "flos": 25602511409280.0, + "grad_norm": 2.099535486194818, + "language_loss": 0.75725633, + "learning_rate": 1.657989284462725e-06, + "loss": 0.78192043, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20849609, + "step": 9448, + "time_per_iteration": 2.8827052116394043 + }, + { + "auxiliary_loss_clip": 0.01442634, + "auxiliary_loss_mlp": 0.01031439, + "balance_loss_clip": 1.26969826, + "balance_loss_mlp": 1.01236558, + "epoch": 0.5681046144596422, + "flos": 23706241557120.0, + "grad_norm": 2.2618494194320937, + "language_loss": 0.77563429, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.80037498, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.19067383, + "step": 9449, + "time_per_iteration": 2.916572093963623 + }, + { + "auxiliary_loss_clip": 0.01423397, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.25406039, + "balance_loss_mlp": 1.0156157, + "epoch": 0.5681647377123102, + "flos": 28012238962560.0, + "grad_norm": 1.563858319919621, + "language_loss": 0.75545263, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.7800464, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20361328, + "step": 9450, + "time_per_iteration": 4.3708531856536865 + }, + { + "auxiliary_loss_clip": 0.01421643, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.25242448, + "balance_loss_mlp": 1.01535606, + "epoch": 0.5682248609649782, + "flos": 22758106631040.0, + "grad_norm": 2.131644361973487, + "language_loss": 0.68131918, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.70589614, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20690918, + "step": 9451, + "time_per_iteration": 2.8501923084259033 + }, + { + "auxiliary_loss_clip": 0.01440515, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.26388502, + "balance_loss_mlp": 1.01334488, + "epoch": 0.5682849842176462, + "flos": 21298640509440.0, + "grad_norm": 1.9320860383102612, + "language_loss": 0.73342329, + "learning_rate": 1.656454488573026e-06, + "loss": 0.75817662, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 1.76660156, + "router_z_loss_mlp": 0.21472168, + "step": 9452, + "time_per_iteration": 2.8510501384735107 + }, + { + "auxiliary_loss_clip": 0.01408227, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.24287081, + "balance_loss_mlp": 1.01276731, + "epoch": 0.5683451074703142, + "flos": 21151440616320.0, + "grad_norm": 1.5957458862831349, + "language_loss": 0.71441233, + "learning_rate": 1.656070822132428e-06, + "loss": 0.7388227, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20031738, + "step": 9453, + "time_per_iteration": 2.8485419750213623 + }, + { + "auxiliary_loss_clip": 0.01418598, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.25103998, + "balance_loss_mlp": 1.01559126, + "epoch": 0.5684052307229821, + "flos": 22354223132160.0, + "grad_norm": 1.8277144652504675, + "language_loss": 0.70571893, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.73026192, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20117188, + "step": 9454, + "time_per_iteration": 2.8821299076080322 + }, + { + "auxiliary_loss_clip": 0.01410977, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.24604082, + "balance_loss_mlp": 1.01446557, + "epoch": 0.5684653539756501, + "flos": 21808614360960.0, + "grad_norm": 2.4453218506474834, + "language_loss": 0.61521524, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.63965428, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18469238, + "step": 9455, + "time_per_iteration": 2.849801778793335 + }, + { + "auxiliary_loss_clip": 0.01438011, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.26573896, + "balance_loss_mlp": 1.01300144, + "epoch": 0.568525477228318, + "flos": 23009134636800.0, + "grad_norm": 2.071200064679158, + "language_loss": 0.7463913, + "learning_rate": 1.6549199011198e-06, + "loss": 0.77110493, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20336914, + "step": 9456, + "time_per_iteration": 2.8493847846984863 + }, + { + "auxiliary_loss_clip": 0.01426719, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.2593894, + "balance_loss_mlp": 1.01473069, + "epoch": 0.568585600480986, + "flos": 21401111278080.0, + "grad_norm": 3.0590334984810243, + "language_loss": 0.77667254, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.80128384, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19689941, + "step": 9457, + "time_per_iteration": 2.8741726875305176 + }, + { + "auxiliary_loss_clip": 0.01427272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.25744987, + "balance_loss_mlp": 1.0172255, + "epoch": 0.568645723733654, + "flos": 30019304626560.0, + "grad_norm": 1.9225646972249055, + "language_loss": 0.6721437, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.69680333, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.21459961, + "step": 9458, + "time_per_iteration": 2.912330150604248 + }, + { + "auxiliary_loss_clip": 0.01436731, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.2642827, + "balance_loss_mlp": 1.01258218, + "epoch": 0.568705846986322, + "flos": 20422408849920.0, + "grad_norm": 2.0025162586942704, + "language_loss": 0.6886763, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.71337354, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.20397949, + "step": 9459, + "time_per_iteration": 2.8779401779174805 + }, + { + "auxiliary_loss_clip": 0.0143537, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.26312733, + "balance_loss_mlp": 1.02014112, + "epoch": 0.5687659702389899, + "flos": 17465579447040.0, + "grad_norm": 2.6105928192726537, + "language_loss": 0.77320999, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.79796302, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.19787598, + "step": 9460, + "time_per_iteration": 2.920724630355835 + }, + { + "auxiliary_loss_clip": 0.01426022, + "auxiliary_loss_mlp": 0.01038777, + "balance_loss_clip": 1.25464058, + "balance_loss_mlp": 1.01839221, + "epoch": 0.5688260934916579, + "flos": 25415785543680.0, + "grad_norm": 1.6081728217505364, + "language_loss": 0.72451061, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74915862, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20361328, + "step": 9461, + "time_per_iteration": 2.906696081161499 + }, + { + "auxiliary_loss_clip": 0.01416942, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.24965274, + "balance_loss_mlp": 1.01372766, + "epoch": 0.5688862167443258, + "flos": 21615825692160.0, + "grad_norm": 1.915012585529728, + "language_loss": 0.7332058, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.757716, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20349121, + "step": 9462, + "time_per_iteration": 4.274590730667114 + }, + { + "auxiliary_loss_clip": 0.01413806, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.25033355, + "balance_loss_mlp": 1.01812232, + "epoch": 0.5689463399969938, + "flos": 22429157800320.0, + "grad_norm": 2.0733788858133657, + "language_loss": 0.73934746, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.76385283, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18615723, + "step": 9463, + "time_per_iteration": 2.864074468612671 + }, + { + "auxiliary_loss_clip": 0.01421525, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.25273442, + "balance_loss_mlp": 1.01310003, + "epoch": 0.5690064632496618, + "flos": 18306085697280.0, + "grad_norm": 1.8412639573453236, + "language_loss": 0.75299937, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.77754343, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19799805, + "step": 9464, + "time_per_iteration": 4.274746417999268 + }, + { + "auxiliary_loss_clip": 0.01417068, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.2481761, + "balance_loss_mlp": 1.01550663, + "epoch": 0.5690665865023298, + "flos": 21589058753280.0, + "grad_norm": 1.6836548974331096, + "language_loss": 0.8511641, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.87569809, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20825195, + "step": 9465, + "time_per_iteration": 4.217571973800659 + }, + { + "auxiliary_loss_clip": 0.01409459, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.2449441, + "balance_loss_mlp": 1.01375377, + "epoch": 0.5691267097549978, + "flos": 24431020312320.0, + "grad_norm": 1.7250420363162762, + "language_loss": 0.72971869, + "learning_rate": 1.651084350506125e-06, + "loss": 0.75414044, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18945312, + "step": 9466, + "time_per_iteration": 2.8731305599212646 + }, + { + "auxiliary_loss_clip": 0.01220271, + "auxiliary_loss_mlp": 0.01019828, + "balance_loss_clip": 1.12631011, + "balance_loss_mlp": 1.00237584, + "epoch": 0.5691868330076657, + "flos": 61692058391040.0, + "grad_norm": 0.7152723500551653, + "language_loss": 0.55504262, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.5774436, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.17480469, + "step": 9467, + "time_per_iteration": 3.3957624435424805 + }, + { + "auxiliary_loss_clip": 0.01420951, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.25322247, + "balance_loss_mlp": 1.01183915, + "epoch": 0.5692469562603337, + "flos": 21335270814720.0, + "grad_norm": 5.882338844244037, + "language_loss": 0.6402272, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66476226, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20727539, + "step": 9468, + "time_per_iteration": 2.91056489944458 + }, + { + "auxiliary_loss_clip": 0.0141677, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.25133514, + "balance_loss_mlp": 1.01509714, + "epoch": 0.5693070795130016, + "flos": 23378378601600.0, + "grad_norm": 1.8572052199355693, + "language_loss": 0.79472923, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81925076, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20275879, + "step": 9469, + "time_per_iteration": 2.900360107421875 + }, + { + "auxiliary_loss_clip": 0.01427132, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.25538158, + "balance_loss_mlp": 1.02090907, + "epoch": 0.5693672027656697, + "flos": 18706394856960.0, + "grad_norm": 4.075816497729076, + "language_loss": 0.70578182, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.73047084, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.20861816, + "step": 9470, + "time_per_iteration": 2.795785427093506 + }, + { + "auxiliary_loss_clip": 0.01416448, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.24881709, + "balance_loss_mlp": 1.01682854, + "epoch": 0.5694273260183376, + "flos": 20458948665600.0, + "grad_norm": 1.6512052386773992, + "language_loss": 0.75051957, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.77505308, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20068359, + "step": 9471, + "time_per_iteration": 2.8562755584716797 + }, + { + "auxiliary_loss_clip": 0.01411371, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.24724829, + "balance_loss_mlp": 1.01234412, + "epoch": 0.5694874492710056, + "flos": 17612191157760.0, + "grad_norm": 1.6915788343008675, + "language_loss": 0.59218585, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.61661494, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19189453, + "step": 9472, + "time_per_iteration": 2.8294436931610107 + }, + { + "auxiliary_loss_clip": 0.01403346, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.24108601, + "balance_loss_mlp": 1.01279342, + "epoch": 0.5695475725236735, + "flos": 13378730446080.0, + "grad_norm": 2.1116510186060475, + "language_loss": 0.74635136, + "learning_rate": 1.648400251450638e-06, + "loss": 0.77070928, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19641113, + "step": 9473, + "time_per_iteration": 2.8105525970458984 + }, + { + "auxiliary_loss_clip": 0.01220162, + "auxiliary_loss_mlp": 0.010207, + "balance_loss_clip": 1.12506247, + "balance_loss_mlp": 1.00591838, + "epoch": 0.5696076957763415, + "flos": 68206488658560.0, + "grad_norm": 0.6531212367226398, + "language_loss": 0.57648313, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59889174, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.14746094, + "step": 9474, + "time_per_iteration": 3.410151720046997 + }, + { + "auxiliary_loss_clip": 0.01411816, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.24826908, + "balance_loss_mlp": 1.01615286, + "epoch": 0.5696678190290094, + "flos": 33851234568960.0, + "grad_norm": 6.065464204212192, + "language_loss": 0.54140127, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56588674, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20581055, + "step": 9475, + "time_per_iteration": 2.9617111682891846 + }, + { + "auxiliary_loss_clip": 0.0142484, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.25766659, + "balance_loss_mlp": 1.01382637, + "epoch": 0.5697279422816774, + "flos": 26367042360960.0, + "grad_norm": 2.0108881186602585, + "language_loss": 0.80096334, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82555318, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20300293, + "step": 9476, + "time_per_iteration": 2.943243980407715 + }, + { + "auxiliary_loss_clip": 0.01439932, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.26993012, + "balance_loss_mlp": 1.01804614, + "epoch": 0.5697880655343454, + "flos": 22940986688640.0, + "grad_norm": 2.079497958060551, + "language_loss": 0.6745497, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.69933695, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20751953, + "step": 9477, + "time_per_iteration": 2.8671066761016846 + }, + { + "auxiliary_loss_clip": 0.01428981, + "auxiliary_loss_mlp": 0.01036232, + "balance_loss_clip": 1.26000118, + "balance_loss_mlp": 1.01484609, + "epoch": 0.5698481887870134, + "flos": 26772735651840.0, + "grad_norm": 2.214859598873471, + "language_loss": 0.71487045, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73952258, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21374512, + "step": 9478, + "time_per_iteration": 2.8818464279174805 + }, + { + "auxiliary_loss_clip": 0.01401503, + "auxiliary_loss_mlp": 0.01035146, + "balance_loss_clip": 1.24136662, + "balance_loss_mlp": 1.0143559, + "epoch": 0.5699083120396814, + "flos": 15750741818880.0, + "grad_norm": 1.5787095941094638, + "language_loss": 0.69525969, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71962619, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.2076416, + "step": 9479, + "time_per_iteration": 2.8226168155670166 + }, + { + "auxiliary_loss_clip": 0.01401501, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.24023104, + "balance_loss_mlp": 1.01740623, + "epoch": 0.5699684352923493, + "flos": 19546855862400.0, + "grad_norm": 1.4311482583878463, + "language_loss": 0.72036511, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.74474943, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.1953125, + "step": 9480, + "time_per_iteration": 2.8688602447509766 + }, + { + "auxiliary_loss_clip": 0.0141878, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.25404835, + "balance_loss_mlp": 1.01509929, + "epoch": 0.5700285585450173, + "flos": 16262027769600.0, + "grad_norm": 2.1217300978077724, + "language_loss": 0.72948015, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.75402641, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.2076416, + "step": 9481, + "time_per_iteration": 2.8214454650878906 + }, + { + "auxiliary_loss_clip": 0.01417065, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.25188279, + "balance_loss_mlp": 1.01507413, + "epoch": 0.5700886817976852, + "flos": 19874492593920.0, + "grad_norm": 1.795180129698411, + "language_loss": 0.79204834, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.81656468, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19494629, + "step": 9482, + "time_per_iteration": 2.8182265758514404 + }, + { + "auxiliary_loss_clip": 0.01406186, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.241925, + "balance_loss_mlp": 1.01335919, + "epoch": 0.5701488050503533, + "flos": 23852174595840.0, + "grad_norm": 1.51885201109169, + "language_loss": 0.78351671, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80790734, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19519043, + "step": 9483, + "time_per_iteration": 2.8460726737976074 + }, + { + "auxiliary_loss_clip": 0.01417536, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.25093007, + "balance_loss_mlp": 1.01683784, + "epoch": 0.5702089283030212, + "flos": 23670199434240.0, + "grad_norm": 1.578253470239627, + "language_loss": 0.81667703, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8412205, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19970703, + "step": 9484, + "time_per_iteration": 2.851006031036377 + }, + { + "auxiliary_loss_clip": 0.01413152, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.24600029, + "balance_loss_mlp": 1.01239681, + "epoch": 0.5702690515556892, + "flos": 27902574270720.0, + "grad_norm": 2.9261440213714613, + "language_loss": 0.60751534, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.63198149, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21069336, + "step": 9485, + "time_per_iteration": 4.371880054473877 + }, + { + "auxiliary_loss_clip": 0.0141635, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.24900556, + "balance_loss_mlp": 1.01622391, + "epoch": 0.5703291748083571, + "flos": 24034421226240.0, + "grad_norm": 2.556852608642169, + "language_loss": 0.66351944, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.6880427, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19750977, + "step": 9486, + "time_per_iteration": 2.863821268081665 + }, + { + "auxiliary_loss_clip": 0.01215331, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.12159324, + "balance_loss_mlp": 1.01077127, + "epoch": 0.5703892980610251, + "flos": 57056116769280.0, + "grad_norm": 0.6701509433190653, + "language_loss": 0.48041552, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50288254, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20605469, + "step": 9487, + "time_per_iteration": 3.4431607723236084 + }, + { + "auxiliary_loss_clip": 0.01419753, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.2529192, + "balance_loss_mlp": 1.01707006, + "epoch": 0.570449421313693, + "flos": 24361243551360.0, + "grad_norm": 1.4947224234991785, + "language_loss": 0.87648553, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.90105855, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20483398, + "step": 9488, + "time_per_iteration": 2.931861162185669 + }, + { + "auxiliary_loss_clip": 0.01424109, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.25440454, + "balance_loss_mlp": 1.01446581, + "epoch": 0.570509544566361, + "flos": 24839745004800.0, + "grad_norm": 1.4250455256532184, + "language_loss": 0.79058492, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81516391, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.1932373, + "step": 9489, + "time_per_iteration": 2.9101195335388184 + }, + { + "auxiliary_loss_clip": 0.01412705, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.24739802, + "balance_loss_mlp": 1.02034616, + "epoch": 0.570569667819029, + "flos": 21407174081280.0, + "grad_norm": 1.8420314337451194, + "language_loss": 0.70890808, + "learning_rate": 1.641884454927604e-06, + "loss": 0.73343039, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19165039, + "step": 9490, + "time_per_iteration": 2.856544256210327 + }, + { + "auxiliary_loss_clip": 0.01423365, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.25746441, + "balance_loss_mlp": 1.0190804, + "epoch": 0.570629791071697, + "flos": 23225839822080.0, + "grad_norm": 2.12319997976811, + "language_loss": 0.76598084, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.79060239, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19714355, + "step": 9491, + "time_per_iteration": 2.8958823680877686 + }, + { + "auxiliary_loss_clip": 0.0122125, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.12666225, + "balance_loss_mlp": 1.02079809, + "epoch": 0.570689914324365, + "flos": 65314097130240.0, + "grad_norm": 0.801386007240681, + "language_loss": 0.5746851, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59729058, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.18457031, + "step": 9492, + "time_per_iteration": 3.340087890625 + }, + { + "auxiliary_loss_clip": 0.01414901, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.25011301, + "balance_loss_mlp": 1.02149594, + "epoch": 0.5707500375770329, + "flos": 21151712085120.0, + "grad_norm": 4.093888198481086, + "language_loss": 0.72963572, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.75419837, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19885254, + "step": 9493, + "time_per_iteration": 2.8598716259002686 + }, + { + "auxiliary_loss_clip": 0.01439427, + "auxiliary_loss_mlp": 0.01037577, + "balance_loss_clip": 1.26835144, + "balance_loss_mlp": 1.01750207, + "epoch": 0.5708101608297009, + "flos": 20822265561600.0, + "grad_norm": 1.8106270963586066, + "language_loss": 0.7845037, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.80927372, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20068359, + "step": 9494, + "time_per_iteration": 2.8673572540283203 + }, + { + "auxiliary_loss_clip": 0.01444272, + "auxiliary_loss_mlp": 0.01037398, + "balance_loss_clip": 1.27114928, + "balance_loss_mlp": 1.01656008, + "epoch": 0.5708702840823688, + "flos": 25823424360960.0, + "grad_norm": 2.2901655610866207, + "language_loss": 0.8079139, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.83273059, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20837402, + "step": 9495, + "time_per_iteration": 2.903404474258423 + }, + { + "auxiliary_loss_clip": 0.01453598, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.27886939, + "balance_loss_mlp": 1.02263844, + "epoch": 0.5709304073350369, + "flos": 23660426557440.0, + "grad_norm": 5.645404682163034, + "language_loss": 0.67103297, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.6960085, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.21313477, + "step": 9496, + "time_per_iteration": 2.8979718685150146 + }, + { + "auxiliary_loss_clip": 0.01432376, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.26089859, + "balance_loss_mlp": 1.0161128, + "epoch": 0.5709905305877048, + "flos": 16116230465280.0, + "grad_norm": 7.305600688108984, + "language_loss": 0.70400262, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.72868764, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20007324, + "step": 9497, + "time_per_iteration": 4.27473783493042 + }, + { + "auxiliary_loss_clip": 0.01430534, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.26066828, + "balance_loss_mlp": 1.013659, + "epoch": 0.5710506538403728, + "flos": 24760738304640.0, + "grad_norm": 2.9983893503404135, + "language_loss": 0.82050073, + "learning_rate": 1.638819551358182e-06, + "loss": 0.8451519, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20922852, + "step": 9498, + "time_per_iteration": 2.8640806674957275 + }, + { + "auxiliary_loss_clip": 0.0143141, + "auxiliary_loss_mlp": 0.01040532, + "balance_loss_clip": 1.26072454, + "balance_loss_mlp": 1.0194087, + "epoch": 0.5711107770930407, + "flos": 21992580293760.0, + "grad_norm": 1.974863980733816, + "language_loss": 0.67181462, + "learning_rate": 1.638436499891469e-06, + "loss": 0.69653404, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.21118164, + "step": 9499, + "time_per_iteration": 4.237907648086548 + }, + { + "auxiliary_loss_clip": 0.01429597, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.2609061, + "balance_loss_mlp": 1.01511264, + "epoch": 0.5711709003457087, + "flos": 19583667146880.0, + "grad_norm": 1.5018218256361155, + "language_loss": 0.72986007, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.75450671, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19958496, + "step": 9500, + "time_per_iteration": 4.216063022613525 + }, + { + "auxiliary_loss_clip": 0.01437239, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.26543617, + "balance_loss_mlp": 1.01084566, + "epoch": 0.5712310235983766, + "flos": 24253162427520.0, + "grad_norm": 2.082957913938169, + "language_loss": 0.77475381, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.79943955, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.20483398, + "step": 9501, + "time_per_iteration": 3.0606400966644287 + }, + { + "auxiliary_loss_clip": 0.01434046, + "auxiliary_loss_mlp": 0.01035333, + "balance_loss_clip": 1.26444685, + "balance_loss_mlp": 1.01549685, + "epoch": 0.5712911468510447, + "flos": 21005960025600.0, + "grad_norm": 1.688759608228194, + "language_loss": 0.7561754, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.78086919, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19824219, + "step": 9502, + "time_per_iteration": 2.8382177352905273 + }, + { + "auxiliary_loss_clip": 0.01425399, + "auxiliary_loss_mlp": 0.01036363, + "balance_loss_clip": 1.25880051, + "balance_loss_mlp": 1.01644349, + "epoch": 0.5713512701037126, + "flos": 18926402912640.0, + "grad_norm": 2.514422205451015, + "language_loss": 0.83032072, + "learning_rate": 1.636904431275105e-06, + "loss": 0.85493833, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19934082, + "step": 9503, + "time_per_iteration": 2.8908681869506836 + }, + { + "auxiliary_loss_clip": 0.01426842, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.2604661, + "balance_loss_mlp": 1.0155611, + "epoch": 0.5714113933563806, + "flos": 17420488364160.0, + "grad_norm": 2.0589880389036623, + "language_loss": 0.8651346, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88976091, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20227051, + "step": 9504, + "time_per_iteration": 2.79015851020813 + }, + { + "auxiliary_loss_clip": 0.01418991, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.25415123, + "balance_loss_mlp": 1.01292038, + "epoch": 0.5714715166090486, + "flos": 20202943731840.0, + "grad_norm": 2.0082949324275376, + "language_loss": 0.76005912, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.78457493, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19665527, + "step": 9505, + "time_per_iteration": 2.811351776123047 + }, + { + "auxiliary_loss_clip": 0.01420096, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.2541182, + "balance_loss_mlp": 1.01526809, + "epoch": 0.5715316398617165, + "flos": 18560869021440.0, + "grad_norm": 1.4514664605425311, + "language_loss": 0.8220377, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84658742, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19604492, + "step": 9506, + "time_per_iteration": 2.851440906524658 + }, + { + "auxiliary_loss_clip": 0.01421123, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.25488639, + "balance_loss_mlp": 1.0124948, + "epoch": 0.5715917631143845, + "flos": 18487246452480.0, + "grad_norm": 1.9662333328430843, + "language_loss": 0.78312016, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.80765563, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19934082, + "step": 9507, + "time_per_iteration": 2.853846311569214 + }, + { + "auxiliary_loss_clip": 0.0142781, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.25789738, + "balance_loss_mlp": 1.01308298, + "epoch": 0.5716518863670524, + "flos": 24029489543040.0, + "grad_norm": 1.550007792930395, + "language_loss": 0.69687253, + "learning_rate": 1.63498965540751e-06, + "loss": 0.72149128, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.2097168, + "step": 9508, + "time_per_iteration": 2.8868308067321777 + }, + { + "auxiliary_loss_clip": 0.01432215, + "auxiliary_loss_mlp": 0.01036022, + "balance_loss_clip": 1.26163042, + "balance_loss_mlp": 1.01569724, + "epoch": 0.5717120096197205, + "flos": 17827855712640.0, + "grad_norm": 2.0947844690105333, + "language_loss": 0.8101151, + "learning_rate": 1.634606741699593e-06, + "loss": 0.83479744, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20324707, + "step": 9509, + "time_per_iteration": 2.829927682876587 + }, + { + "auxiliary_loss_clip": 0.01415826, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.25167871, + "balance_loss_mlp": 1.01122093, + "epoch": 0.5717721328723884, + "flos": 21874590558720.0, + "grad_norm": 1.9158664632967035, + "language_loss": 0.73731232, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.76178396, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2010498, + "step": 9510, + "time_per_iteration": 2.8271336555480957 + }, + { + "auxiliary_loss_clip": 0.01427037, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.2600832, + "balance_loss_mlp": 1.01485658, + "epoch": 0.5718322561250564, + "flos": 28448952203520.0, + "grad_norm": 1.53935491248813, + "language_loss": 0.70458221, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.72920179, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20056152, + "step": 9511, + "time_per_iteration": 2.879603147506714 + }, + { + "auxiliary_loss_clip": 0.01425286, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.25647831, + "balance_loss_mlp": 1.0179162, + "epoch": 0.5718923793777243, + "flos": 13559393508480.0, + "grad_norm": 2.1578421711967697, + "language_loss": 0.62093818, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.64557981, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20959473, + "step": 9512, + "time_per_iteration": 2.830775499343872 + }, + { + "auxiliary_loss_clip": 0.01415423, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.24961925, + "balance_loss_mlp": 1.01605439, + "epoch": 0.5719525026303923, + "flos": 17831068093440.0, + "grad_norm": 2.557844938257377, + "language_loss": 0.76497895, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78948122, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1875, + "step": 9513, + "time_per_iteration": 2.8179476261138916 + }, + { + "auxiliary_loss_clip": 0.01221817, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.12570024, + "balance_loss_mlp": 1.01458478, + "epoch": 0.5720126258830602, + "flos": 61323927563520.0, + "grad_norm": 0.8874807727193097, + "language_loss": 0.66853786, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.69109643, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.19433594, + "step": 9514, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0144128, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.27004552, + "balance_loss_mlp": 1.01552749, + "epoch": 0.5720727491357283, + "flos": 23998650572160.0, + "grad_norm": 2.0054396893750224, + "language_loss": 0.82261151, + "learning_rate": 1.63230955093099e-06, + "loss": 0.84738755, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20825195, + "step": 9515, + "time_per_iteration": 2.8941762447357178 + }, + { + "auxiliary_loss_clip": 0.01409042, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.24661458, + "balance_loss_mlp": 1.01444435, + "epoch": 0.5721328723883962, + "flos": 23416185271680.0, + "grad_norm": 1.5700018916518101, + "language_loss": 0.860587, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88502097, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19909668, + "step": 9516, + "time_per_iteration": 2.8979694843292236 + }, + { + "auxiliary_loss_clip": 0.01429154, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.26209617, + "balance_loss_mlp": 1.01584864, + "epoch": 0.5721929956410642, + "flos": 18813616329600.0, + "grad_norm": 1.7572092527946557, + "language_loss": 0.88324738, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.90790641, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.2088623, + "step": 9517, + "time_per_iteration": 2.843526840209961 + }, + { + "auxiliary_loss_clip": 0.01420149, + "auxiliary_loss_mlp": 0.01035334, + "balance_loss_clip": 1.25444901, + "balance_loss_mlp": 1.01514006, + "epoch": 0.5722531188937322, + "flos": 27207548611200.0, + "grad_norm": 1.6674365453802922, + "language_loss": 0.86046076, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.88501561, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.2019043, + "step": 9518, + "time_per_iteration": 2.873119831085205 + }, + { + "auxiliary_loss_clip": 0.01413574, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.25145113, + "balance_loss_mlp": 1.01271892, + "epoch": 0.5723132421464001, + "flos": 15204182906880.0, + "grad_norm": 1.9316760452950092, + "language_loss": 0.79981303, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.82427454, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19873047, + "step": 9519, + "time_per_iteration": 2.872514486312866 + }, + { + "auxiliary_loss_clip": 0.01420218, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.25503504, + "balance_loss_mlp": 1.01671731, + "epoch": 0.5723733653990681, + "flos": 27610798682880.0, + "grad_norm": 3.6522574749398293, + "language_loss": 0.83572119, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.86028606, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1953125, + "step": 9520, + "time_per_iteration": 4.39677095413208 + }, + { + "auxiliary_loss_clip": 0.01440141, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.26951396, + "balance_loss_mlp": 1.01635456, + "epoch": 0.572433488651736, + "flos": 18231874945920.0, + "grad_norm": 2.1263273079446052, + "language_loss": 0.73889863, + "learning_rate": 1.630012862105243e-06, + "loss": 0.76367003, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.2064209, + "step": 9521, + "time_per_iteration": 2.8062448501586914 + }, + { + "auxiliary_loss_clip": 0.01425293, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.25852346, + "balance_loss_mlp": 1.01629555, + "epoch": 0.5724936119044041, + "flos": 31261703604480.0, + "grad_norm": 1.5178492224270694, + "language_loss": 0.78953665, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.81415337, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.2010498, + "step": 9522, + "time_per_iteration": 2.9147608280181885 + }, + { + "auxiliary_loss_clip": 0.01416347, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.25425398, + "balance_loss_mlp": 1.01474166, + "epoch": 0.572553735157072, + "flos": 19209943946880.0, + "grad_norm": 1.7068541284664465, + "language_loss": 0.724886, + "learning_rate": 1.629247411248102e-06, + "loss": 0.7493881, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19116211, + "step": 9523, + "time_per_iteration": 2.8089489936828613 + }, + { + "auxiliary_loss_clip": 0.01407206, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.24471247, + "balance_loss_mlp": 1.01085293, + "epoch": 0.57261385840974, + "flos": 21224746471680.0, + "grad_norm": 1.6391920514711746, + "language_loss": 0.70805359, + "learning_rate": 1.628864706900738e-06, + "loss": 0.73242092, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18676758, + "step": 9524, + "time_per_iteration": 2.8604204654693604 + }, + { + "auxiliary_loss_clip": 0.01413978, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.24995553, + "balance_loss_mlp": 1.01296473, + "epoch": 0.5726739816624079, + "flos": 33997574810880.0, + "grad_norm": 1.294532231739259, + "language_loss": 0.66388881, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.68835753, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19921875, + "step": 9525, + "time_per_iteration": 2.9671006202697754 + }, + { + "auxiliary_loss_clip": 0.01413538, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.24994135, + "balance_loss_mlp": 1.01373053, + "epoch": 0.5727341049150759, + "flos": 24285630211200.0, + "grad_norm": 1.6065963345647825, + "language_loss": 0.73884594, + "learning_rate": 1.628099340440984e-06, + "loss": 0.76331282, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1940918, + "step": 9526, + "time_per_iteration": 2.8433845043182373 + }, + { + "auxiliary_loss_clip": 0.01407944, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.2460041, + "balance_loss_mlp": 1.01184368, + "epoch": 0.5727942281677438, + "flos": 28411733715840.0, + "grad_norm": 1.5754941119492267, + "language_loss": 0.81187916, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.83627284, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19555664, + "step": 9527, + "time_per_iteration": 2.942884683609009 + }, + { + "auxiliary_loss_clip": 0.01409509, + "auxiliary_loss_mlp": 0.01039259, + "balance_loss_clip": 1.24800563, + "balance_loss_mlp": 1.01914871, + "epoch": 0.5728543514204119, + "flos": 19546222435200.0, + "grad_norm": 1.5267328106237117, + "language_loss": 0.72874755, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.75323522, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.2010498, + "step": 9528, + "time_per_iteration": 2.8228275775909424 + }, + { + "auxiliary_loss_clip": 0.01415962, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.25261092, + "balance_loss_mlp": 1.01212454, + "epoch": 0.5729144746730798, + "flos": 21516657793920.0, + "grad_norm": 1.9587400244919435, + "language_loss": 0.86786616, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.89234215, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19506836, + "step": 9529, + "time_per_iteration": 2.8684070110321045 + }, + { + "auxiliary_loss_clip": 0.01221773, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.12775195, + "balance_loss_mlp": 1.01318705, + "epoch": 0.5729745979257478, + "flos": 58709575186560.0, + "grad_norm": 0.7602970773718527, + "language_loss": 0.56152934, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.5840506, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.171875, + "step": 9530, + "time_per_iteration": 3.305128574371338 + }, + { + "auxiliary_loss_clip": 0.01444046, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.2749933, + "balance_loss_mlp": 1.01930249, + "epoch": 0.5730347211784158, + "flos": 18561411959040.0, + "grad_norm": 1.704356429975184, + "language_loss": 0.6754207, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.70024061, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.18652344, + "step": 9531, + "time_per_iteration": 2.9308788776397705 + }, + { + "auxiliary_loss_clip": 0.01421951, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.25667703, + "balance_loss_mlp": 1.01395869, + "epoch": 0.5730948444310837, + "flos": 38044807349760.0, + "grad_norm": 1.9407483552989404, + "language_loss": 0.76065183, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78520441, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19348145, + "step": 9532, + "time_per_iteration": 2.9904747009277344 + }, + { + "auxiliary_loss_clip": 0.01413161, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.24812078, + "balance_loss_mlp": 1.01602447, + "epoch": 0.5731549676837517, + "flos": 25237610945280.0, + "grad_norm": 1.3334348394814057, + "language_loss": 0.79526341, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81974614, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19091797, + "step": 9533, + "time_per_iteration": 4.348853826522827 + }, + { + "auxiliary_loss_clip": 0.01413162, + "auxiliary_loss_mlp": 0.01031371, + "balance_loss_clip": 1.25103736, + "balance_loss_mlp": 1.01196384, + "epoch": 0.5732150909364196, + "flos": 23378785804800.0, + "grad_norm": 1.6736005906560716, + "language_loss": 0.85862541, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.88307071, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19396973, + "step": 9534, + "time_per_iteration": 4.376283645629883 + }, + { + "auxiliary_loss_clip": 0.01415651, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.25158405, + "balance_loss_mlp": 1.01669121, + "epoch": 0.5732752141890877, + "flos": 23090629800960.0, + "grad_norm": 1.8273271761365368, + "language_loss": 0.76057446, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.7851032, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20507812, + "step": 9535, + "time_per_iteration": 4.2519567012786865 + }, + { + "auxiliary_loss_clip": 0.01434717, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.26540184, + "balance_loss_mlp": 1.01488733, + "epoch": 0.5733353374417556, + "flos": 24362827119360.0, + "grad_norm": 2.1228061448288007, + "language_loss": 0.71772772, + "learning_rate": 1.624273356614346e-06, + "loss": 0.74243271, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20898438, + "step": 9536, + "time_per_iteration": 2.857120990753174 + }, + { + "auxiliary_loss_clip": 0.01417228, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.25436497, + "balance_loss_mlp": 1.01788807, + "epoch": 0.5733954606944236, + "flos": 27210308544000.0, + "grad_norm": 1.8834504599634612, + "language_loss": 0.70550221, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.73005009, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19677734, + "step": 9537, + "time_per_iteration": 2.890341281890869 + }, + { + "auxiliary_loss_clip": 0.01417808, + "auxiliary_loss_mlp": 0.01038322, + "balance_loss_clip": 1.25283015, + "balance_loss_mlp": 1.01891494, + "epoch": 0.5734555839470915, + "flos": 28776317466240.0, + "grad_norm": 1.8907529337853262, + "language_loss": 0.63458312, + "learning_rate": 1.623508330355902e-06, + "loss": 0.6591444, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.1940918, + "step": 9538, + "time_per_iteration": 2.9428534507751465 + }, + { + "auxiliary_loss_clip": 0.01419994, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.25418139, + "balance_loss_mlp": 1.01430488, + "epoch": 0.5735157071997595, + "flos": 22977255035520.0, + "grad_norm": 1.9298084288917203, + "language_loss": 0.83407229, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85862654, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21130371, + "step": 9539, + "time_per_iteration": 2.8669750690460205 + }, + { + "auxiliary_loss_clip": 0.01427652, + "auxiliary_loss_mlp": 0.01036095, + "balance_loss_clip": 1.25866675, + "balance_loss_mlp": 1.015818, + "epoch": 0.5735758304524274, + "flos": 18998758627200.0, + "grad_norm": 2.0609671103577902, + "language_loss": 0.73481762, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75945508, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20275879, + "step": 9540, + "time_per_iteration": 2.814688205718994 + }, + { + "auxiliary_loss_clip": 0.01412521, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.2477355, + "balance_loss_mlp": 1.01677299, + "epoch": 0.5736359537050955, + "flos": 28408430845440.0, + "grad_norm": 2.8203460593542573, + "language_loss": 0.81252217, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.83700794, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19287109, + "step": 9541, + "time_per_iteration": 2.9061129093170166 + }, + { + "auxiliary_loss_clip": 0.01434804, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.26551282, + "balance_loss_mlp": 1.01932335, + "epoch": 0.5736960769577634, + "flos": 15634199917440.0, + "grad_norm": 4.225179032884918, + "language_loss": 0.65386033, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67859852, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19689941, + "step": 9542, + "time_per_iteration": 2.8120384216308594 + }, + { + "auxiliary_loss_clip": 0.01420831, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.25589061, + "balance_loss_mlp": 1.02180946, + "epoch": 0.5737562002104314, + "flos": 18012952765440.0, + "grad_norm": 2.2170858832765257, + "language_loss": 0.83612823, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.8607415, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18688965, + "step": 9543, + "time_per_iteration": 2.8360350131988525 + }, + { + "auxiliary_loss_clip": 0.01423512, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.2552731, + "balance_loss_mlp": 1.02257991, + "epoch": 0.5738163234630994, + "flos": 20706357087360.0, + "grad_norm": 1.751282260302756, + "language_loss": 0.74906731, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.77373028, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20202637, + "step": 9544, + "time_per_iteration": 2.9210257530212402 + }, + { + "auxiliary_loss_clip": 0.01425973, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.25774169, + "balance_loss_mlp": 1.01758695, + "epoch": 0.5738764467157673, + "flos": 23159818379520.0, + "grad_norm": 1.7378467268109927, + "language_loss": 0.76671624, + "learning_rate": 1.620831188925733e-06, + "loss": 0.79134965, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19763184, + "step": 9545, + "time_per_iteration": 2.9093708992004395 + }, + { + "auxiliary_loss_clip": 0.01420014, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.25349736, + "balance_loss_mlp": 1.01674461, + "epoch": 0.5739365699684353, + "flos": 29503810909440.0, + "grad_norm": 4.103101787022907, + "language_loss": 0.57093716, + "learning_rate": 1.620448797546459e-06, + "loss": 0.59550881, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20422363, + "step": 9546, + "time_per_iteration": 2.9276201725006104 + }, + { + "auxiliary_loss_clip": 0.01425277, + "auxiliary_loss_mlp": 0.01036239, + "balance_loss_clip": 1.25730324, + "balance_loss_mlp": 1.0169034, + "epoch": 0.5739966932211032, + "flos": 14035225518720.0, + "grad_norm": 2.4857200736378813, + "language_loss": 0.7805407, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.80515587, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.19311523, + "step": 9547, + "time_per_iteration": 2.8136696815490723 + }, + { + "auxiliary_loss_clip": 0.01425324, + "auxiliary_loss_mlp": 0.01039681, + "balance_loss_clip": 1.25819063, + "balance_loss_mlp": 1.01899791, + "epoch": 0.5740568164737713, + "flos": 19071250076160.0, + "grad_norm": 2.975573972296334, + "language_loss": 0.75599027, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.78064024, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20678711, + "step": 9548, + "time_per_iteration": 2.826310396194458 + }, + { + "auxiliary_loss_clip": 0.01417129, + "auxiliary_loss_mlp": 0.01042432, + "balance_loss_clip": 1.25215089, + "balance_loss_mlp": 1.02210724, + "epoch": 0.5741169397264392, + "flos": 22138106129280.0, + "grad_norm": 2.9577904376407, + "language_loss": 0.70302999, + "learning_rate": 1.619301709822355e-06, + "loss": 0.72762549, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20336914, + "step": 9549, + "time_per_iteration": 2.8783011436462402 + }, + { + "auxiliary_loss_clip": 0.01426039, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.26112318, + "balance_loss_mlp": 1.01919675, + "epoch": 0.5741770629791072, + "flos": 24947735639040.0, + "grad_norm": 1.5352606983842598, + "language_loss": 0.79949892, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.82414651, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19506836, + "step": 9550, + "time_per_iteration": 2.8792431354522705 + }, + { + "auxiliary_loss_clip": 0.01429951, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.26307523, + "balance_loss_mlp": 1.01697123, + "epoch": 0.5742371862317751, + "flos": 18809272828800.0, + "grad_norm": 1.9316810859985574, + "language_loss": 0.68306887, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70774519, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20703125, + "step": 9551, + "time_per_iteration": 2.847304582595825 + }, + { + "auxiliary_loss_clip": 0.01440759, + "auxiliary_loss_mlp": 0.01043553, + "balance_loss_clip": 1.26909328, + "balance_loss_mlp": 1.02242959, + "epoch": 0.5742973094844431, + "flos": 24470953488000.0, + "grad_norm": 2.0949910831289467, + "language_loss": 0.72689509, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.75173819, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21118164, + "step": 9552, + "time_per_iteration": 2.8693933486938477 + }, + { + "auxiliary_loss_clip": 0.01425042, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.26100612, + "balance_loss_mlp": 1.01276207, + "epoch": 0.574357432737111, + "flos": 21662726567040.0, + "grad_norm": 2.651156061226727, + "language_loss": 0.80676121, + "learning_rate": 1.617772461696843e-06, + "loss": 0.83133221, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19287109, + "step": 9553, + "time_per_iteration": 2.8567821979522705 + }, + { + "auxiliary_loss_clip": 0.01437375, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.26685858, + "balance_loss_mlp": 1.01481938, + "epoch": 0.5744175559897791, + "flos": 16553350909440.0, + "grad_norm": 2.258008785980967, + "language_loss": 0.84068257, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.86540049, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19592285, + "step": 9554, + "time_per_iteration": 2.866321325302124 + }, + { + "auxiliary_loss_clip": 0.01437337, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.26746595, + "balance_loss_mlp": 1.02083039, + "epoch": 0.574477679242447, + "flos": 24218432403840.0, + "grad_norm": 1.4331481925124931, + "language_loss": 0.71815902, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.74294782, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20715332, + "step": 9555, + "time_per_iteration": 4.306546688079834 + }, + { + "auxiliary_loss_clip": 0.01432725, + "auxiliary_loss_mlp": 0.01036167, + "balance_loss_clip": 1.26550865, + "balance_loss_mlp": 1.01587749, + "epoch": 0.574537802495115, + "flos": 14911230954240.0, + "grad_norm": 2.166929401136669, + "language_loss": 0.73883015, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.76351905, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20300293, + "step": 9556, + "time_per_iteration": 2.8328654766082764 + }, + { + "auxiliary_loss_clip": 0.01412477, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.24806213, + "balance_loss_mlp": 1.01446342, + "epoch": 0.5745979257477829, + "flos": 24945382909440.0, + "grad_norm": 2.0390443354187187, + "language_loss": 0.74979794, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.77427799, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.21069336, + "step": 9557, + "time_per_iteration": 2.8800582885742188 + }, + { + "auxiliary_loss_clip": 0.0142505, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.25853133, + "balance_loss_mlp": 1.0172255, + "epoch": 0.5746580490004509, + "flos": 17243987823360.0, + "grad_norm": 2.02212664695032, + "language_loss": 0.68894887, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.71357083, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19934082, + "step": 9558, + "time_per_iteration": 2.827829360961914 + }, + { + "auxiliary_loss_clip": 0.01439619, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.26677299, + "balance_loss_mlp": 1.01733422, + "epoch": 0.5747181722531189, + "flos": 13195578919680.0, + "grad_norm": 2.496132899961028, + "language_loss": 0.71952289, + "learning_rate": 1.615479024621659e-06, + "loss": 0.74431497, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.22241211, + "step": 9559, + "time_per_iteration": 2.8413195610046387 + }, + { + "auxiliary_loss_clip": 0.01420839, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.25608587, + "balance_loss_mlp": 1.01510525, + "epoch": 0.5747782955057869, + "flos": 22972459086720.0, + "grad_norm": 1.7016791454251394, + "language_loss": 0.79914033, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.82369339, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19372559, + "step": 9560, + "time_per_iteration": 2.8604722023010254 + }, + { + "auxiliary_loss_clip": 0.01430299, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.26101768, + "balance_loss_mlp": 1.01694322, + "epoch": 0.5748384187584549, + "flos": 23413425338880.0, + "grad_norm": 1.9916162822667793, + "language_loss": 0.65101969, + "learning_rate": 1.614714662090588e-06, + "loss": 0.67569625, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20410156, + "step": 9561, + "time_per_iteration": 2.875861883163452 + }, + { + "auxiliary_loss_clip": 0.01452665, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.27849436, + "balance_loss_mlp": 1.01749706, + "epoch": 0.5748985420111228, + "flos": 17794528277760.0, + "grad_norm": 10.516035198460475, + "language_loss": 0.72344041, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.74835849, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21643066, + "step": 9562, + "time_per_iteration": 2.8049819469451904 + }, + { + "auxiliary_loss_clip": 0.01432933, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.26508522, + "balance_loss_mlp": 1.01792645, + "epoch": 0.5749586652637908, + "flos": 19876076161920.0, + "grad_norm": 1.4945400050753446, + "language_loss": 0.84474027, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86943913, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19030762, + "step": 9563, + "time_per_iteration": 2.8334243297576904 + }, + { + "auxiliary_loss_clip": 0.01439105, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.26676774, + "balance_loss_mlp": 1.0175072, + "epoch": 0.5750187885164587, + "flos": 21296921207040.0, + "grad_norm": 1.9393347653174426, + "language_loss": 0.58768857, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.61246514, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21044922, + "step": 9564, + "time_per_iteration": 2.842576026916504 + }, + { + "auxiliary_loss_clip": 0.01397458, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.23724508, + "balance_loss_mlp": 1.01498127, + "epoch": 0.5750789117691267, + "flos": 18813299616000.0, + "grad_norm": 2.271382244061916, + "language_loss": 0.76563203, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78996813, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.21179199, + "step": 9565, + "time_per_iteration": 2.822629690170288 + }, + { + "auxiliary_loss_clip": 0.01222744, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_clip": 1.12820506, + "balance_loss_mlp": 1.03486598, + "epoch": 0.5751390350217946, + "flos": 70697982844800.0, + "grad_norm": 0.7429110730220063, + "language_loss": 0.60757947, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.63040161, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.24511719, + "step": 9566, + "time_per_iteration": 3.4485089778900146 + }, + { + "auxiliary_loss_clip": 0.01427413, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.26046824, + "balance_loss_mlp": 1.01490426, + "epoch": 0.5751991582744627, + "flos": 14254419168000.0, + "grad_norm": 1.97416771907183, + "language_loss": 0.76655155, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.79117703, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20214844, + "step": 9567, + "time_per_iteration": 2.8615429401397705 + }, + { + "auxiliary_loss_clip": 0.01417491, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.25172365, + "balance_loss_mlp": 1.01512504, + "epoch": 0.5752592815271306, + "flos": 18336110261760.0, + "grad_norm": 1.3954716258947677, + "language_loss": 0.75165492, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77618766, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20666504, + "step": 9568, + "time_per_iteration": 4.267677545547485 + }, + { + "auxiliary_loss_clip": 0.01422093, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.25459337, + "balance_loss_mlp": 1.01222086, + "epoch": 0.5753194047797986, + "flos": 20932427946240.0, + "grad_norm": 1.582773656486258, + "language_loss": 0.72274375, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.74729049, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20349121, + "step": 9569, + "time_per_iteration": 4.320723533630371 + }, + { + "auxiliary_loss_clip": 0.01420295, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.25295377, + "balance_loss_mlp": 1.01982594, + "epoch": 0.5753795280324665, + "flos": 19291303376640.0, + "grad_norm": 2.889669900598554, + "language_loss": 0.5639959, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.58861327, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21618652, + "step": 9570, + "time_per_iteration": 4.243241310119629 + }, + { + "auxiliary_loss_clip": 0.01408948, + "auxiliary_loss_mlp": 0.01040398, + "balance_loss_clip": 1.24538743, + "balance_loss_mlp": 1.01973927, + "epoch": 0.5754396512851345, + "flos": 21662862301440.0, + "grad_norm": 1.4931114289916747, + "language_loss": 0.64696705, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.67146045, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20654297, + "step": 9571, + "time_per_iteration": 2.905670642852783 + }, + { + "auxiliary_loss_clip": 0.01411058, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.24552405, + "balance_loss_mlp": 1.01586938, + "epoch": 0.5754997745378025, + "flos": 51038729619840.0, + "grad_norm": 1.6079505050111795, + "language_loss": 0.67880315, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.70327318, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20092773, + "step": 9572, + "time_per_iteration": 3.1498560905456543 + }, + { + "auxiliary_loss_clip": 0.01410305, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.24618959, + "balance_loss_mlp": 1.01825356, + "epoch": 0.5755598977904705, + "flos": 22867183140480.0, + "grad_norm": 1.6564921837005748, + "language_loss": 0.73163533, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.75613815, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.21704102, + "step": 9573, + "time_per_iteration": 2.842282772064209 + }, + { + "auxiliary_loss_clip": 0.01409034, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.24837971, + "balance_loss_mlp": 1.01457131, + "epoch": 0.5756200210431385, + "flos": 38487176190720.0, + "grad_norm": 1.8001822030984374, + "language_loss": 0.76878279, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.79320669, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18811035, + "step": 9574, + "time_per_iteration": 3.0207772254943848 + }, + { + "auxiliary_loss_clip": 0.0144358, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.27037239, + "balance_loss_mlp": 1.01738787, + "epoch": 0.5756801442958064, + "flos": 23919281913600.0, + "grad_norm": 3.890396386877448, + "language_loss": 0.67286688, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.69769251, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.21569824, + "step": 9575, + "time_per_iteration": 2.8627254962921143 + }, + { + "auxiliary_loss_clip": 0.01403803, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.2424705, + "balance_loss_mlp": 1.01420975, + "epoch": 0.5757402675484744, + "flos": 21115036535040.0, + "grad_norm": 1.4358875820676233, + "language_loss": 0.80280131, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.82719254, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.21105957, + "step": 9576, + "time_per_iteration": 2.8811585903167725 + }, + { + "auxiliary_loss_clip": 0.01420466, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.25617206, + "balance_loss_mlp": 1.01546621, + "epoch": 0.5758003908011423, + "flos": 20569020560640.0, + "grad_norm": 2.637115430720199, + "language_loss": 0.71000242, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.73455894, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19714355, + "step": 9577, + "time_per_iteration": 2.825751304626465 + }, + { + "auxiliary_loss_clip": 0.01420236, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.25228119, + "balance_loss_mlp": 1.01690841, + "epoch": 0.5758605140538103, + "flos": 16481945335680.0, + "grad_norm": 2.0072066267817124, + "language_loss": 0.66838574, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.69295502, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19787598, + "step": 9578, + "time_per_iteration": 2.8282041549682617 + }, + { + "auxiliary_loss_clip": 0.01417122, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.25257552, + "balance_loss_mlp": 1.01254666, + "epoch": 0.5759206373064782, + "flos": 21297328410240.0, + "grad_norm": 1.599003805009062, + "language_loss": 0.73434865, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.75884241, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19702148, + "step": 9579, + "time_per_iteration": 2.8292505741119385 + }, + { + "auxiliary_loss_clip": 0.01435285, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.26400018, + "balance_loss_mlp": 1.0133971, + "epoch": 0.5759807605591463, + "flos": 26079067336320.0, + "grad_norm": 2.4174328209053852, + "language_loss": 0.66628897, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.69098324, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20751953, + "step": 9580, + "time_per_iteration": 2.9403624534606934 + }, + { + "auxiliary_loss_clip": 0.01412302, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.24585176, + "balance_loss_mlp": 1.01499987, + "epoch": 0.5760408838118142, + "flos": 18880542668160.0, + "grad_norm": 2.1142892414711105, + "language_loss": 0.86011469, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.88459712, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20935059, + "step": 9581, + "time_per_iteration": 2.844252347946167 + }, + { + "auxiliary_loss_clip": 0.01458896, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.28678691, + "balance_loss_mlp": 1.01896119, + "epoch": 0.5761010070644822, + "flos": 15386701006080.0, + "grad_norm": 2.0718823693040256, + "language_loss": 0.68749315, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.712484, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.21228027, + "step": 9582, + "time_per_iteration": 2.833075523376465 + }, + { + "auxiliary_loss_clip": 0.01215791, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.12376189, + "balance_loss_mlp": 1.02700007, + "epoch": 0.5761611303171501, + "flos": 71508464530560.0, + "grad_norm": 0.6635219380741795, + "language_loss": 0.57224357, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59489274, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.22167969, + "step": 9583, + "time_per_iteration": 3.5074870586395264 + }, + { + "auxiliary_loss_clip": 0.01417746, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.25224674, + "balance_loss_mlp": 1.01276231, + "epoch": 0.5762212535698181, + "flos": 16252978809600.0, + "grad_norm": 1.7843484743298361, + "language_loss": 0.83281118, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.85731578, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19946289, + "step": 9584, + "time_per_iteration": 2.828568696975708 + }, + { + "auxiliary_loss_clip": 0.01225705, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.12884545, + "balance_loss_mlp": 1.01113844, + "epoch": 0.5762813768224861, + "flos": 70219255167360.0, + "grad_norm": 0.6287756609342996, + "language_loss": 0.49619007, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51877594, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.21777344, + "step": 9585, + "time_per_iteration": 3.389831304550171 + }, + { + "auxiliary_loss_clip": 0.0141373, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.24769115, + "balance_loss_mlp": 1.01335871, + "epoch": 0.5763415000751541, + "flos": 20526734655360.0, + "grad_norm": 1.5513011508405017, + "language_loss": 0.8579042, + "learning_rate": 1.605165098835465e-06, + "loss": 0.88237983, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20458984, + "step": 9586, + "time_per_iteration": 2.8331639766693115 + }, + { + "auxiliary_loss_clip": 0.01414065, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.2479744, + "balance_loss_mlp": 1.01438653, + "epoch": 0.5764016233278221, + "flos": 15824092919040.0, + "grad_norm": 1.6537595481364107, + "language_loss": 0.80499583, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82948977, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20935059, + "step": 9587, + "time_per_iteration": 2.879678726196289 + }, + { + "auxiliary_loss_clip": 0.01423418, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.25662589, + "balance_loss_mlp": 1.01644838, + "epoch": 0.57646174658049, + "flos": 20780794062720.0, + "grad_norm": 1.8720312761335045, + "language_loss": 0.66755772, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.69216919, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21289062, + "step": 9588, + "time_per_iteration": 2.9449524879455566 + }, + { + "auxiliary_loss_clip": 0.01419182, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.25110805, + "balance_loss_mlp": 1.01480854, + "epoch": 0.576521869833158, + "flos": 23560353763200.0, + "grad_norm": 1.8885624535290177, + "language_loss": 0.79448062, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81903201, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21154785, + "step": 9589, + "time_per_iteration": 4.370093822479248 + }, + { + "auxiliary_loss_clip": 0.01396854, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.23489976, + "balance_loss_mlp": 1.01348245, + "epoch": 0.5765819930858259, + "flos": 20276249587200.0, + "grad_norm": 2.116985814696345, + "language_loss": 0.8057307, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.83003151, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19750977, + "step": 9590, + "time_per_iteration": 2.838031053543091 + }, + { + "auxiliary_loss_clip": 0.01427822, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.26109099, + "balance_loss_mlp": 1.01208913, + "epoch": 0.5766421163384939, + "flos": 23159139707520.0, + "grad_norm": 1.8856156118005807, + "language_loss": 0.63818425, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.66278398, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20080566, + "step": 9591, + "time_per_iteration": 2.837923288345337 + }, + { + "auxiliary_loss_clip": 0.01417692, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.25080609, + "balance_loss_mlp": 1.01640213, + "epoch": 0.5767022395911618, + "flos": 25860009421440.0, + "grad_norm": 1.5167557531385518, + "language_loss": 0.78387856, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80842113, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20141602, + "step": 9592, + "time_per_iteration": 2.882469892501831 + }, + { + "auxiliary_loss_clip": 0.01217321, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.1212635, + "balance_loss_mlp": 1.01184297, + "epoch": 0.5767623628438299, + "flos": 68327871753600.0, + "grad_norm": 0.7350499726264537, + "language_loss": 0.59753281, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.62002379, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.19921875, + "step": 9593, + "time_per_iteration": 3.4963326454162598 + }, + { + "auxiliary_loss_clip": 0.01432185, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.26171219, + "balance_loss_mlp": 1.01388693, + "epoch": 0.5768224860964978, + "flos": 30200103423360.0, + "grad_norm": 2.0924087788161074, + "language_loss": 0.71531767, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73999715, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21862793, + "step": 9594, + "time_per_iteration": 2.9057695865631104 + }, + { + "auxiliary_loss_clip": 0.01418276, + "auxiliary_loss_mlp": 0.01039628, + "balance_loss_clip": 1.25193191, + "balance_loss_mlp": 1.02005434, + "epoch": 0.5768826093491658, + "flos": 17904102480000.0, + "grad_norm": 2.1106477122275304, + "language_loss": 0.71573049, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.74030954, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19567871, + "step": 9595, + "time_per_iteration": 2.843494415283203 + }, + { + "auxiliary_loss_clip": 0.01413084, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.24761248, + "balance_loss_mlp": 1.01411057, + "epoch": 0.5769427326018337, + "flos": 17466122384640.0, + "grad_norm": 1.9824875285738517, + "language_loss": 0.69971752, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72419012, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20068359, + "step": 9596, + "time_per_iteration": 2.837700128555298 + }, + { + "auxiliary_loss_clip": 0.01440765, + "auxiliary_loss_mlp": 0.01040604, + "balance_loss_clip": 1.26713872, + "balance_loss_mlp": 1.01917017, + "epoch": 0.5770028558545017, + "flos": 39436396992000.0, + "grad_norm": 2.147558485661204, + "language_loss": 0.68229485, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.7071085, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21435547, + "step": 9597, + "time_per_iteration": 3.0175962448120117 + }, + { + "auxiliary_loss_clip": 0.01415753, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.24956489, + "balance_loss_mlp": 1.01644325, + "epoch": 0.5770629791071697, + "flos": 21544510608000.0, + "grad_norm": 1.8924729523168022, + "language_loss": 0.82062888, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.84514982, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19873047, + "step": 9598, + "time_per_iteration": 2.825298547744751 + }, + { + "auxiliary_loss_clip": 0.01425756, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.25743723, + "balance_loss_mlp": 1.0140065, + "epoch": 0.5771231023598377, + "flos": 20896612047360.0, + "grad_norm": 1.7958867892416215, + "language_loss": 0.7361927, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.76079184, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20153809, + "step": 9599, + "time_per_iteration": 2.83890438079834 + }, + { + "auxiliary_loss_clip": 0.0140623, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.24416876, + "balance_loss_mlp": 1.0132283, + "epoch": 0.5771832256125057, + "flos": 18086258620800.0, + "grad_norm": 2.292385817507339, + "language_loss": 0.78933167, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.81371939, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19299316, + "step": 9600, + "time_per_iteration": 2.8331172466278076 + }, + { + "auxiliary_loss_clip": 0.01427326, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.25778604, + "balance_loss_mlp": 1.01981449, + "epoch": 0.5772433488651736, + "flos": 26370164252160.0, + "grad_norm": 1.7113576102411356, + "language_loss": 0.73006511, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.75473517, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19873047, + "step": 9601, + "time_per_iteration": 2.875852584838867 + }, + { + "auxiliary_loss_clip": 0.01414668, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.2507534, + "balance_loss_mlp": 1.01482737, + "epoch": 0.5773034721178416, + "flos": 19690028968320.0, + "grad_norm": 1.7058665889078708, + "language_loss": 0.69148374, + "learning_rate": 1.599058274973348e-06, + "loss": 0.71598995, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.21142578, + "step": 9602, + "time_per_iteration": 2.838705539703369 + }, + { + "auxiliary_loss_clip": 0.01396994, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.23765135, + "balance_loss_mlp": 1.02036166, + "epoch": 0.5773635953705095, + "flos": 25093849656960.0, + "grad_norm": 1.527887390502405, + "language_loss": 0.73743999, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.76181, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19641113, + "step": 9603, + "time_per_iteration": 4.263092756271362 + }, + { + "auxiliary_loss_clip": 0.0141325, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.24895012, + "balance_loss_mlp": 1.01613426, + "epoch": 0.5774237186231775, + "flos": 21042907044480.0, + "grad_norm": 1.6294794917873814, + "language_loss": 0.77594185, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.80042964, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19396973, + "step": 9604, + "time_per_iteration": 4.4003331661224365 + }, + { + "auxiliary_loss_clip": 0.01422813, + "auxiliary_loss_mlp": 0.01036478, + "balance_loss_clip": 1.25548756, + "balance_loss_mlp": 1.01567626, + "epoch": 0.5774838418758454, + "flos": 15240134540160.0, + "grad_norm": 1.6634882962032873, + "language_loss": 0.837524, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.86211693, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20812988, + "step": 9605, + "time_per_iteration": 4.197317361831665 + }, + { + "auxiliary_loss_clip": 0.01452494, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.27742732, + "balance_loss_mlp": 1.0170064, + "epoch": 0.5775439651285135, + "flos": 23592052385280.0, + "grad_norm": 1.5894050009088756, + "language_loss": 0.78632814, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.81122994, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.20678711, + "step": 9606, + "time_per_iteration": 2.85898756980896 + }, + { + "auxiliary_loss_clip": 0.01415554, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.24989283, + "balance_loss_mlp": 1.0159632, + "epoch": 0.5776040883811814, + "flos": 18049492581120.0, + "grad_norm": 1.761725318161477, + "language_loss": 0.74180353, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76631546, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1965332, + "step": 9607, + "time_per_iteration": 2.8024539947509766 + }, + { + "auxiliary_loss_clip": 0.01426683, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.25929523, + "balance_loss_mlp": 1.01889408, + "epoch": 0.5776642116338494, + "flos": 18633858163200.0, + "grad_norm": 2.0253264297958857, + "language_loss": 0.69928145, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.72393513, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19787598, + "step": 9608, + "time_per_iteration": 2.848527431488037 + }, + { + "auxiliary_loss_clip": 0.01416867, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.2496897, + "balance_loss_mlp": 1.01835454, + "epoch": 0.5777243348865173, + "flos": 28414222179840.0, + "grad_norm": 1.7471931396889104, + "language_loss": 0.77358437, + "learning_rate": 1.596387759940665e-06, + "loss": 0.79814446, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20800781, + "step": 9609, + "time_per_iteration": 2.9093523025512695 + }, + { + "auxiliary_loss_clip": 0.01406307, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.24074125, + "balance_loss_mlp": 1.01253915, + "epoch": 0.5777844581391853, + "flos": 24035371367040.0, + "grad_norm": 1.8320377933634127, + "language_loss": 0.78219056, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.80658031, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.2010498, + "step": 9610, + "time_per_iteration": 2.842288017272949 + }, + { + "auxiliary_loss_clip": 0.01422465, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.25669622, + "balance_loss_mlp": 1.01661539, + "epoch": 0.5778445813918534, + "flos": 17784257708160.0, + "grad_norm": 2.179219040735565, + "language_loss": 0.6989606, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.72355616, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20471191, + "step": 9611, + "time_per_iteration": 2.84462833404541 + }, + { + "auxiliary_loss_clip": 0.01413098, + "auxiliary_loss_mlp": 0.01034817, + "balance_loss_clip": 1.24954891, + "balance_loss_mlp": 1.01386046, + "epoch": 0.5779047046445213, + "flos": 22242431934720.0, + "grad_norm": 1.834851675157946, + "language_loss": 0.83500773, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85948682, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20947266, + "step": 9612, + "time_per_iteration": 2.8602166175842285 + }, + { + "auxiliary_loss_clip": 0.01412651, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.24850702, + "balance_loss_mlp": 1.01404536, + "epoch": 0.5779648278971893, + "flos": 21444392568960.0, + "grad_norm": 1.6650491066534918, + "language_loss": 0.80110729, + "learning_rate": 1.594862087742667e-06, + "loss": 0.82557189, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19750977, + "step": 9613, + "time_per_iteration": 2.8393123149871826 + }, + { + "auxiliary_loss_clip": 0.01405607, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.24204922, + "balance_loss_mlp": 1.01541138, + "epoch": 0.5780249511498572, + "flos": 19035479422080.0, + "grad_norm": 1.9041929212471937, + "language_loss": 0.78186792, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.80628407, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20593262, + "step": 9614, + "time_per_iteration": 2.833939552307129 + }, + { + "auxiliary_loss_clip": 0.01419383, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.25114131, + "balance_loss_mlp": 1.01644897, + "epoch": 0.5780850744025252, + "flos": 12130766357760.0, + "grad_norm": 3.6386443646621887, + "language_loss": 0.82369852, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.84826273, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20593262, + "step": 9615, + "time_per_iteration": 2.8209636211395264 + }, + { + "auxiliary_loss_clip": 0.0142063, + "auxiliary_loss_mlp": 0.0103991, + "balance_loss_clip": 1.25190294, + "balance_loss_mlp": 1.02081251, + "epoch": 0.5781451976551931, + "flos": 25054866622080.0, + "grad_norm": 1.5577939982528795, + "language_loss": 0.6750465, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.6996519, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19091797, + "step": 9616, + "time_per_iteration": 2.8806276321411133 + }, + { + "auxiliary_loss_clip": 0.01404374, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.24265385, + "balance_loss_mlp": 1.0196743, + "epoch": 0.5782053209078611, + "flos": 19255170764160.0, + "grad_norm": 1.7448875836798061, + "language_loss": 0.78801596, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.81245923, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.20275879, + "step": 9617, + "time_per_iteration": 2.8146486282348633 + }, + { + "auxiliary_loss_clip": 0.01407482, + "auxiliary_loss_mlp": 0.01039944, + "balance_loss_clip": 1.24389744, + "balance_loss_mlp": 1.01949918, + "epoch": 0.578265444160529, + "flos": 26004720850560.0, + "grad_norm": 1.50095810234385, + "language_loss": 0.76080847, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.78528273, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.2043457, + "step": 9618, + "time_per_iteration": 2.8852405548095703 + }, + { + "auxiliary_loss_clip": 0.01411582, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.24803841, + "balance_loss_mlp": 1.01661789, + "epoch": 0.5783255674131971, + "flos": 21808569116160.0, + "grad_norm": 1.7132596656387997, + "language_loss": 0.82130975, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84578735, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19567871, + "step": 9619, + "time_per_iteration": 2.8386099338531494 + }, + { + "auxiliary_loss_clip": 0.01411469, + "auxiliary_loss_mlp": 0.01042609, + "balance_loss_clip": 1.24641538, + "balance_loss_mlp": 1.02304673, + "epoch": 0.578385690665865, + "flos": 24800309521920.0, + "grad_norm": 1.834069995924176, + "language_loss": 0.73240411, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.75694489, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19555664, + "step": 9620, + "time_per_iteration": 2.865807056427002 + }, + { + "auxiliary_loss_clip": 0.01429839, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.26335335, + "balance_loss_mlp": 1.02054858, + "epoch": 0.578445813918533, + "flos": 21222167518080.0, + "grad_norm": 1.5909783185512139, + "language_loss": 0.78326213, + "learning_rate": 1.591811481689916e-06, + "loss": 0.80796415, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19787598, + "step": 9621, + "time_per_iteration": 2.8440253734588623 + }, + { + "auxiliary_loss_clip": 0.01421698, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.25558519, + "balance_loss_mlp": 1.01852667, + "epoch": 0.5785059371712009, + "flos": 25057852778880.0, + "grad_norm": 1.4019016669559574, + "language_loss": 0.71212596, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.73672563, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19750977, + "step": 9622, + "time_per_iteration": 2.888073444366455 + }, + { + "auxiliary_loss_clip": 0.01217027, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.12587953, + "balance_loss_mlp": 1.01131654, + "epoch": 0.5785660604238689, + "flos": 70877333808000.0, + "grad_norm": 0.7777259577716826, + "language_loss": 0.56019592, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.58270156, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.22265625, + "step": 9623, + "time_per_iteration": 3.388606309890747 + }, + { + "auxiliary_loss_clip": 0.01417477, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.24971581, + "balance_loss_mlp": 1.02257133, + "epoch": 0.578626183676537, + "flos": 31662555701760.0, + "grad_norm": 1.9168647914571713, + "language_loss": 0.72048259, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.7450884, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20532227, + "step": 9624, + "time_per_iteration": 2.933692693710327 + }, + { + "auxiliary_loss_clip": 0.01413407, + "auxiliary_loss_mlp": 0.01045239, + "balance_loss_clip": 1.24838221, + "balance_loss_mlp": 1.02490222, + "epoch": 0.5786863069292049, + "flos": 21873504683520.0, + "grad_norm": 2.437695218053764, + "language_loss": 0.83242714, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.85701364, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20349121, + "step": 9625, + "time_per_iteration": 4.262711048126221 + }, + { + "auxiliary_loss_clip": 0.01420048, + "auxiliary_loss_mlp": 0.01038599, + "balance_loss_clip": 1.25616479, + "balance_loss_mlp": 1.0178802, + "epoch": 0.5787464301818729, + "flos": 23374668528000.0, + "grad_norm": 1.5527125035245037, + "language_loss": 0.70528024, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72986668, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20727539, + "step": 9626, + "time_per_iteration": 2.9079980850219727 + }, + { + "auxiliary_loss_clip": 0.01410475, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.24646783, + "balance_loss_mlp": 1.01803136, + "epoch": 0.5788065534345408, + "flos": 30015096860160.0, + "grad_norm": 4.270699692625408, + "language_loss": 0.72456765, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74905491, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.20227051, + "step": 9627, + "time_per_iteration": 2.9150376319885254 + }, + { + "auxiliary_loss_clip": 0.01418061, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.25417519, + "balance_loss_mlp": 1.01764798, + "epoch": 0.5788666766872088, + "flos": 24536658216960.0, + "grad_norm": 1.7334315434430667, + "language_loss": 0.8428638, + "learning_rate": 1.589143013764458e-06, + "loss": 0.8674196, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19885254, + "step": 9628, + "time_per_iteration": 2.8801112174987793 + }, + { + "auxiliary_loss_clip": 0.01427272, + "auxiliary_loss_mlp": 0.01039518, + "balance_loss_clip": 1.26155567, + "balance_loss_mlp": 1.01906204, + "epoch": 0.5789267999398767, + "flos": 23743414800000.0, + "grad_norm": 1.5867481812755941, + "language_loss": 0.72934943, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.75401735, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20458984, + "step": 9629, + "time_per_iteration": 2.892033338546753 + }, + { + "auxiliary_loss_clip": 0.01418725, + "auxiliary_loss_mlp": 0.01039893, + "balance_loss_clip": 1.25384867, + "balance_loss_mlp": 1.01916194, + "epoch": 0.5789869231925447, + "flos": 21143070328320.0, + "grad_norm": 2.065372319135669, + "language_loss": 0.750741, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.77532715, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20739746, + "step": 9630, + "time_per_iteration": 2.9415769577026367 + }, + { + "auxiliary_loss_clip": 0.01399114, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.23911309, + "balance_loss_mlp": 1.01738894, + "epoch": 0.5790470464452127, + "flos": 21218095486080.0, + "grad_norm": 1.537372840187083, + "language_loss": 0.79492116, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81928098, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19482422, + "step": 9631, + "time_per_iteration": 2.8702595233917236 + }, + { + "auxiliary_loss_clip": 0.01423249, + "auxiliary_loss_mlp": 0.01038703, + "balance_loss_clip": 1.25823879, + "balance_loss_mlp": 1.01875949, + "epoch": 0.5791071696978807, + "flos": 23414873172480.0, + "grad_norm": 9.662733625350944, + "language_loss": 0.75739115, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19934082, + "step": 9632, + "time_per_iteration": 2.8572511672973633 + }, + { + "auxiliary_loss_clip": 0.01410774, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.2462523, + "balance_loss_mlp": 1.0127852, + "epoch": 0.5791672929505486, + "flos": 24217075059840.0, + "grad_norm": 2.124403997800271, + "language_loss": 0.79579717, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.82023478, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.2019043, + "step": 9633, + "time_per_iteration": 2.8659183979034424 + }, + { + "auxiliary_loss_clip": 0.01452831, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.27888763, + "balance_loss_mlp": 1.01975942, + "epoch": 0.5792274162032166, + "flos": 24358528863360.0, + "grad_norm": 1.753991951687585, + "language_loss": 0.78801966, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.81295431, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20861816, + "step": 9634, + "time_per_iteration": 2.8666832447052 + }, + { + "auxiliary_loss_clip": 0.01431893, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.26334524, + "balance_loss_mlp": 1.01684773, + "epoch": 0.5792875394558845, + "flos": 20459220134400.0, + "grad_norm": 2.5498845316499823, + "language_loss": 0.64774567, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.67243969, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20666504, + "step": 9635, + "time_per_iteration": 2.8647119998931885 + }, + { + "auxiliary_loss_clip": 0.01411968, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.25012696, + "balance_loss_mlp": 1.01405442, + "epoch": 0.5793476627085525, + "flos": 24070780062720.0, + "grad_norm": 1.4630507628619012, + "language_loss": 0.78032589, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.80477893, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19287109, + "step": 9636, + "time_per_iteration": 2.874861478805542 + }, + { + "auxiliary_loss_clip": 0.01410257, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.24932826, + "balance_loss_mlp": 1.01762199, + "epoch": 0.5794077859612206, + "flos": 22064393070720.0, + "grad_norm": 1.576172456139333, + "language_loss": 0.69591963, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.72038364, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18505859, + "step": 9637, + "time_per_iteration": 2.8470871448516846 + }, + { + "auxiliary_loss_clip": 0.01435388, + "auxiliary_loss_mlp": 0.01038974, + "balance_loss_clip": 1.26678252, + "balance_loss_mlp": 1.01832688, + "epoch": 0.5794679092138885, + "flos": 11441667767040.0, + "grad_norm": 13.814267988394317, + "language_loss": 0.73239106, + "learning_rate": 1.585332242234043e-06, + "loss": 0.75713468, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.2064209, + "step": 9638, + "time_per_iteration": 4.2034752368927 + }, + { + "auxiliary_loss_clip": 0.01421954, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.25773251, + "balance_loss_mlp": 1.02113891, + "epoch": 0.5795280324665565, + "flos": 18889546383360.0, + "grad_norm": 1.5656758674596514, + "language_loss": 0.73185062, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.75647593, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19445801, + "step": 9639, + "time_per_iteration": 4.262020587921143 + }, + { + "auxiliary_loss_clip": 0.01417103, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.25280678, + "balance_loss_mlp": 1.01920772, + "epoch": 0.5795881557192244, + "flos": 13013739492480.0, + "grad_norm": 4.007521070679836, + "language_loss": 0.69990987, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.72446895, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19604492, + "step": 9640, + "time_per_iteration": 4.309274196624756 + }, + { + "auxiliary_loss_clip": 0.01453974, + "auxiliary_loss_mlp": 0.01043175, + "balance_loss_clip": 1.28020322, + "balance_loss_mlp": 1.02168131, + "epoch": 0.5796482789718924, + "flos": 19940468791680.0, + "grad_norm": 2.769298514180623, + "language_loss": 0.78119987, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80617142, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21496582, + "step": 9641, + "time_per_iteration": 2.7930023670196533 + }, + { + "auxiliary_loss_clip": 0.01419821, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.25481629, + "balance_loss_mlp": 1.01809442, + "epoch": 0.5797084022245603, + "flos": 21660238103040.0, + "grad_norm": 2.103226355990795, + "language_loss": 0.74886531, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.77344722, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20288086, + "step": 9642, + "time_per_iteration": 2.841566562652588 + }, + { + "auxiliary_loss_clip": 0.01415094, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.2508713, + "balance_loss_mlp": 1.02082014, + "epoch": 0.5797685254772283, + "flos": 26042798989440.0, + "grad_norm": 1.4937907130960841, + "language_loss": 0.74237978, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.76694715, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20812988, + "step": 9643, + "time_per_iteration": 2.8780648708343506 + }, + { + "auxiliary_loss_clip": 0.01427635, + "auxiliary_loss_mlp": 0.01037011, + "balance_loss_clip": 1.26114643, + "balance_loss_mlp": 1.01649523, + "epoch": 0.5798286487298963, + "flos": 22713920444160.0, + "grad_norm": 3.8865801566266223, + "language_loss": 0.68518865, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.70983517, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20507812, + "step": 9644, + "time_per_iteration": 2.8448522090911865 + }, + { + "auxiliary_loss_clip": 0.01435206, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.26650763, + "balance_loss_mlp": 1.01765049, + "epoch": 0.5798887719825643, + "flos": 23159411176320.0, + "grad_norm": 2.0564735405782266, + "language_loss": 0.86839861, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.89314032, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21313477, + "step": 9645, + "time_per_iteration": 2.8298254013061523 + }, + { + "auxiliary_loss_clip": 0.01425137, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.25997949, + "balance_loss_mlp": 1.01581144, + "epoch": 0.5799488952352322, + "flos": 24436811646720.0, + "grad_norm": 2.2907869005974737, + "language_loss": 0.76128471, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.78589129, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19714355, + "step": 9646, + "time_per_iteration": 2.899134635925293 + }, + { + "auxiliary_loss_clip": 0.01436424, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.26713765, + "balance_loss_mlp": 1.01775765, + "epoch": 0.5800090184879002, + "flos": 38409210120960.0, + "grad_norm": 2.03549269623621, + "language_loss": 0.59862489, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.62338769, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.2208252, + "step": 9647, + "time_per_iteration": 3.032667398452759 + }, + { + "auxiliary_loss_clip": 0.01433655, + "auxiliary_loss_mlp": 0.01045947, + "balance_loss_clip": 1.26467729, + "balance_loss_mlp": 1.02543068, + "epoch": 0.5800691417405681, + "flos": 19792454492160.0, + "grad_norm": 1.589211301148527, + "language_loss": 0.84715652, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.87195253, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20507812, + "step": 9648, + "time_per_iteration": 2.9254708290100098 + }, + { + "auxiliary_loss_clip": 0.01214758, + "auxiliary_loss_mlp": 0.01024178, + "balance_loss_clip": 1.12095559, + "balance_loss_mlp": 1.00376916, + "epoch": 0.5801292649932361, + "flos": 70343968394880.0, + "grad_norm": 0.8604722644748034, + "language_loss": 0.63157737, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65396678, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20410156, + "step": 9649, + "time_per_iteration": 3.407658338546753 + }, + { + "auxiliary_loss_clip": 0.01405054, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.24384511, + "balance_loss_mlp": 1.01509881, + "epoch": 0.5801893882459042, + "flos": 18744246771840.0, + "grad_norm": 1.8771501263449424, + "language_loss": 0.82366675, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84806156, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1932373, + "step": 9650, + "time_per_iteration": 2.8750314712524414 + }, + { + "auxiliary_loss_clip": 0.01425657, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.25614154, + "balance_loss_mlp": 1.01493013, + "epoch": 0.5802495114985721, + "flos": 15604989759360.0, + "grad_norm": 2.14037973103482, + "language_loss": 0.77429354, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79889977, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20019531, + "step": 9651, + "time_per_iteration": 2.8271162509918213 + }, + { + "auxiliary_loss_clip": 0.01429727, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.26090932, + "balance_loss_mlp": 1.017308, + "epoch": 0.5803096347512401, + "flos": 18263392588800.0, + "grad_norm": 2.2856666126433285, + "language_loss": 0.74991155, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.77457595, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1940918, + "step": 9652, + "time_per_iteration": 2.9189822673797607 + }, + { + "auxiliary_loss_clip": 0.01417885, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.24969983, + "balance_loss_mlp": 1.01708949, + "epoch": 0.580369758003908, + "flos": 22903406242560.0, + "grad_norm": 4.400847956487363, + "language_loss": 0.77783048, + "learning_rate": 1.579619037747193e-06, + "loss": 0.80239141, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21118164, + "step": 9653, + "time_per_iteration": 2.8709278106689453 + }, + { + "auxiliary_loss_clip": 0.01419052, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.25171161, + "balance_loss_mlp": 1.01508391, + "epoch": 0.580429881256576, + "flos": 18706937794560.0, + "grad_norm": 2.2308169713829287, + "language_loss": 0.75700247, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.78155243, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20849609, + "step": 9654, + "time_per_iteration": 2.825124740600586 + }, + { + "auxiliary_loss_clip": 0.01400842, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.23925114, + "balance_loss_mlp": 1.01522446, + "epoch": 0.5804900045092439, + "flos": 24692952314880.0, + "grad_norm": 1.8439425104151423, + "language_loss": 0.71393049, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.73829186, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20056152, + "step": 9655, + "time_per_iteration": 2.8343758583068848 + }, + { + "auxiliary_loss_clip": 0.01423433, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.25214159, + "balance_loss_mlp": 1.01297569, + "epoch": 0.580550127761912, + "flos": 23123278563840.0, + "grad_norm": 2.1005716896837603, + "language_loss": 0.70085263, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.72543311, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21630859, + "step": 9656, + "time_per_iteration": 2.8374574184417725 + }, + { + "auxiliary_loss_clip": 0.01392963, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.23334038, + "balance_loss_mlp": 1.01043904, + "epoch": 0.5806102510145799, + "flos": 18484079316480.0, + "grad_norm": 1.670552258171982, + "language_loss": 0.72103113, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.74525577, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19055176, + "step": 9657, + "time_per_iteration": 2.8063547611236572 + }, + { + "auxiliary_loss_clip": 0.01437333, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.26663828, + "balance_loss_mlp": 1.01776123, + "epoch": 0.5806703742672479, + "flos": 23926385347200.0, + "grad_norm": 2.4987999101303564, + "language_loss": 0.7201649, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.74492252, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20678711, + "step": 9658, + "time_per_iteration": 2.9283196926116943 + }, + { + "auxiliary_loss_clip": 0.0120973, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.1197058, + "balance_loss_mlp": 1.00576723, + "epoch": 0.5807304975199158, + "flos": 66343184282880.0, + "grad_norm": 0.6479624456958403, + "language_loss": 0.5359453, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55833012, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.22949219, + "step": 9659, + "time_per_iteration": 4.812680959701538 + }, + { + "auxiliary_loss_clip": 0.01424726, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.25564694, + "balance_loss_mlp": 1.01872468, + "epoch": 0.5807906207725838, + "flos": 31734911416320.0, + "grad_norm": 3.6420768443223683, + "language_loss": 0.63312209, + "learning_rate": 1.576954100136366e-06, + "loss": 0.65776789, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21105957, + "step": 9660, + "time_per_iteration": 2.938620090484619 + }, + { + "auxiliary_loss_clip": 0.01411904, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.24419188, + "balance_loss_mlp": 1.01403928, + "epoch": 0.5808507440252517, + "flos": 23810567362560.0, + "grad_norm": 1.593740415267695, + "language_loss": 0.66413468, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.68859637, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20227051, + "step": 9661, + "time_per_iteration": 2.8557522296905518 + }, + { + "auxiliary_loss_clip": 0.01392883, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.23334396, + "balance_loss_mlp": 1.01097786, + "epoch": 0.5809108672779197, + "flos": 13706231443200.0, + "grad_norm": 2.6670945630162213, + "language_loss": 0.75088805, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.77513587, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20935059, + "step": 9662, + "time_per_iteration": 2.8302438259124756 + }, + { + "auxiliary_loss_clip": 0.01214195, + "auxiliary_loss_mlp": 0.01052736, + "balance_loss_clip": 1.12023628, + "balance_loss_mlp": 1.02946651, + "epoch": 0.5809709905305876, + "flos": 69170278285440.0, + "grad_norm": 0.8847474026625254, + "language_loss": 0.5849157, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60758501, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.23242188, + "step": 9663, + "time_per_iteration": 3.3706986904144287 + }, + { + "auxiliary_loss_clip": 0.01404862, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.24237251, + "balance_loss_mlp": 1.01076448, + "epoch": 0.5810311137832557, + "flos": 19836776413440.0, + "grad_norm": 2.351922942396373, + "language_loss": 0.82888025, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.85323137, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19470215, + "step": 9664, + "time_per_iteration": 2.8335611820220947 + }, + { + "auxiliary_loss_clip": 0.01408164, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.23928022, + "balance_loss_mlp": 1.01241338, + "epoch": 0.5810912370359237, + "flos": 29249570522880.0, + "grad_norm": 1.6119931103610399, + "language_loss": 0.82341415, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.84783196, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21203613, + "step": 9665, + "time_per_iteration": 2.9228031635284424 + }, + { + "auxiliary_loss_clip": 0.01435346, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.26398349, + "balance_loss_mlp": 1.01672828, + "epoch": 0.5811513602885916, + "flos": 22796275259520.0, + "grad_norm": 1.6137206480338748, + "language_loss": 0.81821132, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.84293824, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20629883, + "step": 9666, + "time_per_iteration": 2.8480751514434814 + }, + { + "auxiliary_loss_clip": 0.01413193, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.24985802, + "balance_loss_mlp": 1.01574397, + "epoch": 0.5812114835412596, + "flos": 18743658589440.0, + "grad_norm": 1.927148951646313, + "language_loss": 0.81023264, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.8347162, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19421387, + "step": 9667, + "time_per_iteration": 2.8306288719177246 + }, + { + "auxiliary_loss_clip": 0.01441449, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.26641202, + "balance_loss_mlp": 1.01651382, + "epoch": 0.5812716067939275, + "flos": 26442203253120.0, + "grad_norm": 3.022829467062817, + "language_loss": 0.79354572, + "learning_rate": 1.573909419957653e-06, + "loss": 0.81834054, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.21520996, + "step": 9668, + "time_per_iteration": 2.8757975101470947 + }, + { + "auxiliary_loss_clip": 0.01410786, + "auxiliary_loss_mlp": 0.01040352, + "balance_loss_clip": 1.24442458, + "balance_loss_mlp": 1.02007437, + "epoch": 0.5813317300465956, + "flos": 43413355077120.0, + "grad_norm": 1.7779705162503576, + "language_loss": 0.65692675, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.68143821, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20275879, + "step": 9669, + "time_per_iteration": 3.0489182472229004 + }, + { + "auxiliary_loss_clip": 0.01405791, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_clip": 1.24131405, + "balance_loss_mlp": 1.02105784, + "epoch": 0.5813918532992635, + "flos": 24794925390720.0, + "grad_norm": 1.4745824876898979, + "language_loss": 0.7407136, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.76519758, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.2154541, + "step": 9670, + "time_per_iteration": 2.9266796112060547 + }, + { + "auxiliary_loss_clip": 0.01416773, + "auxiliary_loss_mlp": 0.01038793, + "balance_loss_clip": 1.24924731, + "balance_loss_mlp": 1.0186584, + "epoch": 0.5814519765519315, + "flos": 22867771322880.0, + "grad_norm": 2.0199678771878298, + "language_loss": 0.79506385, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81961954, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20141602, + "step": 9671, + "time_per_iteration": 2.8541440963745117 + }, + { + "auxiliary_loss_clip": 0.01445218, + "auxiliary_loss_mlp": 0.01042285, + "balance_loss_clip": 1.27219164, + "balance_loss_mlp": 1.02123249, + "epoch": 0.5815120998045994, + "flos": 24071413489920.0, + "grad_norm": 2.033699395922381, + "language_loss": 0.62801576, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.6528908, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21032715, + "step": 9672, + "time_per_iteration": 4.229781627655029 + }, + { + "auxiliary_loss_clip": 0.01410273, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.24586141, + "balance_loss_mlp": 1.01457453, + "epoch": 0.5815722230572674, + "flos": 24290064201600.0, + "grad_norm": 1.6086933957418081, + "language_loss": 0.82325524, + "learning_rate": 1.572007019492342e-06, + "loss": 0.84770179, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19799805, + "step": 9673, + "time_per_iteration": 2.961327075958252 + }, + { + "auxiliary_loss_clip": 0.01421899, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.25221586, + "balance_loss_mlp": 1.01652801, + "epoch": 0.5816323463099353, + "flos": 22210552333440.0, + "grad_norm": 2.049307579270645, + "language_loss": 0.89059794, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.91519082, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20861816, + "step": 9674, + "time_per_iteration": 2.8462719917297363 + }, + { + "auxiliary_loss_clip": 0.01415659, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.25016499, + "balance_loss_mlp": 1.01709032, + "epoch": 0.5816924695626033, + "flos": 24145217038080.0, + "grad_norm": 1.646159890888627, + "language_loss": 0.79586762, + "learning_rate": 1.571246172811984e-06, + "loss": 0.82039183, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19665527, + "step": 9675, + "time_per_iteration": 4.34076452255249 + }, + { + "auxiliary_loss_clip": 0.01424015, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.257146, + "balance_loss_mlp": 1.01609516, + "epoch": 0.5817525928152713, + "flos": 21334049205120.0, + "grad_norm": 2.8290256239270213, + "language_loss": 0.70876747, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.73337501, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20654297, + "step": 9676, + "time_per_iteration": 2.8427958488464355 + }, + { + "auxiliary_loss_clip": 0.01424719, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.25625145, + "balance_loss_mlp": 1.02075696, + "epoch": 0.5818127160679393, + "flos": 26943580592640.0, + "grad_norm": 2.560486599265088, + "language_loss": 0.6486333, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.67328537, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19726562, + "step": 9677, + "time_per_iteration": 2.8650524616241455 + }, + { + "auxiliary_loss_clip": 0.01218561, + "auxiliary_loss_mlp": 0.01033328, + "balance_loss_clip": 1.12005293, + "balance_loss_mlp": 1.00509918, + "epoch": 0.5818728393206073, + "flos": 63953002483200.0, + "grad_norm": 0.8048764321380758, + "language_loss": 0.54221833, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56473732, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.28320312, + "step": 9678, + "time_per_iteration": 3.4022960662841797 + }, + { + "auxiliary_loss_clip": 0.0121324, + "auxiliary_loss_mlp": 0.01037177, + "balance_loss_clip": 1.11514449, + "balance_loss_mlp": 1.01104641, + "epoch": 0.5819329625732752, + "flos": 64982886059520.0, + "grad_norm": 0.7487936075394347, + "language_loss": 0.56282324, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58532739, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.26171875, + "step": 9679, + "time_per_iteration": 3.2118566036224365 + }, + { + "auxiliary_loss_clip": 0.01423609, + "auxiliary_loss_mlp": 0.01042017, + "balance_loss_clip": 1.25541556, + "balance_loss_mlp": 1.02142978, + "epoch": 0.5819930858259432, + "flos": 21225198919680.0, + "grad_norm": 1.5938441270552801, + "language_loss": 0.66086036, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.6855166, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20581055, + "step": 9680, + "time_per_iteration": 2.852072238922119 + }, + { + "auxiliary_loss_clip": 0.01422856, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.25579154, + "balance_loss_mlp": 1.01415455, + "epoch": 0.5820532090786111, + "flos": 19466582307840.0, + "grad_norm": 2.2313199376645008, + "language_loss": 0.8416512, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.86622649, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20507812, + "step": 9681, + "time_per_iteration": 2.8397393226623535 + }, + { + "auxiliary_loss_clip": 0.0141893, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.25355279, + "balance_loss_mlp": 1.01616573, + "epoch": 0.5821133323312792, + "flos": 17721629625600.0, + "grad_norm": 1.791458377621193, + "language_loss": 0.76984024, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.79439384, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20275879, + "step": 9682, + "time_per_iteration": 2.833104372024536 + }, + { + "auxiliary_loss_clip": 0.01427628, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.2572161, + "balance_loss_mlp": 1.01495767, + "epoch": 0.5821734555839471, + "flos": 24582563706240.0, + "grad_norm": 2.1529484069121465, + "language_loss": 0.75832772, + "learning_rate": 1.568203437579977e-06, + "loss": 0.78296089, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20727539, + "step": 9683, + "time_per_iteration": 2.8651559352874756 + }, + { + "auxiliary_loss_clip": 0.01432917, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.26151764, + "balance_loss_mlp": 1.01363981, + "epoch": 0.5822335788366151, + "flos": 22392346515840.0, + "grad_norm": 2.2989960855594154, + "language_loss": 0.74830556, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.77297199, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20092773, + "step": 9684, + "time_per_iteration": 2.8428680896759033 + }, + { + "auxiliary_loss_clip": 0.01429947, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.26024687, + "balance_loss_mlp": 1.02317262, + "epoch": 0.582293702089283, + "flos": 26733300168960.0, + "grad_norm": 2.161050554772547, + "language_loss": 0.79112697, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.81586587, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.2076416, + "step": 9685, + "time_per_iteration": 2.8790948390960693 + }, + { + "auxiliary_loss_clip": 0.01425972, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.25707865, + "balance_loss_mlp": 1.02180338, + "epoch": 0.582353825341951, + "flos": 17357815036800.0, + "grad_norm": 1.693622388726344, + "language_loss": 0.76161206, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.7862944, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20458984, + "step": 9686, + "time_per_iteration": 2.8611233234405518 + }, + { + "auxiliary_loss_clip": 0.01212111, + "auxiliary_loss_mlp": 0.01024441, + "balance_loss_clip": 1.117311, + "balance_loss_mlp": 0.9971658, + "epoch": 0.5824139485946189, + "flos": 55500993861120.0, + "grad_norm": 0.8142254599066522, + "language_loss": 0.57390594, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59627151, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.2734375, + "step": 9687, + "time_per_iteration": 3.1717722415924072 + }, + { + "auxiliary_loss_clip": 0.01426124, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.25716054, + "balance_loss_mlp": 1.01165211, + "epoch": 0.582474071847287, + "flos": 20312698913280.0, + "grad_norm": 3.0611420396577205, + "language_loss": 0.70981491, + "learning_rate": 1.566302259738727e-06, + "loss": 0.73440552, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21264648, + "step": 9688, + "time_per_iteration": 2.8849117755889893 + }, + { + "auxiliary_loss_clip": 0.01418339, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.25107467, + "balance_loss_mlp": 1.01461875, + "epoch": 0.5825341950999549, + "flos": 23888442942720.0, + "grad_norm": 2.397390264003874, + "language_loss": 0.66238022, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.68690968, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.1998291, + "step": 9689, + "time_per_iteration": 2.896416425704956 + }, + { + "auxiliary_loss_clip": 0.01421131, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.25619447, + "balance_loss_mlp": 1.0177089, + "epoch": 0.5825943183526229, + "flos": 23123595277440.0, + "grad_norm": 1.8927864218734844, + "language_loss": 0.7424801, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.76708198, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.21350098, + "step": 9690, + "time_per_iteration": 2.8680717945098877 + }, + { + "auxiliary_loss_clip": 0.01424799, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.2553612, + "balance_loss_mlp": 1.01815951, + "epoch": 0.5826544416052909, + "flos": 22867861812480.0, + "grad_norm": 1.8665242399882351, + "language_loss": 0.76331794, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78796935, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.22180176, + "step": 9691, + "time_per_iteration": 2.839571237564087 + }, + { + "auxiliary_loss_clip": 0.01412853, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.24444783, + "balance_loss_mlp": 1.01475811, + "epoch": 0.5827145648579588, + "flos": 31513817485440.0, + "grad_norm": 1.6841847879511873, + "language_loss": 0.81338853, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.83786863, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20410156, + "step": 9692, + "time_per_iteration": 2.8897199630737305 + }, + { + "auxiliary_loss_clip": 0.01208009, + "auxiliary_loss_mlp": 0.01023425, + "balance_loss_clip": 1.11375308, + "balance_loss_mlp": 0.9967224, + "epoch": 0.5827746881106268, + "flos": 69843179237760.0, + "grad_norm": 0.7596682392227048, + "language_loss": 0.56978083, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.59209514, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.26757812, + "step": 9693, + "time_per_iteration": 3.330054998397827 + }, + { + "auxiliary_loss_clip": 0.01419218, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.25144732, + "balance_loss_mlp": 1.0171206, + "epoch": 0.5828348113632947, + "flos": 23122780871040.0, + "grad_norm": 1.764733363679559, + "language_loss": 0.79567814, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.82025248, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.2109375, + "step": 9694, + "time_per_iteration": 4.3018248081207275 + }, + { + "auxiliary_loss_clip": 0.01396337, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.23617756, + "balance_loss_mlp": 1.01338339, + "epoch": 0.5828949346159628, + "flos": 21883458539520.0, + "grad_norm": 1.3184454041143983, + "language_loss": 0.77102661, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.79531467, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19091797, + "step": 9695, + "time_per_iteration": 2.8470022678375244 + }, + { + "auxiliary_loss_clip": 0.01212185, + "auxiliary_loss_mlp": 0.01023038, + "balance_loss_clip": 1.11669779, + "balance_loss_mlp": 0.99614471, + "epoch": 0.5829550578686307, + "flos": 65997648593280.0, + "grad_norm": 0.7769868158984526, + "language_loss": 0.55081832, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57317054, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.26953125, + "step": 9696, + "time_per_iteration": 3.3739840984344482 + }, + { + "auxiliary_loss_clip": 0.01425434, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.25724792, + "balance_loss_mlp": 1.01299107, + "epoch": 0.5830151811212987, + "flos": 16298477095680.0, + "grad_norm": 2.1411064077493345, + "language_loss": 0.77801967, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.80260897, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20495605, + "step": 9697, + "time_per_iteration": 2.817671060562134 + }, + { + "auxiliary_loss_clip": 0.0142503, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.25431085, + "balance_loss_mlp": 1.01768613, + "epoch": 0.5830753043739666, + "flos": 24179087410560.0, + "grad_norm": 1.715919874126014, + "language_loss": 0.7821691, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.80681401, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21777344, + "step": 9698, + "time_per_iteration": 2.8826565742492676 + }, + { + "auxiliary_loss_clip": 0.01411112, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.24587297, + "balance_loss_mlp": 1.01881647, + "epoch": 0.5831354276266346, + "flos": 27072067121280.0, + "grad_norm": 1.8108074775284826, + "language_loss": 0.84124774, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.86575437, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20739746, + "step": 9699, + "time_per_iteration": 2.9753732681274414 + }, + { + "auxiliary_loss_clip": 0.01421283, + "auxiliary_loss_mlp": 0.01036062, + "balance_loss_clip": 1.25099277, + "balance_loss_mlp": 1.01486611, + "epoch": 0.5831955508793025, + "flos": 23634066821760.0, + "grad_norm": 2.2487772090029763, + "language_loss": 0.66762424, + "learning_rate": 1.561741113828305e-06, + "loss": 0.69219768, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21191406, + "step": 9700, + "time_per_iteration": 2.955373764038086 + }, + { + "auxiliary_loss_clip": 0.01423996, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.25561738, + "balance_loss_mlp": 1.01743698, + "epoch": 0.5832556741319705, + "flos": 24984139720320.0, + "grad_norm": 1.5813117954852438, + "language_loss": 0.71552503, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.74015617, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21691895, + "step": 9701, + "time_per_iteration": 2.923311710357666 + }, + { + "auxiliary_loss_clip": 0.01408708, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.2418226, + "balance_loss_mlp": 1.01636207, + "epoch": 0.5833157973846385, + "flos": 23231902625280.0, + "grad_norm": 13.706303535976698, + "language_loss": 0.86112154, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.88557857, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.2064209, + "step": 9702, + "time_per_iteration": 2.8524200916290283 + }, + { + "auxiliary_loss_clip": 0.01405209, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.24109197, + "balance_loss_mlp": 1.01612508, + "epoch": 0.5833759206373065, + "flos": 21987150917760.0, + "grad_norm": 1.5600128491478518, + "language_loss": 0.78371543, + "learning_rate": 1.560601200301392e-06, + "loss": 0.8081305, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20178223, + "step": 9703, + "time_per_iteration": 2.8563156127929688 + }, + { + "auxiliary_loss_clip": 0.01422738, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.25290561, + "balance_loss_mlp": 1.01642728, + "epoch": 0.5834360438899745, + "flos": 21772436503680.0, + "grad_norm": 1.7458935014644952, + "language_loss": 0.72066379, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.74525821, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20275879, + "step": 9704, + "time_per_iteration": 2.8372626304626465 + }, + { + "auxiliary_loss_clip": 0.01423305, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.25456917, + "balance_loss_mlp": 1.01778185, + "epoch": 0.5834961671426424, + "flos": 15999779053440.0, + "grad_norm": 1.7348376578404974, + "language_loss": 0.82279408, + "learning_rate": 1.559841341236335e-06, + "loss": 0.84740496, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.1998291, + "step": 9705, + "time_per_iteration": 2.842571496963501 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.25360191, + "balance_loss_mlp": 1.01265836, + "epoch": 0.5835562903953104, + "flos": 22828064371200.0, + "grad_norm": 1.6209014300463165, + "language_loss": 0.80654061, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.83108008, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1973877, + "step": 9706, + "time_per_iteration": 2.8453478813171387 + }, + { + "auxiliary_loss_clip": 0.01415916, + "auxiliary_loss_mlp": 0.0103734, + "balance_loss_clip": 1.25005603, + "balance_loss_mlp": 1.01638305, + "epoch": 0.5836164136479783, + "flos": 48484516861440.0, + "grad_norm": 1.747309918742351, + "language_loss": 0.75973254, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7842651, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20947266, + "step": 9707, + "time_per_iteration": 4.4875876903533936 + }, + { + "auxiliary_loss_clip": 0.0140191, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.23857784, + "balance_loss_mlp": 1.01390481, + "epoch": 0.5836765369006464, + "flos": 26917130367360.0, + "grad_norm": 2.062395980263255, + "language_loss": 0.82534826, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.84970641, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20007324, + "step": 9708, + "time_per_iteration": 2.88557505607605 + }, + { + "auxiliary_loss_clip": 0.01414949, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.25026846, + "balance_loss_mlp": 1.01664567, + "epoch": 0.5837366601533143, + "flos": 20093640998400.0, + "grad_norm": 1.5586350508639089, + "language_loss": 0.79298002, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.81749803, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.2019043, + "step": 9709, + "time_per_iteration": 2.864179849624634 + }, + { + "auxiliary_loss_clip": 0.01210554, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.11683726, + "balance_loss_mlp": 1.02473533, + "epoch": 0.5837967834059823, + "flos": 65394958867200.0, + "grad_norm": 0.7706546265753152, + "language_loss": 0.56527823, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58784568, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.21484375, + "step": 9710, + "time_per_iteration": 6.105860233306885 + }, + { + "auxiliary_loss_clip": 0.0141018, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.24632573, + "balance_loss_mlp": 1.0146482, + "epoch": 0.5838569066586502, + "flos": 25349764101120.0, + "grad_norm": 1.45963145397657, + "language_loss": 0.66170299, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.68615431, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.203125, + "step": 9711, + "time_per_iteration": 2.8632452487945557 + }, + { + "auxiliary_loss_clip": 0.0143401, + "auxiliary_loss_mlp": 0.01039902, + "balance_loss_clip": 1.26131749, + "balance_loss_mlp": 1.01838458, + "epoch": 0.5839170299113182, + "flos": 22237455006720.0, + "grad_norm": 1.918735942555622, + "language_loss": 0.7976433, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.82238245, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.21520996, + "step": 9712, + "time_per_iteration": 2.8469126224517822 + }, + { + "auxiliary_loss_clip": 0.01409694, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.24368906, + "balance_loss_mlp": 1.01210403, + "epoch": 0.5839771531639861, + "flos": 22210099885440.0, + "grad_norm": 8.447012770526932, + "language_loss": 0.7393434, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.76376426, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20300293, + "step": 9713, + "time_per_iteration": 2.8470218181610107 + }, + { + "auxiliary_loss_clip": 0.01426276, + "auxiliary_loss_mlp": 0.01038829, + "balance_loss_clip": 1.25399649, + "balance_loss_mlp": 1.0172286, + "epoch": 0.5840372764166541, + "flos": 22429610248320.0, + "grad_norm": 5.5438793445392385, + "language_loss": 0.70712888, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.73177993, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.21606445, + "step": 9714, + "time_per_iteration": 2.842230796813965 + }, + { + "auxiliary_loss_clip": 0.01421276, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.251351, + "balance_loss_mlp": 1.01242852, + "epoch": 0.5840973996693221, + "flos": 19837681309440.0, + "grad_norm": 1.8134360816003714, + "language_loss": 0.80482936, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82937944, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21313477, + "step": 9715, + "time_per_iteration": 3.0888350009918213 + }, + { + "auxiliary_loss_clip": 0.01406116, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.24066699, + "balance_loss_mlp": 1.0110209, + "epoch": 0.5841575229219901, + "flos": 21153069429120.0, + "grad_norm": 1.989117865866564, + "language_loss": 0.74667025, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.77104294, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20129395, + "step": 9716, + "time_per_iteration": 2.8566577434539795 + }, + { + "auxiliary_loss_clip": 0.01400602, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.23708248, + "balance_loss_mlp": 1.01506674, + "epoch": 0.5842176461746581, + "flos": 24649942492800.0, + "grad_norm": 1.8996633850271123, + "language_loss": 0.75274879, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.77710509, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19970703, + "step": 9717, + "time_per_iteration": 2.8826334476470947 + }, + { + "auxiliary_loss_clip": 0.01411661, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.24576128, + "balance_loss_mlp": 1.01852894, + "epoch": 0.584277769427326, + "flos": 19139398024320.0, + "grad_norm": 2.2245987653615007, + "language_loss": 0.81622529, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.84072989, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20275879, + "step": 9718, + "time_per_iteration": 2.849442481994629 + }, + { + "auxiliary_loss_clip": 0.01406182, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.24128866, + "balance_loss_mlp": 1.01508343, + "epoch": 0.584337892679994, + "flos": 22685524692480.0, + "grad_norm": 3.3902088437828617, + "language_loss": 0.68191838, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.70634311, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.2121582, + "step": 9719, + "time_per_iteration": 2.8596949577331543 + }, + { + "auxiliary_loss_clip": 0.01408322, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.24099588, + "balance_loss_mlp": 1.01447237, + "epoch": 0.5843980159326619, + "flos": 31296297893760.0, + "grad_norm": 3.0566714578921683, + "language_loss": 0.76728868, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.79172707, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21044922, + "step": 9720, + "time_per_iteration": 2.942124128341675 + }, + { + "auxiliary_loss_clip": 0.01419948, + "auxiliary_loss_mlp": 0.01039574, + "balance_loss_clip": 1.25076485, + "balance_loss_mlp": 1.01897478, + "epoch": 0.58445813918533, + "flos": 22758559079040.0, + "grad_norm": 1.6918560297080556, + "language_loss": 0.83246863, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85706377, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20605469, + "step": 9721, + "time_per_iteration": 2.8595967292785645 + }, + { + "auxiliary_loss_clip": 0.01203196, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.11259675, + "balance_loss_mlp": 1.00985813, + "epoch": 0.5845182624379979, + "flos": 60714559589760.0, + "grad_norm": 0.9330605472687717, + "language_loss": 0.71373463, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73610353, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.23828125, + "step": 9722, + "time_per_iteration": 3.398693561553955 + }, + { + "auxiliary_loss_clip": 0.01409672, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.24361157, + "balance_loss_mlp": 1.01577246, + "epoch": 0.5845783856906659, + "flos": 16371059034240.0, + "grad_norm": 2.3128844263992177, + "language_loss": 0.90350217, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.92795497, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19836426, + "step": 9723, + "time_per_iteration": 2.807891368865967 + }, + { + "auxiliary_loss_clip": 0.01403476, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.23950005, + "balance_loss_mlp": 1.01647615, + "epoch": 0.5846385089433338, + "flos": 20092826592000.0, + "grad_norm": 1.6512253282716858, + "language_loss": 0.69302917, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.71742713, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19836426, + "step": 9724, + "time_per_iteration": 2.871147632598877 + }, + { + "auxiliary_loss_clip": 0.01413641, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.24495327, + "balance_loss_mlp": 1.01527524, + "epoch": 0.5846986321960018, + "flos": 17319917877120.0, + "grad_norm": 2.25761214802707, + "language_loss": 0.87255704, + "learning_rate": 1.552246441587197e-06, + "loss": 0.89705175, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20544434, + "step": 9725, + "time_per_iteration": 2.8130106925964355 + }, + { + "auxiliary_loss_clip": 0.01429999, + "auxiliary_loss_mlp": 0.01041097, + "balance_loss_clip": 1.2596103, + "balance_loss_mlp": 1.02009273, + "epoch": 0.5847587554486697, + "flos": 17205773950080.0, + "grad_norm": 1.5457931101314528, + "language_loss": 0.83382905, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85853994, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20996094, + "step": 9726, + "time_per_iteration": 2.8753154277801514 + }, + { + "auxiliary_loss_clip": 0.0143113, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.26199031, + "balance_loss_mlp": 1.01448333, + "epoch": 0.5848188787013378, + "flos": 24538241784960.0, + "grad_norm": 1.9164949846457244, + "language_loss": 0.67620242, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.70086324, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20458984, + "step": 9727, + "time_per_iteration": 2.870342493057251 + }, + { + "auxiliary_loss_clip": 0.01421995, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.25446153, + "balance_loss_mlp": 1.01770234, + "epoch": 0.5848790019540057, + "flos": 20637349488000.0, + "grad_norm": 1.7431372379091086, + "language_loss": 0.82298315, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84757662, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19641113, + "step": 9728, + "time_per_iteration": 2.8334927558898926 + }, + { + "auxiliary_loss_clip": 0.01395764, + "auxiliary_loss_mlp": 0.01035487, + "balance_loss_clip": 1.23472357, + "balance_loss_mlp": 1.01588917, + "epoch": 0.5849391252066737, + "flos": 22428976821120.0, + "grad_norm": 2.399218908270919, + "language_loss": 0.78991067, + "learning_rate": 1.550728272957027e-06, + "loss": 0.81422317, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19592285, + "step": 9729, + "time_per_iteration": 2.8994829654693604 + }, + { + "auxiliary_loss_clip": 0.01415175, + "auxiliary_loss_mlp": 0.01041196, + "balance_loss_clip": 1.24756444, + "balance_loss_mlp": 1.02062023, + "epoch": 0.5849992484593417, + "flos": 25421893591680.0, + "grad_norm": 1.902152354615498, + "language_loss": 0.71950924, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.74407297, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20581055, + "step": 9730, + "time_per_iteration": 4.361703634262085 + }, + { + "auxiliary_loss_clip": 0.01413202, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.24465179, + "balance_loss_mlp": 1.01511991, + "epoch": 0.5850593717120096, + "flos": 21073972239360.0, + "grad_norm": 1.5852362300709648, + "language_loss": 0.79171592, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81620717, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20812988, + "step": 9731, + "time_per_iteration": 2.822007179260254 + }, + { + "auxiliary_loss_clip": 0.01405986, + "auxiliary_loss_mlp": 0.0103639, + "balance_loss_clip": 1.23977923, + "balance_loss_mlp": 1.01688719, + "epoch": 0.5851194949646776, + "flos": 25312500368640.0, + "grad_norm": 5.280574158717184, + "language_loss": 0.7115221, + "learning_rate": 1.549589825316528e-06, + "loss": 0.73594582, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19482422, + "step": 9732, + "time_per_iteration": 2.886577606201172 + }, + { + "auxiliary_loss_clip": 0.01430701, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.25970888, + "balance_loss_mlp": 1.0203042, + "epoch": 0.5851796182173455, + "flos": 23597888964480.0, + "grad_norm": 1.7887547448390424, + "language_loss": 0.5388974, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.56362259, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21508789, + "step": 9733, + "time_per_iteration": 2.872915744781494 + }, + { + "auxiliary_loss_clip": 0.01417814, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.25116599, + "balance_loss_mlp": 1.02167439, + "epoch": 0.5852397414700136, + "flos": 24832370102400.0, + "grad_norm": 2.0317040395203176, + "language_loss": 0.88521552, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.90982914, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21875, + "step": 9734, + "time_per_iteration": 2.8412466049194336 + }, + { + "auxiliary_loss_clip": 0.01395394, + "auxiliary_loss_mlp": 0.01039559, + "balance_loss_clip": 1.23458493, + "balance_loss_mlp": 1.01912642, + "epoch": 0.5852998647226815, + "flos": 19947255511680.0, + "grad_norm": 1.7300119149582647, + "language_loss": 0.72994834, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.75429785, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.2043457, + "step": 9735, + "time_per_iteration": 2.8420207500457764 + }, + { + "auxiliary_loss_clip": 0.01420021, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.25127113, + "balance_loss_mlp": 1.02268147, + "epoch": 0.5853599879753495, + "flos": 16727272496640.0, + "grad_norm": 2.8425307563094337, + "language_loss": 0.75422007, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.77885121, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20410156, + "step": 9736, + "time_per_iteration": 2.806025981903076 + }, + { + "auxiliary_loss_clip": 0.01409224, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.24268341, + "balance_loss_mlp": 1.01418591, + "epoch": 0.5854201112280174, + "flos": 44472512039040.0, + "grad_norm": 1.4823143901071398, + "language_loss": 0.70851648, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.73294681, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19628906, + "step": 9737, + "time_per_iteration": 3.0624618530273438 + }, + { + "auxiliary_loss_clip": 0.01409191, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.24556518, + "balance_loss_mlp": 1.01672864, + "epoch": 0.5854802344806854, + "flos": 20348786280960.0, + "grad_norm": 1.8397011856927914, + "language_loss": 0.83081049, + "learning_rate": 1.547313391573169e-06, + "loss": 0.8552683, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1986084, + "step": 9738, + "time_per_iteration": 2.8310129642486572 + }, + { + "auxiliary_loss_clip": 0.01421359, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.25210381, + "balance_loss_mlp": 1.01680779, + "epoch": 0.5855403577333533, + "flos": 20930301440640.0, + "grad_norm": 1.6962774774899998, + "language_loss": 0.69210565, + "learning_rate": 1.546934045946082e-06, + "loss": 0.71669364, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20629883, + "step": 9739, + "time_per_iteration": 2.84096097946167 + }, + { + "auxiliary_loss_clip": 0.01424089, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.25490987, + "balance_loss_mlp": 1.01585603, + "epoch": 0.5856004809860214, + "flos": 20458269993600.0, + "grad_norm": 2.2719069233604765, + "language_loss": 0.59862345, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.62321758, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19445801, + "step": 9740, + "time_per_iteration": 2.8300857543945312 + }, + { + "auxiliary_loss_clip": 0.01416964, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.2482996, + "balance_loss_mlp": 1.0160284, + "epoch": 0.5856606042386893, + "flos": 19648828938240.0, + "grad_norm": 1.7155908534521276, + "language_loss": 0.75711799, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.78165865, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21081543, + "step": 9741, + "time_per_iteration": 2.8338370323181152 + }, + { + "auxiliary_loss_clip": 0.0141827, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.25143301, + "balance_loss_mlp": 1.01782513, + "epoch": 0.5857207274913573, + "flos": 21695330085120.0, + "grad_norm": 1.6953138960915284, + "language_loss": 0.76763189, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.79219699, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.2043457, + "step": 9742, + "time_per_iteration": 4.276289224624634 + }, + { + "auxiliary_loss_clip": 0.01404172, + "auxiliary_loss_mlp": 0.01038729, + "balance_loss_clip": 1.23949766, + "balance_loss_mlp": 1.01894069, + "epoch": 0.5857808507440253, + "flos": 23192421897600.0, + "grad_norm": 2.454588667504928, + "language_loss": 0.76001614, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.78444517, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19787598, + "step": 9743, + "time_per_iteration": 2.9690585136413574 + }, + { + "auxiliary_loss_clip": 0.01408785, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.24623775, + "balance_loss_mlp": 1.01304245, + "epoch": 0.5858409739966932, + "flos": 27246757870080.0, + "grad_norm": 1.7325491897041374, + "language_loss": 0.81814384, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.8425678, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20581055, + "step": 9744, + "time_per_iteration": 4.278918981552124 + }, + { + "auxiliary_loss_clip": 0.01434043, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.26322079, + "balance_loss_mlp": 1.01383615, + "epoch": 0.5859010972493612, + "flos": 27867120330240.0, + "grad_norm": 1.9582588728380177, + "language_loss": 0.72696918, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.75164616, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.19787598, + "step": 9745, + "time_per_iteration": 4.308863639831543 + }, + { + "auxiliary_loss_clip": 0.01205847, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.1119597, + "balance_loss_mlp": 1.01269686, + "epoch": 0.5859612205020291, + "flos": 70041804485760.0, + "grad_norm": 0.7310070387014701, + "language_loss": 0.53341258, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55580688, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20898438, + "step": 9746, + "time_per_iteration": 3.462275266647339 + }, + { + "auxiliary_loss_clip": 0.01422557, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.25373399, + "balance_loss_mlp": 1.01380777, + "epoch": 0.5860213437546972, + "flos": 24065350686720.0, + "grad_norm": 1.9716939642908846, + "language_loss": 0.74194396, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.76652646, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21899414, + "step": 9747, + "time_per_iteration": 2.859715461730957 + }, + { + "auxiliary_loss_clip": 0.01421859, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.25294447, + "balance_loss_mlp": 1.0159651, + "epoch": 0.5860814670073651, + "flos": 18955477336320.0, + "grad_norm": 1.9266191051005657, + "language_loss": 0.81247878, + "learning_rate": 1.543520710142051e-06, + "loss": 0.8370682, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21118164, + "step": 9748, + "time_per_iteration": 2.849375009536743 + }, + { + "auxiliary_loss_clip": 0.01416903, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.2477957, + "balance_loss_mlp": 1.01371908, + "epoch": 0.5861415902600331, + "flos": 22571245031040.0, + "grad_norm": 1.6776698696459076, + "language_loss": 0.72543001, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74995375, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21740723, + "step": 9749, + "time_per_iteration": 2.828810691833496 + }, + { + "auxiliary_loss_clip": 0.01401564, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.23793912, + "balance_loss_mlp": 1.01234972, + "epoch": 0.586201713512701, + "flos": 14400080737920.0, + "grad_norm": 2.4014663441266713, + "language_loss": 0.75809509, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.78244537, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.21130371, + "step": 9750, + "time_per_iteration": 2.830185890197754 + }, + { + "auxiliary_loss_clip": 0.01404267, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.23937488, + "balance_loss_mlp": 1.01514459, + "epoch": 0.586261836765369, + "flos": 19507556113920.0, + "grad_norm": 1.6437950299689497, + "language_loss": 0.72261631, + "learning_rate": 1.542383242598344e-06, + "loss": 0.74704033, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.22998047, + "step": 9751, + "time_per_iteration": 2.8693981170654297 + }, + { + "auxiliary_loss_clip": 0.01435894, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.26125252, + "balance_loss_mlp": 1.01011157, + "epoch": 0.5863219600180369, + "flos": 20710926812160.0, + "grad_norm": 1.8699852607352607, + "language_loss": 0.75624955, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.7809248, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 1.74609375, + "router_z_loss_mlp": 0.21520996, + "step": 9752, + "time_per_iteration": 2.8478024005889893 + }, + { + "auxiliary_loss_clip": 0.01419155, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.25143194, + "balance_loss_mlp": 1.01461196, + "epoch": 0.586382083270705, + "flos": 19801639186560.0, + "grad_norm": 1.8434144446791156, + "language_loss": 0.78529, + "learning_rate": 1.541625017642943e-06, + "loss": 0.8098433, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21569824, + "step": 9753, + "time_per_iteration": 2.8551583290100098 + }, + { + "auxiliary_loss_clip": 0.01403275, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.24091196, + "balance_loss_mlp": 1.01147485, + "epoch": 0.5864422065233729, + "flos": 16507173951360.0, + "grad_norm": 1.7236126437697146, + "language_loss": 0.71891791, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.74327928, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.21386719, + "step": 9754, + "time_per_iteration": 2.977651357650757 + }, + { + "auxiliary_loss_clip": 0.0140937, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.24121499, + "balance_loss_mlp": 1.01499867, + "epoch": 0.5865023297760409, + "flos": 20423178011520.0, + "grad_norm": 1.8643068303245285, + "language_loss": 0.73119134, + "learning_rate": 1.540866862214043e-06, + "loss": 0.75564796, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.21313477, + "step": 9755, + "time_per_iteration": 2.94720721244812 + }, + { + "auxiliary_loss_clip": 0.01219343, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.12231946, + "balance_loss_mlp": 1.00446892, + "epoch": 0.5865624530287089, + "flos": 63379658649600.0, + "grad_norm": 0.7426920270506249, + "language_loss": 0.56943321, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59201658, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.34570312, + "step": 9756, + "time_per_iteration": 3.354534387588501 + }, + { + "auxiliary_loss_clip": 0.01417797, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.2518692, + "balance_loss_mlp": 1.01282632, + "epoch": 0.5866225762813768, + "flos": 27027654710400.0, + "grad_norm": 2.115363992682989, + "language_loss": 0.77410996, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.7986182, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20202637, + "step": 9757, + "time_per_iteration": 2.9005088806152344 + }, + { + "auxiliary_loss_clip": 0.01212804, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.11777997, + "balance_loss_mlp": 1.01959538, + "epoch": 0.5866826995340448, + "flos": 73019473758720.0, + "grad_norm": 0.85788848032698, + "language_loss": 0.60561132, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62816906, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.23339844, + "step": 9758, + "time_per_iteration": 3.3590505123138428 + }, + { + "auxiliary_loss_clip": 0.0142059, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.24923062, + "balance_loss_mlp": 1.01250577, + "epoch": 0.5867428227867127, + "flos": 21295020925440.0, + "grad_norm": 3.6657645710813136, + "language_loss": 0.73211336, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.75666618, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.22180176, + "step": 9759, + "time_per_iteration": 2.8505609035491943 + }, + { + "auxiliary_loss_clip": 0.01407993, + "auxiliary_loss_mlp": 0.01041475, + "balance_loss_clip": 1.24138796, + "balance_loss_mlp": 1.02004147, + "epoch": 0.5868029460393808, + "flos": 33480497525760.0, + "grad_norm": 1.8508978607633233, + "language_loss": 0.73830545, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7628001, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21435547, + "step": 9760, + "time_per_iteration": 2.968231678009033 + }, + { + "auxiliary_loss_clip": 0.01410351, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.24344063, + "balance_loss_mlp": 1.01364183, + "epoch": 0.5868630692920487, + "flos": 17897089536000.0, + "grad_norm": 1.8065495457034064, + "language_loss": 0.73122311, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.75567269, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20983887, + "step": 9761, + "time_per_iteration": 2.865251064300537 + }, + { + "auxiliary_loss_clip": 0.01413776, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.24372983, + "balance_loss_mlp": 1.0107106, + "epoch": 0.5869231925447167, + "flos": 21045485998080.0, + "grad_norm": 2.072671030302661, + "language_loss": 0.7596218, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.78409696, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.23046875, + "step": 9762, + "time_per_iteration": 2.865057945251465 + }, + { + "auxiliary_loss_clip": 0.01406283, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.2428292, + "balance_loss_mlp": 1.01577544, + "epoch": 0.5869833157973846, + "flos": 74761603280640.0, + "grad_norm": 1.4177386353826051, + "language_loss": 0.72976625, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.75420725, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.22021484, + "step": 9763, + "time_per_iteration": 3.285033702850342 + }, + { + "auxiliary_loss_clip": 0.01406929, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.2407577, + "balance_loss_mlp": 1.01474929, + "epoch": 0.5870434390500526, + "flos": 17647826077440.0, + "grad_norm": 1.6042708575208982, + "language_loss": 0.80775142, + "learning_rate": 1.53745602625755e-06, + "loss": 0.83218044, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21228027, + "step": 9764, + "time_per_iteration": 2.8754215240478516 + }, + { + "auxiliary_loss_clip": 0.01409266, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.24235678, + "balance_loss_mlp": 1.01431251, + "epoch": 0.5871035623027205, + "flos": 21515888632320.0, + "grad_norm": 1.6824022303170003, + "language_loss": 0.79853934, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.82299948, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.2244873, + "step": 9765, + "time_per_iteration": 4.228884220123291 + }, + { + "auxiliary_loss_clip": 0.01395292, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.23209095, + "balance_loss_mlp": 1.01369715, + "epoch": 0.5871636855553886, + "flos": 13559438753280.0, + "grad_norm": 1.6642002589617249, + "language_loss": 0.8401767, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.86447847, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.21179199, + "step": 9766, + "time_per_iteration": 2.8089590072631836 + }, + { + "auxiliary_loss_clip": 0.01420132, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.24911237, + "balance_loss_mlp": 1.01749527, + "epoch": 0.5872238088080565, + "flos": 26223914499840.0, + "grad_norm": 1.5625989381754892, + "language_loss": 0.6981132, + "learning_rate": 1.536319396136257e-06, + "loss": 0.72270012, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21057129, + "step": 9767, + "time_per_iteration": 2.8680531978607178 + }, + { + "auxiliary_loss_clip": 0.01415054, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.24596286, + "balance_loss_mlp": 1.01387298, + "epoch": 0.5872839320607245, + "flos": 30677609491200.0, + "grad_norm": 2.341163651940572, + "language_loss": 0.64439225, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.66890061, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21911621, + "step": 9768, + "time_per_iteration": 2.957693576812744 + }, + { + "auxiliary_loss_clip": 0.01207296, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.11502314, + "balance_loss_mlp": 1.0049088, + "epoch": 0.5873440553133924, + "flos": 60336239402880.0, + "grad_norm": 0.713354606398054, + "language_loss": 0.54042667, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.56285203, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.30273438, + "step": 9769, + "time_per_iteration": 3.3547139167785645 + }, + { + "auxiliary_loss_clip": 0.01406119, + "auxiliary_loss_mlp": 0.01038855, + "balance_loss_clip": 1.23936796, + "balance_loss_mlp": 1.01714647, + "epoch": 0.5874041785660604, + "flos": 21548175436800.0, + "grad_norm": 4.8035300741887275, + "language_loss": 0.71010739, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73455715, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.21704102, + "step": 9770, + "time_per_iteration": 2.862474203109741 + }, + { + "auxiliary_loss_clip": 0.01402673, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.23687553, + "balance_loss_mlp": 1.0112884, + "epoch": 0.5874643018187284, + "flos": 24399547914240.0, + "grad_norm": 3.157778078545333, + "language_loss": 0.6853174, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.70967269, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21557617, + "step": 9771, + "time_per_iteration": 2.8613970279693604 + }, + { + "auxiliary_loss_clip": 0.01418015, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.24702597, + "balance_loss_mlp": 1.01435947, + "epoch": 0.5875244250713964, + "flos": 28159484100480.0, + "grad_norm": 1.527585659132837, + "language_loss": 0.67021358, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.69475007, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.21276855, + "step": 9772, + "time_per_iteration": 2.9079885482788086 + }, + { + "auxiliary_loss_clip": 0.01429966, + "auxiliary_loss_mlp": 0.01046363, + "balance_loss_clip": 1.25811386, + "balance_loss_mlp": 1.02011299, + "epoch": 0.5875845483240644, + "flos": 25823514850560.0, + "grad_norm": 1.5402411330176966, + "language_loss": 0.75222141, + "learning_rate": 1.534046611017519e-06, + "loss": 0.77698469, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.26257324, + "step": 9773, + "time_per_iteration": 2.8697237968444824 + }, + { + "auxiliary_loss_clip": 0.01428768, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.25894475, + "balance_loss_mlp": 1.01669335, + "epoch": 0.5876446715767323, + "flos": 26917854284160.0, + "grad_norm": 3.892099943724368, + "language_loss": 0.54574144, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.57041657, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.22045898, + "step": 9774, + "time_per_iteration": 2.907550573348999 + }, + { + "auxiliary_loss_clip": 0.01415341, + "auxiliary_loss_mlp": 0.01039431, + "balance_loss_clip": 1.24889088, + "balance_loss_mlp": 1.01634049, + "epoch": 0.5877047948294003, + "flos": 36698670748800.0, + "grad_norm": 2.158202766055543, + "language_loss": 0.65643072, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.68097842, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.23095703, + "step": 9775, + "time_per_iteration": 2.953894853591919 + }, + { + "auxiliary_loss_clip": 0.0141156, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.24375391, + "balance_loss_mlp": 1.01350045, + "epoch": 0.5877649180820682, + "flos": 26736105346560.0, + "grad_norm": 2.290949921398518, + "language_loss": 0.74032336, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.76479185, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21801758, + "step": 9776, + "time_per_iteration": 2.8983731269836426 + }, + { + "auxiliary_loss_clip": 0.01428107, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.25787973, + "balance_loss_mlp": 1.01548827, + "epoch": 0.5878250413347362, + "flos": 21041911658880.0, + "grad_norm": 2.1124310729131643, + "language_loss": 0.75294745, + "learning_rate": 1.532531774126821e-06, + "loss": 0.77759719, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21386719, + "step": 9777, + "time_per_iteration": 4.353303670883179 + }, + { + "auxiliary_loss_clip": 0.01395105, + "auxiliary_loss_mlp": 0.01039358, + "balance_loss_clip": 1.23311949, + "balance_loss_mlp": 1.01871109, + "epoch": 0.5878851645874041, + "flos": 25495335181440.0, + "grad_norm": 1.508216958948938, + "language_loss": 0.74983591, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.77418053, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.2064209, + "step": 9778, + "time_per_iteration": 2.9330766201019287 + }, + { + "auxiliary_loss_clip": 0.01421128, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.2532115, + "balance_loss_mlp": 1.01937318, + "epoch": 0.5879452878400722, + "flos": 23779683146880.0, + "grad_norm": 1.9796569777331752, + "language_loss": 0.70636737, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.73099864, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.22631836, + "step": 9779, + "time_per_iteration": 5.665513038635254 + }, + { + "auxiliary_loss_clip": 0.01421975, + "auxiliary_loss_mlp": 0.01037173, + "balance_loss_clip": 1.25111818, + "balance_loss_mlp": 1.01508367, + "epoch": 0.5880054110927401, + "flos": 17834189984640.0, + "grad_norm": 2.6973363494466165, + "language_loss": 0.67763543, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.70222694, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.22094727, + "step": 9780, + "time_per_iteration": 2.8982720375061035 + }, + { + "auxiliary_loss_clip": 0.01426545, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.25741792, + "balance_loss_mlp": 1.01818252, + "epoch": 0.5880655343454081, + "flos": 19472735600640.0, + "grad_norm": 2.3411123256132482, + "language_loss": 0.72802824, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.75269306, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21765137, + "step": 9781, + "time_per_iteration": 2.883565902709961 + }, + { + "auxiliary_loss_clip": 0.01416467, + "auxiliary_loss_mlp": 0.01036568, + "balance_loss_clip": 1.25019097, + "balance_loss_mlp": 1.01521742, + "epoch": 0.588125657598076, + "flos": 21407536039680.0, + "grad_norm": 1.461959583639037, + "language_loss": 0.71420193, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.73873234, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21362305, + "step": 9782, + "time_per_iteration": 2.894538402557373 + }, + { + "auxiliary_loss_clip": 0.01423959, + "auxiliary_loss_mlp": 0.01041084, + "balance_loss_clip": 1.25302982, + "balance_loss_mlp": 1.01868439, + "epoch": 0.588185780850744, + "flos": 16043331813120.0, + "grad_norm": 2.4817468152936417, + "language_loss": 0.70771277, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.73236322, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.22399902, + "step": 9783, + "time_per_iteration": 2.8804900646209717 + }, + { + "auxiliary_loss_clip": 0.01427601, + "auxiliary_loss_mlp": 0.01041019, + "balance_loss_clip": 1.25758314, + "balance_loss_mlp": 1.01897693, + "epoch": 0.588245904103412, + "flos": 23737623465600.0, + "grad_norm": 1.7645899897311215, + "language_loss": 0.70276618, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.7274524, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.22045898, + "step": 9784, + "time_per_iteration": 2.841827869415283 + }, + { + "auxiliary_loss_clip": 0.01433426, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.26061177, + "balance_loss_mlp": 1.01454067, + "epoch": 0.58830602735608, + "flos": 33815644894080.0, + "grad_norm": 1.8946453844456843, + "language_loss": 0.69687265, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.72156954, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.21728516, + "step": 9785, + "time_per_iteration": 3.025907516479492 + }, + { + "auxiliary_loss_clip": 0.01403351, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.23694491, + "balance_loss_mlp": 1.01228452, + "epoch": 0.588366150608748, + "flos": 17099140659840.0, + "grad_norm": 1.9117576067123427, + "language_loss": 0.77874821, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.80311215, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.2076416, + "step": 9786, + "time_per_iteration": 2.822808027267456 + }, + { + "auxiliary_loss_clip": 0.014168, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.24734592, + "balance_loss_mlp": 1.01564658, + "epoch": 0.5884262738614159, + "flos": 22137291722880.0, + "grad_norm": 1.5745817365537436, + "language_loss": 0.80008978, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.82463658, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.22229004, + "step": 9787, + "time_per_iteration": 2.8554141521453857 + }, + { + "auxiliary_loss_clip": 0.01417322, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.24770713, + "balance_loss_mlp": 1.01695514, + "epoch": 0.5884863971140839, + "flos": 21041459210880.0, + "grad_norm": 1.782752941092352, + "language_loss": 0.67306292, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.6976198, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21411133, + "step": 9788, + "time_per_iteration": 2.8718106746673584 + }, + { + "auxiliary_loss_clip": 0.01408543, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.2432518, + "balance_loss_mlp": 1.01763058, + "epoch": 0.5885465203667518, + "flos": 23815544290560.0, + "grad_norm": 2.4386922212561086, + "language_loss": 0.8101629, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.83463353, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20874023, + "step": 9789, + "time_per_iteration": 2.863879680633545 + }, + { + "auxiliary_loss_clip": 0.0140865, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.24321222, + "balance_loss_mlp": 1.01687503, + "epoch": 0.5886066436194198, + "flos": 18889998831360.0, + "grad_norm": 1.5431900061695556, + "language_loss": 0.70795816, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.73244262, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.22900391, + "step": 9790, + "time_per_iteration": 2.8621883392333984 + }, + { + "auxiliary_loss_clip": 0.01405254, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.23927236, + "balance_loss_mlp": 1.01820946, + "epoch": 0.5886667668720877, + "flos": 24800807214720.0, + "grad_norm": 1.741889108708411, + "language_loss": 0.84905821, + "learning_rate": 1.527232084570895e-06, + "loss": 0.87350899, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21618652, + "step": 9791, + "time_per_iteration": 2.8779773712158203 + }, + { + "auxiliary_loss_clip": 0.01426631, + "auxiliary_loss_mlp": 0.01045088, + "balance_loss_clip": 1.25780535, + "balance_loss_mlp": 1.02324915, + "epoch": 0.5887268901247558, + "flos": 21623834021760.0, + "grad_norm": 1.63804883010965, + "language_loss": 0.77347553, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.79819274, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21826172, + "step": 9792, + "time_per_iteration": 2.850757360458374 + }, + { + "auxiliary_loss_clip": 0.01424348, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.25166464, + "balance_loss_mlp": 1.0152638, + "epoch": 0.5887870133774237, + "flos": 20490466308480.0, + "grad_norm": 8.233835187147315, + "language_loss": 0.69769752, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.72231108, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.2175293, + "step": 9793, + "time_per_iteration": 2.8211119174957275 + }, + { + "auxiliary_loss_clip": 0.01408167, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.24195313, + "balance_loss_mlp": 1.01597476, + "epoch": 0.5888471366300917, + "flos": 19215101854080.0, + "grad_norm": 1.8123655091741104, + "language_loss": 0.60476613, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62922943, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.22180176, + "step": 9794, + "time_per_iteration": 2.8413476943969727 + }, + { + "auxiliary_loss_clip": 0.01425097, + "auxiliary_loss_mlp": 0.01043766, + "balance_loss_clip": 1.25526524, + "balance_loss_mlp": 1.02164114, + "epoch": 0.5889072598827596, + "flos": 19982030780160.0, + "grad_norm": 1.5973947531394181, + "language_loss": 0.66176975, + "learning_rate": 1.525718531219257e-06, + "loss": 0.68645835, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.22131348, + "step": 9795, + "time_per_iteration": 2.8608076572418213 + }, + { + "auxiliary_loss_clip": 0.01409209, + "auxiliary_loss_mlp": 0.01043311, + "balance_loss_clip": 1.24433267, + "balance_loss_mlp": 1.0207566, + "epoch": 0.5889673831354276, + "flos": 20751131456640.0, + "grad_norm": 1.7100737307571143, + "language_loss": 0.75115013, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.77567536, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.22558594, + "step": 9796, + "time_per_iteration": 2.8349525928497314 + }, + { + "auxiliary_loss_clip": 0.01420096, + "auxiliary_loss_mlp": 0.01034342, + "balance_loss_clip": 1.252267, + "balance_loss_mlp": 1.0128243, + "epoch": 0.5890275063880956, + "flos": 25311731207040.0, + "grad_norm": 1.5355591333692875, + "language_loss": 0.83964109, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.86418551, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21508789, + "step": 9797, + "time_per_iteration": 2.92546010017395 + }, + { + "auxiliary_loss_clip": 0.0141409, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.24711323, + "balance_loss_mlp": 1.01711917, + "epoch": 0.5890876296407636, + "flos": 11772833592960.0, + "grad_norm": 2.464395152957588, + "language_loss": 0.80028337, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.82480228, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20678711, + "step": 9798, + "time_per_iteration": 2.8469839096069336 + }, + { + "auxiliary_loss_clip": 0.01407411, + "auxiliary_loss_mlp": 0.01040205, + "balance_loss_clip": 1.24295354, + "balance_loss_mlp": 1.01968908, + "epoch": 0.5891477528934316, + "flos": 13597335912960.0, + "grad_norm": 3.5299519711984937, + "language_loss": 0.74721336, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.77168947, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20507812, + "step": 9799, + "time_per_iteration": 2.8385612964630127 + }, + { + "auxiliary_loss_clip": 0.01428796, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.25709832, + "balance_loss_mlp": 1.01633215, + "epoch": 0.5892078761460995, + "flos": 15057525951360.0, + "grad_norm": 2.491596002619927, + "language_loss": 0.77625465, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.80094409, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.23815918, + "step": 9800, + "time_per_iteration": 4.290425062179565 + }, + { + "auxiliary_loss_clip": 0.01429815, + "auxiliary_loss_mlp": 0.01041667, + "balance_loss_clip": 1.25987935, + "balance_loss_mlp": 1.01900554, + "epoch": 0.5892679993987675, + "flos": 15786557717760.0, + "grad_norm": 1.8725256492668771, + "language_loss": 0.7988987, + "learning_rate": 1.523448741022722e-06, + "loss": 0.82361352, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.22680664, + "step": 9801, + "time_per_iteration": 2.9512622356414795 + }, + { + "auxiliary_loss_clip": 0.01428657, + "auxiliary_loss_mlp": 0.01039974, + "balance_loss_clip": 1.25820458, + "balance_loss_mlp": 1.01912439, + "epoch": 0.5893281226514354, + "flos": 25276186776960.0, + "grad_norm": 1.7107336230308925, + "language_loss": 0.66743052, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.69211686, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20849609, + "step": 9802, + "time_per_iteration": 2.9146339893341064 + }, + { + "auxiliary_loss_clip": 0.01409924, + "auxiliary_loss_mlp": 0.01040865, + "balance_loss_clip": 1.24332333, + "balance_loss_mlp": 1.01828694, + "epoch": 0.5893882459041034, + "flos": 19466220349440.0, + "grad_norm": 1.5254193940694714, + "language_loss": 0.78666866, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8111766, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.22595215, + "step": 9803, + "time_per_iteration": 2.8247745037078857 + }, + { + "auxiliary_loss_clip": 0.01431093, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.26012897, + "balance_loss_mlp": 1.02075887, + "epoch": 0.5894483691567713, + "flos": 20644498166400.0, + "grad_norm": 1.532774695877252, + "language_loss": 0.73625517, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.76099759, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.22375488, + "step": 9804, + "time_per_iteration": 2.875318765640259 + }, + { + "auxiliary_loss_clip": 0.01413199, + "auxiliary_loss_mlp": 0.0103982, + "balance_loss_clip": 1.24652433, + "balance_loss_mlp": 1.01868439, + "epoch": 0.5895084924094394, + "flos": 17785343583360.0, + "grad_norm": 1.5133686442275884, + "language_loss": 0.75602233, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.78055251, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.21142578, + "step": 9805, + "time_per_iteration": 2.8312366008758545 + }, + { + "auxiliary_loss_clip": 0.01451029, + "auxiliary_loss_mlp": 0.01041498, + "balance_loss_clip": 1.27554917, + "balance_loss_mlp": 1.02105355, + "epoch": 0.5895686156621073, + "flos": 20130814241280.0, + "grad_norm": 1.6678052450980936, + "language_loss": 0.79024547, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.81517076, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 1.75390625, + "router_z_loss_mlp": 0.2043457, + "step": 9806, + "time_per_iteration": 2.8377721309661865 + }, + { + "auxiliary_loss_clip": 0.01416201, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.24745727, + "balance_loss_mlp": 1.01721048, + "epoch": 0.5896287389147753, + "flos": 20859800762880.0, + "grad_norm": 1.9986619461404835, + "language_loss": 0.77634662, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.80089653, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21569824, + "step": 9807, + "time_per_iteration": 2.82161545753479 + }, + { + "auxiliary_loss_clip": 0.01439912, + "auxiliary_loss_mlp": 0.01035968, + "balance_loss_clip": 1.26606822, + "balance_loss_mlp": 1.01422453, + "epoch": 0.5896888621674432, + "flos": 14545968531840.0, + "grad_norm": 1.9070503563223435, + "language_loss": 0.75025046, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.77500927, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 1.73828125, + "router_z_loss_mlp": 0.21728516, + "step": 9808, + "time_per_iteration": 2.8112308979034424 + }, + { + "auxiliary_loss_clip": 0.01433607, + "auxiliary_loss_mlp": 0.01043768, + "balance_loss_clip": 1.26245964, + "balance_loss_mlp": 1.02073693, + "epoch": 0.5897489854201112, + "flos": 20896566802560.0, + "grad_norm": 2.840396125797323, + "language_loss": 0.72971278, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.75448656, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.23010254, + "step": 9809, + "time_per_iteration": 2.8702399730682373 + }, + { + "auxiliary_loss_clip": 0.01433572, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.25963759, + "balance_loss_mlp": 1.01966596, + "epoch": 0.5898091086727792, + "flos": 20020742346240.0, + "grad_norm": 9.092244163769664, + "language_loss": 0.84294456, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.86769533, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.21850586, + "step": 9810, + "time_per_iteration": 2.842733144760132 + }, + { + "auxiliary_loss_clip": 0.01413662, + "auxiliary_loss_mlp": 0.01039948, + "balance_loss_clip": 1.24925053, + "balance_loss_mlp": 1.01828766, + "epoch": 0.5898692319254472, + "flos": 16261892035200.0, + "grad_norm": 1.569066174995993, + "language_loss": 0.82339287, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.84792894, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.21643066, + "step": 9811, + "time_per_iteration": 2.8572592735290527 + }, + { + "auxiliary_loss_clip": 0.01429093, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.25552237, + "balance_loss_mlp": 1.01407194, + "epoch": 0.5899293551781152, + "flos": 20458134259200.0, + "grad_norm": 2.3300491936524668, + "language_loss": 0.78061247, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.80525672, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 1.73730469, + "router_z_loss_mlp": 0.21252441, + "step": 9812, + "time_per_iteration": 4.254642009735107 + }, + { + "auxiliary_loss_clip": 0.01440012, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.27018762, + "balance_loss_mlp": 1.02051544, + "epoch": 0.5899894784307831, + "flos": 13889744928000.0, + "grad_norm": 2.0547205341995443, + "language_loss": 0.71239495, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.73719454, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.19421387, + "step": 9813, + "time_per_iteration": 2.8059709072113037 + }, + { + "auxiliary_loss_clip": 0.01428811, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.25828612, + "balance_loss_mlp": 1.02115273, + "epoch": 0.5900496016834511, + "flos": 20093324284800.0, + "grad_norm": 1.7191213356721373, + "language_loss": 0.72640765, + "learning_rate": 1.518533098148494e-06, + "loss": 0.75112128, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.21398926, + "step": 9814, + "time_per_iteration": 5.659720182418823 + }, + { + "auxiliary_loss_clip": 0.01422501, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.2545023, + "balance_loss_mlp": 1.01499605, + "epoch": 0.590109724936119, + "flos": 20266883913600.0, + "grad_norm": 1.707298452055488, + "language_loss": 0.79332733, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.81791353, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.21130371, + "step": 9815, + "time_per_iteration": 2.821929693222046 + }, + { + "auxiliary_loss_clip": 0.01447879, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.27183163, + "balance_loss_mlp": 1.01571608, + "epoch": 0.590169848188787, + "flos": 24243570529920.0, + "grad_norm": 2.874969415202789, + "language_loss": 0.77173007, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.79658091, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.21484375, + "step": 9816, + "time_per_iteration": 2.872300863265991 + }, + { + "auxiliary_loss_clip": 0.01426874, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.25790787, + "balance_loss_mlp": 1.01373529, + "epoch": 0.590229971441455, + "flos": 17793532892160.0, + "grad_norm": 2.0108342607588288, + "language_loss": 0.82088459, + "learning_rate": 1.517399156051309e-06, + "loss": 0.84549505, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.2043457, + "step": 9817, + "time_per_iteration": 2.7997539043426514 + }, + { + "auxiliary_loss_clip": 0.01435377, + "auxiliary_loss_mlp": 0.01038219, + "balance_loss_clip": 1.26394582, + "balance_loss_mlp": 1.01698744, + "epoch": 0.590290094694123, + "flos": 22247092149120.0, + "grad_norm": 1.9881139051613224, + "language_loss": 0.77521455, + "learning_rate": 1.517021211933682e-06, + "loss": 0.79995048, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21252441, + "step": 9818, + "time_per_iteration": 2.859215497970581 + }, + { + "auxiliary_loss_clip": 0.01428797, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.26003504, + "balance_loss_mlp": 1.01653504, + "epoch": 0.5903502179467909, + "flos": 19107880381440.0, + "grad_norm": 1.7839089271966944, + "language_loss": 0.67288888, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.69754839, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20617676, + "step": 9819, + "time_per_iteration": 2.8198301792144775 + }, + { + "auxiliary_loss_clip": 0.01434914, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.26578212, + "balance_loss_mlp": 1.01482844, + "epoch": 0.5904103411994589, + "flos": 24244113467520.0, + "grad_norm": 1.8906756088886163, + "language_loss": 0.79193616, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.81664473, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.21105957, + "step": 9820, + "time_per_iteration": 2.8816215991973877 + }, + { + "auxiliary_loss_clip": 0.01195161, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.10387301, + "balance_loss_mlp": 1.00639367, + "epoch": 0.5904704644521268, + "flos": 64904196072960.0, + "grad_norm": 0.9184882652932346, + "language_loss": 0.65103376, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67326963, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.22070312, + "step": 9821, + "time_per_iteration": 3.379164695739746 + }, + { + "auxiliary_loss_clip": 0.01411877, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.24549174, + "balance_loss_mlp": 1.0169661, + "epoch": 0.5905305877047948, + "flos": 19619528290560.0, + "grad_norm": 2.0820411119405495, + "language_loss": 0.62166971, + "learning_rate": 1.515509618752521e-06, + "loss": 0.64617008, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21203613, + "step": 9822, + "time_per_iteration": 2.8700110912323 + }, + { + "auxiliary_loss_clip": 0.01433779, + "auxiliary_loss_mlp": 0.01039648, + "balance_loss_clip": 1.26404595, + "balance_loss_mlp": 1.01934624, + "epoch": 0.5905907109574628, + "flos": 18998984851200.0, + "grad_norm": 1.8574427812301137, + "language_loss": 0.83319175, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.85792607, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20300293, + "step": 9823, + "time_per_iteration": 2.895549774169922 + }, + { + "auxiliary_loss_clip": 0.01422322, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.25537133, + "balance_loss_mlp": 1.01508129, + "epoch": 0.5906508342101308, + "flos": 22210597578240.0, + "grad_norm": 2.3349097802913366, + "language_loss": 0.74213779, + "learning_rate": 1.514753932336165e-06, + "loss": 0.76671529, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20361328, + "step": 9824, + "time_per_iteration": 2.848313570022583 + }, + { + "auxiliary_loss_clip": 0.01460946, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.28164411, + "balance_loss_mlp": 1.01678109, + "epoch": 0.5907109574627988, + "flos": 20896476312960.0, + "grad_norm": 2.048505234010724, + "language_loss": 0.84476238, + "learning_rate": 1.514376116721693e-06, + "loss": 0.86975431, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.21472168, + "step": 9825, + "time_per_iteration": 2.8185672760009766 + }, + { + "auxiliary_loss_clip": 0.01415531, + "auxiliary_loss_mlp": 0.01036504, + "balance_loss_clip": 1.25041485, + "balance_loss_mlp": 1.01753759, + "epoch": 0.5907710807154667, + "flos": 21516522059520.0, + "grad_norm": 1.6892036925207452, + "language_loss": 0.77247977, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79700017, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1895752, + "step": 9826, + "time_per_iteration": 2.8062500953674316 + }, + { + "auxiliary_loss_clip": 0.0142199, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.25596547, + "balance_loss_mlp": 1.02051139, + "epoch": 0.5908312039681347, + "flos": 22028622416640.0, + "grad_norm": 1.6389661203483195, + "language_loss": 0.73298317, + "learning_rate": 1.513620540751793e-06, + "loss": 0.75760472, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.1965332, + "step": 9827, + "time_per_iteration": 2.8308017253875732 + }, + { + "auxiliary_loss_clip": 0.01433485, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.26346004, + "balance_loss_mlp": 1.01915646, + "epoch": 0.5908913272208026, + "flos": 18488920510080.0, + "grad_norm": 1.8883134152676848, + "language_loss": 0.80421823, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.82893741, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19262695, + "step": 9828, + "time_per_iteration": 2.886791944503784 + }, + { + "auxiliary_loss_clip": 0.01442161, + "auxiliary_loss_mlp": 0.01042912, + "balance_loss_clip": 1.2714746, + "balance_loss_mlp": 1.02222931, + "epoch": 0.5909514504734706, + "flos": 12319663973760.0, + "grad_norm": 2.355532176065077, + "language_loss": 0.88997984, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.91483057, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20678711, + "step": 9829, + "time_per_iteration": 2.837496042251587 + }, + { + "auxiliary_loss_clip": 0.0120109, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.10858297, + "balance_loss_mlp": 1.01141667, + "epoch": 0.5910115737261386, + "flos": 70244755251840.0, + "grad_norm": 0.7567898033891322, + "language_loss": 0.57944632, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.601807, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.23535156, + "step": 9830, + "time_per_iteration": 3.272918939590454 + }, + { + "auxiliary_loss_clip": 0.01448852, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.27323079, + "balance_loss_mlp": 1.01929247, + "epoch": 0.5910716969788066, + "flos": 22027853255040.0, + "grad_norm": 3.0435483691038985, + "language_loss": 0.76972967, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.79463083, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.21984863, + "step": 9831, + "time_per_iteration": 2.8328471183776855 + }, + { + "auxiliary_loss_clip": 0.01408039, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.24583125, + "balance_loss_mlp": 1.01805782, + "epoch": 0.5911318202314745, + "flos": 21261557756160.0, + "grad_norm": 2.201991716113376, + "language_loss": 0.78284991, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80730557, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19470215, + "step": 9832, + "time_per_iteration": 2.85086989402771 + }, + { + "auxiliary_loss_clip": 0.01413259, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.24769163, + "balance_loss_mlp": 1.01652408, + "epoch": 0.5911919434841425, + "flos": 17830525155840.0, + "grad_norm": 2.0139928913487344, + "language_loss": 0.84351945, + "learning_rate": 1.511354255945847e-06, + "loss": 0.86802518, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20800781, + "step": 9833, + "time_per_iteration": 2.844409465789795 + }, + { + "auxiliary_loss_clip": 0.01438533, + "auxiliary_loss_mlp": 0.01040922, + "balance_loss_clip": 1.26866651, + "balance_loss_mlp": 1.02058518, + "epoch": 0.5912520667368104, + "flos": 20384149731840.0, + "grad_norm": 1.837537096996121, + "language_loss": 0.75134313, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.77613771, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20336914, + "step": 9834, + "time_per_iteration": 2.844228744506836 + }, + { + "auxiliary_loss_clip": 0.01435217, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.26636147, + "balance_loss_mlp": 1.02124953, + "epoch": 0.5913121899894784, + "flos": 17939375441280.0, + "grad_norm": 2.548979008978037, + "language_loss": 0.78923368, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.81400228, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20410156, + "step": 9835, + "time_per_iteration": 4.2609264850616455 + }, + { + "auxiliary_loss_clip": 0.01439196, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.26811576, + "balance_loss_mlp": 1.01756382, + "epoch": 0.5913723132421465, + "flos": 22136975009280.0, + "grad_norm": 2.6202291647373475, + "language_loss": 0.74790889, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.77268237, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20593262, + "step": 9836, + "time_per_iteration": 2.858978509902954 + }, + { + "auxiliary_loss_clip": 0.01445785, + "auxiliary_loss_mlp": 0.01037696, + "balance_loss_clip": 1.27433634, + "balance_loss_mlp": 1.01802671, + "epoch": 0.5914324364948144, + "flos": 15704338636800.0, + "grad_norm": 6.244583898013082, + "language_loss": 0.83616656, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.86100137, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.19665527, + "step": 9837, + "time_per_iteration": 2.815629005432129 + }, + { + "auxiliary_loss_clip": 0.01433485, + "auxiliary_loss_mlp": 0.01041695, + "balance_loss_clip": 1.26306248, + "balance_loss_mlp": 1.02036798, + "epoch": 0.5914925597474824, + "flos": 22757563693440.0, + "grad_norm": 1.6396847184436647, + "language_loss": 0.80621326, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.83096504, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.21325684, + "step": 9838, + "time_per_iteration": 2.861156940460205 + }, + { + "auxiliary_loss_clip": 0.01430996, + "auxiliary_loss_mlp": 0.01039093, + "balance_loss_clip": 1.26068902, + "balance_loss_mlp": 1.01887476, + "epoch": 0.5915526830001503, + "flos": 18301018279680.0, + "grad_norm": 1.7796335544552642, + "language_loss": 0.70946956, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.73417044, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20202637, + "step": 9839, + "time_per_iteration": 2.830792188644409 + }, + { + "auxiliary_loss_clip": 0.01439969, + "auxiliary_loss_mlp": 0.01039488, + "balance_loss_clip": 1.26910424, + "balance_loss_mlp": 1.01966369, + "epoch": 0.5916128062528183, + "flos": 17027825575680.0, + "grad_norm": 2.0276362048274867, + "language_loss": 0.65676427, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68155885, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19824219, + "step": 9840, + "time_per_iteration": 2.8305978775024414 + }, + { + "auxiliary_loss_clip": 0.01433527, + "auxiliary_loss_mlp": 0.01038055, + "balance_loss_clip": 1.26425648, + "balance_loss_mlp": 1.01737201, + "epoch": 0.5916729295054862, + "flos": 24765036560640.0, + "grad_norm": 1.9815151343688935, + "language_loss": 0.82455796, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.8492738, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20690918, + "step": 9841, + "time_per_iteration": 2.8630168437957764 + }, + { + "auxiliary_loss_clip": 0.01420941, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.25508428, + "balance_loss_mlp": 1.01757252, + "epoch": 0.5917330527581542, + "flos": 15965953925760.0, + "grad_norm": 2.3036597982101843, + "language_loss": 0.69720495, + "learning_rate": 1.507956080444291e-06, + "loss": 0.72177893, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18884277, + "step": 9842, + "time_per_iteration": 2.846956968307495 + }, + { + "auxiliary_loss_clip": 0.01427488, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.25701082, + "balance_loss_mlp": 1.0184449, + "epoch": 0.5917931760108222, + "flos": 23810024424960.0, + "grad_norm": 2.109083812385693, + "language_loss": 0.8324421, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85710239, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20080566, + "step": 9843, + "time_per_iteration": 2.9326016902923584 + }, + { + "auxiliary_loss_clip": 0.01435101, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.26489687, + "balance_loss_mlp": 1.01627791, + "epoch": 0.5918532992634902, + "flos": 23257764668160.0, + "grad_norm": 2.516648784544326, + "language_loss": 0.83166873, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.85638881, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2064209, + "step": 9844, + "time_per_iteration": 2.9020986557006836 + }, + { + "auxiliary_loss_clip": 0.01446096, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.27581847, + "balance_loss_mlp": 1.0180918, + "epoch": 0.5919134225161581, + "flos": 19509184926720.0, + "grad_norm": 1.7743798037059866, + "language_loss": 0.74821007, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.77305698, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20495605, + "step": 9845, + "time_per_iteration": 2.876032590866089 + }, + { + "auxiliary_loss_clip": 0.0143191, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.26034451, + "balance_loss_mlp": 1.01840615, + "epoch": 0.5919735457688261, + "flos": 38815310615040.0, + "grad_norm": 1.921905535415222, + "language_loss": 0.6461674, + "learning_rate": 1.506446264718213e-06, + "loss": 0.67088073, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20996094, + "step": 9846, + "time_per_iteration": 2.984344720840454 + }, + { + "auxiliary_loss_clip": 0.01409417, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.24826229, + "balance_loss_mlp": 1.01249397, + "epoch": 0.592033669021494, + "flos": 22174193496960.0, + "grad_norm": 2.5911058121346198, + "language_loss": 0.76839292, + "learning_rate": 1.506068857539931e-06, + "loss": 0.79279733, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18530273, + "step": 9847, + "time_per_iteration": 4.232517719268799 + }, + { + "auxiliary_loss_clip": 0.01430461, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.26053083, + "balance_loss_mlp": 1.01706433, + "epoch": 0.592093792274162, + "flos": 22721250101760.0, + "grad_norm": 1.8351872403908078, + "language_loss": 0.6336273, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.65831256, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21020508, + "step": 9848, + "time_per_iteration": 2.834757089614868 + }, + { + "auxiliary_loss_clip": 0.01437622, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.26877284, + "balance_loss_mlp": 1.02170753, + "epoch": 0.59215391552683, + "flos": 22539274940160.0, + "grad_norm": 2.0157867504798777, + "language_loss": 0.76931739, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.79409665, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.18591309, + "step": 9849, + "time_per_iteration": 5.7194085121154785 + }, + { + "auxiliary_loss_clip": 0.01424068, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.25528049, + "balance_loss_mlp": 1.01911736, + "epoch": 0.592214038779498, + "flos": 24509800788480.0, + "grad_norm": 2.068446430839621, + "language_loss": 0.75840145, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.78304565, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21240234, + "step": 9850, + "time_per_iteration": 2.8519582748413086 + }, + { + "auxiliary_loss_clip": 0.01422352, + "auxiliary_loss_mlp": 0.01035697, + "balance_loss_clip": 1.25538874, + "balance_loss_mlp": 1.0165875, + "epoch": 0.592274162032166, + "flos": 21840358227840.0, + "grad_norm": 1.7698384225323622, + "language_loss": 0.76201868, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.78659916, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19116211, + "step": 9851, + "time_per_iteration": 2.821364164352417 + }, + { + "auxiliary_loss_clip": 0.01438588, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.26857615, + "balance_loss_mlp": 1.01819026, + "epoch": 0.5923342852848339, + "flos": 24618877297920.0, + "grad_norm": 2.0784979707902753, + "language_loss": 0.7193979, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.74416041, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.19458008, + "step": 9852, + "time_per_iteration": 2.881312131881714 + }, + { + "auxiliary_loss_clip": 0.01440464, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.26894295, + "balance_loss_mlp": 1.02257824, + "epoch": 0.5923944085375019, + "flos": 19947707959680.0, + "grad_norm": 1.67736125376662, + "language_loss": 0.80529499, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.83012515, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1998291, + "step": 9853, + "time_per_iteration": 2.82205867767334 + }, + { + "auxiliary_loss_clip": 0.01429613, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.263219, + "balance_loss_mlp": 1.01848507, + "epoch": 0.5924545317901698, + "flos": 28670408092800.0, + "grad_norm": 1.704437822220621, + "language_loss": 0.68161786, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.70628554, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18688965, + "step": 9854, + "time_per_iteration": 2.9279284477233887 + }, + { + "auxiliary_loss_clip": 0.01427568, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.26116681, + "balance_loss_mlp": 1.01337481, + "epoch": 0.5925146550428378, + "flos": 19874356859520.0, + "grad_norm": 1.8434622346720118, + "language_loss": 0.89849806, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.92309582, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18859863, + "step": 9855, + "time_per_iteration": 2.807422637939453 + }, + { + "auxiliary_loss_clip": 0.0141957, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.25477743, + "balance_loss_mlp": 1.01580501, + "epoch": 0.5925747782955058, + "flos": 15131555723520.0, + "grad_norm": 1.7735537821551608, + "language_loss": 0.87482584, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.8993817, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20214844, + "step": 9856, + "time_per_iteration": 2.8977530002593994 + }, + { + "auxiliary_loss_clip": 0.01441789, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.27184534, + "balance_loss_mlp": 1.01498103, + "epoch": 0.5926349015481738, + "flos": 18414528779520.0, + "grad_norm": 1.9464620819118943, + "language_loss": 0.78045321, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.80521506, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19421387, + "step": 9857, + "time_per_iteration": 2.9011240005493164 + }, + { + "auxiliary_loss_clip": 0.01437387, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.27088714, + "balance_loss_mlp": 1.01678658, + "epoch": 0.5926950248008417, + "flos": 23121197303040.0, + "grad_norm": 2.087253280532357, + "language_loss": 0.65928543, + "learning_rate": 1.501918617901419e-06, + "loss": 0.68401861, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19128418, + "step": 9858, + "time_per_iteration": 2.857067346572876 + }, + { + "auxiliary_loss_clip": 0.01416318, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.25249922, + "balance_loss_mlp": 1.01375127, + "epoch": 0.5927551480535097, + "flos": 28044525767040.0, + "grad_norm": 1.7950700186965656, + "language_loss": 0.77197385, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79645598, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18164062, + "step": 9859, + "time_per_iteration": 2.929734945297241 + }, + { + "auxiliary_loss_clip": 0.01426683, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.25782275, + "balance_loss_mlp": 1.01793921, + "epoch": 0.5928152713061776, + "flos": 21808659605760.0, + "grad_norm": 2.7861747931085814, + "language_loss": 0.76318878, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.78783238, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.1973877, + "step": 9860, + "time_per_iteration": 2.8330183029174805 + }, + { + "auxiliary_loss_clip": 0.01423596, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.25728858, + "balance_loss_mlp": 1.01558137, + "epoch": 0.5928753945588456, + "flos": 24327644647680.0, + "grad_norm": 1.6791021315138142, + "language_loss": 0.76766443, + "learning_rate": 1.500787130195763e-06, + "loss": 0.79225254, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19641113, + "step": 9861, + "time_per_iteration": 2.901172637939453 + }, + { + "auxiliary_loss_clip": 0.0140828, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.24409616, + "balance_loss_mlp": 1.01449442, + "epoch": 0.5929355178115137, + "flos": 26474716281600.0, + "grad_norm": 1.6263844772816096, + "language_loss": 0.71433747, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.73875868, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19360352, + "step": 9862, + "time_per_iteration": 2.918715476989746 + }, + { + "auxiliary_loss_clip": 0.01428467, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.26077044, + "balance_loss_mlp": 1.01643443, + "epoch": 0.5929956410641816, + "flos": 24975000270720.0, + "grad_norm": 1.9019725923045085, + "language_loss": 0.78924012, + "learning_rate": 1.500032899685832e-06, + "loss": 0.81387794, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.1887207, + "step": 9863, + "time_per_iteration": 2.881592273712158 + }, + { + "auxiliary_loss_clip": 0.01429065, + "auxiliary_loss_mlp": 0.01038552, + "balance_loss_clip": 1.26189542, + "balance_loss_mlp": 1.01919198, + "epoch": 0.5930557643168496, + "flos": 26218123165440.0, + "grad_norm": 1.9155807614867755, + "language_loss": 0.71169627, + "learning_rate": 1.499655812861921e-06, + "loss": 0.73637235, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19348145, + "step": 9864, + "time_per_iteration": 2.9026668071746826 + }, + { + "auxiliary_loss_clip": 0.01424616, + "auxiliary_loss_mlp": 0.01036346, + "balance_loss_clip": 1.25764048, + "balance_loss_mlp": 1.01618779, + "epoch": 0.5931158875695175, + "flos": 27866396413440.0, + "grad_norm": 7.0926175482617, + "language_loss": 0.67992318, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.7045328, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20153809, + "step": 9865, + "time_per_iteration": 2.925739288330078 + }, + { + "auxiliary_loss_clip": 0.0143625, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.26700819, + "balance_loss_mlp": 1.018839, + "epoch": 0.5931760108221855, + "flos": 15421973967360.0, + "grad_norm": 1.9585127214013762, + "language_loss": 0.7873624, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.81210583, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19238281, + "step": 9866, + "time_per_iteration": 2.817072868347168 + }, + { + "auxiliary_loss_clip": 0.0141794, + "auxiliary_loss_mlp": 0.01035778, + "balance_loss_clip": 1.2555728, + "balance_loss_mlp": 1.01626313, + "epoch": 0.5932361340748534, + "flos": 30200193912960.0, + "grad_norm": 2.005992784753267, + "language_loss": 0.73281038, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.75734752, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19519043, + "step": 9867, + "time_per_iteration": 2.8962535858154297 + }, + { + "auxiliary_loss_clip": 0.01424962, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.25975263, + "balance_loss_mlp": 1.01889312, + "epoch": 0.5932962573275214, + "flos": 20167308812160.0, + "grad_norm": 1.5915688865612314, + "language_loss": 0.67446417, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.69911271, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.21008301, + "step": 9868, + "time_per_iteration": 2.8464162349700928 + }, + { + "auxiliary_loss_clip": 0.01427422, + "auxiliary_loss_mlp": 0.01041207, + "balance_loss_clip": 1.25979197, + "balance_loss_mlp": 1.02075028, + "epoch": 0.5933563805801894, + "flos": 25456487880960.0, + "grad_norm": 2.116641274180415, + "language_loss": 0.76156712, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.78625345, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20458984, + "step": 9869, + "time_per_iteration": 2.9266693592071533 + }, + { + "auxiliary_loss_clip": 0.01432097, + "auxiliary_loss_mlp": 0.01036699, + "balance_loss_clip": 1.26327455, + "balance_loss_mlp": 1.01588559, + "epoch": 0.5934165038328574, + "flos": 60014719998720.0, + "grad_norm": 1.830872330237942, + "language_loss": 0.74939156, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.77407956, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20825195, + "step": 9870, + "time_per_iteration": 4.532441854476929 + }, + { + "auxiliary_loss_clip": 0.01433471, + "auxiliary_loss_mlp": 0.01038773, + "balance_loss_clip": 1.2641654, + "balance_loss_mlp": 1.01909113, + "epoch": 0.5934766270855253, + "flos": 24429889192320.0, + "grad_norm": 2.259079945015092, + "language_loss": 0.72064435, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74536681, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19677734, + "step": 9871, + "time_per_iteration": 2.8672282695770264 + }, + { + "auxiliary_loss_clip": 0.01423807, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.25643623, + "balance_loss_mlp": 1.01572835, + "epoch": 0.5935367503381933, + "flos": 23523497233920.0, + "grad_norm": 1.8926684452916953, + "language_loss": 0.75291479, + "learning_rate": 1.496639802503271e-06, + "loss": 0.77750278, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19274902, + "step": 9872, + "time_per_iteration": 2.856052875518799 + }, + { + "auxiliary_loss_clip": 0.01437306, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.26667345, + "balance_loss_mlp": 1.02417088, + "epoch": 0.5935968735908612, + "flos": 18956427477120.0, + "grad_norm": 2.2581569956118646, + "language_loss": 0.79976487, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.82458389, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.2043457, + "step": 9873, + "time_per_iteration": 2.8186724185943604 + }, + { + "auxiliary_loss_clip": 0.01425554, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.25893259, + "balance_loss_mlp": 1.0167985, + "epoch": 0.5936569968435292, + "flos": 25494294551040.0, + "grad_norm": 1.7499656760481874, + "language_loss": 0.85596675, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.8805964, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20617676, + "step": 9874, + "time_per_iteration": 2.8692786693573 + }, + { + "auxiliary_loss_clip": 0.01197799, + "auxiliary_loss_mlp": 0.01025669, + "balance_loss_clip": 1.10660827, + "balance_loss_mlp": 1.00611877, + "epoch": 0.5937171200961973, + "flos": 66407848381440.0, + "grad_norm": 0.7072058944302858, + "language_loss": 0.60157132, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.623806, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.1953125, + "step": 9875, + "time_per_iteration": 3.425703287124634 + }, + { + "auxiliary_loss_clip": 0.01439628, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.26704669, + "balance_loss_mlp": 1.01348829, + "epoch": 0.5937772433488652, + "flos": 14911819136640.0, + "grad_norm": 3.511494805887591, + "language_loss": 0.78591108, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.81065047, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20825195, + "step": 9876, + "time_per_iteration": 2.832049608230591 + }, + { + "auxiliary_loss_clip": 0.01414893, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.25182843, + "balance_loss_mlp": 1.01351857, + "epoch": 0.5938373666015332, + "flos": 22570792583040.0, + "grad_norm": 1.6889936249811792, + "language_loss": 0.76082176, + "learning_rate": 1.494755415907243e-06, + "loss": 0.78531003, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20422363, + "step": 9877, + "time_per_iteration": 2.8527491092681885 + }, + { + "auxiliary_loss_clip": 0.01436834, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.26812887, + "balance_loss_mlp": 1.01276755, + "epoch": 0.5938974898542011, + "flos": 18449711251200.0, + "grad_norm": 2.777969795878775, + "language_loss": 0.82647258, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.85118675, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21801758, + "step": 9878, + "time_per_iteration": 2.8352272510528564 + }, + { + "auxiliary_loss_clip": 0.01437443, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.26776171, + "balance_loss_mlp": 1.01713049, + "epoch": 0.5939576131068691, + "flos": 45602712616320.0, + "grad_norm": 2.2502488109752643, + "language_loss": 0.71276057, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73751259, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20617676, + "step": 9879, + "time_per_iteration": 3.1024248600006104 + }, + { + "auxiliary_loss_clip": 0.01418874, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.25489569, + "balance_loss_mlp": 1.01389956, + "epoch": 0.594017736359537, + "flos": 23598477146880.0, + "grad_norm": 1.5451617236148434, + "language_loss": 0.57869309, + "learning_rate": 1.493625013742401e-06, + "loss": 0.60322273, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.20178223, + "step": 9880, + "time_per_iteration": 2.8984971046447754 + }, + { + "auxiliary_loss_clip": 0.01422992, + "auxiliary_loss_mlp": 0.01036849, + "balance_loss_clip": 1.25686717, + "balance_loss_mlp": 1.01683426, + "epoch": 0.594077859612205, + "flos": 29468311724160.0, + "grad_norm": 1.916694986476144, + "language_loss": 0.78418171, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.80878013, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20007324, + "step": 9881, + "time_per_iteration": 2.9815361499786377 + }, + { + "auxiliary_loss_clip": 0.01424544, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.25634241, + "balance_loss_mlp": 1.0128901, + "epoch": 0.594137982864873, + "flos": 16808767660800.0, + "grad_norm": 2.098255446756221, + "language_loss": 0.83528721, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.85985637, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19482422, + "step": 9882, + "time_per_iteration": 4.307379722595215 + }, + { + "auxiliary_loss_clip": 0.0142636, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.2604444, + "balance_loss_mlp": 1.01418912, + "epoch": 0.594198106117541, + "flos": 12757915537920.0, + "grad_norm": 2.279454979894283, + "language_loss": 0.80336481, + "learning_rate": 1.492494784393667e-06, + "loss": 0.82795626, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18603516, + "step": 9883, + "time_per_iteration": 2.7944443225860596 + }, + { + "auxiliary_loss_clip": 0.01438105, + "auxiliary_loss_mlp": 0.01040163, + "balance_loss_clip": 1.26765847, + "balance_loss_mlp": 1.0189085, + "epoch": 0.5942582293702089, + "flos": 21006457718400.0, + "grad_norm": 1.8124491237032592, + "language_loss": 0.75521201, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.77999473, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21264648, + "step": 9884, + "time_per_iteration": 5.68001127243042 + }, + { + "auxiliary_loss_clip": 0.01432393, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.26407063, + "balance_loss_mlp": 1.01491642, + "epoch": 0.5943183526228769, + "flos": 28302657206400.0, + "grad_norm": 1.9355153962290743, + "language_loss": 0.6736542, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.69833386, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20666504, + "step": 9885, + "time_per_iteration": 2.887298345565796 + }, + { + "auxiliary_loss_clip": 0.01425831, + "auxiliary_loss_mlp": 0.0103717, + "balance_loss_clip": 1.26006126, + "balance_loss_mlp": 1.01792932, + "epoch": 0.5943784758755448, + "flos": 26625807227520.0, + "grad_norm": 2.5238925370733374, + "language_loss": 0.77978694, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.80441689, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19250488, + "step": 9886, + "time_per_iteration": 2.978957414627075 + }, + { + "auxiliary_loss_clip": 0.01197817, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.10818744, + "balance_loss_mlp": 1.00932205, + "epoch": 0.5944385991282128, + "flos": 64220300634240.0, + "grad_norm": 0.848722819766542, + "language_loss": 0.64704263, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66932094, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.20703125, + "step": 9887, + "time_per_iteration": 3.250133752822876 + }, + { + "auxiliary_loss_clip": 0.01428926, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.26177406, + "balance_loss_mlp": 1.01498652, + "epoch": 0.5944987223808808, + "flos": 19581133438080.0, + "grad_norm": 1.95296062315955, + "language_loss": 0.70246494, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.72710049, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19641113, + "step": 9888, + "time_per_iteration": 2.8967599868774414 + }, + { + "auxiliary_loss_clip": 0.01430053, + "auxiliary_loss_mlp": 0.01040332, + "balance_loss_clip": 1.26283073, + "balance_loss_mlp": 1.02003098, + "epoch": 0.5945588456335488, + "flos": 26188867762560.0, + "grad_norm": 1.9681810352617994, + "language_loss": 0.79974824, + "learning_rate": 1.490234845687366e-06, + "loss": 0.8244521, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.203125, + "step": 9889, + "time_per_iteration": 2.8805134296417236 + }, + { + "auxiliary_loss_clip": 0.01420687, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.25463462, + "balance_loss_mlp": 1.01518857, + "epoch": 0.5946189688862168, + "flos": 20455555305600.0, + "grad_norm": 1.707502151579249, + "language_loss": 0.72079355, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.74534971, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.1973877, + "step": 9890, + "time_per_iteration": 2.8844571113586426 + }, + { + "auxiliary_loss_clip": 0.01437811, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.26870775, + "balance_loss_mlp": 1.01893544, + "epoch": 0.5946790921388847, + "flos": 13443801747840.0, + "grad_norm": 2.425890565367597, + "language_loss": 0.70286226, + "learning_rate": 1.489481687275691e-06, + "loss": 0.72762847, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.1986084, + "step": 9891, + "time_per_iteration": 2.8462204933166504 + }, + { + "auxiliary_loss_clip": 0.01420339, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.25465345, + "balance_loss_mlp": 1.01666903, + "epoch": 0.5947392153915527, + "flos": 20421911157120.0, + "grad_norm": 1.9390211287665915, + "language_loss": 0.54621994, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.57078302, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19311523, + "step": 9892, + "time_per_iteration": 2.8993146419525146 + }, + { + "auxiliary_loss_clip": 0.01200465, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.11110806, + "balance_loss_mlp": 1.00886142, + "epoch": 0.5947993386442206, + "flos": 65650239884160.0, + "grad_norm": 0.6537102418789985, + "language_loss": 0.54607761, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.5683769, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.20605469, + "step": 9893, + "time_per_iteration": 3.413184881210327 + }, + { + "auxiliary_loss_clip": 0.01409398, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.24674439, + "balance_loss_mlp": 1.01596212, + "epoch": 0.5948594618968887, + "flos": 23192919590400.0, + "grad_norm": 2.1032485779837335, + "language_loss": 0.75681806, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.78127027, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1986084, + "step": 9894, + "time_per_iteration": 2.866065740585327 + }, + { + "auxiliary_loss_clip": 0.0141962, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.25231504, + "balance_loss_mlp": 1.01816285, + "epoch": 0.5949195851495566, + "flos": 13634871114240.0, + "grad_norm": 2.431023739867387, + "language_loss": 0.78687662, + "learning_rate": 1.487975602873434e-06, + "loss": 0.81145364, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19934082, + "step": 9895, + "time_per_iteration": 2.8491034507751465 + }, + { + "auxiliary_loss_clip": 0.01449385, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.27897871, + "balance_loss_mlp": 1.02184796, + "epoch": 0.5949797084022246, + "flos": 19759308036480.0, + "grad_norm": 1.691662524800222, + "language_loss": 0.79738003, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.82229853, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20605469, + "step": 9896, + "time_per_iteration": 2.8587896823883057 + }, + { + "auxiliary_loss_clip": 0.01432278, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.26424706, + "balance_loss_mlp": 1.01691389, + "epoch": 0.5950398316548925, + "flos": 25784441326080.0, + "grad_norm": 1.4963758756030519, + "language_loss": 0.8423475, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.86704296, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20349121, + "step": 9897, + "time_per_iteration": 2.886740207672119 + }, + { + "auxiliary_loss_clip": 0.01431066, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.26255155, + "balance_loss_mlp": 1.01793206, + "epoch": 0.5950999549075605, + "flos": 23049294036480.0, + "grad_norm": 1.8083325483797432, + "language_loss": 0.72244495, + "learning_rate": 1.486846243389939e-06, + "loss": 0.74713045, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19555664, + "step": 9898, + "time_per_iteration": 2.846064567565918 + }, + { + "auxiliary_loss_clip": 0.01434998, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.26301265, + "balance_loss_mlp": 1.01913989, + "epoch": 0.5951600781602284, + "flos": 32457518421120.0, + "grad_norm": 13.003859577839917, + "language_loss": 0.64774364, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.67249167, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.20666504, + "step": 9899, + "time_per_iteration": 2.939169406890869 + }, + { + "auxiliary_loss_clip": 0.01422437, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.2571646, + "balance_loss_mlp": 1.01791441, + "epoch": 0.5952202014128964, + "flos": 23810205404160.0, + "grad_norm": 1.7518648961733951, + "language_loss": 0.72912264, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.75370991, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18383789, + "step": 9900, + "time_per_iteration": 2.8975331783294678 + }, + { + "auxiliary_loss_clip": 0.01416117, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.25222659, + "balance_loss_mlp": 1.01822495, + "epoch": 0.5952803246655644, + "flos": 22502418410880.0, + "grad_norm": 2.2736544324101717, + "language_loss": 0.85591525, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.88047135, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.21289062, + "step": 9901, + "time_per_iteration": 2.8299620151519775 + }, + { + "auxiliary_loss_clip": 0.012034, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.11221242, + "balance_loss_mlp": 1.011271, + "epoch": 0.5953404479182324, + "flos": 51259705799040.0, + "grad_norm": 0.781208399653823, + "language_loss": 0.58212829, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60450959, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.234375, + "step": 9902, + "time_per_iteration": 3.2500903606414795 + }, + { + "auxiliary_loss_clip": 0.01421558, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.25407743, + "balance_loss_mlp": 1.011935, + "epoch": 0.5954005711709004, + "flos": 23123142829440.0, + "grad_norm": 1.6981087205192982, + "language_loss": 0.78165293, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.80618608, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19812012, + "step": 9903, + "time_per_iteration": 2.9618070125579834 + }, + { + "auxiliary_loss_clip": 0.01419932, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.25392604, + "balance_loss_mlp": 1.01285791, + "epoch": 0.5954606944235683, + "flos": 35968281638400.0, + "grad_norm": 1.7425380883459551, + "language_loss": 0.79084349, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.81537151, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20031738, + "step": 9904, + "time_per_iteration": 2.9704926013946533 + }, + { + "auxiliary_loss_clip": 0.0144294, + "auxiliary_loss_mlp": 0.01039776, + "balance_loss_clip": 1.27035642, + "balance_loss_mlp": 1.01866376, + "epoch": 0.5955208176762363, + "flos": 30455339195520.0, + "grad_norm": 1.3844497954274668, + "language_loss": 0.73121512, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75604236, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.21105957, + "step": 9905, + "time_per_iteration": 4.322917461395264 + }, + { + "auxiliary_loss_clip": 0.01407781, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.24166715, + "balance_loss_mlp": 1.01752234, + "epoch": 0.5955809409289042, + "flos": 17649228666240.0, + "grad_norm": 1.7816492806695519, + "language_loss": 0.70746499, + "learning_rate": 1.483835475336295e-06, + "loss": 0.73192799, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20996094, + "step": 9906, + "time_per_iteration": 2.817312717437744 + }, + { + "auxiliary_loss_clip": 0.01417283, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.25024581, + "balance_loss_mlp": 1.01677716, + "epoch": 0.5956410641815723, + "flos": 24290561894400.0, + "grad_norm": 1.7871048799217477, + "language_loss": 0.75403965, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77858716, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20690918, + "step": 9907, + "time_per_iteration": 2.849928617477417 + }, + { + "auxiliary_loss_clip": 0.0142571, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.25829673, + "balance_loss_mlp": 1.01689219, + "epoch": 0.5957011874342402, + "flos": 35747368686720.0, + "grad_norm": 1.8316266773205743, + "language_loss": 0.68391484, + "learning_rate": 1.483082978767595e-06, + "loss": 0.70854771, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20678711, + "step": 9908, + "time_per_iteration": 2.9540486335754395 + }, + { + "auxiliary_loss_clip": 0.0140903, + "auxiliary_loss_mlp": 0.01035466, + "balance_loss_clip": 1.24433982, + "balance_loss_mlp": 1.01447296, + "epoch": 0.5957613106869082, + "flos": 21253187468160.0, + "grad_norm": 3.3209808383837913, + "language_loss": 0.77524483, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.79968977, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20983887, + "step": 9909, + "time_per_iteration": 2.8728034496307373 + }, + { + "auxiliary_loss_clip": 0.01203374, + "auxiliary_loss_mlp": 0.01021593, + "balance_loss_clip": 1.11200833, + "balance_loss_mlp": 1.00251961, + "epoch": 0.5958214339395761, + "flos": 65970456468480.0, + "grad_norm": 0.9250427387541176, + "language_loss": 0.73536485, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75761455, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.19042969, + "step": 9910, + "time_per_iteration": 3.402803897857666 + }, + { + "auxiliary_loss_clip": 0.01421357, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.25299978, + "balance_loss_mlp": 1.01139402, + "epoch": 0.5958815571922441, + "flos": 23228283041280.0, + "grad_norm": 1.6506235477489333, + "language_loss": 0.70679414, + "learning_rate": 1.481954380961799e-06, + "loss": 0.73133588, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21411133, + "step": 9911, + "time_per_iteration": 2.8778703212738037 + }, + { + "auxiliary_loss_clip": 0.01443352, + "auxiliary_loss_mlp": 0.01037448, + "balance_loss_clip": 1.26987743, + "balance_loss_mlp": 1.01682496, + "epoch": 0.595941680444912, + "flos": 16545659293440.0, + "grad_norm": 2.0194738611878815, + "language_loss": 0.66628361, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.6910916, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20605469, + "step": 9912, + "time_per_iteration": 2.811274528503418 + }, + { + "auxiliary_loss_clip": 0.0142411, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.25693965, + "balance_loss_mlp": 1.01484537, + "epoch": 0.59600180369758, + "flos": 27830444780160.0, + "grad_norm": 1.8950507029364791, + "language_loss": 0.73449641, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75908768, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20166016, + "step": 9913, + "time_per_iteration": 2.8912205696105957 + }, + { + "auxiliary_loss_clip": 0.01433945, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.26186395, + "balance_loss_mlp": 1.01565504, + "epoch": 0.596061926950248, + "flos": 29502091607040.0, + "grad_norm": 4.477185169591049, + "language_loss": 0.80823386, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.83293891, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20898438, + "step": 9914, + "time_per_iteration": 2.982706069946289 + }, + { + "auxiliary_loss_clip": 0.0141616, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.25132012, + "balance_loss_mlp": 1.01403463, + "epoch": 0.596122050202916, + "flos": 16845533700480.0, + "grad_norm": 1.73319835733686, + "language_loss": 0.68069762, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.70519781, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19824219, + "step": 9915, + "time_per_iteration": 2.8324806690216064 + }, + { + "auxiliary_loss_clip": 0.01427663, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.2582494, + "balance_loss_mlp": 1.01579165, + "epoch": 0.596182173455584, + "flos": 21006593452800.0, + "grad_norm": 1.658135840475047, + "language_loss": 0.79826069, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.82289875, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20349121, + "step": 9916, + "time_per_iteration": 2.8590149879455566 + }, + { + "auxiliary_loss_clip": 0.01436066, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.26481545, + "balance_loss_mlp": 1.01314521, + "epoch": 0.5962422967082519, + "flos": 16072949174400.0, + "grad_norm": 1.7627919144793953, + "language_loss": 0.83714473, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.86183721, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20031738, + "step": 9917, + "time_per_iteration": 4.251293182373047 + }, + { + "auxiliary_loss_clip": 0.01415814, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.25075209, + "balance_loss_mlp": 1.01415038, + "epoch": 0.5963024199609199, + "flos": 12174590586240.0, + "grad_norm": 1.7920281195697247, + "language_loss": 0.78356713, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.80807072, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20385742, + "step": 9918, + "time_per_iteration": 2.8501601219177246 + }, + { + "auxiliary_loss_clip": 0.01418744, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.2532872, + "balance_loss_mlp": 1.02168512, + "epoch": 0.5963625432135878, + "flos": 28085906776320.0, + "grad_norm": 1.6479972622885097, + "language_loss": 0.79149818, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.81610483, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20239258, + "step": 9919, + "time_per_iteration": 5.7042951583862305 + }, + { + "auxiliary_loss_clip": 0.01421961, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.25474036, + "balance_loss_mlp": 1.01467633, + "epoch": 0.5964226664662559, + "flos": 19868610769920.0, + "grad_norm": 2.277950339013918, + "language_loss": 0.78388411, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.80845368, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.203125, + "step": 9920, + "time_per_iteration": 2.8314244747161865 + }, + { + "auxiliary_loss_clip": 0.01437907, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.26951945, + "balance_loss_mlp": 1.01931441, + "epoch": 0.5964827897189238, + "flos": 12940297902720.0, + "grad_norm": 3.049525442228704, + "language_loss": 0.83497584, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.85974944, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20153809, + "step": 9921, + "time_per_iteration": 2.829024076461792 + }, + { + "auxiliary_loss_clip": 0.01411421, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.24654472, + "balance_loss_mlp": 1.01652372, + "epoch": 0.5965429129715918, + "flos": 18160016924160.0, + "grad_norm": 2.7814967646847846, + "language_loss": 0.81089163, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.835374, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20288086, + "step": 9922, + "time_per_iteration": 2.831235408782959 + }, + { + "auxiliary_loss_clip": 0.01408688, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.24373579, + "balance_loss_mlp": 1.01328611, + "epoch": 0.5966030362242597, + "flos": 21773477134080.0, + "grad_norm": 1.8332506029164501, + "language_loss": 0.77436846, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79878747, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19909668, + "step": 9923, + "time_per_iteration": 2.866919755935669 + }, + { + "auxiliary_loss_clip": 0.01437854, + "auxiliary_loss_mlp": 0.01037295, + "balance_loss_clip": 1.26688063, + "balance_loss_mlp": 1.01583719, + "epoch": 0.5966631594769277, + "flos": 18816511996800.0, + "grad_norm": 2.03478562973171, + "language_loss": 0.76276445, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.787516, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21472168, + "step": 9924, + "time_per_iteration": 2.834747314453125 + }, + { + "auxiliary_loss_clip": 0.01422868, + "auxiliary_loss_mlp": 0.0103934, + "balance_loss_clip": 1.25885582, + "balance_loss_mlp": 1.01875234, + "epoch": 0.5967232827295956, + "flos": 14072263027200.0, + "grad_norm": 2.2346607450815767, + "language_loss": 0.67303663, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.69765866, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20581055, + "step": 9925, + "time_per_iteration": 2.8318073749542236 + }, + { + "auxiliary_loss_clip": 0.01420897, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.25637448, + "balance_loss_mlp": 1.02043211, + "epoch": 0.5967834059822636, + "flos": 17247109714560.0, + "grad_norm": 1.890542901047272, + "language_loss": 0.72253376, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74715155, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20446777, + "step": 9926, + "time_per_iteration": 2.858304023742676 + }, + { + "auxiliary_loss_clip": 0.01429374, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.25981712, + "balance_loss_mlp": 1.01533461, + "epoch": 0.5968435292349316, + "flos": 42534996912000.0, + "grad_norm": 2.010218982363157, + "language_loss": 0.71287513, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.73752475, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20251465, + "step": 9927, + "time_per_iteration": 3.037299871444702 + }, + { + "auxiliary_loss_clip": 0.0143152, + "auxiliary_loss_mlp": 0.01039685, + "balance_loss_clip": 1.25957465, + "balance_loss_mlp": 1.01914489, + "epoch": 0.5969036524875996, + "flos": 37644317210880.0, + "grad_norm": 2.0340757040301023, + "language_loss": 0.64379483, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.66850686, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20544434, + "step": 9928, + "time_per_iteration": 3.0375545024871826 + }, + { + "auxiliary_loss_clip": 0.01411467, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.24626517, + "balance_loss_mlp": 1.01208699, + "epoch": 0.5969637757402676, + "flos": 23158642014720.0, + "grad_norm": 1.7265241808903646, + "language_loss": 0.70278549, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.72722375, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20263672, + "step": 9929, + "time_per_iteration": 2.9572389125823975 + }, + { + "auxiliary_loss_clip": 0.01400744, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.23949862, + "balance_loss_mlp": 1.01808953, + "epoch": 0.5970238989929355, + "flos": 24030620663040.0, + "grad_norm": 1.9837695836223377, + "language_loss": 0.78363276, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.80802226, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.2010498, + "step": 9930, + "time_per_iteration": 2.8604960441589355 + }, + { + "auxiliary_loss_clip": 0.01444928, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.27213454, + "balance_loss_mlp": 1.01492691, + "epoch": 0.5970840222456035, + "flos": 19436195784960.0, + "grad_norm": 1.7039550583737546, + "language_loss": 0.70354915, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.72836214, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21459961, + "step": 9931, + "time_per_iteration": 2.842463731765747 + }, + { + "auxiliary_loss_clip": 0.01207422, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.11656106, + "balance_loss_mlp": 1.02242756, + "epoch": 0.5971441454982714, + "flos": 63002994537600.0, + "grad_norm": 0.8597695292956713, + "language_loss": 0.64259553, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66505903, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16503906, + "step": 9932, + "time_per_iteration": 3.299004554748535 + }, + { + "auxiliary_loss_clip": 0.01413518, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.24742794, + "balance_loss_mlp": 1.01259017, + "epoch": 0.5972042687509395, + "flos": 20275797139200.0, + "grad_norm": 1.9209217520821718, + "language_loss": 0.75267494, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.77714092, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.2052002, + "step": 9933, + "time_per_iteration": 2.8653359413146973 + }, + { + "auxiliary_loss_clip": 0.01206105, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.11561394, + "balance_loss_mlp": 1.00947571, + "epoch": 0.5972643920036074, + "flos": 71688448920960.0, + "grad_norm": 0.6602046078011982, + "language_loss": 0.52044594, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54278111, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1796875, + "step": 9934, + "time_per_iteration": 3.4048213958740234 + }, + { + "auxiliary_loss_clip": 0.01208986, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.11799705, + "balance_loss_mlp": 1.01982641, + "epoch": 0.5973245152562754, + "flos": 56919893379840.0, + "grad_norm": 0.8283827550397656, + "language_loss": 0.54230714, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56477451, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.1796875, + "step": 9935, + "time_per_iteration": 3.2338685989379883 + }, + { + "auxiliary_loss_clip": 0.01416362, + "auxiliary_loss_mlp": 0.01040978, + "balance_loss_clip": 1.24845731, + "balance_loss_mlp": 1.01832819, + "epoch": 0.5973846385089433, + "flos": 24173431810560.0, + "grad_norm": 1.765983113685319, + "language_loss": 0.6674161, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.69198954, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.22644043, + "step": 9936, + "time_per_iteration": 2.9413180351257324 + }, + { + "auxiliary_loss_clip": 0.01431306, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.26169562, + "balance_loss_mlp": 1.01568413, + "epoch": 0.5974447617616113, + "flos": 17678167355520.0, + "grad_norm": 1.9460496678895458, + "language_loss": 0.68030095, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.70497185, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.2010498, + "step": 9937, + "time_per_iteration": 2.9272842407226562 + }, + { + "auxiliary_loss_clip": 0.01428644, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.25907993, + "balance_loss_mlp": 1.01271033, + "epoch": 0.5975048850142792, + "flos": 22902591836160.0, + "grad_norm": 2.1922047281702812, + "language_loss": 0.78561693, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.81024575, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21520996, + "step": 9938, + "time_per_iteration": 2.9505600929260254 + }, + { + "auxiliary_loss_clip": 0.01430705, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.2621336, + "balance_loss_mlp": 1.01596391, + "epoch": 0.5975650082669473, + "flos": 24363867749760.0, + "grad_norm": 1.8231987535158651, + "language_loss": 0.76648831, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.79117882, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.22375488, + "step": 9939, + "time_per_iteration": 2.931148052215576 + }, + { + "auxiliary_loss_clip": 0.0143547, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.26213551, + "balance_loss_mlp": 1.01687694, + "epoch": 0.5976251315196152, + "flos": 20933197107840.0, + "grad_norm": 2.5158932394194786, + "language_loss": 0.69492805, + "learning_rate": 1.471053774486878e-06, + "loss": 0.71966863, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21728516, + "step": 9940, + "time_per_iteration": 4.341262578964233 + }, + { + "auxiliary_loss_clip": 0.01413654, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.24969852, + "balance_loss_mlp": 1.01397657, + "epoch": 0.5976852547722832, + "flos": 35857938274560.0, + "grad_norm": 1.360144234209964, + "language_loss": 0.70686722, + "learning_rate": 1.470678190375664e-06, + "loss": 0.73134041, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19677734, + "step": 9941, + "time_per_iteration": 2.9766056537628174 + }, + { + "auxiliary_loss_clip": 0.01415743, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.24955821, + "balance_loss_mlp": 1.01507163, + "epoch": 0.5977453780249512, + "flos": 12862648546560.0, + "grad_norm": 2.739663306110588, + "language_loss": 0.7825948, + "learning_rate": 1.470302626336386e-06, + "loss": 0.80711406, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.21105957, + "step": 9942, + "time_per_iteration": 2.831942558288574 + }, + { + "auxiliary_loss_clip": 0.0142312, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.25253654, + "balance_loss_mlp": 1.01615047, + "epoch": 0.5978055012776191, + "flos": 20968832027520.0, + "grad_norm": 1.7013274192150403, + "language_loss": 0.75902462, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78361857, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20117188, + "step": 9943, + "time_per_iteration": 2.8651227951049805 + }, + { + "auxiliary_loss_clip": 0.01427486, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.26006842, + "balance_loss_mlp": 1.01609349, + "epoch": 0.5978656245302871, + "flos": 34071197379840.0, + "grad_norm": 2.114572442349852, + "language_loss": 0.63020086, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.65483642, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19970703, + "step": 9944, + "time_per_iteration": 2.9593162536621094 + }, + { + "auxiliary_loss_clip": 0.01430103, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.26333141, + "balance_loss_mlp": 1.01465321, + "epoch": 0.597925747782955, + "flos": 37386050037120.0, + "grad_norm": 1.6194787097154162, + "language_loss": 0.73112476, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.75577784, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20568848, + "step": 9945, + "time_per_iteration": 2.9799981117248535 + }, + { + "auxiliary_loss_clip": 0.01419584, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.25154352, + "balance_loss_mlp": 1.01300049, + "epoch": 0.5979858710356231, + "flos": 25385851468800.0, + "grad_norm": 2.5675449181521, + "language_loss": 0.68121505, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.70574296, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20214844, + "step": 9946, + "time_per_iteration": 2.860201597213745 + }, + { + "auxiliary_loss_clip": 0.01443571, + "auxiliary_loss_mlp": 0.01040642, + "balance_loss_clip": 1.26988876, + "balance_loss_mlp": 1.01924419, + "epoch": 0.598045994288291, + "flos": 13706186198400.0, + "grad_norm": 6.324691890564547, + "language_loss": 0.89784765, + "learning_rate": 1.468425107717461e-06, + "loss": 0.9226898, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21398926, + "step": 9947, + "time_per_iteration": 2.8091392517089844 + }, + { + "auxiliary_loss_clip": 0.01408077, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.24460244, + "balance_loss_mlp": 1.02053297, + "epoch": 0.598106117540959, + "flos": 21991449173760.0, + "grad_norm": 2.1116338137624977, + "language_loss": 0.72730184, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.75178719, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19921875, + "step": 9948, + "time_per_iteration": 2.8411688804626465 + }, + { + "auxiliary_loss_clip": 0.01421996, + "auxiliary_loss_mlp": 0.01038772, + "balance_loss_clip": 1.25304389, + "balance_loss_mlp": 1.01695657, + "epoch": 0.5981662407936269, + "flos": 20569337274240.0, + "grad_norm": 2.1443071760378207, + "language_loss": 0.90131724, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.92592496, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21801758, + "step": 9949, + "time_per_iteration": 2.895925998687744 + }, + { + "auxiliary_loss_clip": 0.0143696, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.26867831, + "balance_loss_mlp": 1.01590574, + "epoch": 0.5982263640462949, + "flos": 14071855824000.0, + "grad_norm": 1.9863850405199435, + "language_loss": 0.71550465, + "learning_rate": 1.467298838320673e-06, + "loss": 0.74023652, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.203125, + "step": 9950, + "time_per_iteration": 2.838127613067627 + }, + { + "auxiliary_loss_clip": 0.01436497, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.26759696, + "balance_loss_mlp": 1.01677847, + "epoch": 0.5982864872989628, + "flos": 17714842905600.0, + "grad_norm": 1.7879886012218047, + "language_loss": 0.79022026, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.81496197, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.2088623, + "step": 9951, + "time_per_iteration": 2.872558355331421 + }, + { + "auxiliary_loss_clip": 0.01433274, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.26377749, + "balance_loss_mlp": 1.02352571, + "epoch": 0.5983466105516309, + "flos": 16773766168320.0, + "grad_norm": 1.5240895302143922, + "language_loss": 0.74575186, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.77052999, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21032715, + "step": 9952, + "time_per_iteration": 4.402590036392212 + }, + { + "auxiliary_loss_clip": 0.0143116, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.26067472, + "balance_loss_mlp": 1.01579821, + "epoch": 0.5984067338042988, + "flos": 20051038379520.0, + "grad_norm": 7.414314923600157, + "language_loss": 0.79719228, + "learning_rate": 1.466172750724613e-06, + "loss": 0.82188278, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.22106934, + "step": 9953, + "time_per_iteration": 2.857473134994507 + }, + { + "auxiliary_loss_clip": 0.01418897, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.25199032, + "balance_loss_mlp": 1.01771414, + "epoch": 0.5984668570569668, + "flos": 26330276321280.0, + "grad_norm": 1.4421645822891698, + "language_loss": 0.70096767, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.72553939, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20568848, + "step": 9954, + "time_per_iteration": 5.742401361465454 + }, + { + "auxiliary_loss_clip": 0.01438523, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.26806629, + "balance_loss_mlp": 1.0164485, + "epoch": 0.5985269803096348, + "flos": 20603162401920.0, + "grad_norm": 2.6079539361033586, + "language_loss": 0.74026698, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.76501942, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20263672, + "step": 9955, + "time_per_iteration": 2.9300239086151123 + }, + { + "auxiliary_loss_clip": 0.01420806, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.25339913, + "balance_loss_mlp": 1.01626706, + "epoch": 0.5985871035623027, + "flos": 26875432644480.0, + "grad_norm": 5.347663577640157, + "language_loss": 0.6895467, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.71411973, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20239258, + "step": 9956, + "time_per_iteration": 3.0039994716644287 + }, + { + "auxiliary_loss_clip": 0.01435932, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.2655735, + "balance_loss_mlp": 1.01418328, + "epoch": 0.5986472268149707, + "flos": 19619166332160.0, + "grad_norm": 2.6101486320467226, + "language_loss": 0.74506783, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76978457, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.21569824, + "step": 9957, + "time_per_iteration": 2.82476806640625 + }, + { + "auxiliary_loss_clip": 0.01406004, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.2443974, + "balance_loss_mlp": 1.00993752, + "epoch": 0.5987073500676386, + "flos": 21803546943360.0, + "grad_norm": 2.1378030572601623, + "language_loss": 0.85128629, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.87564719, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.20153809, + "step": 9958, + "time_per_iteration": 2.8701822757720947 + }, + { + "auxiliary_loss_clip": 0.01438973, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.2689867, + "balance_loss_mlp": 1.01395905, + "epoch": 0.5987674733203067, + "flos": 24324522756480.0, + "grad_norm": 1.9552927654593348, + "language_loss": 0.67177123, + "learning_rate": 1.463921122471864e-06, + "loss": 0.69650316, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20263672, + "step": 9959, + "time_per_iteration": 2.9065139293670654 + }, + { + "auxiliary_loss_clip": 0.01430947, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.26348424, + "balance_loss_mlp": 1.01750863, + "epoch": 0.5988275965729746, + "flos": 21328981787520.0, + "grad_norm": 1.6555438057872447, + "language_loss": 0.83910429, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.86378807, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19934082, + "step": 9960, + "time_per_iteration": 2.866654872894287 + }, + { + "auxiliary_loss_clip": 0.01422909, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.25636995, + "balance_loss_mlp": 1.01478601, + "epoch": 0.5988877198256426, + "flos": 25128670170240.0, + "grad_norm": 1.5083637312816112, + "language_loss": 0.80116212, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.82574505, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20581055, + "step": 9961, + "time_per_iteration": 2.8827714920043945 + }, + { + "auxiliary_loss_clip": 0.01421114, + "auxiliary_loss_mlp": 0.01034446, + "balance_loss_clip": 1.25417161, + "balance_loss_mlp": 1.01406133, + "epoch": 0.5989478430783105, + "flos": 26439443320320.0, + "grad_norm": 2.135565947247819, + "language_loss": 0.68027341, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.70482898, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20385742, + "step": 9962, + "time_per_iteration": 2.868591547012329 + }, + { + "auxiliary_loss_clip": 0.01421818, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.25529456, + "balance_loss_mlp": 1.01415682, + "epoch": 0.5990079663309785, + "flos": 25790187415680.0, + "grad_norm": 1.5091448543558166, + "language_loss": 0.74762809, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.77219629, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20849609, + "step": 9963, + "time_per_iteration": 2.9152467250823975 + }, + { + "auxiliary_loss_clip": 0.01414435, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.25038993, + "balance_loss_mlp": 1.01044011, + "epoch": 0.5990680895836464, + "flos": 36845915886720.0, + "grad_norm": 2.673158830895831, + "language_loss": 0.68808794, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.71253908, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20214844, + "step": 9964, + "time_per_iteration": 2.968250274658203 + }, + { + "auxiliary_loss_clip": 0.01409271, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.24787641, + "balance_loss_mlp": 1.01622045, + "epoch": 0.5991282128363145, + "flos": 24144085918080.0, + "grad_norm": 3.188712596812824, + "language_loss": 0.77418762, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79865348, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.2109375, + "step": 9965, + "time_per_iteration": 2.8654401302337646 + }, + { + "auxiliary_loss_clip": 0.01414926, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.24754572, + "balance_loss_mlp": 1.01174831, + "epoch": 0.5991883360889824, + "flos": 10310652783360.0, + "grad_norm": 2.1386746866083346, + "language_loss": 0.7826888, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.80716252, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20690918, + "step": 9966, + "time_per_iteration": 2.816499710083008 + }, + { + "auxiliary_loss_clip": 0.01417983, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.25211596, + "balance_loss_mlp": 1.00910997, + "epoch": 0.5992484593416504, + "flos": 23961703553280.0, + "grad_norm": 1.6112019788425704, + "language_loss": 0.74538392, + "learning_rate": 1.460920090376422e-06, + "loss": 0.7698617, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20678711, + "step": 9967, + "time_per_iteration": 2.882910966873169 + }, + { + "auxiliary_loss_clip": 0.01435483, + "auxiliary_loss_mlp": 0.01038827, + "balance_loss_clip": 1.26146936, + "balance_loss_mlp": 1.01727402, + "epoch": 0.5993085825943184, + "flos": 11950646232960.0, + "grad_norm": 2.5436232139242154, + "language_loss": 0.69142509, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.71616822, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.21557617, + "step": 9968, + "time_per_iteration": 2.8864922523498535 + }, + { + "auxiliary_loss_clip": 0.01432358, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.26281357, + "balance_loss_mlp": 1.01390648, + "epoch": 0.5993687058469863, + "flos": 19036384318080.0, + "grad_norm": 1.7291576602180945, + "language_loss": 0.79960942, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.8242836, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.21142578, + "step": 9969, + "time_per_iteration": 2.842644453048706 + }, + { + "auxiliary_loss_clip": 0.01415484, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.24802732, + "balance_loss_mlp": 1.01259089, + "epoch": 0.5994288290996543, + "flos": 14291094718080.0, + "grad_norm": 2.250379239589451, + "language_loss": 0.81925118, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.84375107, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.21911621, + "step": 9970, + "time_per_iteration": 2.839820623397827 + }, + { + "auxiliary_loss_clip": 0.01431772, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.2613771, + "balance_loss_mlp": 1.01360321, + "epoch": 0.5994889523523222, + "flos": 19215554302080.0, + "grad_norm": 2.520664474800559, + "language_loss": 0.62202847, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64670676, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.2244873, + "step": 9971, + "time_per_iteration": 2.843364953994751 + }, + { + "auxiliary_loss_clip": 0.01415664, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.25241125, + "balance_loss_mlp": 1.0109961, + "epoch": 0.5995490756049903, + "flos": 28048462064640.0, + "grad_norm": 1.726288315920987, + "language_loss": 0.79427546, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81874025, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19824219, + "step": 9972, + "time_per_iteration": 3.009695053100586 + }, + { + "auxiliary_loss_clip": 0.01429089, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.25641131, + "balance_loss_mlp": 1.01539254, + "epoch": 0.5996091988576582, + "flos": 29063161370880.0, + "grad_norm": 2.046955661869127, + "language_loss": 0.77238023, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.79703951, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21447754, + "step": 9973, + "time_per_iteration": 2.904505968093872 + }, + { + "auxiliary_loss_clip": 0.01413162, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.24664855, + "balance_loss_mlp": 1.0142101, + "epoch": 0.5996693221103262, + "flos": 20823803884800.0, + "grad_norm": 1.9167236990750331, + "language_loss": 0.66056538, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.68505251, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21337891, + "step": 9974, + "time_per_iteration": 2.835796594619751 + }, + { + "auxiliary_loss_clip": 0.01411229, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.24378633, + "balance_loss_mlp": 1.01525092, + "epoch": 0.5997294453629941, + "flos": 23779411678080.0, + "grad_norm": 1.4311967805244572, + "language_loss": 0.75582767, + "learning_rate": 1.457920366566428e-06, + "loss": 0.78029728, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20483398, + "step": 9975, + "time_per_iteration": 4.282090187072754 + }, + { + "auxiliary_loss_clip": 0.01419222, + "auxiliary_loss_mlp": 0.01034966, + "balance_loss_clip": 1.25218105, + "balance_loss_mlp": 1.01447368, + "epoch": 0.5997895686156621, + "flos": 20969917902720.0, + "grad_norm": 2.0614981982518104, + "language_loss": 0.78167152, + "learning_rate": 1.457545493441611e-06, + "loss": 0.80621344, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20495605, + "step": 9976, + "time_per_iteration": 2.8894498348236084 + }, + { + "auxiliary_loss_clip": 0.01410009, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.24467254, + "balance_loss_mlp": 1.01393175, + "epoch": 0.59984969186833, + "flos": 28376958447360.0, + "grad_norm": 4.083608941861804, + "language_loss": 0.76136589, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.78582072, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.2154541, + "step": 9977, + "time_per_iteration": 2.8656837940216064 + }, + { + "auxiliary_loss_clip": 0.01422697, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.25367892, + "balance_loss_mlp": 1.0136981, + "epoch": 0.5999098151209981, + "flos": 22576357693440.0, + "grad_norm": 2.1834637264469006, + "language_loss": 0.69687814, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.72145253, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21044922, + "step": 9978, + "time_per_iteration": 2.8588674068450928 + }, + { + "auxiliary_loss_clip": 0.0142624, + "auxiliary_loss_mlp": 0.0103371, + "balance_loss_clip": 1.25671244, + "balance_loss_mlp": 1.0120976, + "epoch": 0.599969938373666, + "flos": 18777483717120.0, + "grad_norm": 2.1496258522459377, + "language_loss": 0.82634258, + "learning_rate": 1.456420997543594e-06, + "loss": 0.85094208, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21606445, + "step": 9979, + "time_per_iteration": 2.8395378589630127 + }, + { + "auxiliary_loss_clip": 0.01392899, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.23281693, + "balance_loss_mlp": 1.01657581, + "epoch": 0.600030061626334, + "flos": 11334491539200.0, + "grad_norm": 2.0540262500565545, + "language_loss": 0.70419484, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72850162, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.21191406, + "step": 9980, + "time_per_iteration": 2.8454771041870117 + }, + { + "auxiliary_loss_clip": 0.01431451, + "auxiliary_loss_mlp": 0.01034795, + "balance_loss_clip": 1.25912619, + "balance_loss_mlp": 1.01297975, + "epoch": 0.600090184879002, + "flos": 16586633099520.0, + "grad_norm": 3.051188424247574, + "language_loss": 0.69942749, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.72408992, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21826172, + "step": 9981, + "time_per_iteration": 2.8002071380615234 + }, + { + "auxiliary_loss_clip": 0.0140705, + "auxiliary_loss_mlp": 0.01036791, + "balance_loss_clip": 1.24273562, + "balance_loss_mlp": 1.01669216, + "epoch": 0.6001503081316699, + "flos": 23627958773760.0, + "grad_norm": 2.1439053628956994, + "language_loss": 0.79819328, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.82263166, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20092773, + "step": 9982, + "time_per_iteration": 2.8563287258148193 + }, + { + "auxiliary_loss_clip": 0.01411861, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.24719119, + "balance_loss_mlp": 1.01273429, + "epoch": 0.6002104313843379, + "flos": 20677373153280.0, + "grad_norm": 1.4511020653176327, + "language_loss": 0.73582089, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.76028788, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.22106934, + "step": 9983, + "time_per_iteration": 2.886061668395996 + }, + { + "auxiliary_loss_clip": 0.01414344, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.24672461, + "balance_loss_mlp": 1.01356781, + "epoch": 0.6002705546370058, + "flos": 22465335657600.0, + "grad_norm": 2.2811898992076127, + "language_loss": 0.79665464, + "learning_rate": 1.454547250154447e-06, + "loss": 0.82114488, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.21105957, + "step": 9984, + "time_per_iteration": 2.8749990463256836 + }, + { + "auxiliary_loss_clip": 0.01417758, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.25025535, + "balance_loss_mlp": 1.0163914, + "epoch": 0.6003306778896739, + "flos": 25202880921600.0, + "grad_norm": 1.6828953163525888, + "language_loss": 0.83855426, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.86310256, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20678711, + "step": 9985, + "time_per_iteration": 2.99177622795105 + }, + { + "auxiliary_loss_clip": 0.01412172, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.2453202, + "balance_loss_mlp": 1.01747561, + "epoch": 0.6003908011423418, + "flos": 26698977348480.0, + "grad_norm": 1.7524742693159485, + "language_loss": 0.72147119, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.74597347, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20568848, + "step": 9986, + "time_per_iteration": 4.32145619392395 + }, + { + "auxiliary_loss_clip": 0.01411602, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.24508238, + "balance_loss_mlp": 1.01901436, + "epoch": 0.6004509243950098, + "flos": 22575317063040.0, + "grad_norm": 1.4520838306209765, + "language_loss": 0.72469234, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.74921256, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.2142334, + "step": 9987, + "time_per_iteration": 2.913203239440918 + }, + { + "auxiliary_loss_clip": 0.01409104, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.24496531, + "balance_loss_mlp": 1.01567745, + "epoch": 0.6005110476476777, + "flos": 19728831024000.0, + "grad_norm": 1.831814193323785, + "language_loss": 0.85854495, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.88299161, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19885254, + "step": 9988, + "time_per_iteration": 5.693461656570435 + }, + { + "auxiliary_loss_clip": 0.01414166, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.24737799, + "balance_loss_mlp": 1.01826632, + "epoch": 0.6005711709003457, + "flos": 17721539136000.0, + "grad_norm": 1.9725637441131074, + "language_loss": 0.66598058, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.6905117, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20654297, + "step": 9989, + "time_per_iteration": 2.8730051517486572 + }, + { + "auxiliary_loss_clip": 0.0141585, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.24980855, + "balance_loss_mlp": 1.01996446, + "epoch": 0.6006312941530136, + "flos": 18523469554560.0, + "grad_norm": 1.5012237621616775, + "language_loss": 0.81024188, + "learning_rate": 1.452299436003257e-06, + "loss": 0.83479834, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19824219, + "step": 9990, + "time_per_iteration": 2.941312551498413 + }, + { + "auxiliary_loss_clip": 0.01434814, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.26492417, + "balance_loss_mlp": 1.01910686, + "epoch": 0.6006914174056817, + "flos": 21399030017280.0, + "grad_norm": 2.1063666614280194, + "language_loss": 0.83459985, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.85933709, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.19799805, + "step": 9991, + "time_per_iteration": 2.8937768936157227 + }, + { + "auxiliary_loss_clip": 0.01409981, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.24629867, + "balance_loss_mlp": 1.01857841, + "epoch": 0.6007515406583496, + "flos": 12758232251520.0, + "grad_norm": 1.7228719878783214, + "language_loss": 0.8342126, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.85869908, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20092773, + "step": 9992, + "time_per_iteration": 2.8034322261810303 + }, + { + "auxiliary_loss_clip": 0.01411041, + "auxiliary_loss_mlp": 0.01038204, + "balance_loss_clip": 1.24570584, + "balance_loss_mlp": 1.01591206, + "epoch": 0.6008116639110176, + "flos": 19215871015680.0, + "grad_norm": 1.9319271320037874, + "language_loss": 0.67164063, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.69613308, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.22290039, + "step": 9993, + "time_per_iteration": 2.9650959968566895 + }, + { + "auxiliary_loss_clip": 0.01417312, + "auxiliary_loss_mlp": 0.01037712, + "balance_loss_clip": 1.24987221, + "balance_loss_mlp": 1.01694608, + "epoch": 0.6008717871636855, + "flos": 17064048677760.0, + "grad_norm": 2.565875090816295, + "language_loss": 0.82301581, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.84756601, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20751953, + "step": 9994, + "time_per_iteration": 2.884915828704834 + }, + { + "auxiliary_loss_clip": 0.01397074, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.23611879, + "balance_loss_mlp": 1.01331639, + "epoch": 0.6009319104163535, + "flos": 20307224292480.0, + "grad_norm": 1.9232116831753792, + "language_loss": 0.73107553, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.75537384, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19433594, + "step": 9995, + "time_per_iteration": 2.8680977821350098 + }, + { + "auxiliary_loss_clip": 0.01426248, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.25735557, + "balance_loss_mlp": 1.01243711, + "epoch": 0.6009920336690215, + "flos": 21847190192640.0, + "grad_norm": 1.6495150594564798, + "language_loss": 0.81568038, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.84027374, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20629883, + "step": 9996, + "time_per_iteration": 2.9244449138641357 + }, + { + "auxiliary_loss_clip": 0.01410907, + "auxiliary_loss_mlp": 0.01037685, + "balance_loss_clip": 1.24557757, + "balance_loss_mlp": 1.01679981, + "epoch": 0.6010521569216895, + "flos": 22604934424320.0, + "grad_norm": 1.6624707059383168, + "language_loss": 0.78758281, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.81206876, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20898438, + "step": 9997, + "time_per_iteration": 2.840813159942627 + }, + { + "auxiliary_loss_clip": 0.01435709, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.26511931, + "balance_loss_mlp": 1.01734269, + "epoch": 0.6011122801743575, + "flos": 19181005257600.0, + "grad_norm": 1.6069534951859916, + "language_loss": 0.73528969, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.7600252, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20507812, + "step": 9998, + "time_per_iteration": 2.903951644897461 + }, + { + "auxiliary_loss_clip": 0.0140993, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.24456048, + "balance_loss_mlp": 1.01552463, + "epoch": 0.6011724034270254, + "flos": 25020996249600.0, + "grad_norm": 1.5355229879150307, + "language_loss": 0.72967315, + "learning_rate": 1.448929117633027e-06, + "loss": 0.75413692, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20910645, + "step": 9999, + "time_per_iteration": 2.8664000034332275 + }, + { + "auxiliary_loss_clip": 0.01427171, + "auxiliary_loss_mlp": 0.01038914, + "balance_loss_clip": 1.25596333, + "balance_loss_mlp": 1.018255, + "epoch": 0.6012325266796934, + "flos": 21807392751360.0, + "grad_norm": 2.4302224172613904, + "language_loss": 0.78661084, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.81127167, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20666504, + "step": 10000, + "time_per_iteration": 2.8526506423950195 + }, + { + "auxiliary_loss_clip": 0.01441936, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.26872432, + "balance_loss_mlp": 1.01648593, + "epoch": 0.6012926499323613, + "flos": 19582400292480.0, + "grad_norm": 2.3240031762050606, + "language_loss": 0.78006399, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.80486548, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 1.72851562, + "router_z_loss_mlp": 0.21716309, + "step": 10001, + "time_per_iteration": 2.8284196853637695 + }, + { + "auxiliary_loss_clip": 0.01428564, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.25706601, + "balance_loss_mlp": 1.01208115, + "epoch": 0.6013527731850293, + "flos": 34874213673600.0, + "grad_norm": 1.655743372298529, + "language_loss": 0.59012389, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61475098, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.22045898, + "step": 10002, + "time_per_iteration": 2.9833273887634277 + }, + { + "auxiliary_loss_clip": 0.01432154, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.262537, + "balance_loss_mlp": 1.01501, + "epoch": 0.6014128964376972, + "flos": 23301634141440.0, + "grad_norm": 1.6487940447877627, + "language_loss": 0.78614652, + "learning_rate": 1.447431741055314e-06, + "loss": 0.81082332, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20532227, + "step": 10003, + "time_per_iteration": 2.8668084144592285 + }, + { + "auxiliary_loss_clip": 0.01424772, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.25529838, + "balance_loss_mlp": 1.01598454, + "epoch": 0.6014730196903653, + "flos": 24830107862400.0, + "grad_norm": 2.030993092591522, + "language_loss": 0.77980292, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.80442667, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.21606445, + "step": 10004, + "time_per_iteration": 2.8453733921051025 + }, + { + "auxiliary_loss_clip": 0.01427728, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.25920188, + "balance_loss_mlp": 1.01454961, + "epoch": 0.6015331429430332, + "flos": 23122735626240.0, + "grad_norm": 1.4849398613325957, + "language_loss": 0.72943699, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.75406647, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20678711, + "step": 10005, + "time_per_iteration": 2.8858256340026855 + }, + { + "auxiliary_loss_clip": 0.01413422, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.25086534, + "balance_loss_mlp": 1.01467323, + "epoch": 0.6015932661957012, + "flos": 19208903316480.0, + "grad_norm": 1.9763752601409366, + "language_loss": 0.76228809, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.7867716, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20239258, + "step": 10006, + "time_per_iteration": 2.8283817768096924 + }, + { + "auxiliary_loss_clip": 0.01424452, + "auxiliary_loss_mlp": 0.01037093, + "balance_loss_clip": 1.25552964, + "balance_loss_mlp": 1.01582623, + "epoch": 0.6016533894483691, + "flos": 18122662702080.0, + "grad_norm": 1.7974043117910554, + "language_loss": 0.74869061, + "learning_rate": 1.445934699732685e-06, + "loss": 0.77330601, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21264648, + "step": 10007, + "time_per_iteration": 2.8655953407287598 + }, + { + "auxiliary_loss_clip": 0.01411024, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.24469554, + "balance_loss_mlp": 1.01590657, + "epoch": 0.6017135127010371, + "flos": 16225578443520.0, + "grad_norm": 1.7033930072149734, + "language_loss": 0.70636612, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.73084235, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20690918, + "step": 10008, + "time_per_iteration": 2.954418182373047 + }, + { + "auxiliary_loss_clip": 0.01422087, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.2548542, + "balance_loss_mlp": 1.01626539, + "epoch": 0.6017736359537051, + "flos": 23455620754560.0, + "grad_norm": 1.8151348355207302, + "language_loss": 0.77385181, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.79843903, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20349121, + "step": 10009, + "time_per_iteration": 2.9332690238952637 + }, + { + "auxiliary_loss_clip": 0.01409721, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.24357104, + "balance_loss_mlp": 1.01409936, + "epoch": 0.601833759206373, + "flos": 23524628353920.0, + "grad_norm": 2.1600378512799905, + "language_loss": 0.75031084, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.7747575, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20837402, + "step": 10010, + "time_per_iteration": 4.3161962032318115 + }, + { + "auxiliary_loss_clip": 0.0120699, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_clip": 1.113801, + "balance_loss_mlp": 1.01638854, + "epoch": 0.6018938824590411, + "flos": 64026082114560.0, + "grad_norm": 0.8370993538455997, + "language_loss": 0.5512141, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57370543, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.2578125, + "step": 10011, + "time_per_iteration": 3.4164257049560547 + }, + { + "auxiliary_loss_clip": 0.01421216, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.25369883, + "balance_loss_mlp": 1.02213919, + "epoch": 0.601954005711709, + "flos": 34652893518720.0, + "grad_norm": 1.3725490756580894, + "language_loss": 0.62279308, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64741462, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.18786621, + "step": 10012, + "time_per_iteration": 2.9786477088928223 + }, + { + "auxiliary_loss_clip": 0.01422201, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.25699711, + "balance_loss_mlp": 1.01539505, + "epoch": 0.602014128964377, + "flos": 19436422008960.0, + "grad_norm": 2.2785517775468636, + "language_loss": 0.75623471, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.78080642, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19567871, + "step": 10013, + "time_per_iteration": 2.853588581085205 + }, + { + "auxiliary_loss_clip": 0.01404946, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.24274027, + "balance_loss_mlp": 1.0151794, + "epoch": 0.6020742522170449, + "flos": 28341006814080.0, + "grad_norm": 1.5798438943810529, + "language_loss": 0.81602323, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.84041405, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1895752, + "step": 10014, + "time_per_iteration": 2.911649703979492 + }, + { + "auxiliary_loss_clip": 0.01405314, + "auxiliary_loss_mlp": 0.01040445, + "balance_loss_clip": 1.24314332, + "balance_loss_mlp": 1.02057266, + "epoch": 0.6021343754697129, + "flos": 22757201735040.0, + "grad_norm": 1.4345887813063787, + "language_loss": 0.73009348, + "learning_rate": 1.442941626485624e-06, + "loss": 0.75455105, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19873047, + "step": 10015, + "time_per_iteration": 2.8793442249298096 + }, + { + "auxiliary_loss_clip": 0.01201253, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.10958898, + "balance_loss_mlp": 1.01183569, + "epoch": 0.6021944987223808, + "flos": 65779749060480.0, + "grad_norm": 0.830439642638239, + "language_loss": 0.54879618, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.57117879, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.25195312, + "step": 10016, + "time_per_iteration": 3.2668616771698 + }, + { + "auxiliary_loss_clip": 0.01411631, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.24611521, + "balance_loss_mlp": 1.01883066, + "epoch": 0.6022546219750489, + "flos": 16113244308480.0, + "grad_norm": 1.5000256431311787, + "language_loss": 0.83156085, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85607761, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.21203613, + "step": 10017, + "time_per_iteration": 2.826338291168213 + }, + { + "auxiliary_loss_clip": 0.01415046, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.25180674, + "balance_loss_mlp": 1.02017188, + "epoch": 0.6023147452277168, + "flos": 25521513937920.0, + "grad_norm": 1.7717220575553094, + "language_loss": 0.84399569, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.86855328, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20544434, + "step": 10018, + "time_per_iteration": 2.902188301086426 + }, + { + "auxiliary_loss_clip": 0.01434097, + "auxiliary_loss_mlp": 0.01042594, + "balance_loss_clip": 1.26217449, + "balance_loss_mlp": 1.02206612, + "epoch": 0.6023748684803848, + "flos": 22645817740800.0, + "grad_norm": 1.6672475963439857, + "language_loss": 0.78820264, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.81296957, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.2052002, + "step": 10019, + "time_per_iteration": 2.8547585010528564 + }, + { + "auxiliary_loss_clip": 0.01420536, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.25369751, + "balance_loss_mlp": 1.01748621, + "epoch": 0.6024349917330527, + "flos": 26220792608640.0, + "grad_norm": 1.7076671137202029, + "language_loss": 0.74540508, + "learning_rate": 1.441071641765681e-06, + "loss": 0.76999569, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.21020508, + "step": 10020, + "time_per_iteration": 2.902641534805298 + }, + { + "auxiliary_loss_clip": 0.01418487, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.24993134, + "balance_loss_mlp": 1.02309418, + "epoch": 0.6024951149857207, + "flos": 21261693490560.0, + "grad_norm": 1.6868826381474988, + "language_loss": 0.64826679, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.67289668, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21398926, + "step": 10021, + "time_per_iteration": 4.280750751495361 + }, + { + "auxiliary_loss_clip": 0.01427319, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.25921917, + "balance_loss_mlp": 1.01350021, + "epoch": 0.6025552382383887, + "flos": 26954846547840.0, + "grad_norm": 1.4406051086451284, + "language_loss": 0.80988163, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.83449751, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20751953, + "step": 10022, + "time_per_iteration": 2.902776002883911 + }, + { + "auxiliary_loss_clip": 0.01456946, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.28404737, + "balance_loss_mlp": 1.01523542, + "epoch": 0.6026153614910567, + "flos": 31696426074240.0, + "grad_norm": 1.4342665301888817, + "language_loss": 0.67063439, + "learning_rate": 1.439949905155693e-06, + "loss": 0.69556606, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20983887, + "step": 10023, + "time_per_iteration": 4.515506029129028 + }, + { + "auxiliary_loss_clip": 0.01430828, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.26156878, + "balance_loss_mlp": 1.02047348, + "epoch": 0.6026754847437247, + "flos": 29324143232640.0, + "grad_norm": 1.8015004174132123, + "language_loss": 0.75147098, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.77619338, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20947266, + "step": 10024, + "time_per_iteration": 2.918546676635742 + }, + { + "auxiliary_loss_clip": 0.01423903, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.25720906, + "balance_loss_mlp": 1.01847386, + "epoch": 0.6027356079963926, + "flos": 23597300782080.0, + "grad_norm": 1.6272524232943106, + "language_loss": 0.72961003, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75423598, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20214844, + "step": 10025, + "time_per_iteration": 2.8653087615966797 + }, + { + "auxiliary_loss_clip": 0.01437668, + "auxiliary_loss_mlp": 0.01037707, + "balance_loss_clip": 1.26581359, + "balance_loss_mlp": 1.01657116, + "epoch": 0.6027957312490606, + "flos": 20823532416000.0, + "grad_norm": 2.167089353766132, + "language_loss": 0.69048917, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.71524286, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21130371, + "step": 10026, + "time_per_iteration": 2.8980712890625 + }, + { + "auxiliary_loss_clip": 0.0140845, + "auxiliary_loss_mlp": 0.01039672, + "balance_loss_clip": 1.24547172, + "balance_loss_mlp": 1.01935875, + "epoch": 0.6028558545017285, + "flos": 19944721802880.0, + "grad_norm": 1.937786198465296, + "language_loss": 0.80917436, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.8336556, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.203125, + "step": 10027, + "time_per_iteration": 2.8540515899658203 + }, + { + "auxiliary_loss_clip": 0.01439578, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.26726723, + "balance_loss_mlp": 1.01805687, + "epoch": 0.6029159777543965, + "flos": 22831231507200.0, + "grad_norm": 4.27059145727633, + "language_loss": 0.71485114, + "learning_rate": 1.438080769071171e-06, + "loss": 0.7396313, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20397949, + "step": 10028, + "time_per_iteration": 2.853317975997925 + }, + { + "auxiliary_loss_clip": 0.01426167, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.25806558, + "balance_loss_mlp": 1.01750851, + "epoch": 0.6029761010070644, + "flos": 23597888964480.0, + "grad_norm": 1.8323808397172243, + "language_loss": 0.85025918, + "learning_rate": 1.437707005721669e-06, + "loss": 0.87491453, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21850586, + "step": 10029, + "time_per_iteration": 2.889514446258545 + }, + { + "auxiliary_loss_clip": 0.01407668, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.24416089, + "balance_loss_mlp": 1.01711094, + "epoch": 0.6030362242597325, + "flos": 13670325054720.0, + "grad_norm": 1.8719044439040922, + "language_loss": 0.8099972, + "learning_rate": 1.437333263694373e-06, + "loss": 0.83445108, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20617676, + "step": 10030, + "time_per_iteration": 2.825075149536133 + }, + { + "auxiliary_loss_clip": 0.01431516, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.26199794, + "balance_loss_mlp": 1.01927245, + "epoch": 0.6030963475124004, + "flos": 24432830104320.0, + "grad_norm": 1.72533821907604, + "language_loss": 0.7169866, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.74169964, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20507812, + "step": 10031, + "time_per_iteration": 2.8874855041503906 + }, + { + "auxiliary_loss_clip": 0.01438356, + "auxiliary_loss_mlp": 0.01038688, + "balance_loss_clip": 1.26607192, + "balance_loss_mlp": 1.01744461, + "epoch": 0.6031564707650684, + "flos": 29656304444160.0, + "grad_norm": 1.807129412015419, + "language_loss": 0.73945045, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.76422083, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21252441, + "step": 10032, + "time_per_iteration": 2.9415769577026367 + }, + { + "auxiliary_loss_clip": 0.01432795, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.26209974, + "balance_loss_mlp": 1.01311946, + "epoch": 0.6032165940177363, + "flos": 16627697395200.0, + "grad_norm": 1.80950428041311, + "language_loss": 0.68766516, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.71234053, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.21618652, + "step": 10033, + "time_per_iteration": 2.9123125076293945 + }, + { + "auxiliary_loss_clip": 0.01416061, + "auxiliary_loss_mlp": 0.01038528, + "balance_loss_clip": 1.2507627, + "balance_loss_mlp": 1.01684391, + "epoch": 0.6032767172704043, + "flos": 17495694501120.0, + "grad_norm": 2.339841352605086, + "language_loss": 0.76537716, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78992307, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.21691895, + "step": 10034, + "time_per_iteration": 2.835196018218994 + }, + { + "auxiliary_loss_clip": 0.01438459, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_clip": 1.26775694, + "balance_loss_mlp": 1.02211773, + "epoch": 0.6033368405230723, + "flos": 26844050736000.0, + "grad_norm": 1.8636821281588323, + "language_loss": 0.7474668, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.77228618, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.21374512, + "step": 10035, + "time_per_iteration": 2.931239128112793 + }, + { + "auxiliary_loss_clip": 0.01414545, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.25044179, + "balance_loss_mlp": 1.01255465, + "epoch": 0.6033969637757403, + "flos": 16918703821440.0, + "grad_norm": 1.5784937972275932, + "language_loss": 0.87241161, + "learning_rate": 1.435091260090536e-06, + "loss": 0.89689684, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2142334, + "step": 10036, + "time_per_iteration": 2.8254477977752686 + }, + { + "auxiliary_loss_clip": 0.01432565, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.26205945, + "balance_loss_mlp": 1.0160439, + "epoch": 0.6034570870284083, + "flos": 22940443751040.0, + "grad_norm": 1.94296253725499, + "language_loss": 0.70504129, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72974378, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.21618652, + "step": 10037, + "time_per_iteration": 2.88252592086792 + }, + { + "auxiliary_loss_clip": 0.01429115, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.263008, + "balance_loss_mlp": 1.01268423, + "epoch": 0.6035172102810762, + "flos": 23376387830400.0, + "grad_norm": 2.173334094124257, + "language_loss": 0.85695803, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.88159072, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.21459961, + "step": 10038, + "time_per_iteration": 2.884089231491089 + }, + { + "auxiliary_loss_clip": 0.01422272, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.25369859, + "balance_loss_mlp": 1.01565194, + "epoch": 0.6035773335337442, + "flos": 20896974005760.0, + "grad_norm": 1.9747241953360997, + "language_loss": 0.77625299, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.8008374, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20507812, + "step": 10039, + "time_per_iteration": 2.834707498550415 + }, + { + "auxiliary_loss_clip": 0.01411841, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.24633312, + "balance_loss_mlp": 1.01160407, + "epoch": 0.6036374567864121, + "flos": 24947056967040.0, + "grad_norm": 1.5703846065339016, + "language_loss": 0.72170699, + "learning_rate": 1.433597019260301e-06, + "loss": 0.74614787, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20654297, + "step": 10040, + "time_per_iteration": 2.8654778003692627 + }, + { + "auxiliary_loss_clip": 0.0143412, + "auxiliary_loss_mlp": 0.01039937, + "balance_loss_clip": 1.26354539, + "balance_loss_mlp": 1.01800275, + "epoch": 0.6036975800390801, + "flos": 23158506280320.0, + "grad_norm": 1.9482389368463588, + "language_loss": 0.79184294, + "learning_rate": 1.433223512712475e-06, + "loss": 0.81658351, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.21948242, + "step": 10041, + "time_per_iteration": 2.8346941471099854 + }, + { + "auxiliary_loss_clip": 0.01423414, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.25762081, + "balance_loss_mlp": 1.01430404, + "epoch": 0.603757703291748, + "flos": 18669855041280.0, + "grad_norm": 1.7299218145969342, + "language_loss": 0.76306069, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.78765035, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.21252441, + "step": 10042, + "time_per_iteration": 2.80134654045105 + }, + { + "auxiliary_loss_clip": 0.0141882, + "auxiliary_loss_mlp": 0.01036935, + "balance_loss_clip": 1.25350654, + "balance_loss_mlp": 1.01612163, + "epoch": 0.6038178265444161, + "flos": 19692064984320.0, + "grad_norm": 1.7169004632223308, + "language_loss": 0.85350287, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.8780604, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20812988, + "step": 10043, + "time_per_iteration": 2.836409330368042 + }, + { + "auxiliary_loss_clip": 0.01434092, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.26326048, + "balance_loss_mlp": 1.02017665, + "epoch": 0.603877949797084, + "flos": 22648532428800.0, + "grad_norm": 2.70247007563294, + "language_loss": 0.6977663, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72251958, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21044922, + "step": 10044, + "time_per_iteration": 2.8495242595672607 + }, + { + "auxiliary_loss_clip": 0.01432094, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.26361656, + "balance_loss_mlp": 1.01292801, + "epoch": 0.603938073049752, + "flos": 25458976344960.0, + "grad_norm": 1.536693063233035, + "language_loss": 0.78590083, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.81056172, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21057129, + "step": 10045, + "time_per_iteration": 4.283295154571533 + }, + { + "auxiliary_loss_clip": 0.01413267, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.24851012, + "balance_loss_mlp": 1.01640093, + "epoch": 0.6039981963024199, + "flos": 22348748511360.0, + "grad_norm": 2.3457492413190963, + "language_loss": 0.78150034, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.80600446, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20739746, + "step": 10046, + "time_per_iteration": 2.838019609451294 + }, + { + "auxiliary_loss_clip": 0.0142888, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.26023483, + "balance_loss_mlp": 1.01533306, + "epoch": 0.6040583195550879, + "flos": 20712600869760.0, + "grad_norm": 1.6496707273121405, + "language_loss": 0.87825215, + "learning_rate": 1.430982925257827e-06, + "loss": 0.90289265, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19836426, + "step": 10047, + "time_per_iteration": 2.8313379287719727 + }, + { + "auxiliary_loss_clip": 0.01420387, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.25710416, + "balance_loss_mlp": 1.01431262, + "epoch": 0.604118442807756, + "flos": 27174764113920.0, + "grad_norm": 2.445771342892238, + "language_loss": 0.76390481, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.78844631, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19470215, + "step": 10048, + "time_per_iteration": 2.8957207202911377 + }, + { + "auxiliary_loss_clip": 0.01446282, + "auxiliary_loss_mlp": 0.0104116, + "balance_loss_clip": 1.27181101, + "balance_loss_mlp": 1.018677, + "epoch": 0.6041785660604239, + "flos": 30893319290880.0, + "grad_norm": 1.9212023163638519, + "language_loss": 0.66783249, + "learning_rate": 1.430236235239386e-06, + "loss": 0.69270688, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.22509766, + "step": 10049, + "time_per_iteration": 2.918947219848633 + }, + { + "auxiliary_loss_clip": 0.0143711, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.26982307, + "balance_loss_mlp": 1.01991415, + "epoch": 0.6042386893130919, + "flos": 19947798449280.0, + "grad_norm": 1.70742961389772, + "language_loss": 0.67220199, + "learning_rate": 1.429862922631336e-06, + "loss": 0.6969763, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20397949, + "step": 10050, + "time_per_iteration": 2.8502004146575928 + }, + { + "auxiliary_loss_clip": 0.0143161, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.26491499, + "balance_loss_mlp": 1.01783037, + "epoch": 0.6042988125657598, + "flos": 32428624976640.0, + "grad_norm": 1.7445249806418974, + "language_loss": 0.70434201, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.72904396, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20751953, + "step": 10051, + "time_per_iteration": 3.060915470123291 + }, + { + "auxiliary_loss_clip": 0.01413615, + "auxiliary_loss_mlp": 0.010374, + "balance_loss_clip": 1.24801457, + "balance_loss_mlp": 1.01614523, + "epoch": 0.6043589358184278, + "flos": 17429763548160.0, + "grad_norm": 2.316501530028133, + "language_loss": 0.65810746, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.68261755, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.21228027, + "step": 10052, + "time_per_iteration": 2.8138670921325684 + }, + { + "auxiliary_loss_clip": 0.01426016, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.25756741, + "balance_loss_mlp": 1.0151968, + "epoch": 0.6044190590710957, + "flos": 27684330762240.0, + "grad_norm": 2.1083682395432723, + "language_loss": 0.69293624, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71755272, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.2043457, + "step": 10053, + "time_per_iteration": 2.8695178031921387 + }, + { + "auxiliary_loss_clip": 0.0121152, + "auxiliary_loss_mlp": 0.01042066, + "balance_loss_clip": 1.11764359, + "balance_loss_mlp": 1.02041721, + "epoch": 0.6044791823237637, + "flos": 65344438408320.0, + "grad_norm": 0.7289237466168756, + "language_loss": 0.60479277, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62732863, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.21679688, + "step": 10054, + "time_per_iteration": 3.5018398761749268 + }, + { + "auxiliary_loss_clip": 0.01402784, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.24033308, + "balance_loss_mlp": 1.01323581, + "epoch": 0.6045393055764317, + "flos": 24501430500480.0, + "grad_norm": 1.9812865588149222, + "language_loss": 0.86460638, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.88898921, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.22253418, + "step": 10055, + "time_per_iteration": 2.8799567222595215 + }, + { + "auxiliary_loss_clip": 0.01425778, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.25918603, + "balance_loss_mlp": 1.01799154, + "epoch": 0.6045994288290997, + "flos": 19061567688960.0, + "grad_norm": 3.7606989623449585, + "language_loss": 0.74579144, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.77044111, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.21191406, + "step": 10056, + "time_per_iteration": 4.288909435272217 + }, + { + "auxiliary_loss_clip": 0.01418155, + "auxiliary_loss_mlp": 0.01037565, + "balance_loss_clip": 1.25586152, + "balance_loss_mlp": 1.01721585, + "epoch": 0.6046595520817676, + "flos": 26587502864640.0, + "grad_norm": 1.7104904147974485, + "language_loss": 0.80948287, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.83404005, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20324707, + "step": 10057, + "time_per_iteration": 2.992302656173706 + }, + { + "auxiliary_loss_clip": 0.01419503, + "auxiliary_loss_mlp": 0.01042197, + "balance_loss_clip": 1.25539827, + "balance_loss_mlp": 1.01977324, + "epoch": 0.6047196753344356, + "flos": 13588286952960.0, + "grad_norm": 2.1697783373570605, + "language_loss": 0.75311887, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77773589, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2244873, + "step": 10058, + "time_per_iteration": 4.287184238433838 + }, + { + "auxiliary_loss_clip": 0.0141695, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.25332999, + "balance_loss_mlp": 1.01829743, + "epoch": 0.6047797985871035, + "flos": 25531467793920.0, + "grad_norm": 1.8619015183490688, + "language_loss": 0.72467053, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.74922848, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20556641, + "step": 10059, + "time_per_iteration": 2.87035870552063 + }, + { + "auxiliary_loss_clip": 0.01424132, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.25684309, + "balance_loss_mlp": 1.01445901, + "epoch": 0.6048399218397715, + "flos": 20529404098560.0, + "grad_norm": 1.46962679298768, + "language_loss": 0.76864612, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.79324335, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21130371, + "step": 10060, + "time_per_iteration": 2.8537654876708984 + }, + { + "auxiliary_loss_clip": 0.01424269, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.25785184, + "balance_loss_mlp": 1.01483774, + "epoch": 0.6049000450924396, + "flos": 20417522411520.0, + "grad_norm": 1.9808512718605469, + "language_loss": 0.74630964, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.77090657, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20581055, + "step": 10061, + "time_per_iteration": 2.824930191040039 + }, + { + "auxiliary_loss_clip": 0.0142856, + "auxiliary_loss_mlp": 0.01036062, + "balance_loss_clip": 1.26022577, + "balance_loss_mlp": 1.01585555, + "epoch": 0.6049601683451075, + "flos": 20751267191040.0, + "grad_norm": 1.7152994558422956, + "language_loss": 0.68238109, + "learning_rate": 1.425384861715639e-06, + "loss": 0.70702732, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20227051, + "step": 10062, + "time_per_iteration": 2.8743603229522705 + }, + { + "auxiliary_loss_clip": 0.01405901, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.24253285, + "balance_loss_mlp": 1.0157969, + "epoch": 0.6050202915977755, + "flos": 20092464633600.0, + "grad_norm": 1.9097120412926312, + "language_loss": 0.72187734, + "learning_rate": 1.425011831266978e-06, + "loss": 0.74630713, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.21276855, + "step": 10063, + "time_per_iteration": 2.841268539428711 + }, + { + "auxiliary_loss_clip": 0.01420707, + "auxiliary_loss_mlp": 0.01041058, + "balance_loss_clip": 1.25583065, + "balance_loss_mlp": 1.02029192, + "epoch": 0.6050804148504434, + "flos": 15969256796160.0, + "grad_norm": 1.8103612584916575, + "language_loss": 0.85041749, + "learning_rate": 1.424638822621926e-06, + "loss": 0.87503517, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.2076416, + "step": 10064, + "time_per_iteration": 2.8366026878356934 + }, + { + "auxiliary_loss_clip": 0.01418349, + "auxiliary_loss_mlp": 0.01040687, + "balance_loss_clip": 1.25274086, + "balance_loss_mlp": 1.01924086, + "epoch": 0.6051405381031114, + "flos": 17465443712640.0, + "grad_norm": 2.495623206344067, + "language_loss": 0.80898535, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.83357573, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.21447754, + "step": 10065, + "time_per_iteration": 2.8143727779388428 + }, + { + "auxiliary_loss_clip": 0.01445837, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.27409029, + "balance_loss_mlp": 1.01327133, + "epoch": 0.6052006613557793, + "flos": 11407390191360.0, + "grad_norm": 1.9503287349897236, + "language_loss": 0.794447, + "learning_rate": 1.423892870799226e-06, + "loss": 0.81924832, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21020508, + "step": 10066, + "time_per_iteration": 2.8488786220550537 + }, + { + "auxiliary_loss_clip": 0.01424126, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.25834405, + "balance_loss_mlp": 1.01492715, + "epoch": 0.6052607846084473, + "flos": 24760964528640.0, + "grad_norm": 1.7149704471830003, + "language_loss": 0.74137741, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.76597095, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.203125, + "step": 10067, + "time_per_iteration": 2.9268276691436768 + }, + { + "auxiliary_loss_clip": 0.01430357, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.26474214, + "balance_loss_mlp": 1.01492214, + "epoch": 0.6053209078611153, + "flos": 20750995722240.0, + "grad_norm": 1.3650825447048458, + "language_loss": 0.69389862, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.7185573, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20581055, + "step": 10068, + "time_per_iteration": 2.955702781677246 + }, + { + "auxiliary_loss_clip": 0.01420067, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.25137091, + "balance_loss_mlp": 1.01589084, + "epoch": 0.6053810311137833, + "flos": 18962490280320.0, + "grad_norm": 2.244835546165123, + "language_loss": 0.87588, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.90043664, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19714355, + "step": 10069, + "time_per_iteration": 2.826185703277588 + }, + { + "auxiliary_loss_clip": 0.01422577, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.25720406, + "balance_loss_mlp": 1.01484823, + "epoch": 0.6054411543664512, + "flos": 23961613063680.0, + "grad_norm": 1.480612070665327, + "language_loss": 0.83948439, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.86405623, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19750977, + "step": 10070, + "time_per_iteration": 2.8723835945129395 + }, + { + "auxiliary_loss_clip": 0.01430925, + "auxiliary_loss_mlp": 0.01038475, + "balance_loss_clip": 1.26206672, + "balance_loss_mlp": 1.01764894, + "epoch": 0.6055012776191192, + "flos": 20603479115520.0, + "grad_norm": 1.5255306272332645, + "language_loss": 0.8683579, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.89305186, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20800781, + "step": 10071, + "time_per_iteration": 2.856146812438965 + }, + { + "auxiliary_loss_clip": 0.01436008, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.26582587, + "balance_loss_mlp": 1.01502442, + "epoch": 0.6055614008717871, + "flos": 30309949094400.0, + "grad_norm": 3.0656283922975387, + "language_loss": 0.77812189, + "learning_rate": 1.421655540088603e-06, + "loss": 0.80283022, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19812012, + "step": 10072, + "time_per_iteration": 2.9437766075134277 + }, + { + "auxiliary_loss_clip": 0.01431841, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.26262164, + "balance_loss_mlp": 1.01330948, + "epoch": 0.6056215241244551, + "flos": 27136233527040.0, + "grad_norm": 1.6561982527122652, + "language_loss": 0.74793661, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.77260184, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21362305, + "step": 10073, + "time_per_iteration": 2.893334150314331 + }, + { + "auxiliary_loss_clip": 0.0121335, + "auxiliary_loss_mlp": 0.01054855, + "balance_loss_clip": 1.11893952, + "balance_loss_mlp": 1.03358769, + "epoch": 0.6056816473771232, + "flos": 56031762337920.0, + "grad_norm": 0.7692476441483127, + "language_loss": 0.5518555, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57453752, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.21289062, + "step": 10074, + "time_per_iteration": 3.396829128265381 + }, + { + "auxiliary_loss_clip": 0.01427518, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.26275468, + "balance_loss_mlp": 1.0139395, + "epoch": 0.6057417706297911, + "flos": 23559810825600.0, + "grad_norm": 1.6271795752035823, + "language_loss": 0.82043785, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84504604, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19348145, + "step": 10075, + "time_per_iteration": 2.867440938949585 + }, + { + "auxiliary_loss_clip": 0.01427238, + "auxiliary_loss_mlp": 0.01031621, + "balance_loss_clip": 1.25907898, + "balance_loss_mlp": 1.01170087, + "epoch": 0.6058018938824591, + "flos": 27755419622400.0, + "grad_norm": 3.074878958282947, + "language_loss": 0.78654784, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.81113642, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19921875, + "step": 10076, + "time_per_iteration": 2.9752860069274902 + }, + { + "auxiliary_loss_clip": 0.01425817, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.2562921, + "balance_loss_mlp": 1.01914024, + "epoch": 0.605862017135127, + "flos": 22793515326720.0, + "grad_norm": 1.828903986489873, + "language_loss": 0.73537314, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.7600168, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.1940918, + "step": 10077, + "time_per_iteration": 2.9972007274627686 + }, + { + "auxiliary_loss_clip": 0.01433721, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.26625001, + "balance_loss_mlp": 1.01487875, + "epoch": 0.605922140387795, + "flos": 21224882206080.0, + "grad_norm": 1.6302412032632612, + "language_loss": 0.56173289, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.58641607, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19726562, + "step": 10078, + "time_per_iteration": 2.89125394821167 + }, + { + "auxiliary_loss_clip": 0.01433159, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.26288629, + "balance_loss_mlp": 1.0148592, + "epoch": 0.6059822636404629, + "flos": 27278954184960.0, + "grad_norm": 1.5224686474927496, + "language_loss": 0.70931816, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.73399675, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.19848633, + "step": 10079, + "time_per_iteration": 4.323460578918457 + }, + { + "auxiliary_loss_clip": 0.01423629, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.2572844, + "balance_loss_mlp": 1.01615584, + "epoch": 0.606042386893131, + "flos": 20641059561600.0, + "grad_norm": 2.250304284406604, + "language_loss": 0.633219, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.65781951, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20251465, + "step": 10080, + "time_per_iteration": 2.842203378677368 + }, + { + "auxiliary_loss_clip": 0.0142592, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.25837922, + "balance_loss_mlp": 1.01591969, + "epoch": 0.6061025101457989, + "flos": 23012482752000.0, + "grad_norm": 1.658797494600291, + "language_loss": 0.71906775, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.743689, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20288086, + "step": 10081, + "time_per_iteration": 2.854065418243408 + }, + { + "auxiliary_loss_clip": 0.01431662, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.26396298, + "balance_loss_mlp": 1.01717186, + "epoch": 0.6061626333984669, + "flos": 29911133013120.0, + "grad_norm": 1.6593466785502682, + "language_loss": 0.69647634, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.72116518, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20056152, + "step": 10082, + "time_per_iteration": 2.954836368560791 + }, + { + "auxiliary_loss_clip": 0.01434146, + "auxiliary_loss_mlp": 0.01033681, + "balance_loss_clip": 1.26620007, + "balance_loss_mlp": 1.01440525, + "epoch": 0.6062227566511348, + "flos": 25019955619200.0, + "grad_norm": 2.907212178470548, + "language_loss": 0.66490144, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.68957967, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19287109, + "step": 10083, + "time_per_iteration": 2.874065399169922 + }, + { + "auxiliary_loss_clip": 0.01443921, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.27439213, + "balance_loss_mlp": 1.01748657, + "epoch": 0.6062828799038028, + "flos": 19473414272640.0, + "grad_norm": 1.8323303789221732, + "language_loss": 0.74760342, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.77241039, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19274902, + "step": 10084, + "time_per_iteration": 2.840383768081665 + }, + { + "auxiliary_loss_clip": 0.01417908, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.25159919, + "balance_loss_mlp": 1.01443076, + "epoch": 0.6063430031564707, + "flos": 13597471647360.0, + "grad_norm": 2.461807128961348, + "language_loss": 0.7396906, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.76422024, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20629883, + "step": 10085, + "time_per_iteration": 2.812438488006592 + }, + { + "auxiliary_loss_clip": 0.01423043, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.25747979, + "balance_loss_mlp": 1.0147686, + "epoch": 0.6064031264091387, + "flos": 23265184815360.0, + "grad_norm": 2.2003983314769933, + "language_loss": 0.76999092, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.79455817, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18896484, + "step": 10086, + "time_per_iteration": 2.8749682903289795 + }, + { + "auxiliary_loss_clip": 0.01418508, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.25351739, + "balance_loss_mlp": 1.01681519, + "epoch": 0.6064632496618068, + "flos": 22469407689600.0, + "grad_norm": 1.3531479301470306, + "language_loss": 0.73350137, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.75804877, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19421387, + "step": 10087, + "time_per_iteration": 2.877917528152466 + }, + { + "auxiliary_loss_clip": 0.01422341, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.25856662, + "balance_loss_mlp": 1.01502752, + "epoch": 0.6065233729144747, + "flos": 25129439331840.0, + "grad_norm": 1.6036406518268422, + "language_loss": 0.84146088, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86601877, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.1842041, + "step": 10088, + "time_per_iteration": 2.8852734565734863 + }, + { + "auxiliary_loss_clip": 0.0142134, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.25667977, + "balance_loss_mlp": 1.01482701, + "epoch": 0.6065834961671427, + "flos": 23487862314240.0, + "grad_norm": 2.1816153465833095, + "language_loss": 0.72101092, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.74557561, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20300293, + "step": 10089, + "time_per_iteration": 2.8861777782440186 + }, + { + "auxiliary_loss_clip": 0.01426726, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.2606318, + "balance_loss_mlp": 1.01500452, + "epoch": 0.6066436194198106, + "flos": 17028232778880.0, + "grad_norm": 2.9726808424664255, + "language_loss": 0.83606994, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.86068451, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19726562, + "step": 10090, + "time_per_iteration": 2.8743653297424316 + }, + { + "auxiliary_loss_clip": 0.01453826, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.27800465, + "balance_loss_mlp": 1.01790667, + "epoch": 0.6067037426724786, + "flos": 18523831512960.0, + "grad_norm": 2.2801283107408, + "language_loss": 0.76909328, + "learning_rate": 1.4145758826341e-06, + "loss": 0.7940166, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.20593262, + "step": 10091, + "time_per_iteration": 4.320406198501587 + }, + { + "auxiliary_loss_clip": 0.01425417, + "auxiliary_loss_mlp": 0.01035828, + "balance_loss_clip": 1.26072383, + "balance_loss_mlp": 1.01574111, + "epoch": 0.6067638659251465, + "flos": 22356123413760.0, + "grad_norm": 1.6586412268259023, + "language_loss": 0.80166662, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.8262791, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20092773, + "step": 10092, + "time_per_iteration": 4.348551273345947 + }, + { + "auxiliary_loss_clip": 0.01418786, + "auxiliary_loss_mlp": 0.01039463, + "balance_loss_clip": 1.25190687, + "balance_loss_mlp": 1.0201993, + "epoch": 0.6068239891778145, + "flos": 12456683786880.0, + "grad_norm": 6.1082751585282296, + "language_loss": 0.76473641, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78931892, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19287109, + "step": 10093, + "time_per_iteration": 2.9546127319335938 + }, + { + "auxiliary_loss_clip": 0.01414271, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.25149226, + "balance_loss_mlp": 1.01315022, + "epoch": 0.6068841124304825, + "flos": 23196403440000.0, + "grad_norm": 2.003270783522232, + "language_loss": 0.8786301, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.90310413, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1998291, + "step": 10094, + "time_per_iteration": 2.8740999698638916 + }, + { + "auxiliary_loss_clip": 0.01423226, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.25848532, + "balance_loss_mlp": 1.01197541, + "epoch": 0.6069442356831505, + "flos": 18597046878720.0, + "grad_norm": 1.568352499144357, + "language_loss": 0.73019326, + "learning_rate": 1.413086446353919e-06, + "loss": 0.75475109, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20593262, + "step": 10095, + "time_per_iteration": 2.8532907962799072 + }, + { + "auxiliary_loss_clip": 0.01421314, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.25517046, + "balance_loss_mlp": 1.01491046, + "epoch": 0.6070043589358184, + "flos": 20970325105920.0, + "grad_norm": 2.1264768339000786, + "language_loss": 0.7763626, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.8009181, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19311523, + "step": 10096, + "time_per_iteration": 2.912102460861206 + }, + { + "auxiliary_loss_clip": 0.01431138, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.26334691, + "balance_loss_mlp": 1.01920271, + "epoch": 0.6070644821884864, + "flos": 11699889696000.0, + "grad_norm": 1.8822534594187885, + "language_loss": 0.80851829, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.83321917, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19726562, + "step": 10097, + "time_per_iteration": 2.836203098297119 + }, + { + "auxiliary_loss_clip": 0.01410907, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.24871349, + "balance_loss_mlp": 1.01252639, + "epoch": 0.6071246054411543, + "flos": 19318070315520.0, + "grad_norm": 1.5300611884914288, + "language_loss": 0.68417788, + "learning_rate": 1.411969602780478e-06, + "loss": 0.70859975, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1875, + "step": 10098, + "time_per_iteration": 2.8667104244232178 + }, + { + "auxiliary_loss_clip": 0.01416392, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.25212479, + "balance_loss_mlp": 1.01499414, + "epoch": 0.6071847286938223, + "flos": 17758033706880.0, + "grad_norm": 2.005935836725218, + "language_loss": 0.81232518, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.83683175, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19274902, + "step": 10099, + "time_per_iteration": 2.8730127811431885 + }, + { + "auxiliary_loss_clip": 0.01432071, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.26165271, + "balance_loss_mlp": 1.0184629, + "epoch": 0.6072448519464904, + "flos": 22647627532800.0, + "grad_norm": 1.7395607239957527, + "language_loss": 0.71509218, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.73980093, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20349121, + "step": 10100, + "time_per_iteration": 2.8692426681518555 + }, + { + "auxiliary_loss_clip": 0.01435334, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.26857233, + "balance_loss_mlp": 1.01793778, + "epoch": 0.6073049751991583, + "flos": 19546901107200.0, + "grad_norm": 2.133749033704126, + "language_loss": 0.71812844, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.7428695, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20849609, + "step": 10101, + "time_per_iteration": 2.8186120986938477 + }, + { + "auxiliary_loss_clip": 0.01422228, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.25703263, + "balance_loss_mlp": 1.01404774, + "epoch": 0.6073650984518263, + "flos": 28306186300800.0, + "grad_norm": 1.8713904567450006, + "language_loss": 0.69684887, + "learning_rate": 1.410480790256154e-06, + "loss": 0.72141027, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19873047, + "step": 10102, + "time_per_iteration": 2.9369587898254395 + }, + { + "auxiliary_loss_clip": 0.01434461, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.26597905, + "balance_loss_mlp": 1.01256311, + "epoch": 0.6074252217044942, + "flos": 25674957613440.0, + "grad_norm": 2.066322947459648, + "language_loss": 0.7453438, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.77001339, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19934082, + "step": 10103, + "time_per_iteration": 2.8683276176452637 + }, + { + "auxiliary_loss_clip": 0.01454796, + "auxiliary_loss_mlp": 0.01039198, + "balance_loss_clip": 1.28114152, + "balance_loss_mlp": 1.01950431, + "epoch": 0.6074853449571622, + "flos": 22867454609280.0, + "grad_norm": 1.5630486315383372, + "language_loss": 0.77085108, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.79579103, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.19702148, + "step": 10104, + "time_per_iteration": 2.8466057777404785 + }, + { + "auxiliary_loss_clip": 0.0121091, + "auxiliary_loss_mlp": 0.01055487, + "balance_loss_clip": 1.11896706, + "balance_loss_mlp": 1.03746212, + "epoch": 0.6075454682098301, + "flos": 67141947565440.0, + "grad_norm": 0.7236744207171859, + "language_loss": 0.56102598, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58368993, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.18066406, + "step": 10105, + "time_per_iteration": 3.4074668884277344 + }, + { + "auxiliary_loss_clip": 0.0121107, + "auxiliary_loss_mlp": 0.01064982, + "balance_loss_clip": 1.12093854, + "balance_loss_mlp": 1.04600394, + "epoch": 0.6076055914624982, + "flos": 70740522236160.0, + "grad_norm": 0.7831622439921496, + "language_loss": 0.56858915, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.59134966, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.18945312, + "step": 10106, + "time_per_iteration": 3.3174498081207275 + }, + { + "auxiliary_loss_clip": 0.01418586, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.25406075, + "balance_loss_mlp": 1.01484179, + "epoch": 0.6076657147151661, + "flos": 28375827327360.0, + "grad_norm": 1.756896408627934, + "language_loss": 0.69618869, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.72071189, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18896484, + "step": 10107, + "time_per_iteration": 2.912600040435791 + }, + { + "auxiliary_loss_clip": 0.01441758, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.27101731, + "balance_loss_mlp": 1.01113355, + "epoch": 0.6077258379678341, + "flos": 15058249868160.0, + "grad_norm": 1.6613806020027926, + "language_loss": 0.81354719, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83826482, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.18884277, + "step": 10108, + "time_per_iteration": 2.87605881690979 + }, + { + "auxiliary_loss_clip": 0.0142944, + "auxiliary_loss_mlp": 0.01036638, + "balance_loss_clip": 1.25945449, + "balance_loss_mlp": 1.01630068, + "epoch": 0.607785961220502, + "flos": 36179195489280.0, + "grad_norm": 1.8958697554232684, + "language_loss": 0.71926618, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.743927, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20336914, + "step": 10109, + "time_per_iteration": 2.9811882972717285 + }, + { + "auxiliary_loss_clip": 0.01404446, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.2435993, + "balance_loss_mlp": 1.01354122, + "epoch": 0.60784608447317, + "flos": 22533347871360.0, + "grad_norm": 1.6361417775256606, + "language_loss": 0.80901331, + "learning_rate": 1.407504239132653e-06, + "loss": 0.83338726, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19396973, + "step": 10110, + "time_per_iteration": 2.875927209854126 + }, + { + "auxiliary_loss_clip": 0.01423465, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.25600863, + "balance_loss_mlp": 1.01494479, + "epoch": 0.6079062077258379, + "flos": 23851495923840.0, + "grad_norm": 2.678791910701082, + "language_loss": 0.72114229, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.74572015, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19360352, + "step": 10111, + "time_per_iteration": 2.8420586585998535 + }, + { + "auxiliary_loss_clip": 0.01431525, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.26159692, + "balance_loss_mlp": 1.01787901, + "epoch": 0.6079663309785059, + "flos": 23377292726400.0, + "grad_norm": 2.7334102668644933, + "language_loss": 0.65331966, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.6780203, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20654297, + "step": 10112, + "time_per_iteration": 2.8827414512634277 + }, + { + "auxiliary_loss_clip": 0.01203155, + "auxiliary_loss_mlp": 0.01023114, + "balance_loss_clip": 1.11422968, + "balance_loss_mlp": 1.00671101, + "epoch": 0.6080264542311739, + "flos": 71415205718400.0, + "grad_norm": 0.6509523165913869, + "language_loss": 0.49565384, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51791656, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1640625, + "step": 10113, + "time_per_iteration": 3.4065239429473877 + }, + { + "auxiliary_loss_clip": 0.01205416, + "auxiliary_loss_mlp": 0.01022868, + "balance_loss_clip": 1.11458111, + "balance_loss_mlp": 1.0025543, + "epoch": 0.6080865774838419, + "flos": 66560794364160.0, + "grad_norm": 0.8449235158214998, + "language_loss": 0.57009709, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59237993, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.203125, + "step": 10114, + "time_per_iteration": 4.616581678390503 + }, + { + "auxiliary_loss_clip": 0.01430315, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.26233912, + "balance_loss_mlp": 1.01226521, + "epoch": 0.6081467007365099, + "flos": 19217183114880.0, + "grad_norm": 2.0633522053060984, + "language_loss": 0.71682966, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.74145007, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19470215, + "step": 10115, + "time_per_iteration": 2.87162709236145 + }, + { + "auxiliary_loss_clip": 0.01425477, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.25943995, + "balance_loss_mlp": 1.01759553, + "epoch": 0.6082068239891778, + "flos": 24177684821760.0, + "grad_norm": 1.616913455947331, + "language_loss": 0.73286021, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.7574898, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19873047, + "step": 10116, + "time_per_iteration": 2.937653064727783 + }, + { + "auxiliary_loss_clip": 0.01438173, + "auxiliary_loss_mlp": 0.01041775, + "balance_loss_clip": 1.26838684, + "balance_loss_mlp": 1.02091324, + "epoch": 0.6082669472418458, + "flos": 37427023843200.0, + "grad_norm": 2.24701180660584, + "language_loss": 0.54612648, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.57092595, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20849609, + "step": 10117, + "time_per_iteration": 3.005751848220825 + }, + { + "auxiliary_loss_clip": 0.01424504, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.25680304, + "balance_loss_mlp": 1.02064097, + "epoch": 0.6083270704945137, + "flos": 15093839543040.0, + "grad_norm": 1.721983565983604, + "language_loss": 0.70732605, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.73196745, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.18969727, + "step": 10118, + "time_per_iteration": 2.8599250316619873 + }, + { + "auxiliary_loss_clip": 0.01421611, + "auxiliary_loss_mlp": 0.01040284, + "balance_loss_clip": 1.25496936, + "balance_loss_mlp": 1.02056718, + "epoch": 0.6083871937471818, + "flos": 20678368538880.0, + "grad_norm": 3.5683144194271375, + "language_loss": 0.75048733, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.77510625, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19726562, + "step": 10119, + "time_per_iteration": 2.8657584190368652 + }, + { + "auxiliary_loss_clip": 0.01423256, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.25787544, + "balance_loss_mlp": 1.02399302, + "epoch": 0.6084473169998497, + "flos": 21516838773120.0, + "grad_norm": 1.732719927502735, + "language_loss": 0.68313682, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.70781243, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20324707, + "step": 10120, + "time_per_iteration": 2.8730695247650146 + }, + { + "auxiliary_loss_clip": 0.01445988, + "auxiliary_loss_mlp": 0.0104696, + "balance_loss_clip": 1.27523732, + "balance_loss_mlp": 1.02609801, + "epoch": 0.6085074402525177, + "flos": 26881314468480.0, + "grad_norm": 1.677126820575194, + "language_loss": 0.75314432, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.77807379, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20861816, + "step": 10121, + "time_per_iteration": 2.8982303142547607 + }, + { + "auxiliary_loss_clip": 0.01421195, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.25611925, + "balance_loss_mlp": 1.02101707, + "epoch": 0.6085675635051856, + "flos": 10897099626240.0, + "grad_norm": 2.1440082105779013, + "language_loss": 0.81230414, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.83691913, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19287109, + "step": 10122, + "time_per_iteration": 2.814009666442871 + }, + { + "auxiliary_loss_clip": 0.01429409, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.26423025, + "balance_loss_mlp": 1.02177179, + "epoch": 0.6086276867578536, + "flos": 34875978220800.0, + "grad_norm": 1.5738349025424445, + "language_loss": 0.56411862, + "learning_rate": 1.402670413578284e-06, + "loss": 0.58882767, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19714355, + "step": 10123, + "time_per_iteration": 2.9800362586975098 + }, + { + "auxiliary_loss_clip": 0.01412724, + "auxiliary_loss_mlp": 0.01048674, + "balance_loss_clip": 1.25042987, + "balance_loss_mlp": 1.02853966, + "epoch": 0.6086878100105215, + "flos": 20057327406720.0, + "grad_norm": 1.8054022382702537, + "language_loss": 0.75065553, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.77526951, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20129395, + "step": 10124, + "time_per_iteration": 2.8338046073913574 + }, + { + "auxiliary_loss_clip": 0.01426835, + "auxiliary_loss_mlp": 0.01047744, + "balance_loss_clip": 1.25889575, + "balance_loss_mlp": 1.02817023, + "epoch": 0.6087479332631895, + "flos": 18341539637760.0, + "grad_norm": 3.345788673276808, + "language_loss": 0.65454865, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67929441, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19567871, + "step": 10125, + "time_per_iteration": 2.845608711242676 + }, + { + "auxiliary_loss_clip": 0.01415106, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.25138593, + "balance_loss_mlp": 1.02181399, + "epoch": 0.6088080565158575, + "flos": 24502742599680.0, + "grad_norm": 8.172051263824828, + "language_loss": 0.77723563, + "learning_rate": 1.40155545786479e-06, + "loss": 0.80180633, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20141602, + "step": 10126, + "time_per_iteration": 4.3261542320251465 + }, + { + "auxiliary_loss_clip": 0.01433118, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.26334524, + "balance_loss_mlp": 1.0220952, + "epoch": 0.6088681797685255, + "flos": 10275968004480.0, + "grad_norm": 2.5036058602757687, + "language_loss": 0.7268362, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.75158405, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19567871, + "step": 10127, + "time_per_iteration": 5.63316798210144 + }, + { + "auxiliary_loss_clip": 0.01441485, + "auxiliary_loss_mlp": 0.01045686, + "balance_loss_clip": 1.27197373, + "balance_loss_mlp": 1.02592111, + "epoch": 0.6089283030211935, + "flos": 21981812031360.0, + "grad_norm": 2.125993011777379, + "language_loss": 0.73605949, + "learning_rate": 1.400812267497691e-06, + "loss": 0.76093119, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19763184, + "step": 10128, + "time_per_iteration": 2.8361382484436035 + }, + { + "auxiliary_loss_clip": 0.01417847, + "auxiliary_loss_mlp": 0.01038481, + "balance_loss_clip": 1.25312114, + "balance_loss_mlp": 1.01962245, + "epoch": 0.6089884262738614, + "flos": 17794121074560.0, + "grad_norm": 2.055199812729614, + "language_loss": 0.73817444, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.76273763, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18847656, + "step": 10129, + "time_per_iteration": 2.8527722358703613 + }, + { + "auxiliary_loss_clip": 0.01421589, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.25591242, + "balance_loss_mlp": 1.02153325, + "epoch": 0.6090485495265294, + "flos": 36926053706880.0, + "grad_norm": 1.5301158956678766, + "language_loss": 0.66960865, + "learning_rate": 1.400069168015626e-06, + "loss": 0.69423115, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19140625, + "step": 10130, + "time_per_iteration": 3.040285348892212 + }, + { + "auxiliary_loss_clip": 0.01410001, + "auxiliary_loss_mlp": 0.01035888, + "balance_loss_clip": 1.24768972, + "balance_loss_mlp": 1.0167073, + "epoch": 0.6091086727791973, + "flos": 19907910518400.0, + "grad_norm": 2.5659780133274905, + "language_loss": 0.77616453, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.80062342, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19177246, + "step": 10131, + "time_per_iteration": 2.8524441719055176 + }, + { + "auxiliary_loss_clip": 0.01412519, + "auxiliary_loss_mlp": 0.01038573, + "balance_loss_clip": 1.24859715, + "balance_loss_mlp": 1.01948738, + "epoch": 0.6091687960318654, + "flos": 22173741048960.0, + "grad_norm": 3.585045848449767, + "language_loss": 0.77629042, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.8008014, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.1907959, + "step": 10132, + "time_per_iteration": 2.8822109699249268 + }, + { + "auxiliary_loss_clip": 0.01411805, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.2515527, + "balance_loss_mlp": 1.02079141, + "epoch": 0.6092289192845333, + "flos": 21473874195840.0, + "grad_norm": 1.7272302200968659, + "language_loss": 0.76079941, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.78531992, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19470215, + "step": 10133, + "time_per_iteration": 2.8532354831695557 + }, + { + "auxiliary_loss_clip": 0.01428089, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.26170444, + "balance_loss_mlp": 1.01973677, + "epoch": 0.6092890425372013, + "flos": 28706812174080.0, + "grad_norm": 1.8387564902042173, + "language_loss": 0.64777553, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.6724562, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20239258, + "step": 10134, + "time_per_iteration": 2.9171957969665527 + }, + { + "auxiliary_loss_clip": 0.01415342, + "auxiliary_loss_mlp": 0.01035335, + "balance_loss_clip": 1.25058126, + "balance_loss_mlp": 1.01566589, + "epoch": 0.6093491657898692, + "flos": 20822808499200.0, + "grad_norm": 1.8641164722244383, + "language_loss": 0.79434991, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.81885672, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19665527, + "step": 10135, + "time_per_iteration": 2.850215435028076 + }, + { + "auxiliary_loss_clip": 0.01420049, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.25369298, + "balance_loss_mlp": 1.02156317, + "epoch": 0.6094092890425372, + "flos": 25457845224960.0, + "grad_norm": 1.8253713796403923, + "language_loss": 0.72625399, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.75085783, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18774414, + "step": 10136, + "time_per_iteration": 2.96061110496521 + }, + { + "auxiliary_loss_clip": 0.01418011, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.25296926, + "balance_loss_mlp": 1.01463437, + "epoch": 0.6094694122952051, + "flos": 35633405738880.0, + "grad_norm": 1.767733147974447, + "language_loss": 0.7501539, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.77467906, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.1986084, + "step": 10137, + "time_per_iteration": 2.9900147914886475 + }, + { + "auxiliary_loss_clip": 0.01411725, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.24558604, + "balance_loss_mlp": 1.01816654, + "epoch": 0.6095295355478731, + "flos": 24466293273600.0, + "grad_norm": 1.652066552987795, + "language_loss": 0.80747843, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.83197892, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20153809, + "step": 10138, + "time_per_iteration": 2.86911940574646 + }, + { + "auxiliary_loss_clip": 0.01405683, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.24445069, + "balance_loss_mlp": 1.0169692, + "epoch": 0.6095896588005411, + "flos": 15641755799040.0, + "grad_norm": 1.7289689983680092, + "language_loss": 0.82447374, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.84889269, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19238281, + "step": 10139, + "time_per_iteration": 2.820976495742798 + }, + { + "auxiliary_loss_clip": 0.01434889, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.26623225, + "balance_loss_mlp": 1.01561546, + "epoch": 0.6096497820532091, + "flos": 15556595806080.0, + "grad_norm": 1.9092426701999503, + "language_loss": 0.83850759, + "learning_rate": 1.396355037825315e-06, + "loss": 0.86321062, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19812012, + "step": 10140, + "time_per_iteration": 2.7832326889038086 + }, + { + "auxiliary_loss_clip": 0.01409586, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.24342132, + "balance_loss_mlp": 1.01740944, + "epoch": 0.6097099053058771, + "flos": 24214496106240.0, + "grad_norm": 1.7460544796977036, + "language_loss": 0.76149315, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.78596276, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19958496, + "step": 10141, + "time_per_iteration": 2.875797986984253 + }, + { + "auxiliary_loss_clip": 0.01407302, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.24345088, + "balance_loss_mlp": 1.0133791, + "epoch": 0.609770028558545, + "flos": 19578916442880.0, + "grad_norm": 2.785268794650899, + "language_loss": 0.77421296, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.79861403, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19421387, + "step": 10142, + "time_per_iteration": 2.801565408706665 + }, + { + "auxiliary_loss_clip": 0.01412743, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.24793839, + "balance_loss_mlp": 1.01335633, + "epoch": 0.609830151811213, + "flos": 23959169844480.0, + "grad_norm": 1.7019803690971946, + "language_loss": 0.7680195, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.79248643, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20605469, + "step": 10143, + "time_per_iteration": 2.876600742340088 + }, + { + "auxiliary_loss_clip": 0.01414713, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.24967933, + "balance_loss_mlp": 1.01547813, + "epoch": 0.6098902750638809, + "flos": 16188224221440.0, + "grad_norm": 1.9260475597782607, + "language_loss": 0.76245922, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.78695405, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19299316, + "step": 10144, + "time_per_iteration": 2.821422576904297 + }, + { + "auxiliary_loss_clip": 0.01427437, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.26025653, + "balance_loss_mlp": 1.01400399, + "epoch": 0.609950398316549, + "flos": 44541519638400.0, + "grad_norm": 1.863055862870904, + "language_loss": 0.74214035, + "learning_rate": 1.394498830235383e-06, + "loss": 0.76675278, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19812012, + "step": 10145, + "time_per_iteration": 3.101120710372925 + }, + { + "auxiliary_loss_clip": 0.01419118, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.25398481, + "balance_loss_mlp": 1.01584077, + "epoch": 0.6100105215692169, + "flos": 23232128849280.0, + "grad_norm": 5.675786613139131, + "language_loss": 0.69655931, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.72110671, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19775391, + "step": 10146, + "time_per_iteration": 2.899437189102173 + }, + { + "auxiliary_loss_clip": 0.01405707, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.24504137, + "balance_loss_mlp": 1.01730061, + "epoch": 0.6100706448218849, + "flos": 15020850401280.0, + "grad_norm": 1.5496689644941308, + "language_loss": 0.77028894, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79470676, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18774414, + "step": 10147, + "time_per_iteration": 2.8510098457336426 + }, + { + "auxiliary_loss_clip": 0.01413608, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.24933052, + "balance_loss_mlp": 1.01271486, + "epoch": 0.6101307680745528, + "flos": 19647833552640.0, + "grad_norm": 1.8808954214348996, + "language_loss": 0.79128224, + "learning_rate": 1.393385381096786e-06, + "loss": 0.81574869, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20324707, + "step": 10148, + "time_per_iteration": 2.8704307079315186 + }, + { + "auxiliary_loss_clip": 0.01420645, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.25238156, + "balance_loss_mlp": 1.01268089, + "epoch": 0.6101908913272208, + "flos": 29947808563200.0, + "grad_norm": 1.9734273260463708, + "language_loss": 0.54824126, + "learning_rate": 1.39301427737093e-06, + "loss": 0.57278126, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20678711, + "step": 10149, + "time_per_iteration": 4.360315561294556 + }, + { + "auxiliary_loss_clip": 0.01395042, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.23680055, + "balance_loss_mlp": 1.01499832, + "epoch": 0.6102510145798887, + "flos": 21808614360960.0, + "grad_norm": 1.7830318508808038, + "language_loss": 0.80741203, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.83171451, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.20202637, + "step": 10150, + "time_per_iteration": 2.849329710006714 + }, + { + "auxiliary_loss_clip": 0.01426833, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.25904262, + "balance_loss_mlp": 1.01809192, + "epoch": 0.6103111378325567, + "flos": 20716356188160.0, + "grad_norm": 1.529408908439079, + "language_loss": 0.70307696, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.72773939, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.21325684, + "step": 10151, + "time_per_iteration": 2.8507206439971924 + }, + { + "auxiliary_loss_clip": 0.01401903, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.23938632, + "balance_loss_mlp": 1.0160284, + "epoch": 0.6103712610852247, + "flos": 29392019712000.0, + "grad_norm": 1.7954208345729656, + "language_loss": 0.71681023, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.74118471, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19519043, + "step": 10152, + "time_per_iteration": 2.8810818195343018 + }, + { + "auxiliary_loss_clip": 0.01419368, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.25361967, + "balance_loss_mlp": 1.01415825, + "epoch": 0.6104313843378927, + "flos": 20822627520000.0, + "grad_norm": 1.6754878284532166, + "language_loss": 0.78963733, + "learning_rate": 1.391530092777811e-06, + "loss": 0.81417549, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20275879, + "step": 10153, + "time_per_iteration": 2.838214159011841 + }, + { + "auxiliary_loss_clip": 0.01410198, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.24700904, + "balance_loss_mlp": 1.01429462, + "epoch": 0.6104915075905607, + "flos": 26589719859840.0, + "grad_norm": 1.6826889547493957, + "language_loss": 0.80087912, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8253206, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.1965332, + "step": 10154, + "time_per_iteration": 2.9012928009033203 + }, + { + "auxiliary_loss_clip": 0.01408753, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.24512482, + "balance_loss_mlp": 1.01538754, + "epoch": 0.6105516308432286, + "flos": 23926566326400.0, + "grad_norm": 1.8416946187938175, + "language_loss": 0.70985579, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.73428977, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19262695, + "step": 10155, + "time_per_iteration": 2.845160961151123 + }, + { + "auxiliary_loss_clip": 0.01410523, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.2474407, + "balance_loss_mlp": 1.01966977, + "epoch": 0.6106117540958966, + "flos": 31590019008000.0, + "grad_norm": 1.6664425029489653, + "language_loss": 0.72126138, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.74576539, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20202637, + "step": 10156, + "time_per_iteration": 2.9315431118011475 + }, + { + "auxiliary_loss_clip": 0.01400315, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.24060059, + "balance_loss_mlp": 1.01401544, + "epoch": 0.6106718773485645, + "flos": 19617220805760.0, + "grad_norm": 1.6268485834439577, + "language_loss": 0.67718661, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.70153964, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.2097168, + "step": 10157, + "time_per_iteration": 2.834444522857666 + }, + { + "auxiliary_loss_clip": 0.01415012, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.24919367, + "balance_loss_mlp": 1.01713955, + "epoch": 0.6107320006012326, + "flos": 17131472709120.0, + "grad_norm": 1.9793190377249117, + "language_loss": 0.73009676, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.75461423, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19592285, + "step": 10158, + "time_per_iteration": 2.8232662677764893 + }, + { + "auxiliary_loss_clip": 0.01419812, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.25426626, + "balance_loss_mlp": 1.02042174, + "epoch": 0.6107921238539005, + "flos": 30158903393280.0, + "grad_norm": 2.356034019525822, + "language_loss": 0.70000005, + "learning_rate": 1.389304508366635e-06, + "loss": 0.7246049, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20263672, + "step": 10159, + "time_per_iteration": 2.9268887042999268 + }, + { + "auxiliary_loss_clip": 0.01411691, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.24679077, + "balance_loss_mlp": 1.01438773, + "epoch": 0.6108522471065685, + "flos": 18448942089600.0, + "grad_norm": 1.974539954078257, + "language_loss": 0.8023802, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.82684129, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20031738, + "step": 10160, + "time_per_iteration": 4.218799352645874 + }, + { + "auxiliary_loss_clip": 0.01200229, + "auxiliary_loss_mlp": 0.01019607, + "balance_loss_clip": 1.11058807, + "balance_loss_mlp": 1.0035857, + "epoch": 0.6109123703592364, + "flos": 64169345710080.0, + "grad_norm": 1.251773694211875, + "language_loss": 0.61539865, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63759702, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16015625, + "step": 10161, + "time_per_iteration": 3.482314348220825 + }, + { + "auxiliary_loss_clip": 0.01423253, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.25666952, + "balance_loss_mlp": 1.01474619, + "epoch": 0.6109724936119044, + "flos": 20677599377280.0, + "grad_norm": 1.7064409312212958, + "language_loss": 0.76756406, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.79214329, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19934082, + "step": 10162, + "time_per_iteration": 5.660090446472168 + }, + { + "auxiliary_loss_clip": 0.01417848, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.2541852, + "balance_loss_mlp": 1.01588535, + "epoch": 0.6110326168645723, + "flos": 31362771784320.0, + "grad_norm": 1.7448285933147976, + "language_loss": 0.72648466, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.7510165, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19445801, + "step": 10163, + "time_per_iteration": 2.9347667694091797 + }, + { + "auxiliary_loss_clip": 0.01400815, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.24190247, + "balance_loss_mlp": 1.00954294, + "epoch": 0.6110927401172404, + "flos": 25012625961600.0, + "grad_norm": 1.7170669732185335, + "language_loss": 0.60421747, + "learning_rate": 1.387450491396625e-06, + "loss": 0.62851262, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19152832, + "step": 10164, + "time_per_iteration": 2.8834242820739746 + }, + { + "auxiliary_loss_clip": 0.0141251, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.24893308, + "balance_loss_mlp": 1.01387811, + "epoch": 0.6111528633699083, + "flos": 26258735013120.0, + "grad_norm": 1.9612315121955342, + "language_loss": 0.7623958, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.78686756, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.2076416, + "step": 10165, + "time_per_iteration": 2.9181790351867676 + }, + { + "auxiliary_loss_clip": 0.01415522, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.25566053, + "balance_loss_mlp": 1.01342034, + "epoch": 0.6112129866225763, + "flos": 22392617984640.0, + "grad_norm": 1.5439430607899407, + "language_loss": 0.80059433, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.82507563, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19189453, + "step": 10166, + "time_per_iteration": 2.844480037689209 + }, + { + "auxiliary_loss_clip": 0.01431097, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.26611423, + "balance_loss_mlp": 1.01383734, + "epoch": 0.6112731098752443, + "flos": 25238651575680.0, + "grad_norm": 1.8069376905114054, + "language_loss": 0.68403023, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.70868731, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20776367, + "step": 10167, + "time_per_iteration": 2.8908276557922363 + }, + { + "auxiliary_loss_clip": 0.01417636, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.25677621, + "balance_loss_mlp": 1.01630104, + "epoch": 0.6113332331279122, + "flos": 22903406242560.0, + "grad_norm": 2.5329796624015097, + "language_loss": 0.79356778, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81809598, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18884277, + "step": 10168, + "time_per_iteration": 2.878872871398926 + }, + { + "auxiliary_loss_clip": 0.01446019, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.27335167, + "balance_loss_mlp": 1.02054715, + "epoch": 0.6113933563805802, + "flos": 18628428787200.0, + "grad_norm": 2.424484379369739, + "language_loss": 0.86114353, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.88600683, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.19763184, + "step": 10169, + "time_per_iteration": 2.795133590698242 + }, + { + "auxiliary_loss_clip": 0.01420459, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.25785708, + "balance_loss_mlp": 1.01506495, + "epoch": 0.6114534796332481, + "flos": 41881578485760.0, + "grad_norm": 1.6303515651117158, + "language_loss": 0.79564762, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.82018471, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18188477, + "step": 10170, + "time_per_iteration": 3.0212934017181396 + }, + { + "auxiliary_loss_clip": 0.01433249, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.26398945, + "balance_loss_mlp": 1.01315463, + "epoch": 0.6115136028859162, + "flos": 21918550521600.0, + "grad_norm": 1.8959888169273387, + "language_loss": 0.69747484, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.72214615, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20715332, + "step": 10171, + "time_per_iteration": 2.8802108764648438 + }, + { + "auxiliary_loss_clip": 0.01425326, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.25874448, + "balance_loss_mlp": 1.01534367, + "epoch": 0.6115737261385841, + "flos": 28816884069120.0, + "grad_norm": 1.6200770939681983, + "language_loss": 0.79607368, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.82068372, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20349121, + "step": 10172, + "time_per_iteration": 2.9061429500579834 + }, + { + "auxiliary_loss_clip": 0.01428908, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.25886571, + "balance_loss_mlp": 1.01410365, + "epoch": 0.6116338493912521, + "flos": 21261331532160.0, + "grad_norm": 1.7750722209576755, + "language_loss": 0.67668027, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.70131147, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20129395, + "step": 10173, + "time_per_iteration": 2.843770742416382 + }, + { + "auxiliary_loss_clip": 0.0142552, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.26028657, + "balance_loss_mlp": 1.01323342, + "epoch": 0.61169397264392, + "flos": 17539156771200.0, + "grad_norm": 1.6967483378128967, + "language_loss": 0.56804621, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.5926289, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19519043, + "step": 10174, + "time_per_iteration": 2.799285650253296 + }, + { + "auxiliary_loss_clip": 0.01422128, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.25664532, + "balance_loss_mlp": 1.01625013, + "epoch": 0.611754095896588, + "flos": 23962156001280.0, + "grad_norm": 1.8885369446742384, + "language_loss": 0.67117727, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.69576299, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.2019043, + "step": 10175, + "time_per_iteration": 2.8684136867523193 + }, + { + "auxiliary_loss_clip": 0.01419284, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.25486922, + "balance_loss_mlp": 1.0148778, + "epoch": 0.6118142191492559, + "flos": 26006032949760.0, + "grad_norm": 2.304172152728961, + "language_loss": 0.83415604, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.85868895, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19140625, + "step": 10176, + "time_per_iteration": 2.939687490463257 + }, + { + "auxiliary_loss_clip": 0.01416767, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.25289607, + "balance_loss_mlp": 1.01639044, + "epoch": 0.611874342401924, + "flos": 24612633515520.0, + "grad_norm": 1.9103172603248497, + "language_loss": 0.7792573, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.80379641, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20739746, + "step": 10177, + "time_per_iteration": 2.8368217945098877 + }, + { + "auxiliary_loss_clip": 0.01422809, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.25669527, + "balance_loss_mlp": 1.01634645, + "epoch": 0.6119344656545919, + "flos": 15895362758400.0, + "grad_norm": 1.8795491522965506, + "language_loss": 0.76570374, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.79029262, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19714355, + "step": 10178, + "time_per_iteration": 2.817561388015747 + }, + { + "auxiliary_loss_clip": 0.01430979, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.26517701, + "balance_loss_mlp": 1.01929295, + "epoch": 0.6119945889072599, + "flos": 21662862301440.0, + "grad_norm": 2.225987786006421, + "language_loss": 0.6843164, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.70901847, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19921875, + "step": 10179, + "time_per_iteration": 2.864372730255127 + }, + { + "auxiliary_loss_clip": 0.01425559, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.26020265, + "balance_loss_mlp": 1.01327109, + "epoch": 0.6120547121599279, + "flos": 13780623173760.0, + "grad_norm": 2.246184398298458, + "language_loss": 0.84704566, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.87162334, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1895752, + "step": 10180, + "time_per_iteration": 2.9358816146850586 + }, + { + "auxiliary_loss_clip": 0.01424162, + "auxiliary_loss_mlp": 0.01031469, + "balance_loss_clip": 1.26065195, + "balance_loss_mlp": 1.01183558, + "epoch": 0.6121148354125958, + "flos": 20087487705600.0, + "grad_norm": 1.8608864946783017, + "language_loss": 0.77999592, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.8045522, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19616699, + "step": 10181, + "time_per_iteration": 2.8458054065704346 + }, + { + "auxiliary_loss_clip": 0.01426965, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.26059651, + "balance_loss_mlp": 1.011343, + "epoch": 0.6121749586652638, + "flos": 13476902958720.0, + "grad_norm": 2.1265578754583454, + "language_loss": 0.81209403, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.8366707, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19348145, + "step": 10182, + "time_per_iteration": 2.8119683265686035 + }, + { + "auxiliary_loss_clip": 0.01404118, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.24438882, + "balance_loss_mlp": 1.01319742, + "epoch": 0.6122350819179317, + "flos": 20130135569280.0, + "grad_norm": 1.6411968106374635, + "language_loss": 0.83534169, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.8596943, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17944336, + "step": 10183, + "time_per_iteration": 2.8451850414276123 + }, + { + "auxiliary_loss_clip": 0.01199097, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.1105876, + "balance_loss_mlp": 1.01661742, + "epoch": 0.6122952051705998, + "flos": 65458356111360.0, + "grad_norm": 0.7103728408370809, + "language_loss": 0.62875849, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.65107203, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15625, + "step": 10184, + "time_per_iteration": 4.958518743515015 + }, + { + "auxiliary_loss_clip": 0.0141813, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.25578117, + "balance_loss_mlp": 1.0144763, + "epoch": 0.6123553284232677, + "flos": 20386004768640.0, + "grad_norm": 7.356756652970936, + "language_loss": 0.8238613, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84837151, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18408203, + "step": 10185, + "time_per_iteration": 2.878382444381714 + }, + { + "auxiliary_loss_clip": 0.01443696, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.27537942, + "balance_loss_mlp": 1.01946104, + "epoch": 0.6124154516759357, + "flos": 23998152879360.0, + "grad_norm": 1.88889619252092, + "language_loss": 0.75395072, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.77877986, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1973877, + "step": 10186, + "time_per_iteration": 2.9370062351226807 + }, + { + "auxiliary_loss_clip": 0.01410887, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.24883962, + "balance_loss_mlp": 1.01523101, + "epoch": 0.6124755749286036, + "flos": 21478308186240.0, + "grad_norm": 3.120120945302171, + "language_loss": 0.79433084, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.81878126, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18920898, + "step": 10187, + "time_per_iteration": 2.8535842895507812 + }, + { + "auxiliary_loss_clip": 0.01421394, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.25682914, + "balance_loss_mlp": 1.01352251, + "epoch": 0.6125356981812716, + "flos": 23889936021120.0, + "grad_norm": 1.5979888033207086, + "language_loss": 0.83585393, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.86038208, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.17919922, + "step": 10188, + "time_per_iteration": 2.906890392303467 + }, + { + "auxiliary_loss_clip": 0.0142915, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.26330912, + "balance_loss_mlp": 1.01302516, + "epoch": 0.6125958214339395, + "flos": 14432910480000.0, + "grad_norm": 1.9241581783281914, + "language_loss": 0.75868297, + "learning_rate": 1.378189152155896e-06, + "loss": 0.78328764, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1829834, + "step": 10189, + "time_per_iteration": 2.932894706726074 + }, + { + "auxiliary_loss_clip": 0.01420511, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.25748158, + "balance_loss_mlp": 1.01563227, + "epoch": 0.6126559446866076, + "flos": 23269709295360.0, + "grad_norm": 1.537318271573001, + "language_loss": 0.74450326, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76905036, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18566895, + "step": 10190, + "time_per_iteration": 2.8955636024475098 + }, + { + "auxiliary_loss_clip": 0.01435979, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.26904178, + "balance_loss_mlp": 1.01616526, + "epoch": 0.6127160679392755, + "flos": 26874708727680.0, + "grad_norm": 1.690581845625544, + "language_loss": 0.68800259, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.71272767, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20373535, + "step": 10191, + "time_per_iteration": 2.9143002033233643 + }, + { + "auxiliary_loss_clip": 0.01432134, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.26508129, + "balance_loss_mlp": 1.01533222, + "epoch": 0.6127761911919435, + "flos": 26407744698240.0, + "grad_norm": 1.842549565139184, + "language_loss": 0.7483114, + "learning_rate": 1.377078777445467e-06, + "loss": 0.77298087, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19470215, + "step": 10192, + "time_per_iteration": 2.891695499420166 + }, + { + "auxiliary_loss_clip": 0.01409804, + "auxiliary_loss_mlp": 0.01031726, + "balance_loss_clip": 1.24821842, + "balance_loss_mlp": 1.01349866, + "epoch": 0.6128363144446115, + "flos": 22644143683200.0, + "grad_norm": 2.124915383386676, + "language_loss": 0.85322058, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.87763584, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18225098, + "step": 10193, + "time_per_iteration": 2.8524515628814697 + }, + { + "auxiliary_loss_clip": 0.01427075, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.26221228, + "balance_loss_mlp": 1.01451588, + "epoch": 0.6128964376972794, + "flos": 26769885229440.0, + "grad_norm": 1.9924664626712698, + "language_loss": 0.71344149, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.73805237, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19494629, + "step": 10194, + "time_per_iteration": 2.8710057735443115 + }, + { + "auxiliary_loss_clip": 0.01195189, + "auxiliary_loss_mlp": 0.01024657, + "balance_loss_clip": 1.10691893, + "balance_loss_mlp": 1.005584, + "epoch": 0.6129565609499474, + "flos": 65595194945280.0, + "grad_norm": 0.810953228681581, + "language_loss": 0.58765924, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60985774, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19042969, + "step": 10195, + "time_per_iteration": 4.677098274230957 + }, + { + "auxiliary_loss_clip": 0.0142648, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.26317024, + "balance_loss_mlp": 1.01680601, + "epoch": 0.6130166842026153, + "flos": 16370606586240.0, + "grad_norm": 1.9417235996359883, + "language_loss": 0.69553626, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.72017568, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.2064209, + "step": 10196, + "time_per_iteration": 2.8481924533843994 + }, + { + "auxiliary_loss_clip": 0.01416873, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.25429595, + "balance_loss_mlp": 1.01600599, + "epoch": 0.6130768074552834, + "flos": 23661105229440.0, + "grad_norm": 1.9524408275806746, + "language_loss": 0.71706152, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.74158728, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19677734, + "step": 10197, + "time_per_iteration": 5.701512813568115 + }, + { + "auxiliary_loss_clip": 0.01426985, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.26051712, + "balance_loss_mlp": 1.01864219, + "epoch": 0.6131369307079513, + "flos": 20057010693120.0, + "grad_norm": 1.948014887575, + "language_loss": 0.79701245, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.82167017, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20153809, + "step": 10198, + "time_per_iteration": 2.870842933654785 + }, + { + "auxiliary_loss_clip": 0.01434656, + "auxiliary_loss_mlp": 0.01039519, + "balance_loss_clip": 1.2670064, + "balance_loss_mlp": 1.01973057, + "epoch": 0.6131970539606193, + "flos": 22681814618880.0, + "grad_norm": 1.6847992774216725, + "language_loss": 0.74947363, + "learning_rate": 1.374488730519181e-06, + "loss": 0.77421534, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19787598, + "step": 10199, + "time_per_iteration": 2.9064888954162598 + }, + { + "auxiliary_loss_clip": 0.01425917, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.2586031, + "balance_loss_mlp": 1.01819944, + "epoch": 0.6132571772132872, + "flos": 26882400343680.0, + "grad_norm": 2.0011425688796045, + "language_loss": 0.62146425, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64610934, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20397949, + "step": 10200, + "time_per_iteration": 2.8709676265716553 + }, + { + "auxiliary_loss_clip": 0.014263, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.26209009, + "balance_loss_mlp": 1.01471543, + "epoch": 0.6133173004659552, + "flos": 22902772815360.0, + "grad_norm": 1.8966591671073665, + "language_loss": 0.69443429, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.71903282, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18835449, + "step": 10201, + "time_per_iteration": 2.8562865257263184 + }, + { + "auxiliary_loss_clip": 0.01409979, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.24821639, + "balance_loss_mlp": 1.0149982, + "epoch": 0.6133774237186231, + "flos": 20494719319680.0, + "grad_norm": 1.766956359530808, + "language_loss": 0.84568369, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.87012279, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18933105, + "step": 10202, + "time_per_iteration": 2.823660135269165 + }, + { + "auxiliary_loss_clip": 0.01197081, + "auxiliary_loss_mlp": 0.01020055, + "balance_loss_clip": 1.10839832, + "balance_loss_mlp": 1.00050473, + "epoch": 0.6134375469712912, + "flos": 69444996583680.0, + "grad_norm": 0.8872594338418944, + "language_loss": 0.67065185, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69282323, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.1953125, + "step": 10203, + "time_per_iteration": 3.4145946502685547 + }, + { + "auxiliary_loss_clip": 0.01423641, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.25940394, + "balance_loss_mlp": 1.0165484, + "epoch": 0.6134976702239591, + "flos": 41296443742080.0, + "grad_norm": 1.9966094979241957, + "language_loss": 0.62024415, + "learning_rate": 1.37263940830327e-06, + "loss": 0.6448282, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18225098, + "step": 10204, + "time_per_iteration": 3.0338988304138184 + }, + { + "auxiliary_loss_clip": 0.01414112, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.25127196, + "balance_loss_mlp": 1.01666737, + "epoch": 0.6135577934766271, + "flos": 22356892575360.0, + "grad_norm": 1.8106066658797446, + "language_loss": 0.73423666, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.75873935, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19482422, + "step": 10205, + "time_per_iteration": 2.9577600955963135 + }, + { + "auxiliary_loss_clip": 0.01406245, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.24529195, + "balance_loss_mlp": 1.01363611, + "epoch": 0.6136179167292951, + "flos": 23737532976000.0, + "grad_norm": 1.6879689493482142, + "language_loss": 0.76467311, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78907883, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20690918, + "step": 10206, + "time_per_iteration": 2.9040634632110596 + }, + { + "auxiliary_loss_clip": 0.0142004, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.2555989, + "balance_loss_mlp": 1.01773477, + "epoch": 0.613678039981963, + "flos": 26034111987840.0, + "grad_norm": 2.0569387369294896, + "language_loss": 0.75685668, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.78143382, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19946289, + "step": 10207, + "time_per_iteration": 2.9152138233184814 + }, + { + "auxiliary_loss_clip": 0.01421743, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.25826967, + "balance_loss_mlp": 1.01781964, + "epoch": 0.613738163234631, + "flos": 9864800092800.0, + "grad_norm": 2.2089277306883512, + "language_loss": 0.82810307, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.85269719, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19848633, + "step": 10208, + "time_per_iteration": 2.829864978790283 + }, + { + "auxiliary_loss_clip": 0.01436094, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.26826322, + "balance_loss_mlp": 1.01687419, + "epoch": 0.613798286487299, + "flos": 33194965720320.0, + "grad_norm": 1.8292039344398896, + "language_loss": 0.73669219, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.76142824, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20629883, + "step": 10209, + "time_per_iteration": 2.9635069370269775 + }, + { + "auxiliary_loss_clip": 0.01422136, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.25922894, + "balance_loss_mlp": 1.01173735, + "epoch": 0.613858409739967, + "flos": 25638417797760.0, + "grad_norm": 1.630934169217141, + "language_loss": 0.74993819, + "learning_rate": 1.37042100685438e-06, + "loss": 0.77447295, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19592285, + "step": 10210, + "time_per_iteration": 2.8861145973205566 + }, + { + "auxiliary_loss_clip": 0.01195046, + "auxiliary_loss_mlp": 0.01023046, + "balance_loss_clip": 1.10622776, + "balance_loss_mlp": 1.00597501, + "epoch": 0.6139185329926349, + "flos": 67224935808000.0, + "grad_norm": 0.8852565804816103, + "language_loss": 0.65115845, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67333937, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.17089844, + "step": 10211, + "time_per_iteration": 3.5290822982788086 + }, + { + "auxiliary_loss_clip": 0.01416368, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.25375366, + "balance_loss_mlp": 1.02057099, + "epoch": 0.6139786562453029, + "flos": 21553650057600.0, + "grad_norm": 1.6962870466139335, + "language_loss": 0.75982106, + "learning_rate": 1.369681730544801e-06, + "loss": 0.78439087, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20031738, + "step": 10212, + "time_per_iteration": 2.8657941818237305 + }, + { + "auxiliary_loss_clip": 0.01424573, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.26094043, + "balance_loss_mlp": 1.01580691, + "epoch": 0.6140387794979708, + "flos": 26079836497920.0, + "grad_norm": 1.545782885503587, + "language_loss": 0.74356866, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76817179, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19909668, + "step": 10213, + "time_per_iteration": 2.880075693130493 + }, + { + "auxiliary_loss_clip": 0.01441955, + "auxiliary_loss_mlp": 0.01036658, + "balance_loss_clip": 1.27157176, + "balance_loss_mlp": 1.01698816, + "epoch": 0.6140989027506388, + "flos": 23704748478720.0, + "grad_norm": 1.4335376885962137, + "language_loss": 0.73765451, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.76244062, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.1965332, + "step": 10214, + "time_per_iteration": 2.879070281982422 + }, + { + "auxiliary_loss_clip": 0.01420506, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.25515103, + "balance_loss_mlp": 1.01329601, + "epoch": 0.6141590260033067, + "flos": 22240576897920.0, + "grad_norm": 1.5886497003850317, + "language_loss": 0.7514987, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.77603543, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19873047, + "step": 10215, + "time_per_iteration": 2.833376169204712 + }, + { + "auxiliary_loss_clip": 0.01422237, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.25799179, + "balance_loss_mlp": 1.01575089, + "epoch": 0.6142191492559748, + "flos": 23880434613120.0, + "grad_norm": 2.195861548756872, + "language_loss": 0.79324865, + "learning_rate": 1.368203464858542e-06, + "loss": 0.817837, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20849609, + "step": 10216, + "time_per_iteration": 2.893620252609253 + }, + { + "auxiliary_loss_clip": 0.01422797, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.25859308, + "balance_loss_mlp": 1.01280236, + "epoch": 0.6142792725086427, + "flos": 15049110418560.0, + "grad_norm": 2.1226798136557736, + "language_loss": 0.80572128, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.83027381, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19628906, + "step": 10217, + "time_per_iteration": 2.7954397201538086 + }, + { + "auxiliary_loss_clip": 0.0142549, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.25947809, + "balance_loss_mlp": 1.01203609, + "epoch": 0.6143393957613107, + "flos": 23341295848320.0, + "grad_norm": 2.6706357984023033, + "language_loss": 0.79819393, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.82276201, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19262695, + "step": 10218, + "time_per_iteration": 2.8784401416778564 + }, + { + "auxiliary_loss_clip": 0.0141518, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.25201464, + "balance_loss_mlp": 1.0185039, + "epoch": 0.6143995190139786, + "flos": 20125792068480.0, + "grad_norm": 1.5145614644057983, + "language_loss": 0.82499397, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84954077, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20996094, + "step": 10219, + "time_per_iteration": 4.261526823043823 + }, + { + "auxiliary_loss_clip": 0.01429864, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.26420069, + "balance_loss_mlp": 1.01507545, + "epoch": 0.6144596422666466, + "flos": 42318834664320.0, + "grad_norm": 1.8110049908311496, + "language_loss": 0.67884845, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.70349336, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19555664, + "step": 10220, + "time_per_iteration": 3.0127596855163574 + }, + { + "auxiliary_loss_clip": 0.01416561, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.25266242, + "balance_loss_mlp": 1.01221883, + "epoch": 0.6145197655193146, + "flos": 21581955319680.0, + "grad_norm": 3.051033930891564, + "language_loss": 0.72290063, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.74737966, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19140625, + "step": 10221, + "time_per_iteration": 2.8733503818511963 + }, + { + "auxiliary_loss_clip": 0.01419588, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.25699675, + "balance_loss_mlp": 1.01413441, + "epoch": 0.6145798887719826, + "flos": 21481203853440.0, + "grad_norm": 1.818154941224552, + "language_loss": 0.80170768, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.82623243, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18762207, + "step": 10222, + "time_per_iteration": 2.9629812240600586 + }, + { + "auxiliary_loss_clip": 0.01430375, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.26395547, + "balance_loss_mlp": 1.01200688, + "epoch": 0.6146400120246506, + "flos": 20786721131520.0, + "grad_norm": 3.0968230156184826, + "language_loss": 0.7722767, + "learning_rate": 1.365617422821788e-06, + "loss": 0.79689348, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19287109, + "step": 10223, + "time_per_iteration": 2.836669445037842 + }, + { + "auxiliary_loss_clip": 0.01414612, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.25376391, + "balance_loss_mlp": 1.01411128, + "epoch": 0.6147001352773185, + "flos": 13889247235200.0, + "grad_norm": 2.259572760451063, + "language_loss": 0.79216588, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.81665361, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.20043945, + "step": 10224, + "time_per_iteration": 2.8400509357452393 + }, + { + "auxiliary_loss_clip": 0.01411521, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.25108099, + "balance_loss_mlp": 1.01270807, + "epoch": 0.6147602585299865, + "flos": 56660612837760.0, + "grad_norm": 1.3749490226287833, + "language_loss": 0.66790825, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.69233745, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18676758, + "step": 10225, + "time_per_iteration": 3.20723295211792 + }, + { + "auxiliary_loss_clip": 0.01427211, + "auxiliary_loss_mlp": 0.01033052, + "balance_loss_clip": 1.26085591, + "balance_loss_mlp": 1.01387107, + "epoch": 0.6148203817826544, + "flos": 32830020011520.0, + "grad_norm": 2.220273190296515, + "language_loss": 0.64476711, + "learning_rate": 1.364509479649357e-06, + "loss": 0.66936976, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19189453, + "step": 10226, + "time_per_iteration": 2.9998879432678223 + }, + { + "auxiliary_loss_clip": 0.01436659, + "auxiliary_loss_mlp": 0.01038307, + "balance_loss_clip": 1.26996672, + "balance_loss_mlp": 1.01746964, + "epoch": 0.6148805050353224, + "flos": 18340996700160.0, + "grad_norm": 1.9491300340163806, + "language_loss": 0.76194775, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.78669739, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20837402, + "step": 10227, + "time_per_iteration": 2.87154483795166 + }, + { + "auxiliary_loss_clip": 0.01432541, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.26406634, + "balance_loss_mlp": 1.01809764, + "epoch": 0.6149406282879903, + "flos": 14072036803200.0, + "grad_norm": 2.3676973098765606, + "language_loss": 0.63299072, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.65770864, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.21142578, + "step": 10228, + "time_per_iteration": 2.822516918182373 + }, + { + "auxiliary_loss_clip": 0.01418198, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.25578713, + "balance_loss_mlp": 1.01926088, + "epoch": 0.6150007515406584, + "flos": 25200618681600.0, + "grad_norm": 1.4944923755955306, + "language_loss": 0.75276786, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.77733231, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18994141, + "step": 10229, + "time_per_iteration": 2.8910796642303467 + }, + { + "auxiliary_loss_clip": 0.01431672, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.26809025, + "balance_loss_mlp": 1.01371324, + "epoch": 0.6150608747933263, + "flos": 21955452295680.0, + "grad_norm": 1.7360388537797682, + "language_loss": 0.78945124, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.81410336, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19824219, + "step": 10230, + "time_per_iteration": 4.334232807159424 + }, + { + "auxiliary_loss_clip": 0.0143586, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.26960921, + "balance_loss_mlp": 1.0189085, + "epoch": 0.6151209980459943, + "flos": 30129602745600.0, + "grad_norm": 5.059867026927376, + "language_loss": 0.74017549, + "learning_rate": 1.36266338983927e-06, + "loss": 0.76491463, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19152832, + "step": 10231, + "time_per_iteration": 2.908796787261963 + }, + { + "auxiliary_loss_clip": 0.01419195, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.25551653, + "balance_loss_mlp": 1.01573181, + "epoch": 0.6151811212986622, + "flos": 30020571480960.0, + "grad_norm": 1.785882833069432, + "language_loss": 0.70938563, + "learning_rate": 1.362294244324858e-06, + "loss": 0.73392504, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19030762, + "step": 10232, + "time_per_iteration": 4.258838653564453 + }, + { + "auxiliary_loss_clip": 0.01404041, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.24535334, + "balance_loss_mlp": 1.01496518, + "epoch": 0.6152412445513302, + "flos": 18880316444160.0, + "grad_norm": 3.5403645539011146, + "language_loss": 0.92196214, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.94633615, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18408203, + "step": 10233, + "time_per_iteration": 4.265609979629517 + }, + { + "auxiliary_loss_clip": 0.01422036, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.26006222, + "balance_loss_mlp": 1.01658654, + "epoch": 0.6153013678039982, + "flos": 25714845544320.0, + "grad_norm": 1.7447863186205352, + "language_loss": 0.7229321, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.74749935, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18078613, + "step": 10234, + "time_per_iteration": 2.8695971965789795 + }, + { + "auxiliary_loss_clip": 0.01434954, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.26799321, + "balance_loss_mlp": 1.01595247, + "epoch": 0.6153614910566662, + "flos": 28521534142080.0, + "grad_norm": 1.9949210220309233, + "language_loss": 0.67677164, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.70147741, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19677734, + "step": 10235, + "time_per_iteration": 2.948618173599243 + }, + { + "auxiliary_loss_clip": 0.01441183, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.27225649, + "balance_loss_mlp": 1.01811075, + "epoch": 0.6154216143093342, + "flos": 23560172784000.0, + "grad_norm": 1.7820527269893371, + "language_loss": 0.816953, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.84173393, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.18811035, + "step": 10236, + "time_per_iteration": 2.866508722305298 + }, + { + "auxiliary_loss_clip": 0.0143313, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.26608694, + "balance_loss_mlp": 1.0134095, + "epoch": 0.6154817375620021, + "flos": 22758513834240.0, + "grad_norm": 1.722917619542013, + "language_loss": 0.80850661, + "learning_rate": 1.360448879760721e-06, + "loss": 0.83316052, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18859863, + "step": 10237, + "time_per_iteration": 2.8357560634613037 + }, + { + "auxiliary_loss_clip": 0.01418186, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.25488818, + "balance_loss_mlp": 1.01653886, + "epoch": 0.6155418608146701, + "flos": 27174583134720.0, + "grad_norm": 1.6859222454060037, + "language_loss": 0.76546758, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.79001617, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20117188, + "step": 10238, + "time_per_iteration": 2.9183290004730225 + }, + { + "auxiliary_loss_clip": 0.01199217, + "auxiliary_loss_mlp": 0.01015661, + "balance_loss_clip": 1.10653615, + "balance_loss_mlp": 1.00002098, + "epoch": 0.615601984067338, + "flos": 68838234825600.0, + "grad_norm": 0.7570692365341908, + "language_loss": 0.5766443, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59879303, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15625, + "step": 10239, + "time_per_iteration": 3.3379664421081543 + }, + { + "auxiliary_loss_clip": 0.01425214, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.25915241, + "balance_loss_mlp": 1.01190674, + "epoch": 0.615662107320006, + "flos": 15523947043200.0, + "grad_norm": 2.302545996864993, + "language_loss": 0.78364581, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.80821753, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20043945, + "step": 10240, + "time_per_iteration": 2.8647491931915283 + }, + { + "auxiliary_loss_clip": 0.01429583, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.26142573, + "balance_loss_mlp": 1.01746142, + "epoch": 0.615722230572674, + "flos": 21072117202560.0, + "grad_norm": 2.0331446547231673, + "language_loss": 0.73339796, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75806844, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.19995117, + "step": 10241, + "time_per_iteration": 2.876960039138794 + }, + { + "auxiliary_loss_clip": 0.0140958, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.24927223, + "balance_loss_mlp": 1.01388454, + "epoch": 0.615782353825342, + "flos": 23266813628160.0, + "grad_norm": 3.7667199391176642, + "language_loss": 0.73159635, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.75602084, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18994141, + "step": 10242, + "time_per_iteration": 2.9131641387939453 + }, + { + "auxiliary_loss_clip": 0.01416124, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.2528882, + "balance_loss_mlp": 1.01209331, + "epoch": 0.6158424770780099, + "flos": 21113317232640.0, + "grad_norm": 1.797828511400935, + "language_loss": 0.73101151, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.75547171, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.17810059, + "step": 10243, + "time_per_iteration": 2.8623111248016357 + }, + { + "auxiliary_loss_clip": 0.01194206, + "auxiliary_loss_mlp": 0.01020599, + "balance_loss_clip": 1.10413396, + "balance_loss_mlp": 1.00371861, + "epoch": 0.6159026003306779, + "flos": 70367540935680.0, + "grad_norm": 0.758882536191643, + "language_loss": 0.56849653, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.5906446, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16894531, + "step": 10244, + "time_per_iteration": 3.4159066677093506 + }, + { + "auxiliary_loss_clip": 0.01420088, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.25663817, + "balance_loss_mlp": 1.01285005, + "epoch": 0.6159627235833458, + "flos": 33887186202240.0, + "grad_norm": 1.6910410982463737, + "language_loss": 0.6403811, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.66490531, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19494629, + "step": 10245, + "time_per_iteration": 2.98595929145813 + }, + { + "auxiliary_loss_clip": 0.014097, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.24752736, + "balance_loss_mlp": 1.01252687, + "epoch": 0.6160228468360138, + "flos": 26585919296640.0, + "grad_norm": 1.8105541034223454, + "language_loss": 0.7978763, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.82228845, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19006348, + "step": 10246, + "time_per_iteration": 2.908010244369507 + }, + { + "auxiliary_loss_clip": 0.01440763, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.27261508, + "balance_loss_mlp": 1.02674794, + "epoch": 0.6160829700886818, + "flos": 17199937370880.0, + "grad_norm": 2.519936790371734, + "language_loss": 0.87984157, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.904728, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21118164, + "step": 10247, + "time_per_iteration": 2.889995574951172 + }, + { + "auxiliary_loss_clip": 0.01434536, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.26852894, + "balance_loss_mlp": 1.0149132, + "epoch": 0.6161430933413498, + "flos": 23634157311360.0, + "grad_norm": 1.5839160320004646, + "language_loss": 0.80779576, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.83248115, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19104004, + "step": 10248, + "time_per_iteration": 2.937851667404175 + }, + { + "auxiliary_loss_clip": 0.01414711, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.2533164, + "balance_loss_mlp": 1.01254654, + "epoch": 0.6162032165940178, + "flos": 23012663731200.0, + "grad_norm": 2.619655182289342, + "language_loss": 0.87065256, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89510697, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18188477, + "step": 10249, + "time_per_iteration": 2.844805955886841 + }, + { + "auxiliary_loss_clip": 0.01424761, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.25959635, + "balance_loss_mlp": 1.01413155, + "epoch": 0.6162633398466857, + "flos": 39436713705600.0, + "grad_norm": 2.464460805578947, + "language_loss": 0.70338607, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.72797906, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20397949, + "step": 10250, + "time_per_iteration": 2.9893922805786133 + }, + { + "auxiliary_loss_clip": 0.01408977, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.25082493, + "balance_loss_mlp": 1.01166272, + "epoch": 0.6163234630993537, + "flos": 19253994399360.0, + "grad_norm": 4.6844361695196275, + "language_loss": 0.7460832, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.77047169, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18225098, + "step": 10251, + "time_per_iteration": 2.8247179985046387 + }, + { + "auxiliary_loss_clip": 0.01415372, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.25021005, + "balance_loss_mlp": 1.01297879, + "epoch": 0.6163835863520216, + "flos": 15970478405760.0, + "grad_norm": 2.095999831089002, + "language_loss": 0.68740284, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.71188509, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1986084, + "step": 10252, + "time_per_iteration": 2.858543634414673 + }, + { + "auxiliary_loss_clip": 0.01200864, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.10855484, + "balance_loss_mlp": 1.00558555, + "epoch": 0.6164437096046896, + "flos": 68136530918400.0, + "grad_norm": 0.889290849872288, + "language_loss": 0.58037126, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.6026485, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.21289062, + "step": 10253, + "time_per_iteration": 3.4137163162231445 + }, + { + "auxiliary_loss_clip": 0.01425037, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.25788629, + "balance_loss_mlp": 1.01409125, + "epoch": 0.6165038328573575, + "flos": 21371177203200.0, + "grad_norm": 1.4944481457770558, + "language_loss": 0.80233085, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.82691717, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19506836, + "step": 10254, + "time_per_iteration": 4.273070573806763 + }, + { + "auxiliary_loss_clip": 0.0143114, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.26244497, + "balance_loss_mlp": 1.01565087, + "epoch": 0.6165639561100256, + "flos": 21110874013440.0, + "grad_norm": 1.8148366830355995, + "language_loss": 0.81137228, + "learning_rate": 1.353810600008846e-06, + "loss": 0.83603579, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19567871, + "step": 10255, + "time_per_iteration": 2.881281614303589 + }, + { + "auxiliary_loss_clip": 0.01429361, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.26237559, + "balance_loss_mlp": 1.01597953, + "epoch": 0.6166240793626935, + "flos": 25349628366720.0, + "grad_norm": 5.662700670000636, + "language_loss": 0.66740024, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.69205225, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19848633, + "step": 10256, + "time_per_iteration": 2.8848044872283936 + }, + { + "auxiliary_loss_clip": 0.01420663, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.25931692, + "balance_loss_mlp": 1.01437068, + "epoch": 0.6166842026153615, + "flos": 19692110229120.0, + "grad_norm": 1.7314966486777392, + "language_loss": 0.73135328, + "learning_rate": 1.353073501949825e-06, + "loss": 0.75589907, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19543457, + "step": 10257, + "time_per_iteration": 2.842000722885132 + }, + { + "auxiliary_loss_clip": 0.01421958, + "auxiliary_loss_mlp": 0.01030759, + "balance_loss_clip": 1.25610554, + "balance_loss_mlp": 1.01098251, + "epoch": 0.6167443258680294, + "flos": 19327888437120.0, + "grad_norm": 3.151265864242765, + "language_loss": 0.73352861, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.75805581, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19775391, + "step": 10258, + "time_per_iteration": 2.845064640045166 + }, + { + "auxiliary_loss_clip": 0.01418319, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.25142169, + "balance_loss_mlp": 1.01177025, + "epoch": 0.6168044491206974, + "flos": 25276232021760.0, + "grad_norm": 3.2788530593017486, + "language_loss": 0.65258539, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.6770761, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18981934, + "step": 10259, + "time_per_iteration": 2.88090181350708 + }, + { + "auxiliary_loss_clip": 0.014114, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.24899554, + "balance_loss_mlp": 1.01564932, + "epoch": 0.6168645723733654, + "flos": 13226463135360.0, + "grad_norm": 1.8838617716780197, + "language_loss": 0.71507418, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73954284, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19812012, + "step": 10260, + "time_per_iteration": 2.8262319564819336 + }, + { + "auxiliary_loss_clip": 0.01444176, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.27455151, + "balance_loss_mlp": 1.0148474, + "epoch": 0.6169246956260334, + "flos": 26663478163200.0, + "grad_norm": 2.007345619396919, + "language_loss": 0.69026893, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.71506429, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20507812, + "step": 10261, + "time_per_iteration": 2.8963143825531006 + }, + { + "auxiliary_loss_clip": 0.01411142, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.24746597, + "balance_loss_mlp": 1.01297116, + "epoch": 0.6169848188787014, + "flos": 23158687259520.0, + "grad_norm": 1.6717329204529354, + "language_loss": 0.71892631, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.74335861, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19116211, + "step": 10262, + "time_per_iteration": 2.857736349105835 + }, + { + "auxiliary_loss_clip": 0.01428133, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.26395488, + "balance_loss_mlp": 1.01164758, + "epoch": 0.6170449421313693, + "flos": 23342517457920.0, + "grad_norm": 2.3869894631201634, + "language_loss": 0.70912099, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.73371518, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1965332, + "step": 10263, + "time_per_iteration": 2.8540382385253906 + }, + { + "auxiliary_loss_clip": 0.01428754, + "auxiliary_loss_mlp": 0.01036034, + "balance_loss_clip": 1.26100862, + "balance_loss_mlp": 1.01706743, + "epoch": 0.6171050653840373, + "flos": 15860089797120.0, + "grad_norm": 2.0749195425037774, + "language_loss": 0.76801836, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.7926662, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.1895752, + "step": 10264, + "time_per_iteration": 2.886823892593384 + }, + { + "auxiliary_loss_clip": 0.0141311, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.25065923, + "balance_loss_mlp": 1.01613665, + "epoch": 0.6171651886367052, + "flos": 20054567473920.0, + "grad_norm": 2.121941146698505, + "language_loss": 0.85351449, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87801164, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20471191, + "step": 10265, + "time_per_iteration": 4.34124493598938 + }, + { + "auxiliary_loss_clip": 0.01403644, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.24294293, + "balance_loss_mlp": 1.01261497, + "epoch": 0.6172253118893732, + "flos": 26443967800320.0, + "grad_norm": 2.030236450959666, + "language_loss": 0.65273404, + "learning_rate": 1.349757776608153e-06, + "loss": 0.67707992, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18310547, + "step": 10266, + "time_per_iteration": 2.933673858642578 + }, + { + "auxiliary_loss_clip": 0.01412417, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.24779391, + "balance_loss_mlp": 1.01514506, + "epoch": 0.6172854351420412, + "flos": 22641790953600.0, + "grad_norm": 1.6687025500633408, + "language_loss": 0.76188719, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.78635442, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19152832, + "step": 10267, + "time_per_iteration": 5.676170110702515 + }, + { + "auxiliary_loss_clip": 0.01429928, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.26154685, + "balance_loss_mlp": 1.01835179, + "epoch": 0.6173455583947092, + "flos": 21222258007680.0, + "grad_norm": 1.6975070343269385, + "language_loss": 0.75651294, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.78119576, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20007324, + "step": 10268, + "time_per_iteration": 2.8558239936828613 + }, + { + "auxiliary_loss_clip": 0.01438536, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.26878905, + "balance_loss_mlp": 1.01518416, + "epoch": 0.6174056816473771, + "flos": 19509727864320.0, + "grad_norm": 1.6862505944540929, + "language_loss": 0.76412112, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.78885674, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19836426, + "step": 10269, + "time_per_iteration": 2.826125383377075 + }, + { + "auxiliary_loss_clip": 0.01409225, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.24484217, + "balance_loss_mlp": 1.01476371, + "epoch": 0.6174658049000451, + "flos": 16005796611840.0, + "grad_norm": 2.359998860613369, + "language_loss": 0.77168977, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.79611319, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18359375, + "step": 10270, + "time_per_iteration": 2.809882402420044 + }, + { + "auxiliary_loss_clip": 0.0142233, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.25792766, + "balance_loss_mlp": 1.01591909, + "epoch": 0.617525928152713, + "flos": 21912804432000.0, + "grad_norm": 2.102801982145458, + "language_loss": 0.83412564, + "learning_rate": 1.347916569325736e-06, + "loss": 0.85870284, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19470215, + "step": 10271, + "time_per_iteration": 2.8300681114196777 + }, + { + "auxiliary_loss_clip": 0.01424427, + "auxiliary_loss_mlp": 0.01036204, + "balance_loss_clip": 1.25887609, + "balance_loss_mlp": 1.01752377, + "epoch": 0.617586051405381, + "flos": 21115896186240.0, + "grad_norm": 1.7600845171183308, + "language_loss": 0.77995968, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.80456597, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.18688965, + "step": 10272, + "time_per_iteration": 2.8330299854278564 + }, + { + "auxiliary_loss_clip": 0.01196724, + "auxiliary_loss_mlp": 0.01020147, + "balance_loss_clip": 1.10555184, + "balance_loss_mlp": 1.0024085, + "epoch": 0.617646174658049, + "flos": 58639255488000.0, + "grad_norm": 0.8094277516357877, + "language_loss": 0.59165466, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61382341, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.17773438, + "step": 10273, + "time_per_iteration": 3.201281785964966 + }, + { + "auxiliary_loss_clip": 0.01411921, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.25022292, + "balance_loss_mlp": 1.01485562, + "epoch": 0.617706297910717, + "flos": 13885944364800.0, + "grad_norm": 3.059517164332595, + "language_loss": 0.73914707, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.76360464, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18981934, + "step": 10274, + "time_per_iteration": 2.8165013790130615 + }, + { + "auxiliary_loss_clip": 0.01422242, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.25833559, + "balance_loss_mlp": 1.01572835, + "epoch": 0.617766421163385, + "flos": 19217816542080.0, + "grad_norm": 1.7715768831864545, + "language_loss": 0.78892362, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.81350017, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19677734, + "step": 10275, + "time_per_iteration": 2.892542600631714 + }, + { + "auxiliary_loss_clip": 0.01410392, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.24683046, + "balance_loss_mlp": 1.01608229, + "epoch": 0.6178265444160529, + "flos": 22576448183040.0, + "grad_norm": 1.665912470558785, + "language_loss": 0.8003307, + "learning_rate": 1.346075980219998e-06, + "loss": 0.82479811, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20251465, + "step": 10276, + "time_per_iteration": 2.8324923515319824 + }, + { + "auxiliary_loss_clip": 0.01426099, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.26068211, + "balance_loss_mlp": 1.01912677, + "epoch": 0.6178866676687209, + "flos": 11991801018240.0, + "grad_norm": 1.831212542225, + "language_loss": 0.81652141, + "learning_rate": 1.345707936733612e-06, + "loss": 0.8411755, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20178223, + "step": 10277, + "time_per_iteration": 2.8627374172210693 + }, + { + "auxiliary_loss_clip": 0.01434244, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.26608551, + "balance_loss_mlp": 1.01822686, + "epoch": 0.6179467909213888, + "flos": 21000123446400.0, + "grad_norm": 1.518157870500574, + "language_loss": 0.82515377, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.84988391, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20532227, + "step": 10278, + "time_per_iteration": 2.872072696685791 + }, + { + "auxiliary_loss_clip": 0.014099, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.24681854, + "balance_loss_mlp": 1.01305795, + "epoch": 0.6180069141740568, + "flos": 25349085429120.0, + "grad_norm": 1.7391308841373785, + "language_loss": 0.74408567, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76851028, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19494629, + "step": 10279, + "time_per_iteration": 2.8699381351470947 + }, + { + "auxiliary_loss_clip": 0.01422431, + "auxiliary_loss_mlp": 0.01038233, + "balance_loss_clip": 1.25917721, + "balance_loss_mlp": 1.01905191, + "epoch": 0.6180670374267248, + "flos": 19655072720640.0, + "grad_norm": 1.4400008371261395, + "language_loss": 0.71122867, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73583531, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19189453, + "step": 10280, + "time_per_iteration": 2.838732957839966 + }, + { + "auxiliary_loss_clip": 0.01426328, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.26059484, + "balance_loss_mlp": 1.01723123, + "epoch": 0.6181271606793928, + "flos": 19474590637440.0, + "grad_norm": 1.6355469496507058, + "language_loss": 0.72903699, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75366545, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19287109, + "step": 10281, + "time_per_iteration": 2.818643569946289 + }, + { + "auxiliary_loss_clip": 0.01403805, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.24438965, + "balance_loss_mlp": 1.01716447, + "epoch": 0.6181872839320607, + "flos": 25605542810880.0, + "grad_norm": 1.4932057766613216, + "language_loss": 0.77847052, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.80286908, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18908691, + "step": 10282, + "time_per_iteration": 2.8969223499298096 + }, + { + "auxiliary_loss_clip": 0.01430391, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.26219738, + "balance_loss_mlp": 1.01710224, + "epoch": 0.6182474071847287, + "flos": 25561492358400.0, + "grad_norm": 1.6497136675937623, + "language_loss": 0.69582456, + "learning_rate": 1.343500197330931e-06, + "loss": 0.72050607, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20654297, + "step": 10283, + "time_per_iteration": 2.8804261684417725 + }, + { + "auxiliary_loss_clip": 0.01446141, + "auxiliary_loss_mlp": 0.01033647, + "balance_loss_clip": 1.27335131, + "balance_loss_mlp": 1.01394176, + "epoch": 0.6183075304373966, + "flos": 22132948222080.0, + "grad_norm": 1.8687245848321017, + "language_loss": 0.75346673, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77826464, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.19702148, + "step": 10284, + "time_per_iteration": 2.856048822402954 + }, + { + "auxiliary_loss_clip": 0.01401102, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.24386525, + "balance_loss_mlp": 1.01630938, + "epoch": 0.6183676536900646, + "flos": 22465788105600.0, + "grad_norm": 1.4990264835298597, + "language_loss": 0.76262605, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.78699243, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19226074, + "step": 10285, + "time_per_iteration": 2.8706209659576416 + }, + { + "auxiliary_loss_clip": 0.01414309, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.24979138, + "balance_loss_mlp": 1.01445377, + "epoch": 0.6184277769427327, + "flos": 23373627897600.0, + "grad_norm": 1.8240474647686413, + "language_loss": 0.73462409, + "learning_rate": 1.342396663517503e-06, + "loss": 0.7591083, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19677734, + "step": 10286, + "time_per_iteration": 2.8997135162353516 + }, + { + "auxiliary_loss_clip": 0.01413563, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.25048482, + "balance_loss_mlp": 1.01263976, + "epoch": 0.6184879001954006, + "flos": 22721340591360.0, + "grad_norm": 1.7025529512601867, + "language_loss": 0.76804364, + "learning_rate": 1.342028868767199e-06, + "loss": 0.79249346, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1875, + "step": 10287, + "time_per_iteration": 2.867908000946045 + }, + { + "auxiliary_loss_clip": 0.01421738, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.25786567, + "balance_loss_mlp": 1.0129106, + "epoch": 0.6185480234480686, + "flos": 23852446064640.0, + "grad_norm": 1.6456854050267995, + "language_loss": 0.73329556, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75783992, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19775391, + "step": 10288, + "time_per_iteration": 2.882108211517334 + }, + { + "auxiliary_loss_clip": 0.01394826, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.23617196, + "balance_loss_mlp": 1.01210403, + "epoch": 0.6186081467007365, + "flos": 45493636106880.0, + "grad_norm": 1.7042012845829833, + "language_loss": 0.73613572, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.76038963, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18457031, + "step": 10289, + "time_per_iteration": 4.469456195831299 + }, + { + "auxiliary_loss_clip": 0.01423481, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.256387, + "balance_loss_mlp": 1.01585829, + "epoch": 0.6186682699534045, + "flos": 23561530128000.0, + "grad_norm": 2.138977945006333, + "language_loss": 0.79767597, + "learning_rate": 1.340925634274056e-06, + "loss": 0.82227361, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.2043457, + "step": 10290, + "time_per_iteration": 2.9141712188720703 + }, + { + "auxiliary_loss_clip": 0.01429991, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.26242292, + "balance_loss_mlp": 1.01341367, + "epoch": 0.6187283932060724, + "flos": 25784848529280.0, + "grad_norm": 1.7541525066370758, + "language_loss": 0.82051194, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.84514165, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19567871, + "step": 10291, + "time_per_iteration": 2.899714946746826 + }, + { + "auxiliary_loss_clip": 0.01422285, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.25659966, + "balance_loss_mlp": 1.01390457, + "epoch": 0.6187885164587404, + "flos": 25276186776960.0, + "grad_norm": 1.731830837887726, + "language_loss": 0.78309983, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.80765772, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19592285, + "step": 10292, + "time_per_iteration": 2.8982324600219727 + }, + { + "auxiliary_loss_clip": 0.0143938, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.26773167, + "balance_loss_mlp": 1.01602721, + "epoch": 0.6188486397114084, + "flos": 26262128373120.0, + "grad_norm": 2.114492695611436, + "language_loss": 0.73705339, + "learning_rate": 1.339822624710401e-06, + "loss": 0.76181769, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.21032715, + "step": 10293, + "time_per_iteration": 2.9023170471191406 + }, + { + "auxiliary_loss_clip": 0.01421446, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.25696266, + "balance_loss_mlp": 1.01522684, + "epoch": 0.6189087629640764, + "flos": 20933242352640.0, + "grad_norm": 1.5845831189416137, + "language_loss": 0.83554184, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.8601051, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19665527, + "step": 10294, + "time_per_iteration": 2.8660778999328613 + }, + { + "auxiliary_loss_clip": 0.01410774, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.24622667, + "balance_loss_mlp": 1.01451278, + "epoch": 0.6189688862167443, + "flos": 14837970343680.0, + "grad_norm": 6.496050594112748, + "language_loss": 0.71759033, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.74203801, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19482422, + "step": 10295, + "time_per_iteration": 2.794046401977539 + }, + { + "auxiliary_loss_clip": 0.01404057, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.24190307, + "balance_loss_mlp": 1.01618159, + "epoch": 0.6190290094694123, + "flos": 24297077145600.0, + "grad_norm": 1.4536539537209683, + "language_loss": 0.70760489, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.73200595, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19873047, + "step": 10296, + "time_per_iteration": 2.9184675216674805 + }, + { + "auxiliary_loss_clip": 0.01420812, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.25458503, + "balance_loss_mlp": 1.0171212, + "epoch": 0.6190891327220802, + "flos": 22539908367360.0, + "grad_norm": 2.0796694953303754, + "language_loss": 0.72767663, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.75226104, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20507812, + "step": 10297, + "time_per_iteration": 2.8540337085723877 + }, + { + "auxiliary_loss_clip": 0.01194863, + "auxiliary_loss_mlp": 0.010199, + "balance_loss_clip": 1.10632896, + "balance_loss_mlp": 1.00168478, + "epoch": 0.6191492559747482, + "flos": 67759459603200.0, + "grad_norm": 0.8960907583886707, + "language_loss": 0.6424948, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66464245, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.18261719, + "step": 10298, + "time_per_iteration": 3.2800772190093994 + }, + { + "auxiliary_loss_clip": 0.01413192, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.24804771, + "balance_loss_mlp": 1.02327967, + "epoch": 0.6192093792274163, + "flos": 22356983064960.0, + "grad_norm": 1.6841677404813264, + "language_loss": 0.74922061, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.7737782, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19287109, + "step": 10299, + "time_per_iteration": 2.895599126815796 + }, + { + "auxiliary_loss_clip": 0.01432412, + "auxiliary_loss_mlp": 0.01036538, + "balance_loss_clip": 1.26172328, + "balance_loss_mlp": 1.01698768, + "epoch": 0.6192695024800842, + "flos": 13562062951680.0, + "grad_norm": 1.901617851849146, + "language_loss": 0.68843836, + "learning_rate": 1.337249812568732e-06, + "loss": 0.71312785, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.19555664, + "step": 10300, + "time_per_iteration": 4.311887979507446 + }, + { + "auxiliary_loss_clip": 0.01413047, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.24827075, + "balance_loss_mlp": 1.01815772, + "epoch": 0.6193296257327522, + "flos": 17422841093760.0, + "grad_norm": 1.7554620577067552, + "language_loss": 0.67722636, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.70173568, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19726562, + "step": 10301, + "time_per_iteration": 2.823730707168579 + }, + { + "auxiliary_loss_clip": 0.01411852, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.2462883, + "balance_loss_mlp": 1.01432765, + "epoch": 0.6193897489854201, + "flos": 31113146367360.0, + "grad_norm": 1.690167095036896, + "language_loss": 0.73944747, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.76389635, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18701172, + "step": 10302, + "time_per_iteration": 4.320590257644653 + }, + { + "auxiliary_loss_clip": 0.01407964, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.24444818, + "balance_loss_mlp": 1.0115962, + "epoch": 0.6194498722380881, + "flos": 19143017608320.0, + "grad_norm": 2.084872103215293, + "language_loss": 0.81492001, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.83932996, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.21435547, + "step": 10303, + "time_per_iteration": 4.276179790496826 + }, + { + "auxiliary_loss_clip": 0.01429038, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.25860095, + "balance_loss_mlp": 1.01577616, + "epoch": 0.619509995490756, + "flos": 21845063687040.0, + "grad_norm": 1.5780255990836647, + "language_loss": 0.77450073, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79915762, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.20874023, + "step": 10304, + "time_per_iteration": 2.873863697052002 + }, + { + "auxiliary_loss_clip": 0.0142614, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.25649083, + "balance_loss_mlp": 1.01581502, + "epoch": 0.619570118743424, + "flos": 23817354082560.0, + "grad_norm": 1.9886206936970547, + "language_loss": 0.78648555, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.81111395, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20874023, + "step": 10305, + "time_per_iteration": 2.978706121444702 + }, + { + "auxiliary_loss_clip": 0.01426504, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.2573626, + "balance_loss_mlp": 1.01725054, + "epoch": 0.619630241996092, + "flos": 21110421565440.0, + "grad_norm": 1.6412530275777248, + "language_loss": 0.7950455, + "learning_rate": 1.335045524968045e-06, + "loss": 0.81969517, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21191406, + "step": 10306, + "time_per_iteration": 2.923522472381592 + }, + { + "auxiliary_loss_clip": 0.01410687, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.24869049, + "balance_loss_mlp": 1.01177955, + "epoch": 0.61969036524876, + "flos": 27319747011840.0, + "grad_norm": 1.7482221807783596, + "language_loss": 0.81039643, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.83481103, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18994141, + "step": 10307, + "time_per_iteration": 2.904693365097046 + }, + { + "auxiliary_loss_clip": 0.01194278, + "auxiliary_loss_mlp": 0.01017373, + "balance_loss_clip": 1.10685229, + "balance_loss_mlp": 0.9978224, + "epoch": 0.6197504885014279, + "flos": 51677434206720.0, + "grad_norm": 0.8076171877744418, + "language_loss": 0.59422278, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61633933, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1953125, + "step": 10308, + "time_per_iteration": 3.3857014179229736 + }, + { + "auxiliary_loss_clip": 0.01411334, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.24903154, + "balance_loss_mlp": 1.0111239, + "epoch": 0.6198106117540959, + "flos": 30569845080960.0, + "grad_norm": 1.862479757372258, + "language_loss": 0.68363994, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70805585, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19128418, + "step": 10309, + "time_per_iteration": 2.9414303302764893 + }, + { + "auxiliary_loss_clip": 0.01404566, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.24287271, + "balance_loss_mlp": 1.01602244, + "epoch": 0.6198707350067638, + "flos": 18917263463040.0, + "grad_norm": 1.4480451517674497, + "language_loss": 0.73052859, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.7549417, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.20727539, + "step": 10310, + "time_per_iteration": 2.8381073474884033 + }, + { + "auxiliary_loss_clip": 0.0142838, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.26142681, + "balance_loss_mlp": 1.0165242, + "epoch": 0.6199308582594318, + "flos": 21443759141760.0, + "grad_norm": 1.9809452698090997, + "language_loss": 0.79574466, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.8203963, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20275879, + "step": 10311, + "time_per_iteration": 2.877274513244629 + }, + { + "auxiliary_loss_clip": 0.01410272, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.2439574, + "balance_loss_mlp": 1.01267266, + "epoch": 0.6199909815120999, + "flos": 18416971998720.0, + "grad_norm": 2.885738311527711, + "language_loss": 0.73164809, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75607789, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20043945, + "step": 10312, + "time_per_iteration": 2.8203017711639404 + }, + { + "auxiliary_loss_clip": 0.0143989, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.26911807, + "balance_loss_mlp": 1.01547253, + "epoch": 0.6200511047647678, + "flos": 21475864967040.0, + "grad_norm": 1.8287160448351079, + "language_loss": 0.72885442, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.75361085, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20288086, + "step": 10313, + "time_per_iteration": 2.8425076007843018 + }, + { + "auxiliary_loss_clip": 0.01430172, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.26173019, + "balance_loss_mlp": 1.01351988, + "epoch": 0.6201112280174358, + "flos": 18223097454720.0, + "grad_norm": 3.1652499886155163, + "language_loss": 0.78978574, + "learning_rate": 1.332107887401416e-06, + "loss": 0.81442499, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20239258, + "step": 10314, + "time_per_iteration": 2.8005306720733643 + }, + { + "auxiliary_loss_clip": 0.01408085, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.24305928, + "balance_loss_mlp": 1.01630616, + "epoch": 0.6201713512701037, + "flos": 20020787591040.0, + "grad_norm": 1.677479255773925, + "language_loss": 0.78536248, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80980152, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19506836, + "step": 10315, + "time_per_iteration": 2.838513135910034 + }, + { + "auxiliary_loss_clip": 0.01425303, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.25717139, + "balance_loss_mlp": 1.01964712, + "epoch": 0.6202314745227717, + "flos": 22496762810880.0, + "grad_norm": 1.9273849343531655, + "language_loss": 0.76949614, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.79413688, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19140625, + "step": 10316, + "time_per_iteration": 2.8956499099731445 + }, + { + "auxiliary_loss_clip": 0.01407509, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.24130583, + "balance_loss_mlp": 1.01296747, + "epoch": 0.6202915977754396, + "flos": 26838666604800.0, + "grad_norm": 2.089232130543429, + "language_loss": 0.78501731, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.80943048, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20837402, + "step": 10317, + "time_per_iteration": 2.8941309452056885 + }, + { + "auxiliary_loss_clip": 0.01203346, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.11139488, + "balance_loss_mlp": 1.02553129, + "epoch": 0.6203517210281076, + "flos": 62774588931840.0, + "grad_norm": 0.6934760283803361, + "language_loss": 0.59129912, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61381489, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.2265625, + "step": 10318, + "time_per_iteration": 3.4170868396759033 + }, + { + "auxiliary_loss_clip": 0.01418565, + "auxiliary_loss_mlp": 0.01041488, + "balance_loss_clip": 1.25313163, + "balance_loss_mlp": 1.0197444, + "epoch": 0.6204118442807756, + "flos": 23414737438080.0, + "grad_norm": 1.6245183008069577, + "language_loss": 0.78862488, + "learning_rate": 1.330272686582143e-06, + "loss": 0.81322545, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.21728516, + "step": 10319, + "time_per_iteration": 2.905768394470215 + }, + { + "auxiliary_loss_clip": 0.01402642, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.24111223, + "balance_loss_mlp": 1.01569903, + "epoch": 0.6204719675334436, + "flos": 20203305690240.0, + "grad_norm": 1.7785194501910797, + "language_loss": 0.67394471, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.698318, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18994141, + "step": 10320, + "time_per_iteration": 2.878734588623047 + }, + { + "auxiliary_loss_clip": 0.01397975, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.2367475, + "balance_loss_mlp": 1.01543808, + "epoch": 0.6205320907861115, + "flos": 13195805143680.0, + "grad_norm": 1.6692119387511808, + "language_loss": 0.77148533, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.79581237, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19287109, + "step": 10321, + "time_per_iteration": 2.8377456665039062 + }, + { + "auxiliary_loss_clip": 0.01392868, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.23192048, + "balance_loss_mlp": 1.01339793, + "epoch": 0.6205922140387795, + "flos": 20678278049280.0, + "grad_norm": 1.7270199572680083, + "language_loss": 0.74660134, + "learning_rate": 1.329171870732758e-06, + "loss": 0.77085483, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19091797, + "step": 10322, + "time_per_iteration": 2.8560922145843506 + }, + { + "auxiliary_loss_clip": 0.01403298, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.24058557, + "balance_loss_mlp": 1.01506424, + "epoch": 0.6206523372914474, + "flos": 23888216718720.0, + "grad_norm": 1.7568640670967484, + "language_loss": 0.73649967, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.76087761, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19421387, + "step": 10323, + "time_per_iteration": 2.88151216506958 + }, + { + "auxiliary_loss_clip": 0.01430213, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.25839686, + "balance_loss_mlp": 1.0142045, + "epoch": 0.6207124605441154, + "flos": 13414320120960.0, + "grad_norm": 2.735372635604947, + "language_loss": 0.60448581, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.62913907, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.20922852, + "step": 10324, + "time_per_iteration": 4.265254735946655 + }, + { + "auxiliary_loss_clip": 0.01423321, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.25556183, + "balance_loss_mlp": 1.01520121, + "epoch": 0.6207725837967835, + "flos": 18925588506240.0, + "grad_norm": 1.9176967149696995, + "language_loss": 0.77575254, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.80034608, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20849609, + "step": 10325, + "time_per_iteration": 2.8687658309936523 + }, + { + "auxiliary_loss_clip": 0.0142728, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.25831282, + "balance_loss_mlp": 1.0112021, + "epoch": 0.6208327070494514, + "flos": 23988787205760.0, + "grad_norm": 2.167028048460525, + "language_loss": 0.73253649, + "learning_rate": 1.327704472462003e-06, + "loss": 0.75712335, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.2019043, + "step": 10326, + "time_per_iteration": 2.8777382373809814 + }, + { + "auxiliary_loss_clip": 0.01421074, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.25272584, + "balance_loss_mlp": 1.01767874, + "epoch": 0.6208928303021194, + "flos": 22830688569600.0, + "grad_norm": 2.673547344161478, + "language_loss": 0.7445538, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.76915514, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21362305, + "step": 10327, + "time_per_iteration": 2.8402843475341797 + }, + { + "auxiliary_loss_clip": 0.01430951, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.26282644, + "balance_loss_mlp": 1.01180744, + "epoch": 0.6209529535547873, + "flos": 17572936654080.0, + "grad_norm": 2.4898195633204736, + "language_loss": 0.81428623, + "learning_rate": 1.326970926232066e-06, + "loss": 0.83893102, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21704102, + "step": 10328, + "time_per_iteration": 2.847388505935669 + }, + { + "auxiliary_loss_clip": 0.01415822, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.25012958, + "balance_loss_mlp": 1.01603413, + "epoch": 0.6210130768074553, + "flos": 22020478352640.0, + "grad_norm": 2.0424529646378504, + "language_loss": 0.78647304, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.81101012, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21862793, + "step": 10329, + "time_per_iteration": 2.8287460803985596 + }, + { + "auxiliary_loss_clip": 0.01192513, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.10776162, + "balance_loss_mlp": 1.01676977, + "epoch": 0.6210732000601232, + "flos": 63705259365120.0, + "grad_norm": 0.8406787062293202, + "language_loss": 0.6221323, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64442158, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.19628906, + "step": 10330, + "time_per_iteration": 3.292299270629883 + }, + { + "auxiliary_loss_clip": 0.01426917, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.25833189, + "balance_loss_mlp": 1.0157994, + "epoch": 0.6211333233127913, + "flos": 24254022078720.0, + "grad_norm": 1.9376177623153126, + "language_loss": 0.78996146, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.81459367, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20507812, + "step": 10331, + "time_per_iteration": 2.885284662246704 + }, + { + "auxiliary_loss_clip": 0.01425752, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.25665379, + "balance_loss_mlp": 1.01797318, + "epoch": 0.6211934465654592, + "flos": 16951940766720.0, + "grad_norm": 1.933695747176211, + "language_loss": 0.68320429, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.70785034, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.2088623, + "step": 10332, + "time_per_iteration": 2.8588366508483887 + }, + { + "auxiliary_loss_clip": 0.01417174, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.2513082, + "balance_loss_mlp": 1.01279938, + "epoch": 0.6212535698181272, + "flos": 15275588480640.0, + "grad_norm": 1.922229973877783, + "language_loss": 0.7678563, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.79235625, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20031738, + "step": 10333, + "time_per_iteration": 2.9523508548736572 + }, + { + "auxiliary_loss_clip": 0.01406662, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.24463582, + "balance_loss_mlp": 1.01527607, + "epoch": 0.6213136930707951, + "flos": 13451900567040.0, + "grad_norm": 3.078980702150917, + "language_loss": 0.70944291, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.73386514, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20275879, + "step": 10334, + "time_per_iteration": 2.8661539554595947 + }, + { + "auxiliary_loss_clip": 0.01406772, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.24438024, + "balance_loss_mlp": 1.01076221, + "epoch": 0.6213738163234631, + "flos": 18119631300480.0, + "grad_norm": 2.0590818865755445, + "language_loss": 0.70418203, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72854805, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19067383, + "step": 10335, + "time_per_iteration": 4.283087253570557 + }, + { + "auxiliary_loss_clip": 0.01417427, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.25474048, + "balance_loss_mlp": 1.01050103, + "epoch": 0.621433939576131, + "flos": 25348135288320.0, + "grad_norm": 1.4260894177203194, + "language_loss": 0.80615866, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.83063453, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19628906, + "step": 10336, + "time_per_iteration": 2.9185190200805664 + }, + { + "auxiliary_loss_clip": 0.01398137, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.23747969, + "balance_loss_mlp": 1.01259851, + "epoch": 0.621494062828799, + "flos": 22575995735040.0, + "grad_norm": 1.6705929916528577, + "language_loss": 0.7364471, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.76075923, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20483398, + "step": 10337, + "time_per_iteration": 4.31508207321167 + }, + { + "auxiliary_loss_clip": 0.01425391, + "auxiliary_loss_mlp": 0.01035442, + "balance_loss_clip": 1.25589538, + "balance_loss_mlp": 1.01492643, + "epoch": 0.621554186081467, + "flos": 27429547438080.0, + "grad_norm": 3.0960265174885317, + "language_loss": 0.63986719, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.6644755, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20507812, + "step": 10338, + "time_per_iteration": 2.8883275985717773 + }, + { + "auxiliary_loss_clip": 0.01420261, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.25575709, + "balance_loss_mlp": 1.01117063, + "epoch": 0.621614309334135, + "flos": 22357164044160.0, + "grad_norm": 1.4786053083553699, + "language_loss": 0.72254199, + "learning_rate": 1.322938249724991e-06, + "loss": 0.74704981, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19348145, + "step": 10339, + "time_per_iteration": 2.8704187870025635 + }, + { + "auxiliary_loss_clip": 0.01399475, + "auxiliary_loss_mlp": 0.01036822, + "balance_loss_clip": 1.2384963, + "balance_loss_mlp": 1.01575768, + "epoch": 0.621674432586803, + "flos": 19290579459840.0, + "grad_norm": 1.8275957207017957, + "language_loss": 0.70367408, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.728037, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.21069336, + "step": 10340, + "time_per_iteration": 2.846717119216919 + }, + { + "auxiliary_loss_clip": 0.01400771, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.23865294, + "balance_loss_mlp": 1.01393151, + "epoch": 0.6217345558394709, + "flos": 21617725973760.0, + "grad_norm": 1.9106037807322824, + "language_loss": 0.69989884, + "learning_rate": 1.322205369037788e-06, + "loss": 0.72425163, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20568848, + "step": 10341, + "time_per_iteration": 2.8518295288085938 + }, + { + "auxiliary_loss_clip": 0.01417738, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.25186563, + "balance_loss_mlp": 1.01325655, + "epoch": 0.6217946790921389, + "flos": 18013088499840.0, + "grad_norm": 1.7141534917200223, + "language_loss": 0.80992138, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83444095, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.2097168, + "step": 10342, + "time_per_iteration": 2.830789566040039 + }, + { + "auxiliary_loss_clip": 0.01191718, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.10582972, + "balance_loss_mlp": 1.00926769, + "epoch": 0.6218548023448068, + "flos": 62004583359360.0, + "grad_norm": 0.7889177331274366, + "language_loss": 0.57358801, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59579241, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.19433594, + "step": 10343, + "time_per_iteration": 3.2910213470458984 + }, + { + "auxiliary_loss_clip": 0.01395578, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.23515248, + "balance_loss_mlp": 1.01183867, + "epoch": 0.6219149255974749, + "flos": 25750163750400.0, + "grad_norm": 2.6103681137018704, + "language_loss": 0.73526251, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75952554, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18896484, + "step": 10344, + "time_per_iteration": 2.9269955158233643 + }, + { + "auxiliary_loss_clip": 0.01414027, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.24950695, + "balance_loss_mlp": 1.01671982, + "epoch": 0.6219750488501428, + "flos": 25422074570880.0, + "grad_norm": 1.689220590238894, + "language_loss": 0.61072224, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.63523388, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20422363, + "step": 10345, + "time_per_iteration": 2.901153564453125 + }, + { + "auxiliary_loss_clip": 0.01411991, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.2474544, + "balance_loss_mlp": 1.01446056, + "epoch": 0.6220351721028108, + "flos": 20056920203520.0, + "grad_norm": 2.0506466777648122, + "language_loss": 0.78758377, + "learning_rate": 1.320373617348614e-06, + "loss": 0.81205678, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20837402, + "step": 10346, + "time_per_iteration": 2.8809657096862793 + }, + { + "auxiliary_loss_clip": 0.01415704, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.24787951, + "balance_loss_mlp": 1.01435173, + "epoch": 0.6220952953554787, + "flos": 27499324199040.0, + "grad_norm": 1.8567347889708723, + "language_loss": 0.72395295, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.74845707, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20361328, + "step": 10347, + "time_per_iteration": 2.896475315093994 + }, + { + "auxiliary_loss_clip": 0.01414334, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.25070477, + "balance_loss_mlp": 1.01410639, + "epoch": 0.6221554186081467, + "flos": 19216775911680.0, + "grad_norm": 3.6075826049757804, + "language_loss": 0.72865021, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.75314063, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20617676, + "step": 10348, + "time_per_iteration": 2.879554510116577 + }, + { + "auxiliary_loss_clip": 0.01187336, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.10160041, + "balance_loss_mlp": 1.0098542, + "epoch": 0.6222155418608146, + "flos": 62980118651520.0, + "grad_norm": 0.8218903598601407, + "language_loss": 0.54250956, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56465977, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.17871094, + "step": 10349, + "time_per_iteration": 3.3399224281311035 + }, + { + "auxiliary_loss_clip": 0.01423068, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.25710249, + "balance_loss_mlp": 1.0122, + "epoch": 0.6222756651134826, + "flos": 22611223451520.0, + "grad_norm": 1.951743560498765, + "language_loss": 0.69923675, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.72378463, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19506836, + "step": 10350, + "time_per_iteration": 2.9471845626831055 + }, + { + "auxiliary_loss_clip": 0.01408682, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.24501204, + "balance_loss_mlp": 1.01639247, + "epoch": 0.6223357883661506, + "flos": 21152119288320.0, + "grad_norm": 1.9361258847616158, + "language_loss": 0.58206069, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.60651803, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20666504, + "step": 10351, + "time_per_iteration": 2.8502047061920166 + }, + { + "auxiliary_loss_clip": 0.01190214, + "auxiliary_loss_mlp": 0.01025069, + "balance_loss_clip": 1.10192227, + "balance_loss_mlp": 1.00570917, + "epoch": 0.6223959116188186, + "flos": 63797432302080.0, + "grad_norm": 0.7939591424533053, + "language_loss": 0.61206114, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63421398, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19335938, + "step": 10352, + "time_per_iteration": 3.242985486984253 + }, + { + "auxiliary_loss_clip": 0.01407763, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.2460264, + "balance_loss_mlp": 1.01535487, + "epoch": 0.6224560348714866, + "flos": 22576131469440.0, + "grad_norm": 2.5465376547902427, + "language_loss": 0.83192146, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.85635173, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19909668, + "step": 10353, + "time_per_iteration": 2.857053756713867 + }, + { + "auxiliary_loss_clip": 0.01405112, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.24399471, + "balance_loss_mlp": 1.01254368, + "epoch": 0.6225161581241545, + "flos": 24108450998400.0, + "grad_norm": 1.6699342103467656, + "language_loss": 0.76698947, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.79136121, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19506836, + "step": 10354, + "time_per_iteration": 2.898144006729126 + }, + { + "auxiliary_loss_clip": 0.01402526, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.2407434, + "balance_loss_mlp": 1.01723886, + "epoch": 0.6225762813768225, + "flos": 20451799987200.0, + "grad_norm": 1.50157762552056, + "language_loss": 0.7903322, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.81472707, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19726562, + "step": 10355, + "time_per_iteration": 2.8429126739501953 + }, + { + "auxiliary_loss_clip": 0.01417786, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.2535547, + "balance_loss_mlp": 1.01380563, + "epoch": 0.6226364046294904, + "flos": 27209403648000.0, + "grad_norm": 3.5451447466606707, + "language_loss": 0.78817922, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.8126877, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19238281, + "step": 10356, + "time_per_iteration": 2.973090410232544 + }, + { + "auxiliary_loss_clip": 0.01435579, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.26431704, + "balance_loss_mlp": 1.02001834, + "epoch": 0.6226965278821585, + "flos": 20454650409600.0, + "grad_norm": 2.30340056727797, + "language_loss": 0.68824887, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.71301717, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.21228027, + "step": 10357, + "time_per_iteration": 2.872229814529419 + }, + { + "auxiliary_loss_clip": 0.01428332, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.25817883, + "balance_loss_mlp": 1.01491046, + "epoch": 0.6227566511348264, + "flos": 22172609928960.0, + "grad_norm": 2.9492632425352867, + "language_loss": 0.7697376, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.7943874, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.21716309, + "step": 10358, + "time_per_iteration": 2.842421531677246 + }, + { + "auxiliary_loss_clip": 0.01421337, + "auxiliary_loss_mlp": 0.0103829, + "balance_loss_clip": 1.25345016, + "balance_loss_mlp": 1.01864409, + "epoch": 0.6228167743874944, + "flos": 18049945029120.0, + "grad_norm": 2.108111295112494, + "language_loss": 0.83286315, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.85745943, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19665527, + "step": 10359, + "time_per_iteration": 4.29512619972229 + }, + { + "auxiliary_loss_clip": 0.01398992, + "auxiliary_loss_mlp": 0.01044454, + "balance_loss_clip": 1.23980331, + "balance_loss_mlp": 1.0234611, + "epoch": 0.6228768976401623, + "flos": 17750251601280.0, + "grad_norm": 3.7510155864298373, + "language_loss": 0.74177945, + "learning_rate": 1.315248145768822e-06, + "loss": 0.76621389, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.20996094, + "step": 10360, + "time_per_iteration": 2.805169105529785 + }, + { + "auxiliary_loss_clip": 0.01416524, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.24986362, + "balance_loss_mlp": 1.01877117, + "epoch": 0.6229370208928303, + "flos": 17903966745600.0, + "grad_norm": 1.9791982364275265, + "language_loss": 0.781353, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.80590498, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19934082, + "step": 10361, + "time_per_iteration": 2.8429064750671387 + }, + { + "auxiliary_loss_clip": 0.01413532, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.25112104, + "balance_loss_mlp": 1.01750803, + "epoch": 0.6229971441454982, + "flos": 17356683916800.0, + "grad_norm": 1.6660070088685646, + "language_loss": 0.68417227, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.70868212, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19946289, + "step": 10362, + "time_per_iteration": 2.83235502243042 + }, + { + "auxiliary_loss_clip": 0.01416237, + "auxiliary_loss_mlp": 0.0104197, + "balance_loss_clip": 1.25164735, + "balance_loss_mlp": 1.02160954, + "epoch": 0.6230572673981662, + "flos": 29253190106880.0, + "grad_norm": 2.083099213207099, + "language_loss": 0.68625456, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.71083665, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20361328, + "step": 10363, + "time_per_iteration": 2.9104669094085693 + }, + { + "auxiliary_loss_clip": 0.01422123, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.25240576, + "balance_loss_mlp": 1.01818371, + "epoch": 0.6231173906508342, + "flos": 16334564463360.0, + "grad_norm": 1.788790618088364, + "language_loss": 0.87587351, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.90048349, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20703125, + "step": 10364, + "time_per_iteration": 2.8866076469421387 + }, + { + "auxiliary_loss_clip": 0.01190433, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.1010561, + "balance_loss_mlp": 1.02244127, + "epoch": 0.6231775139035022, + "flos": 68729158316160.0, + "grad_norm": 0.8859861133245455, + "language_loss": 0.60857117, + "learning_rate": 1.313418851605015e-06, + "loss": 0.63089538, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1953125, + "step": 10365, + "time_per_iteration": 3.333400011062622 + }, + { + "auxiliary_loss_clip": 0.01449441, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.27624142, + "balance_loss_mlp": 1.02368414, + "epoch": 0.6232376371561702, + "flos": 19828587104640.0, + "grad_norm": 4.29669287413631, + "language_loss": 0.76090056, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.78585064, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.21899414, + "step": 10366, + "time_per_iteration": 2.8515429496765137 + }, + { + "auxiliary_loss_clip": 0.01432849, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.26387298, + "balance_loss_mlp": 1.02209604, + "epoch": 0.6232977604088381, + "flos": 23268668664960.0, + "grad_norm": 2.0160815612541776, + "language_loss": 0.77437437, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.7991302, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20629883, + "step": 10367, + "time_per_iteration": 2.8505983352661133 + }, + { + "auxiliary_loss_clip": 0.01411808, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.2504741, + "balance_loss_mlp": 1.02333212, + "epoch": 0.6233578836615061, + "flos": 21116484368640.0, + "grad_norm": 1.5366579323636682, + "language_loss": 0.7902025, + "learning_rate": 1.312321587418457e-06, + "loss": 0.8147651, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.21130371, + "step": 10368, + "time_per_iteration": 2.907984495162964 + }, + { + "auxiliary_loss_clip": 0.01416078, + "auxiliary_loss_mlp": 0.01038597, + "balance_loss_clip": 1.25023818, + "balance_loss_mlp": 1.01806939, + "epoch": 0.623418006914174, + "flos": 23780045105280.0, + "grad_norm": 1.770711521142182, + "language_loss": 0.69778758, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.72233427, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20532227, + "step": 10369, + "time_per_iteration": 2.8732213973999023 + }, + { + "auxiliary_loss_clip": 0.01423436, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.25832176, + "balance_loss_mlp": 1.0175277, + "epoch": 0.6234781301668421, + "flos": 17898356390400.0, + "grad_norm": 2.110412146605329, + "language_loss": 0.88336813, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90798473, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20715332, + "step": 10370, + "time_per_iteration": 4.228360176086426 + }, + { + "auxiliary_loss_clip": 0.0141715, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.25322342, + "balance_loss_mlp": 1.01470709, + "epoch": 0.62353825341951, + "flos": 26186243564160.0, + "grad_norm": 1.4556642707085412, + "language_loss": 0.66794479, + "learning_rate": 1.311224557923402e-06, + "loss": 0.6924597, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19628906, + "step": 10371, + "time_per_iteration": 2.896641254425049 + }, + { + "auxiliary_loss_clip": 0.0139086, + "auxiliary_loss_mlp": 0.01037834, + "balance_loss_clip": 1.23452032, + "balance_loss_mlp": 1.01873648, + "epoch": 0.623598376672178, + "flos": 31152536605440.0, + "grad_norm": 1.3028894607517774, + "language_loss": 0.78157222, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.80585921, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19091797, + "step": 10372, + "time_per_iteration": 5.6874754428863525 + }, + { + "auxiliary_loss_clip": 0.01418618, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.25188518, + "balance_loss_mlp": 1.01674771, + "epoch": 0.6236584999248459, + "flos": 23740111929600.0, + "grad_norm": 1.8759815699399536, + "language_loss": 0.77739549, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.80195069, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20153809, + "step": 10373, + "time_per_iteration": 2.8695085048675537 + }, + { + "auxiliary_loss_clip": 0.01407205, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.24628425, + "balance_loss_mlp": 1.01267099, + "epoch": 0.6237186231775139, + "flos": 21772888951680.0, + "grad_norm": 1.6211541034227688, + "language_loss": 0.70557612, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.72996271, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18786621, + "step": 10374, + "time_per_iteration": 2.8578813076019287 + }, + { + "auxiliary_loss_clip": 0.01427085, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.25938463, + "balance_loss_mlp": 1.01579285, + "epoch": 0.6237787464301818, + "flos": 14947725525120.0, + "grad_norm": 1.8207986027795982, + "language_loss": 0.77630186, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.80093843, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20788574, + "step": 10375, + "time_per_iteration": 2.9992563724517822 + }, + { + "auxiliary_loss_clip": 0.01412227, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.25078416, + "balance_loss_mlp": 1.01641095, + "epoch": 0.6238388696828499, + "flos": 35603878867200.0, + "grad_norm": 1.5997237691571562, + "language_loss": 0.71379656, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.73827189, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18908691, + "step": 10376, + "time_per_iteration": 2.9938740730285645 + }, + { + "auxiliary_loss_clip": 0.01424868, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.25787568, + "balance_loss_mlp": 1.02087414, + "epoch": 0.6238989929355178, + "flos": 23634112066560.0, + "grad_norm": 2.4870905527129383, + "language_loss": 0.77906328, + "learning_rate": 1.309031204505301e-06, + "loss": 0.8037231, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20239258, + "step": 10377, + "time_per_iteration": 2.8679239749908447 + }, + { + "auxiliary_loss_clip": 0.01415293, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.25118256, + "balance_loss_mlp": 1.01596308, + "epoch": 0.6239591161881858, + "flos": 22096725120000.0, + "grad_norm": 2.607547396239463, + "language_loss": 0.69179416, + "learning_rate": 1.308665737227052e-06, + "loss": 0.71629167, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18493652, + "step": 10378, + "time_per_iteration": 2.8788669109344482 + }, + { + "auxiliary_loss_clip": 0.01409015, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.24576831, + "balance_loss_mlp": 1.01269913, + "epoch": 0.6240192394408538, + "flos": 24546838296960.0, + "grad_norm": 2.326988346432853, + "language_loss": 0.76826048, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.7926693, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19177246, + "step": 10379, + "time_per_iteration": 2.931349277496338 + }, + { + "auxiliary_loss_clip": 0.01420067, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.25356698, + "balance_loss_mlp": 1.01130903, + "epoch": 0.6240793626935217, + "flos": 27944452972800.0, + "grad_norm": 1.653768105452062, + "language_loss": 0.79383671, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.8183459, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.1953125, + "step": 10380, + "time_per_iteration": 2.9162395000457764 + }, + { + "auxiliary_loss_clip": 0.01404293, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.24324822, + "balance_loss_mlp": 1.01696348, + "epoch": 0.6241394859461897, + "flos": 22902591836160.0, + "grad_norm": 1.5448015725664552, + "language_loss": 0.80673206, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.83113378, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18920898, + "step": 10381, + "time_per_iteration": 2.8782458305358887 + }, + { + "auxiliary_loss_clip": 0.01418554, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.25359929, + "balance_loss_mlp": 1.01198208, + "epoch": 0.6241996091988576, + "flos": 12758548965120.0, + "grad_norm": 1.9178629666890425, + "language_loss": 0.75016069, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.77466047, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19433594, + "step": 10382, + "time_per_iteration": 2.8514316082000732 + }, + { + "auxiliary_loss_clip": 0.01417026, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.25435972, + "balance_loss_mlp": 1.01486897, + "epoch": 0.6242597324515257, + "flos": 25863040823040.0, + "grad_norm": 1.5111190274983402, + "language_loss": 0.78958422, + "learning_rate": 1.306838794344911e-06, + "loss": 0.8141067, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20349121, + "step": 10383, + "time_per_iteration": 2.948786735534668 + }, + { + "auxiliary_loss_clip": 0.01410078, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.24644458, + "balance_loss_mlp": 1.01079094, + "epoch": 0.6243198557041936, + "flos": 19947165022080.0, + "grad_norm": 1.7632518023667838, + "language_loss": 0.75564879, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.78005075, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19335938, + "step": 10384, + "time_per_iteration": 2.826127529144287 + }, + { + "auxiliary_loss_clip": 0.01424282, + "auxiliary_loss_mlp": 0.01035223, + "balance_loss_clip": 1.25708115, + "balance_loss_mlp": 1.01407552, + "epoch": 0.6243799789568616, + "flos": 18415478920320.0, + "grad_norm": 1.8679889539439598, + "language_loss": 0.66835171, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.69294679, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21166992, + "step": 10385, + "time_per_iteration": 2.8361148834228516 + }, + { + "auxiliary_loss_clip": 0.01183359, + "auxiliary_loss_mlp": 0.01016519, + "balance_loss_clip": 1.0966922, + "balance_loss_mlp": 0.9971593, + "epoch": 0.6244401022095295, + "flos": 66060638634240.0, + "grad_norm": 0.7550878379077774, + "language_loss": 0.62077135, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64277011, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19335938, + "step": 10386, + "time_per_iteration": 3.3833136558532715 + }, + { + "auxiliary_loss_clip": 0.0141568, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.2502749, + "balance_loss_mlp": 1.01410198, + "epoch": 0.6245002254621975, + "flos": 24581885034240.0, + "grad_norm": 2.3852820475306324, + "language_loss": 0.72914314, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.75364184, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20080566, + "step": 10387, + "time_per_iteration": 2.892831563949585 + }, + { + "auxiliary_loss_clip": 0.01434031, + "auxiliary_loss_mlp": 0.01037322, + "balance_loss_clip": 1.26344776, + "balance_loss_mlp": 1.01653218, + "epoch": 0.6245603487148654, + "flos": 29180743902720.0, + "grad_norm": 2.1307382585699997, + "language_loss": 0.65897548, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.68368906, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20800781, + "step": 10388, + "time_per_iteration": 2.903571128845215 + }, + { + "auxiliary_loss_clip": 0.01416052, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.25119734, + "balance_loss_mlp": 1.01482654, + "epoch": 0.6246204719675335, + "flos": 14797403740800.0, + "grad_norm": 1.7686758299364909, + "language_loss": 0.79898572, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.82348692, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19262695, + "step": 10389, + "time_per_iteration": 2.808130979537964 + }, + { + "auxiliary_loss_clip": 0.01401946, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.23999476, + "balance_loss_mlp": 1.01103568, + "epoch": 0.6246805952202014, + "flos": 12500779484160.0, + "grad_norm": 2.0965635063971826, + "language_loss": 0.6147033, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.63903129, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19812012, + "step": 10390, + "time_per_iteration": 2.822838306427002 + }, + { + "auxiliary_loss_clip": 0.01435465, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.2664988, + "balance_loss_mlp": 1.01881087, + "epoch": 0.6247407184728694, + "flos": 12794319619200.0, + "grad_norm": 2.0245379916007504, + "language_loss": 0.78091675, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.80565584, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19641113, + "step": 10391, + "time_per_iteration": 2.8201215267181396 + }, + { + "auxiliary_loss_clip": 0.01427239, + "auxiliary_loss_mlp": 0.01037623, + "balance_loss_clip": 1.26095295, + "balance_loss_mlp": 1.01742935, + "epoch": 0.6248008417255374, + "flos": 40645061331840.0, + "grad_norm": 1.6106837147551534, + "language_loss": 0.65479583, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.67944443, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20202637, + "step": 10392, + "time_per_iteration": 3.085664749145508 + }, + { + "auxiliary_loss_clip": 0.01429348, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.26143634, + "balance_loss_mlp": 1.01324344, + "epoch": 0.6248609649782053, + "flos": 19911665836800.0, + "grad_norm": 1.8525788806332424, + "language_loss": 0.76956904, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.79420418, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20910645, + "step": 10393, + "time_per_iteration": 2.907437324523926 + }, + { + "auxiliary_loss_clip": 0.01418702, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.25107813, + "balance_loss_mlp": 1.01693702, + "epoch": 0.6249210882308733, + "flos": 19692245963520.0, + "grad_norm": 1.8205900095628822, + "language_loss": 0.83476657, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.85933733, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.21447754, + "step": 10394, + "time_per_iteration": 4.234117269515991 + }, + { + "auxiliary_loss_clip": 0.01416713, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.24888635, + "balance_loss_mlp": 1.01583266, + "epoch": 0.6249812114835412, + "flos": 13998685703040.0, + "grad_norm": 1.8721606309424472, + "language_loss": 0.76098835, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.78551376, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.1998291, + "step": 10395, + "time_per_iteration": 2.841761827468872 + }, + { + "auxiliary_loss_clip": 0.01437377, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.26803517, + "balance_loss_mlp": 1.01805902, + "epoch": 0.6250413347362093, + "flos": 14536059920640.0, + "grad_norm": 2.3202493468349665, + "language_loss": 0.73249477, + "learning_rate": 1.302091822487119e-06, + "loss": 0.75724709, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19787598, + "step": 10396, + "time_per_iteration": 2.8274879455566406 + }, + { + "auxiliary_loss_clip": 0.01423147, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.2572577, + "balance_loss_mlp": 1.01157308, + "epoch": 0.6251014579888772, + "flos": 22972459086720.0, + "grad_norm": 2.21579124143767, + "language_loss": 0.76970053, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.79424179, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19396973, + "step": 10397, + "time_per_iteration": 2.8524744510650635 + }, + { + "auxiliary_loss_clip": 0.01424856, + "auxiliary_loss_mlp": 0.01037701, + "balance_loss_clip": 1.25833964, + "balance_loss_mlp": 1.01716089, + "epoch": 0.6251615812415452, + "flos": 28122989529600.0, + "grad_norm": 2.2397817770120247, + "language_loss": 0.76013952, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.78476512, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20556641, + "step": 10398, + "time_per_iteration": 2.915275812149048 + }, + { + "auxiliary_loss_clip": 0.01427262, + "auxiliary_loss_mlp": 0.01043756, + "balance_loss_clip": 1.2576896, + "balance_loss_mlp": 1.02259636, + "epoch": 0.6252217044942131, + "flos": 26735788632960.0, + "grad_norm": 2.133714436018665, + "language_loss": 0.75310338, + "learning_rate": 1.300997001489483e-06, + "loss": 0.77781355, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.21142578, + "step": 10399, + "time_per_iteration": 2.9015908241271973 + }, + { + "auxiliary_loss_clip": 0.01423226, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.25709534, + "balance_loss_mlp": 1.0172447, + "epoch": 0.6252818277468811, + "flos": 20015222480640.0, + "grad_norm": 1.4795254414251267, + "language_loss": 0.74802005, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.77263826, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21350098, + "step": 10400, + "time_per_iteration": 2.859086513519287 + }, + { + "auxiliary_loss_clip": 0.01187848, + "auxiliary_loss_mlp": 0.01020367, + "balance_loss_clip": 1.10052657, + "balance_loss_mlp": 0.99690628, + "epoch": 0.625341950999549, + "flos": 59309731203840.0, + "grad_norm": 0.8865687610414611, + "language_loss": 0.56557477, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58765692, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.234375, + "step": 10401, + "time_per_iteration": 3.421266794204712 + }, + { + "auxiliary_loss_clip": 0.01420706, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.2536478, + "balance_loss_mlp": 1.01417232, + "epoch": 0.625402074252217, + "flos": 20166720629760.0, + "grad_norm": 2.916652967263405, + "language_loss": 0.83692622, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.86148679, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21203613, + "step": 10402, + "time_per_iteration": 2.8425896167755127 + }, + { + "auxiliary_loss_clip": 0.01415409, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.24945307, + "balance_loss_mlp": 1.01395321, + "epoch": 0.625462197504885, + "flos": 29144475555840.0, + "grad_norm": 1.7705567282979633, + "language_loss": 0.69635504, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.72084367, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19494629, + "step": 10403, + "time_per_iteration": 2.9768335819244385 + }, + { + "auxiliary_loss_clip": 0.01419001, + "auxiliary_loss_mlp": 0.01032736, + "balance_loss_clip": 1.2516675, + "balance_loss_mlp": 1.01183844, + "epoch": 0.625522320757553, + "flos": 26115652396800.0, + "grad_norm": 1.9344112868785492, + "language_loss": 0.72565758, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.75017494, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20898438, + "step": 10404, + "time_per_iteration": 2.889516830444336 + }, + { + "auxiliary_loss_clip": 0.01425593, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.25898933, + "balance_loss_mlp": 1.01797819, + "epoch": 0.625582444010221, + "flos": 20640607113600.0, + "grad_norm": 2.3843143671013154, + "language_loss": 0.70290601, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.7275492, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20751953, + "step": 10405, + "time_per_iteration": 4.283205509185791 + }, + { + "auxiliary_loss_clip": 0.0142177, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.25762439, + "balance_loss_mlp": 1.01519346, + "epoch": 0.6256425672628889, + "flos": 20531123400960.0, + "grad_norm": 1.602562230395877, + "language_loss": 0.79821181, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.82278323, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20178223, + "step": 10406, + "time_per_iteration": 2.8438658714294434 + }, + { + "auxiliary_loss_clip": 0.01412757, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.24661744, + "balance_loss_mlp": 1.01261687, + "epoch": 0.6257026905155569, + "flos": 29540034011520.0, + "grad_norm": 1.869965856920582, + "language_loss": 0.69202602, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71648169, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20178223, + "step": 10407, + "time_per_iteration": 4.311597585678101 + }, + { + "auxiliary_loss_clip": 0.01400602, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.24031734, + "balance_loss_mlp": 1.01334906, + "epoch": 0.6257628137682248, + "flos": 24035597591040.0, + "grad_norm": 1.7009670345094714, + "language_loss": 0.85828596, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.88262469, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19909668, + "step": 10408, + "time_per_iteration": 2.877589225769043 + }, + { + "auxiliary_loss_clip": 0.01416874, + "auxiliary_loss_mlp": 0.01035013, + "balance_loss_clip": 1.25185609, + "balance_loss_mlp": 1.01525974, + "epoch": 0.6258229370208929, + "flos": 20860931882880.0, + "grad_norm": 2.00322735300357, + "language_loss": 0.80362034, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82813919, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19750977, + "step": 10409, + "time_per_iteration": 2.839599609375 + }, + { + "auxiliary_loss_clip": 0.01418145, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.25373626, + "balance_loss_mlp": 1.01414108, + "epoch": 0.6258830602735608, + "flos": 22240350673920.0, + "grad_norm": 2.1497976340671165, + "language_loss": 0.70431453, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.72884303, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20556641, + "step": 10410, + "time_per_iteration": 2.847909927368164 + }, + { + "auxiliary_loss_clip": 0.01418616, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.25728035, + "balance_loss_mlp": 1.01167846, + "epoch": 0.6259431835262288, + "flos": 25086293775360.0, + "grad_norm": 1.754232610953202, + "language_loss": 0.68876195, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.71325815, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.1932373, + "step": 10411, + "time_per_iteration": 2.876253128051758 + }, + { + "auxiliary_loss_clip": 0.0141729, + "auxiliary_loss_mlp": 0.01037091, + "balance_loss_clip": 1.24983692, + "balance_loss_mlp": 1.01607406, + "epoch": 0.6260033067788967, + "flos": 28262814520320.0, + "grad_norm": 1.7538588642341113, + "language_loss": 0.70150065, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.72604442, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.21020508, + "step": 10412, + "time_per_iteration": 2.915454149246216 + }, + { + "auxiliary_loss_clip": 0.01421412, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.25770521, + "balance_loss_mlp": 1.01729894, + "epoch": 0.6260634300315647, + "flos": 23377790419200.0, + "grad_norm": 1.5220157677659918, + "language_loss": 0.70202541, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.72660363, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19116211, + "step": 10413, + "time_per_iteration": 2.868215322494507 + }, + { + "auxiliary_loss_clip": 0.01443894, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.27008557, + "balance_loss_mlp": 1.01475239, + "epoch": 0.6261235532842326, + "flos": 18042932085120.0, + "grad_norm": 2.5989944491132677, + "language_loss": 0.81925511, + "learning_rate": 1.295526482316796e-06, + "loss": 0.84404838, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.20666504, + "step": 10414, + "time_per_iteration": 2.840371608734131 + }, + { + "auxiliary_loss_clip": 0.01430423, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.26408887, + "balance_loss_mlp": 1.0137887, + "epoch": 0.6261836765369007, + "flos": 22019663946240.0, + "grad_norm": 1.6519593561306756, + "language_loss": 0.75717497, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.78181815, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20092773, + "step": 10415, + "time_per_iteration": 2.947255849838257 + }, + { + "auxiliary_loss_clip": 0.0141989, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.25593758, + "balance_loss_mlp": 1.01406324, + "epoch": 0.6262437997895686, + "flos": 24946695008640.0, + "grad_norm": 1.6370382027458616, + "language_loss": 0.7491765, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.77371311, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19714355, + "step": 10416, + "time_per_iteration": 2.9239585399627686 + }, + { + "auxiliary_loss_clip": 0.01402104, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.24166703, + "balance_loss_mlp": 1.01280379, + "epoch": 0.6263039230422366, + "flos": 31619681614080.0, + "grad_norm": 1.693384712019319, + "language_loss": 0.85119843, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.87554365, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19592285, + "step": 10417, + "time_per_iteration": 2.9268813133239746 + }, + { + "auxiliary_loss_clip": 0.01427444, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.26084602, + "balance_loss_mlp": 1.01563835, + "epoch": 0.6263640462949046, + "flos": 17648414259840.0, + "grad_norm": 2.2548652809489833, + "language_loss": 0.58255064, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.60717547, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19384766, + "step": 10418, + "time_per_iteration": 2.862926483154297 + }, + { + "auxiliary_loss_clip": 0.01436398, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.26519823, + "balance_loss_mlp": 1.01601624, + "epoch": 0.6264241695475725, + "flos": 19984609733760.0, + "grad_norm": 1.8259155934652063, + "language_loss": 0.84837234, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.87310159, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20507812, + "step": 10419, + "time_per_iteration": 2.9714460372924805 + }, + { + "auxiliary_loss_clip": 0.01434078, + "auxiliary_loss_mlp": 0.01036935, + "balance_loss_clip": 1.26758599, + "balance_loss_mlp": 1.017313, + "epoch": 0.6264842928002405, + "flos": 27355698645120.0, + "grad_norm": 1.3801625844806593, + "language_loss": 0.65018839, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67489851, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19616699, + "step": 10420, + "time_per_iteration": 2.916839361190796 + }, + { + "auxiliary_loss_clip": 0.01432881, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.26426685, + "balance_loss_mlp": 1.01642776, + "epoch": 0.6265444160529084, + "flos": 23006148480000.0, + "grad_norm": 2.0234949453804263, + "language_loss": 0.87424982, + "learning_rate": 1.292975627485741e-06, + "loss": 0.89895177, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20910645, + "step": 10421, + "time_per_iteration": 2.863182544708252 + }, + { + "auxiliary_loss_clip": 0.01413863, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.24967504, + "balance_loss_mlp": 1.01557231, + "epoch": 0.6266045393055765, + "flos": 19947934183680.0, + "grad_norm": 2.2871123804957443, + "language_loss": 0.80505288, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.82954347, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19628906, + "step": 10422, + "time_per_iteration": 2.846493721008301 + }, + { + "auxiliary_loss_clip": 0.01415397, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.25026536, + "balance_loss_mlp": 1.01233363, + "epoch": 0.6266646625582444, + "flos": 24399728893440.0, + "grad_norm": 4.744316780846274, + "language_loss": 0.7553885, + "learning_rate": 1.292247052906389e-06, + "loss": 0.77987075, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20495605, + "step": 10423, + "time_per_iteration": 2.8802969455718994 + }, + { + "auxiliary_loss_clip": 0.01419368, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.25394917, + "balance_loss_mlp": 1.01159263, + "epoch": 0.6267247858109124, + "flos": 14691901570560.0, + "grad_norm": 1.7781896315195567, + "language_loss": 0.78785783, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.81236577, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19836426, + "step": 10424, + "time_per_iteration": 2.808213472366333 + }, + { + "auxiliary_loss_clip": 0.01413471, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.25086117, + "balance_loss_mlp": 1.01349521, + "epoch": 0.6267849090635803, + "flos": 24939093882240.0, + "grad_norm": 1.7722926122712463, + "language_loss": 0.69958448, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.7240597, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20556641, + "step": 10425, + "time_per_iteration": 2.8585946559906006 + }, + { + "auxiliary_loss_clip": 0.01405229, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.24590623, + "balance_loss_mlp": 1.00882375, + "epoch": 0.6268450323162483, + "flos": 25348768715520.0, + "grad_norm": 1.545984967630754, + "language_loss": 0.75394249, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.77828777, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20458984, + "step": 10426, + "time_per_iteration": 2.8993122577667236 + }, + { + "auxiliary_loss_clip": 0.01426204, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.2591567, + "balance_loss_mlp": 1.01341021, + "epoch": 0.6269051555689162, + "flos": 26188505804160.0, + "grad_norm": 1.8632742292843785, + "language_loss": 0.81088692, + "learning_rate": 1.290790225914929e-06, + "loss": 0.83548141, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19836426, + "step": 10427, + "time_per_iteration": 2.9259533882141113 + }, + { + "auxiliary_loss_clip": 0.01435354, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.26787543, + "balance_loss_mlp": 1.01727331, + "epoch": 0.6269652788215843, + "flos": 18265247625600.0, + "grad_norm": 1.8085536287142228, + "language_loss": 0.69349372, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.71822047, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20056152, + "step": 10428, + "time_per_iteration": 2.8195786476135254 + }, + { + "auxiliary_loss_clip": 0.01418388, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.2526406, + "balance_loss_mlp": 1.01704144, + "epoch": 0.6270254020742522, + "flos": 11772878837760.0, + "grad_norm": 2.4372848034378247, + "language_loss": 0.7227093, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.74726719, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20349121, + "step": 10429, + "time_per_iteration": 4.300502061843872 + }, + { + "auxiliary_loss_clip": 0.01436386, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.26776695, + "balance_loss_mlp": 1.01473248, + "epoch": 0.6270855253269202, + "flos": 23485690563840.0, + "grad_norm": 1.6319749436746076, + "language_loss": 0.80678141, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.83149701, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20458984, + "step": 10430, + "time_per_iteration": 2.846107244491577 + }, + { + "auxiliary_loss_clip": 0.01183178, + "auxiliary_loss_mlp": 0.01020692, + "balance_loss_clip": 1.09476423, + "balance_loss_mlp": 0.99570608, + "epoch": 0.6271456485795882, + "flos": 70095247873920.0, + "grad_norm": 0.7575120073679272, + "language_loss": 0.59195322, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61399192, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.25, + "step": 10431, + "time_per_iteration": 3.453068733215332 + }, + { + "auxiliary_loss_clip": 0.01189376, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.09926414, + "balance_loss_mlp": 1.01149869, + "epoch": 0.6272057718322561, + "flos": 65190786491520.0, + "grad_norm": 0.8679022115937886, + "language_loss": 0.63928515, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.66150558, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.21191406, + "step": 10432, + "time_per_iteration": 3.3376471996307373 + }, + { + "auxiliary_loss_clip": 0.01416912, + "auxiliary_loss_mlp": 0.0103588, + "balance_loss_clip": 1.25268126, + "balance_loss_mlp": 1.01621079, + "epoch": 0.6272658950849241, + "flos": 24400045607040.0, + "grad_norm": 2.9193723217887055, + "language_loss": 0.65643358, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.68096149, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.1965332, + "step": 10433, + "time_per_iteration": 2.8565850257873535 + }, + { + "auxiliary_loss_clip": 0.01426992, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.25802994, + "balance_loss_mlp": 1.01302278, + "epoch": 0.627326018337592, + "flos": 17974557912960.0, + "grad_norm": 3.1844907135922353, + "language_loss": 0.62890136, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.65350878, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.20751953, + "step": 10434, + "time_per_iteration": 2.7930490970611572 + }, + { + "auxiliary_loss_clip": 0.01424969, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.25740826, + "balance_loss_mlp": 1.0136615, + "epoch": 0.6273861415902601, + "flos": 20239755016320.0, + "grad_norm": 1.837173412465903, + "language_loss": 0.85056549, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.87516081, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.2088623, + "step": 10435, + "time_per_iteration": 2.8522324562072754 + }, + { + "auxiliary_loss_clip": 0.01188772, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.09710443, + "balance_loss_mlp": 1.01170659, + "epoch": 0.627446264842928, + "flos": 64981754939520.0, + "grad_norm": 0.7424138362144119, + "language_loss": 0.61569983, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.637923, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.21875, + "step": 10436, + "time_per_iteration": 3.3448572158813477 + }, + { + "auxiliary_loss_clip": 0.01430742, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.26383519, + "balance_loss_mlp": 1.01574302, + "epoch": 0.627506388095596, + "flos": 23594133646080.0, + "grad_norm": 1.4976023785931207, + "language_loss": 0.77999711, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.80466342, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20141602, + "step": 10437, + "time_per_iteration": 2.956705093383789 + }, + { + "auxiliary_loss_clip": 0.01189547, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.09809041, + "balance_loss_mlp": 1.00866508, + "epoch": 0.6275665113482639, + "flos": 67613752788480.0, + "grad_norm": 0.7188831473243978, + "language_loss": 0.54354471, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56575382, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.2265625, + "step": 10438, + "time_per_iteration": 3.2608180046081543 + }, + { + "auxiliary_loss_clip": 0.0141337, + "auxiliary_loss_mlp": 0.0103831, + "balance_loss_clip": 1.24831903, + "balance_loss_mlp": 1.01848614, + "epoch": 0.6276266346009319, + "flos": 27648560108160.0, + "grad_norm": 2.0474457819878644, + "language_loss": 0.84867156, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.87318838, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19824219, + "step": 10439, + "time_per_iteration": 2.9408130645751953 + }, + { + "auxiliary_loss_clip": 0.01428036, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.25967407, + "balance_loss_mlp": 1.02097929, + "epoch": 0.6276867578535998, + "flos": 22756070615040.0, + "grad_norm": 2.173721302674834, + "language_loss": 0.81012428, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.83482331, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20898438, + "step": 10440, + "time_per_iteration": 4.3223717212677 + }, + { + "auxiliary_loss_clip": 0.01406597, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.24620163, + "balance_loss_mlp": 1.01350534, + "epoch": 0.6277468811062679, + "flos": 24654828931200.0, + "grad_norm": 1.646674132256955, + "language_loss": 0.75177884, + "learning_rate": 1.285694725799337e-06, + "loss": 0.77618074, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.20080566, + "step": 10441, + "time_per_iteration": 2.922445774078369 + }, + { + "auxiliary_loss_clip": 0.01404357, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.241418, + "balance_loss_mlp": 1.01572347, + "epoch": 0.6278070043589358, + "flos": 19687495259520.0, + "grad_norm": 1.8311944498635246, + "language_loss": 0.72535467, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74976182, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20654297, + "step": 10442, + "time_per_iteration": 5.707226276397705 + }, + { + "auxiliary_loss_clip": 0.0141925, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.25221586, + "balance_loss_mlp": 1.01774573, + "epoch": 0.6278671276116038, + "flos": 22130550247680.0, + "grad_norm": 1.484251619129408, + "language_loss": 0.72369796, + "learning_rate": 1.284967229712762e-06, + "loss": 0.74827391, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20605469, + "step": 10443, + "time_per_iteration": 2.887761116027832 + }, + { + "auxiliary_loss_clip": 0.01421843, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.25644374, + "balance_loss_mlp": 1.02188814, + "epoch": 0.6279272508642717, + "flos": 23048705854080.0, + "grad_norm": 2.2840567756162944, + "language_loss": 0.74100339, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.76565069, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20983887, + "step": 10444, + "time_per_iteration": 2.870457172393799 + }, + { + "auxiliary_loss_clip": 0.01414085, + "auxiliary_loss_mlp": 0.01039571, + "balance_loss_clip": 1.2497108, + "balance_loss_mlp": 1.01949644, + "epoch": 0.6279873741169397, + "flos": 19832432912640.0, + "grad_norm": 1.8029380679246159, + "language_loss": 0.72778225, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.7523188, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20056152, + "step": 10445, + "time_per_iteration": 2.8629682064056396 + }, + { + "auxiliary_loss_clip": 0.01406891, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.24245322, + "balance_loss_mlp": 1.01845217, + "epoch": 0.6280474973696077, + "flos": 23926068633600.0, + "grad_norm": 1.5005687791185502, + "language_loss": 0.70088196, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.72533727, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20166016, + "step": 10446, + "time_per_iteration": 2.8603789806365967 + }, + { + "auxiliary_loss_clip": 0.01442754, + "auxiliary_loss_mlp": 0.01043369, + "balance_loss_clip": 1.27118587, + "balance_loss_mlp": 1.0221498, + "epoch": 0.6281076206222757, + "flos": 17977046376960.0, + "grad_norm": 1.8662739545991374, + "language_loss": 0.74600577, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.77086699, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.2121582, + "step": 10447, + "time_per_iteration": 2.833699941635132 + }, + { + "auxiliary_loss_clip": 0.01185691, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.09399891, + "balance_loss_mlp": 1.00729227, + "epoch": 0.6281677438749437, + "flos": 66807614603520.0, + "grad_norm": 0.6769109072416627, + "language_loss": 0.52418864, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54634738, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.22851562, + "step": 10448, + "time_per_iteration": 3.216970682144165 + }, + { + "auxiliary_loss_clip": 0.01439461, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_clip": 1.27147269, + "balance_loss_mlp": 1.02230227, + "epoch": 0.6282278671276116, + "flos": 11663666593920.0, + "grad_norm": 4.1955310728234485, + "language_loss": 0.92866206, + "learning_rate": 1.282785392633079e-06, + "loss": 0.95348632, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20654297, + "step": 10449, + "time_per_iteration": 2.829805850982666 + }, + { + "auxiliary_loss_clip": 0.01424748, + "auxiliary_loss_mlp": 0.01037025, + "balance_loss_clip": 1.25923657, + "balance_loss_mlp": 1.01740336, + "epoch": 0.6282879903802796, + "flos": 42757040983680.0, + "grad_norm": 1.617536897322907, + "language_loss": 0.61034399, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.63496172, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19628906, + "step": 10450, + "time_per_iteration": 3.065383195877075 + }, + { + "auxiliary_loss_clip": 0.0139897, + "auxiliary_loss_mlp": 0.01035223, + "balance_loss_clip": 1.23766279, + "balance_loss_mlp": 1.01467144, + "epoch": 0.6283481136329475, + "flos": 20018299127040.0, + "grad_norm": 1.8631354663017858, + "language_loss": 0.77363312, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79797512, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20556641, + "step": 10451, + "time_per_iteration": 2.856536626815796 + }, + { + "auxiliary_loss_clip": 0.01428915, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.26074088, + "balance_loss_mlp": 1.01232421, + "epoch": 0.6284082368856155, + "flos": 21913483104000.0, + "grad_norm": 1.5498908654818992, + "language_loss": 0.77989811, + "learning_rate": 1.281694841064566e-06, + "loss": 0.80452019, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.2097168, + "step": 10452, + "time_per_iteration": 2.8545711040496826 + }, + { + "auxiliary_loss_clip": 0.01413551, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.24947035, + "balance_loss_mlp": 1.0124228, + "epoch": 0.6284683601382834, + "flos": 25495244691840.0, + "grad_norm": 1.62771451780529, + "language_loss": 0.7370562, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20812988, + "step": 10453, + "time_per_iteration": 2.9143664836883545 + }, + { + "auxiliary_loss_clip": 0.01412908, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.24592125, + "balance_loss_mlp": 1.0124042, + "epoch": 0.6285284833909515, + "flos": 16545749783040.0, + "grad_norm": 1.8184142354329116, + "language_loss": 0.81595832, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.84042031, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20898438, + "step": 10454, + "time_per_iteration": 2.826732635498047 + }, + { + "auxiliary_loss_clip": 0.01409986, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.24711871, + "balance_loss_mlp": 1.01322234, + "epoch": 0.6285886066436194, + "flos": 22831050528000.0, + "grad_norm": 7.4908219299114025, + "language_loss": 0.82963854, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.8540678, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19714355, + "step": 10455, + "time_per_iteration": 2.878007411956787 + }, + { + "auxiliary_loss_clip": 0.01409465, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.24515915, + "balance_loss_mlp": 1.0116086, + "epoch": 0.6286487298962874, + "flos": 24726008280960.0, + "grad_norm": 1.537737450154947, + "language_loss": 0.82819438, + "learning_rate": 1.280241153705706e-06, + "loss": 0.85261053, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20544434, + "step": 10456, + "time_per_iteration": 2.923813581466675 + }, + { + "auxiliary_loss_clip": 0.0143816, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_clip": 1.2690537, + "balance_loss_mlp": 1.02114642, + "epoch": 0.6287088531489553, + "flos": 20750588519040.0, + "grad_norm": 1.4747876105320308, + "language_loss": 0.73381603, + "learning_rate": 1.27987780006486e-06, + "loss": 0.75861102, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.2019043, + "step": 10457, + "time_per_iteration": 2.8872382640838623 + }, + { + "auxiliary_loss_clip": 0.01444913, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.27180469, + "balance_loss_mlp": 1.01941252, + "epoch": 0.6287689764016233, + "flos": 23079635314560.0, + "grad_norm": 1.7549328558020942, + "language_loss": 0.80856633, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.83341622, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20654297, + "step": 10458, + "time_per_iteration": 2.8678884506225586 + }, + { + "auxiliary_loss_clip": 0.01435502, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.26599145, + "balance_loss_mlp": 1.01921535, + "epoch": 0.6288290996542913, + "flos": 32246378346240.0, + "grad_norm": 2.2930436893474333, + "language_loss": 0.61960661, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.6443553, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20166016, + "step": 10459, + "time_per_iteration": 2.9307773113250732 + }, + { + "auxiliary_loss_clip": 0.01434809, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.2673583, + "balance_loss_mlp": 1.01729321, + "epoch": 0.6288892229069593, + "flos": 24651209347200.0, + "grad_norm": 1.5725083044858146, + "language_loss": 0.79770714, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.82241863, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19055176, + "step": 10460, + "time_per_iteration": 2.8985321521759033 + }, + { + "auxiliary_loss_clip": 0.01414485, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.25143659, + "balance_loss_mlp": 1.01021194, + "epoch": 0.6289493461596273, + "flos": 17867336440320.0, + "grad_norm": 1.9146011461421923, + "language_loss": 0.74779314, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.77224052, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20043945, + "step": 10461, + "time_per_iteration": 2.917844533920288 + }, + { + "auxiliary_loss_clip": 0.01411279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.24798441, + "balance_loss_mlp": 1.01534128, + "epoch": 0.6290094694122952, + "flos": 22355354252160.0, + "grad_norm": 1.5490756868932751, + "language_loss": 0.70948958, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.73395342, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19763184, + "step": 10462, + "time_per_iteration": 2.8750245571136475 + }, + { + "auxiliary_loss_clip": 0.01395842, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.237131, + "balance_loss_mlp": 1.01438951, + "epoch": 0.6290695926649632, + "flos": 28414086445440.0, + "grad_norm": 2.0848331536643374, + "language_loss": 0.72727871, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.75156486, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18383789, + "step": 10463, + "time_per_iteration": 2.876756191253662 + }, + { + "auxiliary_loss_clip": 0.01414521, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.25402486, + "balance_loss_mlp": 1.01558769, + "epoch": 0.6291297159176311, + "flos": 21515028981120.0, + "grad_norm": 1.722253207422101, + "language_loss": 0.72888863, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75338006, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19030762, + "step": 10464, + "time_per_iteration": 4.22221565246582 + }, + { + "auxiliary_loss_clip": 0.01423719, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.2598803, + "balance_loss_mlp": 1.01202595, + "epoch": 0.6291898391702991, + "flos": 12210904177920.0, + "grad_norm": 1.7296014890120686, + "language_loss": 0.69878364, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.7233367, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19580078, + "step": 10465, + "time_per_iteration": 2.809210777282715 + }, + { + "auxiliary_loss_clip": 0.01188546, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.09764934, + "balance_loss_mlp": 1.00737, + "epoch": 0.629249962422967, + "flos": 69330309719040.0, + "grad_norm": 0.6823481288745018, + "language_loss": 0.59858978, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.62072635, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.17773438, + "step": 10466, + "time_per_iteration": 3.495004892349243 + }, + { + "auxiliary_loss_clip": 0.0141456, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.25077701, + "balance_loss_mlp": 1.01059604, + "epoch": 0.6293100856756351, + "flos": 40092575351040.0, + "grad_norm": 2.0062083739554293, + "language_loss": 0.66055286, + "learning_rate": 1.276245767820154e-06, + "loss": 0.68498808, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18347168, + "step": 10467, + "time_per_iteration": 3.0293545722961426 + }, + { + "auxiliary_loss_clip": 0.01183649, + "auxiliary_loss_mlp": 0.01018127, + "balance_loss_clip": 1.09412014, + "balance_loss_mlp": 1.00057936, + "epoch": 0.629370208928303, + "flos": 67528728529920.0, + "grad_norm": 0.7949460276035607, + "language_loss": 0.56927502, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.59129274, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.17578125, + "step": 10468, + "time_per_iteration": 3.076260566711426 + }, + { + "auxiliary_loss_clip": 0.0118411, + "auxiliary_loss_mlp": 0.01022218, + "balance_loss_clip": 1.09503436, + "balance_loss_mlp": 1.00152326, + "epoch": 0.629430332180971, + "flos": 60691140766080.0, + "grad_norm": 0.749324227185622, + "language_loss": 0.58106005, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60312343, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20703125, + "step": 10469, + "time_per_iteration": 3.1962838172912598 + }, + { + "auxiliary_loss_clip": 0.01186606, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.0953064, + "balance_loss_mlp": 1.00481534, + "epoch": 0.6294904554336389, + "flos": 66903814327680.0, + "grad_norm": 0.6784350698288011, + "language_loss": 0.52156997, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54369587, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.21191406, + "step": 10470, + "time_per_iteration": 3.31520676612854 + }, + { + "auxiliary_loss_clip": 0.01401563, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.24087751, + "balance_loss_mlp": 1.01720285, + "epoch": 0.6295505786863069, + "flos": 42538209292800.0, + "grad_norm": 2.0532743423588227, + "language_loss": 0.75341415, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.7778402, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.23864746, + "step": 10471, + "time_per_iteration": 3.0668468475341797 + }, + { + "auxiliary_loss_clip": 0.01425602, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.25845683, + "balance_loss_mlp": 1.01138783, + "epoch": 0.629610701938975, + "flos": 17393042753280.0, + "grad_norm": 3.8947935143821173, + "language_loss": 0.64441806, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.66898274, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19506836, + "step": 10472, + "time_per_iteration": 2.8521456718444824 + }, + { + "auxiliary_loss_clip": 0.01429945, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.26271629, + "balance_loss_mlp": 1.01862597, + "epoch": 0.6296708251916429, + "flos": 24253479141120.0, + "grad_norm": 1.7460331712075927, + "language_loss": 0.69901478, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.72370398, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20336914, + "step": 10473, + "time_per_iteration": 2.9144556522369385 + }, + { + "auxiliary_loss_clip": 0.01415461, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.25048494, + "balance_loss_mlp": 1.01467991, + "epoch": 0.6297309484443109, + "flos": 19287186099840.0, + "grad_norm": 1.56849971526424, + "language_loss": 0.75192773, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.77642065, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19140625, + "step": 10474, + "time_per_iteration": 2.861717939376831 + }, + { + "auxiliary_loss_clip": 0.01418699, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.25377297, + "balance_loss_mlp": 1.01229882, + "epoch": 0.6297910716969788, + "flos": 30674442355200.0, + "grad_norm": 1.5506835346899481, + "language_loss": 0.67497927, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.69948173, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19238281, + "step": 10475, + "time_per_iteration": 4.3348987102508545 + }, + { + "auxiliary_loss_clip": 0.0140768, + "auxiliary_loss_mlp": 0.01032327, + "balance_loss_clip": 1.24690163, + "balance_loss_mlp": 1.0135988, + "epoch": 0.6298511949496468, + "flos": 14429652854400.0, + "grad_norm": 1.8734908205639436, + "language_loss": 0.90925145, + "learning_rate": 1.272979284940101e-06, + "loss": 0.93365151, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18725586, + "step": 10476, + "time_per_iteration": 2.8133223056793213 + }, + { + "auxiliary_loss_clip": 0.01410116, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.24756241, + "balance_loss_mlp": 1.01389885, + "epoch": 0.6299113182023147, + "flos": 23524764088320.0, + "grad_norm": 1.6795969339461663, + "language_loss": 0.76636708, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.79078764, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18054199, + "step": 10477, + "time_per_iteration": 5.7560248374938965 + }, + { + "auxiliary_loss_clip": 0.01414244, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.24927104, + "balance_loss_mlp": 1.01357925, + "epoch": 0.6299714414549827, + "flos": 22684755530880.0, + "grad_norm": 1.7799748566108777, + "language_loss": 0.70957315, + "learning_rate": 1.272253702758138e-06, + "loss": 0.73406011, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20874023, + "step": 10478, + "time_per_iteration": 2.8443644046783447 + }, + { + "auxiliary_loss_clip": 0.01439945, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.26820803, + "balance_loss_mlp": 1.01058364, + "epoch": 0.6300315647076506, + "flos": 14509881164160.0, + "grad_norm": 2.3122887164335943, + "language_loss": 0.68252206, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.7072227, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.19519043, + "step": 10479, + "time_per_iteration": 2.8902485370635986 + }, + { + "auxiliary_loss_clip": 0.01425815, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.26214135, + "balance_loss_mlp": 1.01298642, + "epoch": 0.6300916879603187, + "flos": 21881693992320.0, + "grad_norm": 1.7532213100340635, + "language_loss": 0.7449671, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.76955211, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19689941, + "step": 10480, + "time_per_iteration": 2.893348217010498 + }, + { + "auxiliary_loss_clip": 0.01427821, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.26003742, + "balance_loss_mlp": 1.01567805, + "epoch": 0.6301518112129866, + "flos": 21843751587840.0, + "grad_norm": 2.0302104967358914, + "language_loss": 0.79028219, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81492794, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.2109375, + "step": 10481, + "time_per_iteration": 2.8486900329589844 + }, + { + "auxiliary_loss_clip": 0.01184584, + "auxiliary_loss_mlp": 0.01020171, + "balance_loss_clip": 1.0948385, + "balance_loss_mlp": 1.00128829, + "epoch": 0.6302119344656546, + "flos": 44356259589120.0, + "grad_norm": 0.8805446139748399, + "language_loss": 0.61850524, + "learning_rate": 1.2708028696588e-06, + "loss": 0.64055276, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.18847656, + "step": 10482, + "time_per_iteration": 3.0975542068481445 + }, + { + "auxiliary_loss_clip": 0.0144939, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.27662659, + "balance_loss_mlp": 1.01367807, + "epoch": 0.6302720577183225, + "flos": 11225188805760.0, + "grad_norm": 1.8398530057066655, + "language_loss": 0.83628422, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.86112106, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.20605469, + "step": 10483, + "time_per_iteration": 2.902813196182251 + }, + { + "auxiliary_loss_clip": 0.01409101, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.24932432, + "balance_loss_mlp": 1.01245904, + "epoch": 0.6303321809709905, + "flos": 27976558798080.0, + "grad_norm": 1.6308405764019855, + "language_loss": 0.73385346, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75825524, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18615723, + "step": 10484, + "time_per_iteration": 2.894984006881714 + }, + { + "auxiliary_loss_clip": 0.01421179, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.25494039, + "balance_loss_mlp": 1.01481032, + "epoch": 0.6303923042236586, + "flos": 28232563731840.0, + "grad_norm": 1.6846028250305394, + "language_loss": 0.75164866, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.77620196, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19335938, + "step": 10485, + "time_per_iteration": 2.9643726348876953 + }, + { + "auxiliary_loss_clip": 0.01437498, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.26544642, + "balance_loss_mlp": 1.01221538, + "epoch": 0.6304524274763265, + "flos": 27641139960960.0, + "grad_norm": 2.1196606568993044, + "language_loss": 0.82301438, + "learning_rate": 1.269352478979093e-06, + "loss": 0.84769857, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.18713379, + "step": 10486, + "time_per_iteration": 2.8804924488067627 + }, + { + "auxiliary_loss_clip": 0.01415774, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.25143385, + "balance_loss_mlp": 1.01688766, + "epoch": 0.6305125507289945, + "flos": 17320234590720.0, + "grad_norm": 1.677190623256512, + "language_loss": 0.64554262, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.67005634, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18713379, + "step": 10487, + "time_per_iteration": 2.8390207290649414 + }, + { + "auxiliary_loss_clip": 0.01414049, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.25101101, + "balance_loss_mlp": 1.01703095, + "epoch": 0.6305726739816624, + "flos": 25818311698560.0, + "grad_norm": 1.6060041433631136, + "language_loss": 0.67954862, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.70404959, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18994141, + "step": 10488, + "time_per_iteration": 2.864760637283325 + }, + { + "auxiliary_loss_clip": 0.014256, + "auxiliary_loss_mlp": 0.0103484, + "balance_loss_clip": 1.25828815, + "balance_loss_mlp": 1.01573133, + "epoch": 0.6306327972343304, + "flos": 21807121282560.0, + "grad_norm": 1.8358348520941656, + "language_loss": 0.68310589, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.70771027, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19104004, + "step": 10489, + "time_per_iteration": 2.8544108867645264 + }, + { + "auxiliary_loss_clip": 0.01445921, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.27011156, + "balance_loss_mlp": 1.01465416, + "epoch": 0.6306929204869983, + "flos": 20787173579520.0, + "grad_norm": 2.4165003375801954, + "language_loss": 0.71109509, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.73589766, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 1.75683594, + "router_z_loss_mlp": 0.19689941, + "step": 10490, + "time_per_iteration": 2.8539507389068604 + }, + { + "auxiliary_loss_clip": 0.01423985, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.25748289, + "balance_loss_mlp": 1.01936817, + "epoch": 0.6307530437396663, + "flos": 23663322224640.0, + "grad_norm": 1.8540239787011796, + "language_loss": 0.79216623, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.81680089, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.2010498, + "step": 10491, + "time_per_iteration": 2.8690590858459473 + }, + { + "auxiliary_loss_clip": 0.01418932, + "auxiliary_loss_mlp": 0.01037707, + "balance_loss_clip": 1.25510073, + "balance_loss_mlp": 1.01813293, + "epoch": 0.6308131669923343, + "flos": 24730125557760.0, + "grad_norm": 1.931086256739605, + "language_loss": 0.56476295, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58932936, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19580078, + "step": 10492, + "time_per_iteration": 2.8923726081848145 + }, + { + "auxiliary_loss_clip": 0.01424407, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.25702465, + "balance_loss_mlp": 1.01688111, + "epoch": 0.6308732902450023, + "flos": 22575724266240.0, + "grad_norm": 1.8839054977085326, + "language_loss": 0.65507388, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.67969167, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20483398, + "step": 10493, + "time_per_iteration": 2.853484630584717 + }, + { + "auxiliary_loss_clip": 0.01424396, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.25886631, + "balance_loss_mlp": 1.014274, + "epoch": 0.6309334134976702, + "flos": 24654828931200.0, + "grad_norm": 1.413643785023352, + "language_loss": 0.83106321, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85564423, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.1940918, + "step": 10494, + "time_per_iteration": 2.891767978668213 + }, + { + "auxiliary_loss_clip": 0.01430827, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.26401114, + "balance_loss_mlp": 1.01687169, + "epoch": 0.6309935367503382, + "flos": 41443010208000.0, + "grad_norm": 1.70609007257018, + "language_loss": 0.80165601, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.82632899, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19628906, + "step": 10495, + "time_per_iteration": 3.04705810546875 + }, + { + "auxiliary_loss_clip": 0.0143459, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.26600599, + "balance_loss_mlp": 1.01597714, + "epoch": 0.6310536600030061, + "flos": 15126443061120.0, + "grad_norm": 2.295818361052776, + "language_loss": 0.70975423, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.73445851, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19848633, + "step": 10496, + "time_per_iteration": 2.8809518814086914 + }, + { + "auxiliary_loss_clip": 0.01426487, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.25865579, + "balance_loss_mlp": 1.01851916, + "epoch": 0.6311137832556741, + "flos": 15239998805760.0, + "grad_norm": 2.1092866126516805, + "language_loss": 0.81362325, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.8382746, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20153809, + "step": 10497, + "time_per_iteration": 2.8308322429656982 + }, + { + "auxiliary_loss_clip": 0.01423023, + "auxiliary_loss_mlp": 0.01037315, + "balance_loss_clip": 1.2579906, + "balance_loss_mlp": 1.0187186, + "epoch": 0.6311739065083422, + "flos": 22028984375040.0, + "grad_norm": 1.8842482119237882, + "language_loss": 0.75372469, + "learning_rate": 1.265003970256247e-06, + "loss": 0.77832812, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18591309, + "step": 10498, + "time_per_iteration": 2.8629560470581055 + }, + { + "auxiliary_loss_clip": 0.01417835, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.25262737, + "balance_loss_mlp": 1.01346207, + "epoch": 0.6312340297610101, + "flos": 22720978632960.0, + "grad_norm": 2.2903007369616057, + "language_loss": 0.70583129, + "learning_rate": 1.264641775364217e-06, + "loss": 0.73033392, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18981934, + "step": 10499, + "time_per_iteration": 4.32206654548645 + }, + { + "auxiliary_loss_clip": 0.01419516, + "auxiliary_loss_mlp": 0.0103948, + "balance_loss_clip": 1.2576797, + "balance_loss_mlp": 1.02059722, + "epoch": 0.6312941530136781, + "flos": 24290833363200.0, + "grad_norm": 1.8430327668933115, + "language_loss": 0.70604908, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.7306391, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18896484, + "step": 10500, + "time_per_iteration": 2.911339282989502 + }, + { + "auxiliary_loss_clip": 0.01422316, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.25806713, + "balance_loss_mlp": 1.0164113, + "epoch": 0.631354276266346, + "flos": 21735987177600.0, + "grad_norm": 1.7882081907757024, + "language_loss": 0.74526304, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76984096, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19042969, + "step": 10501, + "time_per_iteration": 2.8315303325653076 + }, + { + "auxiliary_loss_clip": 0.01416959, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.25270033, + "balance_loss_mlp": 1.02213585, + "epoch": 0.631414399519014, + "flos": 24035869059840.0, + "grad_norm": 1.9630180680924847, + "language_loss": 0.76118946, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.78578711, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20654297, + "step": 10502, + "time_per_iteration": 2.8702878952026367 + }, + { + "auxiliary_loss_clip": 0.01439619, + "auxiliary_loss_mlp": 0.01039339, + "balance_loss_clip": 1.26853657, + "balance_loss_mlp": 1.01930022, + "epoch": 0.6314745227716819, + "flos": 24326332548480.0, + "grad_norm": 3.0653528436270863, + "language_loss": 0.8611083, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.88589787, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20031738, + "step": 10503, + "time_per_iteration": 2.8569366931915283 + }, + { + "auxiliary_loss_clip": 0.0142467, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.25800788, + "balance_loss_mlp": 1.01736832, + "epoch": 0.6315346460243499, + "flos": 23376478320000.0, + "grad_norm": 2.079009800493109, + "language_loss": 0.87146699, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.896083, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19555664, + "step": 10504, + "time_per_iteration": 2.9101741313934326 + }, + { + "auxiliary_loss_clip": 0.01446957, + "auxiliary_loss_mlp": 0.01040364, + "balance_loss_clip": 1.27428555, + "balance_loss_mlp": 1.0208497, + "epoch": 0.6315947692770179, + "flos": 20268376992000.0, + "grad_norm": 1.6415632667106703, + "language_loss": 0.77539301, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.80026615, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.19506836, + "step": 10505, + "time_per_iteration": 2.8791158199310303 + }, + { + "auxiliary_loss_clip": 0.01424374, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.25766659, + "balance_loss_mlp": 1.01769352, + "epoch": 0.6316548925296859, + "flos": 25277001183360.0, + "grad_norm": 1.8986532173010484, + "language_loss": 0.82276058, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84737825, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19714355, + "step": 10506, + "time_per_iteration": 2.8691341876983643 + }, + { + "auxiliary_loss_clip": 0.01430548, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.26421356, + "balance_loss_mlp": 1.01930547, + "epoch": 0.6317150157823538, + "flos": 22941077178240.0, + "grad_norm": 1.8893211574641704, + "language_loss": 0.75416791, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.77886301, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.1965332, + "step": 10507, + "time_per_iteration": 2.861525058746338 + }, + { + "auxiliary_loss_clip": 0.01441766, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.27263606, + "balance_loss_mlp": 1.02243638, + "epoch": 0.6317751390350218, + "flos": 22536876965760.0, + "grad_norm": 1.792535568939442, + "language_loss": 0.68628514, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.7111249, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19775391, + "step": 10508, + "time_per_iteration": 2.8924825191497803 + }, + { + "auxiliary_loss_clip": 0.01421017, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.25573885, + "balance_loss_mlp": 1.01965451, + "epoch": 0.6318352622876897, + "flos": 23305163235840.0, + "grad_norm": 1.9281275185590812, + "language_loss": 0.713359, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73796296, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19714355, + "step": 10509, + "time_per_iteration": 2.8950679302215576 + }, + { + "auxiliary_loss_clip": 0.01422941, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.25794923, + "balance_loss_mlp": 1.01930773, + "epoch": 0.6318953855403577, + "flos": 20713777234560.0, + "grad_norm": 1.5260279726563561, + "language_loss": 0.80030364, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.82492173, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19567871, + "step": 10510, + "time_per_iteration": 4.252953052520752 + }, + { + "auxiliary_loss_clip": 0.01422586, + "auxiliary_loss_mlp": 0.01035466, + "balance_loss_clip": 1.2551527, + "balance_loss_mlp": 1.01646447, + "epoch": 0.6319555087930258, + "flos": 22830236121600.0, + "grad_norm": 1.5834329066402621, + "language_loss": 0.71119618, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.73577666, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.18994141, + "step": 10511, + "time_per_iteration": 2.898562431335449 + }, + { + "auxiliary_loss_clip": 0.0142574, + "auxiliary_loss_mlp": 0.01038501, + "balance_loss_clip": 1.26306009, + "balance_loss_mlp": 1.01901007, + "epoch": 0.6320156320456937, + "flos": 19979542316160.0, + "grad_norm": 1.8134956262785868, + "language_loss": 0.80726171, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.83190411, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19494629, + "step": 10512, + "time_per_iteration": 5.691515684127808 + }, + { + "auxiliary_loss_clip": 0.01436341, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.26985204, + "balance_loss_mlp": 1.01941371, + "epoch": 0.6320757552983617, + "flos": 27024035126400.0, + "grad_norm": 1.7187220387010815, + "language_loss": 0.71289921, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.73765481, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19824219, + "step": 10513, + "time_per_iteration": 2.905351400375366 + }, + { + "auxiliary_loss_clip": 0.0143809, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.26722598, + "balance_loss_mlp": 1.01380348, + "epoch": 0.6321358785510296, + "flos": 23706422536320.0, + "grad_norm": 1.6906892345051308, + "language_loss": 0.67816818, + "learning_rate": 1.259212205855459e-06, + "loss": 0.70288217, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.19470215, + "step": 10514, + "time_per_iteration": 2.8747355937957764 + }, + { + "auxiliary_loss_clip": 0.01409146, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.24623489, + "balance_loss_mlp": 1.01776993, + "epoch": 0.6321960018036976, + "flos": 26006485397760.0, + "grad_norm": 2.8304227953892336, + "language_loss": 0.75332016, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.77777708, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18798828, + "step": 10515, + "time_per_iteration": 2.939897298812866 + }, + { + "auxiliary_loss_clip": 0.01409215, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.24830627, + "balance_loss_mlp": 1.01541042, + "epoch": 0.6322561250563655, + "flos": 22831457731200.0, + "grad_norm": 1.9131957798509478, + "language_loss": 0.90651405, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.93095446, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19396973, + "step": 10516, + "time_per_iteration": 2.869004726409912 + }, + { + "auxiliary_loss_clip": 0.01447067, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.27319884, + "balance_loss_mlp": 1.01353383, + "epoch": 0.6323162483090335, + "flos": 18997446528000.0, + "grad_norm": 1.6796464906271946, + "language_loss": 0.82083821, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84563923, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.19519043, + "step": 10517, + "time_per_iteration": 2.840836524963379 + }, + { + "auxiliary_loss_clip": 0.01422464, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.2579248, + "balance_loss_mlp": 1.01435542, + "epoch": 0.6323763715617015, + "flos": 19874718817920.0, + "grad_norm": 2.7719133546030723, + "language_loss": 0.78543961, + "learning_rate": 1.257765386189541e-06, + "loss": 0.80999553, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18774414, + "step": 10518, + "time_per_iteration": 2.898045063018799 + }, + { + "auxiliary_loss_clip": 0.01420302, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.2573514, + "balance_loss_mlp": 1.01470613, + "epoch": 0.6324364948143695, + "flos": 22792655675520.0, + "grad_norm": 1.5124320164087828, + "language_loss": 0.85788226, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.88241374, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18139648, + "step": 10519, + "time_per_iteration": 2.9048378467559814 + }, + { + "auxiliary_loss_clip": 0.01404907, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.24406362, + "balance_loss_mlp": 1.01195729, + "epoch": 0.6324966180670374, + "flos": 22245870539520.0, + "grad_norm": 1.543518392375552, + "language_loss": 0.7330997, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.75745869, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19018555, + "step": 10520, + "time_per_iteration": 2.9655165672302246 + }, + { + "auxiliary_loss_clip": 0.01419057, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.25381947, + "balance_loss_mlp": 1.01236176, + "epoch": 0.6325567413197054, + "flos": 21699130648320.0, + "grad_norm": 1.9346482062674295, + "language_loss": 0.72702128, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.75153339, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19787598, + "step": 10521, + "time_per_iteration": 2.8854384422302246 + }, + { + "auxiliary_loss_clip": 0.01427194, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.26037276, + "balance_loss_mlp": 1.01498032, + "epoch": 0.6326168645723733, + "flos": 19946441105280.0, + "grad_norm": 2.037451467991628, + "language_loss": 0.72805476, + "learning_rate": 1.256319016853377e-06, + "loss": 0.75267899, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20251465, + "step": 10522, + "time_per_iteration": 2.8542914390563965 + }, + { + "auxiliary_loss_clip": 0.01416596, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.25352395, + "balance_loss_mlp": 1.01192236, + "epoch": 0.6326769878250413, + "flos": 20240071729920.0, + "grad_norm": 1.701708250380814, + "language_loss": 0.81812918, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84261817, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20385742, + "step": 10523, + "time_per_iteration": 2.87210750579834 + }, + { + "auxiliary_loss_clip": 0.01420129, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.25536811, + "balance_loss_mlp": 1.01118779, + "epoch": 0.6327371110777094, + "flos": 20785137563520.0, + "grad_norm": 1.9058313190830092, + "language_loss": 0.74522936, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76973349, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19091797, + "step": 10524, + "time_per_iteration": 2.8323171138763428 + }, + { + "auxiliary_loss_clip": 0.01441937, + "auxiliary_loss_mlp": 0.01037792, + "balance_loss_clip": 1.26971507, + "balance_loss_mlp": 1.01763332, + "epoch": 0.6327972343303773, + "flos": 30348796394880.0, + "grad_norm": 2.031117756121758, + "language_loss": 0.84974396, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.87454116, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20141602, + "step": 10525, + "time_per_iteration": 2.9047927856445312 + }, + { + "auxiliary_loss_clip": 0.0141916, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.25464296, + "balance_loss_mlp": 1.01289928, + "epoch": 0.6328573575830453, + "flos": 17101086186240.0, + "grad_norm": 1.9708637319865374, + "language_loss": 0.66927826, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.69378328, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18432617, + "step": 10526, + "time_per_iteration": 2.8409132957458496 + }, + { + "auxiliary_loss_clip": 0.01436708, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.2675277, + "balance_loss_mlp": 1.01794505, + "epoch": 0.6329174808357132, + "flos": 25058305226880.0, + "grad_norm": 1.6176299707856039, + "language_loss": 0.74040598, + "learning_rate": 1.254511689796244e-06, + "loss": 0.76515114, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.19836426, + "step": 10527, + "time_per_iteration": 2.8826372623443604 + }, + { + "auxiliary_loss_clip": 0.01413817, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.25242472, + "balance_loss_mlp": 1.01573896, + "epoch": 0.6329776040883812, + "flos": 16845578945280.0, + "grad_norm": 1.9729898440053824, + "language_loss": 0.71718144, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74166095, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18395996, + "step": 10528, + "time_per_iteration": 2.8388054370880127 + }, + { + "auxiliary_loss_clip": 0.01426374, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.26032686, + "balance_loss_mlp": 1.0149107, + "epoch": 0.6330377273410491, + "flos": 13524618240000.0, + "grad_norm": 2.7303451262002643, + "language_loss": 0.67797667, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.70259255, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20300293, + "step": 10529, + "time_per_iteration": 2.8005096912384033 + }, + { + "auxiliary_loss_clip": 0.01442041, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.27126873, + "balance_loss_mlp": 1.01295519, + "epoch": 0.6330978505937171, + "flos": 21547587254400.0, + "grad_norm": 1.8203635008792771, + "language_loss": 0.76705837, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.79180872, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20031738, + "step": 10530, + "time_per_iteration": 2.861442804336548 + }, + { + "auxiliary_loss_clip": 0.01458999, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.2883389, + "balance_loss_mlp": 1.01814246, + "epoch": 0.6331579738463851, + "flos": 25020317577600.0, + "grad_norm": 1.673562030902328, + "language_loss": 0.73910457, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76406938, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.19348145, + "step": 10531, + "time_per_iteration": 2.8916373252868652 + }, + { + "auxiliary_loss_clip": 0.01423073, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.25963473, + "balance_loss_mlp": 1.01956427, + "epoch": 0.6332180970990531, + "flos": 14984265340800.0, + "grad_norm": 2.4017957497904185, + "language_loss": 0.80553567, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.83016348, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20129395, + "step": 10532, + "time_per_iteration": 2.860258102416992 + }, + { + "auxiliary_loss_clip": 0.01410544, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.24768066, + "balance_loss_mlp": 1.01427722, + "epoch": 0.633278220351721, + "flos": 22716182684160.0, + "grad_norm": 1.5451254221100283, + "language_loss": 0.75251019, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.77693868, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18029785, + "step": 10533, + "time_per_iteration": 2.8877639770507812 + }, + { + "auxiliary_loss_clip": 0.01438459, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.26699817, + "balance_loss_mlp": 1.01731253, + "epoch": 0.633338343604389, + "flos": 12610172707200.0, + "grad_norm": 2.4406799942180206, + "language_loss": 0.77860999, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.80336857, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20092773, + "step": 10534, + "time_per_iteration": 4.242862701416016 + }, + { + "auxiliary_loss_clip": 0.01430046, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.26432216, + "balance_loss_mlp": 1.01650703, + "epoch": 0.6333984668570569, + "flos": 25971710129280.0, + "grad_norm": 1.7330164188574384, + "language_loss": 0.86815417, + "learning_rate": 1.251621437204777e-06, + "loss": 0.89282161, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20214844, + "step": 10535, + "time_per_iteration": 2.921600818634033 + }, + { + "auxiliary_loss_clip": 0.01424366, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.2585237, + "balance_loss_mlp": 1.01508784, + "epoch": 0.6334585901097249, + "flos": 23669656496640.0, + "grad_norm": 2.763463269017597, + "language_loss": 0.77311385, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.79770827, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.1998291, + "step": 10536, + "time_per_iteration": 2.9528253078460693 + }, + { + "auxiliary_loss_clip": 0.01425968, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.26197505, + "balance_loss_mlp": 1.01180601, + "epoch": 0.633518713362393, + "flos": 28770480887040.0, + "grad_norm": 1.8092952742929462, + "language_loss": 0.61058491, + "learning_rate": 1.250899157568855e-06, + "loss": 0.6351589, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19616699, + "step": 10537, + "time_per_iteration": 2.929633855819702 + }, + { + "auxiliary_loss_clip": 0.01193576, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.10155034, + "balance_loss_mlp": 1.01623094, + "epoch": 0.6335788366150609, + "flos": 70448791893120.0, + "grad_norm": 0.7813614224732381, + "language_loss": 0.5248059, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54712427, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.22070312, + "step": 10538, + "time_per_iteration": 3.4122090339660645 + }, + { + "auxiliary_loss_clip": 0.01440156, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.26846313, + "balance_loss_mlp": 1.01626432, + "epoch": 0.6336389598677289, + "flos": 23742464659200.0, + "grad_norm": 1.9199142985724913, + "language_loss": 0.83990854, + "learning_rate": 1.250176991556848e-06, + "loss": 0.86468029, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20751953, + "step": 10539, + "time_per_iteration": 2.868924140930176 + }, + { + "auxiliary_loss_clip": 0.0143099, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.26110244, + "balance_loss_mlp": 1.01356649, + "epoch": 0.6336990831203968, + "flos": 29288191599360.0, + "grad_norm": 1.709345226528231, + "language_loss": 0.87604523, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.90070117, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21020508, + "step": 10540, + "time_per_iteration": 2.9496352672576904 + }, + { + "auxiliary_loss_clip": 0.01420355, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.25611067, + "balance_loss_mlp": 1.01123989, + "epoch": 0.6337592063730648, + "flos": 29108976370560.0, + "grad_norm": 3.7001424451825806, + "language_loss": 0.72816366, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.75266773, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18811035, + "step": 10541, + "time_per_iteration": 2.93251371383667 + }, + { + "auxiliary_loss_clip": 0.01457119, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.28488922, + "balance_loss_mlp": 1.01436543, + "epoch": 0.6338193296257327, + "flos": 34717693351680.0, + "grad_norm": 2.0763623427892273, + "language_loss": 0.8587181, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.88363409, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.20129395, + "step": 10542, + "time_per_iteration": 3.008507251739502 + }, + { + "auxiliary_loss_clip": 0.01415425, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.25064671, + "balance_loss_mlp": 1.01167297, + "epoch": 0.6338794528784008, + "flos": 16695438140160.0, + "grad_norm": 1.6106011262312403, + "language_loss": 0.78413427, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80860126, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19616699, + "step": 10543, + "time_per_iteration": 2.8744728565216064 + }, + { + "auxiliary_loss_clip": 0.01405568, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.24530458, + "balance_loss_mlp": 1.0151825, + "epoch": 0.6339395761310687, + "flos": 22356892575360.0, + "grad_norm": 2.2845949708900903, + "language_loss": 0.74164867, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.76605213, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19604492, + "step": 10544, + "time_per_iteration": 4.366663455963135 + }, + { + "auxiliary_loss_clip": 0.01446581, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.27563477, + "balance_loss_mlp": 1.01263916, + "epoch": 0.6339996993837367, + "flos": 18561050000640.0, + "grad_norm": 2.8147173447636686, + "language_loss": 0.6920526, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.71685147, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20654297, + "step": 10545, + "time_per_iteration": 2.8865580558776855 + }, + { + "auxiliary_loss_clip": 0.01405339, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.24261796, + "balance_loss_mlp": 1.01446688, + "epoch": 0.6340598226364046, + "flos": 12977290166400.0, + "grad_norm": 2.0043148903065866, + "language_loss": 0.72218841, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.74658084, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19458008, + "step": 10546, + "time_per_iteration": 4.264515399932861 + }, + { + "auxiliary_loss_clip": 0.01405071, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.24500895, + "balance_loss_mlp": 1.01555037, + "epoch": 0.6341199458890726, + "flos": 26699294062080.0, + "grad_norm": 1.3576352523981463, + "language_loss": 0.78601122, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.81041718, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19958496, + "step": 10547, + "time_per_iteration": 4.348164319992065 + }, + { + "auxiliary_loss_clip": 0.0143357, + "auxiliary_loss_mlp": 0.01035067, + "balance_loss_clip": 1.26420236, + "balance_loss_mlp": 1.01523089, + "epoch": 0.6341800691417405, + "flos": 18743568099840.0, + "grad_norm": 1.878477914759935, + "language_loss": 0.64017516, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.66486156, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19836426, + "step": 10548, + "time_per_iteration": 2.835376024246216 + }, + { + "auxiliary_loss_clip": 0.0142116, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.25475979, + "balance_loss_mlp": 1.01407897, + "epoch": 0.6342401923944085, + "flos": 26260454315520.0, + "grad_norm": 1.6897725618060273, + "language_loss": 0.62642962, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.65097713, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19519043, + "step": 10549, + "time_per_iteration": 2.916325330734253 + }, + { + "auxiliary_loss_clip": 0.01415569, + "auxiliary_loss_mlp": 0.01033787, + "balance_loss_clip": 1.25073195, + "balance_loss_mlp": 1.01406944, + "epoch": 0.6343003156470765, + "flos": 24691640215680.0, + "grad_norm": 1.7800709781878616, + "language_loss": 0.74584419, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.7703377, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19714355, + "step": 10550, + "time_per_iteration": 2.8998491764068604 + }, + { + "auxiliary_loss_clip": 0.01192626, + "auxiliary_loss_mlp": 0.01018224, + "balance_loss_clip": 1.10013199, + "balance_loss_mlp": 0.99628985, + "epoch": 0.6343604388997445, + "flos": 69835035173760.0, + "grad_norm": 0.7241466515962187, + "language_loss": 0.57686305, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59897149, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.21972656, + "step": 10551, + "time_per_iteration": 3.3953652381896973 + }, + { + "auxiliary_loss_clip": 0.01415556, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.24998713, + "balance_loss_mlp": 1.0132426, + "epoch": 0.6344205621524125, + "flos": 21992806517760.0, + "grad_norm": 2.119479702480203, + "language_loss": 0.67833209, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.70282435, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20410156, + "step": 10552, + "time_per_iteration": 2.8635590076446533 + }, + { + "auxiliary_loss_clip": 0.01428623, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.25920224, + "balance_loss_mlp": 1.01029742, + "epoch": 0.6344806854050804, + "flos": 20458541462400.0, + "grad_norm": 1.6433394350698887, + "language_loss": 0.83575642, + "learning_rate": 1.24512502014147e-06, + "loss": 0.8603456, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19970703, + "step": 10553, + "time_per_iteration": 2.8822944164276123 + }, + { + "auxiliary_loss_clip": 0.01434637, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.26499844, + "balance_loss_mlp": 1.01831174, + "epoch": 0.6345408086577484, + "flos": 40524085440000.0, + "grad_norm": 3.1703811654028393, + "language_loss": 0.55994231, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.58467948, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20776367, + "step": 10554, + "time_per_iteration": 2.997152090072632 + }, + { + "auxiliary_loss_clip": 0.01426271, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.25935173, + "balance_loss_mlp": 1.01558876, + "epoch": 0.6346009319104163, + "flos": 21371312937600.0, + "grad_norm": 1.7265979553720885, + "language_loss": 0.71625817, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.74087524, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19848633, + "step": 10555, + "time_per_iteration": 2.873615026473999 + }, + { + "auxiliary_loss_clip": 0.01190227, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.09946394, + "balance_loss_mlp": 1.01644158, + "epoch": 0.6346610551630844, + "flos": 71393216745600.0, + "grad_norm": 0.7866523840235039, + "language_loss": 0.55497593, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57720566, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16308594, + "step": 10556, + "time_per_iteration": 3.3035776615142822 + }, + { + "auxiliary_loss_clip": 0.01429278, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.2592206, + "balance_loss_mlp": 1.01530325, + "epoch": 0.6347211784157523, + "flos": 25422617508480.0, + "grad_norm": 2.0608695851191583, + "language_loss": 0.68971574, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.71436799, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2064209, + "step": 10557, + "time_per_iteration": 2.8911702632904053 + }, + { + "auxiliary_loss_clip": 0.01417474, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.25124598, + "balance_loss_mlp": 1.01022053, + "epoch": 0.6347813016684203, + "flos": 15751330001280.0, + "grad_norm": 1.5402664149742429, + "language_loss": 0.70862573, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.73310518, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20263672, + "step": 10558, + "time_per_iteration": 2.8304927349090576 + }, + { + "auxiliary_loss_clip": 0.01413045, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.24721932, + "balance_loss_mlp": 1.0136373, + "epoch": 0.6348414249210882, + "flos": 21473240768640.0, + "grad_norm": 1.5535912548941604, + "language_loss": 0.78892314, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.81339675, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20678711, + "step": 10559, + "time_per_iteration": 2.8969550132751465 + }, + { + "auxiliary_loss_clip": 0.01426814, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.25940561, + "balance_loss_mlp": 1.01743698, + "epoch": 0.6349015481737562, + "flos": 21663088525440.0, + "grad_norm": 1.6554719594091163, + "language_loss": 0.69258499, + "learning_rate": 1.242601136020078e-06, + "loss": 0.71723461, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20727539, + "step": 10560, + "time_per_iteration": 2.8830788135528564 + }, + { + "auxiliary_loss_clip": 0.01424044, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.25664949, + "balance_loss_mlp": 1.01489294, + "epoch": 0.6349616714264241, + "flos": 22203765613440.0, + "grad_norm": 1.6864240167462674, + "language_loss": 0.77578455, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.80037892, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.2052002, + "step": 10561, + "time_per_iteration": 2.908672332763672 + }, + { + "auxiliary_loss_clip": 0.01430751, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.26137996, + "balance_loss_mlp": 1.01630735, + "epoch": 0.6350217946790921, + "flos": 25421169674880.0, + "grad_norm": 1.8462626674297673, + "language_loss": 0.73129296, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.75596762, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20410156, + "step": 10562, + "time_per_iteration": 2.9103801250457764 + }, + { + "auxiliary_loss_clip": 0.01433511, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.2648387, + "balance_loss_mlp": 1.0153091, + "epoch": 0.63508191793176, + "flos": 19728378576000.0, + "grad_norm": 2.1972553024895807, + "language_loss": 0.81327558, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.83796954, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20568848, + "step": 10563, + "time_per_iteration": 2.960369348526001 + }, + { + "auxiliary_loss_clip": 0.01429755, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.26174676, + "balance_loss_mlp": 1.01761806, + "epoch": 0.6351420411844281, + "flos": 18196059047040.0, + "grad_norm": 4.580599084713165, + "language_loss": 0.81768715, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.84236425, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20336914, + "step": 10564, + "time_per_iteration": 2.855085611343384 + }, + { + "auxiliary_loss_clip": 0.01418043, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.25170648, + "balance_loss_mlp": 1.0163604, + "epoch": 0.6352021644370961, + "flos": 33738809944320.0, + "grad_norm": 1.741208974667203, + "language_loss": 0.73491901, + "learning_rate": 1.240799222993407e-06, + "loss": 0.75945032, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.18737793, + "step": 10565, + "time_per_iteration": 2.994623899459839 + }, + { + "auxiliary_loss_clip": 0.01434738, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.26567805, + "balance_loss_mlp": 1.01475894, + "epoch": 0.635262287689764, + "flos": 20384149731840.0, + "grad_norm": 3.135448863318975, + "language_loss": 0.70233369, + "learning_rate": 1.240438926700324e-06, + "loss": 0.72703767, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20898438, + "step": 10566, + "time_per_iteration": 2.860374689102173 + }, + { + "auxiliary_loss_clip": 0.01413688, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.25135374, + "balance_loss_mlp": 1.01645255, + "epoch": 0.635322410942432, + "flos": 27536497441920.0, + "grad_norm": 12.913596653768852, + "language_loss": 0.70407057, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.72855753, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1854248, + "step": 10567, + "time_per_iteration": 2.938150644302368 + }, + { + "auxiliary_loss_clip": 0.01415558, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.25348902, + "balance_loss_mlp": 1.01361287, + "epoch": 0.6353825341950999, + "flos": 21553604812800.0, + "grad_norm": 2.2653138809590065, + "language_loss": 0.85105354, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.87553775, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19238281, + "step": 10568, + "time_per_iteration": 2.889739751815796 + }, + { + "auxiliary_loss_clip": 0.01419647, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.25323343, + "balance_loss_mlp": 1.01526165, + "epoch": 0.635442657447768, + "flos": 31772446617600.0, + "grad_norm": 2.077714547770781, + "language_loss": 0.85312241, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.87766558, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19421387, + "step": 10569, + "time_per_iteration": 4.331791162490845 + }, + { + "auxiliary_loss_clip": 0.01401536, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.23854494, + "balance_loss_mlp": 1.01706183, + "epoch": 0.6355027807004359, + "flos": 19838405226240.0, + "grad_norm": 1.577611732564103, + "language_loss": 0.70315707, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.72754008, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19702148, + "step": 10570, + "time_per_iteration": 2.8639445304870605 + }, + { + "auxiliary_loss_clip": 0.01423435, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.25497532, + "balance_loss_mlp": 1.01319909, + "epoch": 0.6355629039531039, + "flos": 30384476559360.0, + "grad_norm": 1.707781046795074, + "language_loss": 0.67111802, + "learning_rate": 1.2386378775476e-06, + "loss": 0.69568372, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19934082, + "step": 10571, + "time_per_iteration": 2.903986930847168 + }, + { + "auxiliary_loss_clip": 0.01430272, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.26195788, + "balance_loss_mlp": 1.01528406, + "epoch": 0.6356230272057718, + "flos": 17941275722880.0, + "grad_norm": 1.6740597913142996, + "language_loss": 0.71968341, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.74435496, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.21618652, + "step": 10572, + "time_per_iteration": 2.8243138790130615 + }, + { + "auxiliary_loss_clip": 0.01417017, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.25312257, + "balance_loss_mlp": 1.0141319, + "epoch": 0.6356831504584398, + "flos": 25387163568000.0, + "grad_norm": 1.4754539912150084, + "language_loss": 0.81496662, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.8394742, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19616699, + "step": 10573, + "time_per_iteration": 2.927483558654785 + }, + { + "auxiliary_loss_clip": 0.01426024, + "auxiliary_loss_mlp": 0.01041413, + "balance_loss_clip": 1.25889492, + "balance_loss_mlp": 1.02115965, + "epoch": 0.6357432737111077, + "flos": 46516931925120.0, + "grad_norm": 1.995876584604385, + "language_loss": 0.69803137, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.72270572, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20251465, + "step": 10574, + "time_per_iteration": 3.102177381515503 + }, + { + "auxiliary_loss_clip": 0.01422901, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.25821173, + "balance_loss_mlp": 1.01605785, + "epoch": 0.6358033969637757, + "flos": 17283151837440.0, + "grad_norm": 2.2642023523274086, + "language_loss": 0.87420493, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.89880967, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.21533203, + "step": 10575, + "time_per_iteration": 2.8774237632751465 + }, + { + "auxiliary_loss_clip": 0.01432007, + "auxiliary_loss_mlp": 0.01040029, + "balance_loss_clip": 1.26723969, + "balance_loss_mlp": 1.02062201, + "epoch": 0.6358635202164437, + "flos": 27136866954240.0, + "grad_norm": 1.5471851917253399, + "language_loss": 0.72336543, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.7480858, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19396973, + "step": 10576, + "time_per_iteration": 2.8966774940490723 + }, + { + "auxiliary_loss_clip": 0.01431117, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.26402283, + "balance_loss_mlp": 1.01930499, + "epoch": 0.6359236434691117, + "flos": 27536225973120.0, + "grad_norm": 2.681853939996624, + "language_loss": 0.69916642, + "learning_rate": 1.236477571455085e-06, + "loss": 0.72387522, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20458984, + "step": 10577, + "time_per_iteration": 2.956602096557617 + }, + { + "auxiliary_loss_clip": 0.01416463, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.25184834, + "balance_loss_mlp": 1.01761866, + "epoch": 0.6359837667217797, + "flos": 39362819667840.0, + "grad_norm": 1.7625520190110893, + "language_loss": 0.73049545, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.75502837, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19213867, + "step": 10578, + "time_per_iteration": 3.0018527507781982 + }, + { + "auxiliary_loss_clip": 0.01193301, + "auxiliary_loss_mlp": 0.01026446, + "balance_loss_clip": 1.10213971, + "balance_loss_mlp": 1.00737286, + "epoch": 0.6360438899744476, + "flos": 56439672624000.0, + "grad_norm": 0.7097616740963882, + "language_loss": 0.5457049, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56790239, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.19042969, + "step": 10579, + "time_per_iteration": 4.81935715675354 + }, + { + "auxiliary_loss_clip": 0.01409296, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.24564791, + "balance_loss_mlp": 1.01706934, + "epoch": 0.6361040132271156, + "flos": 24983958741120.0, + "grad_norm": 2.754332172615228, + "language_loss": 0.7817418, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.80620086, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19555664, + "step": 10580, + "time_per_iteration": 2.8833963871002197 + }, + { + "auxiliary_loss_clip": 0.0142142, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.2572304, + "balance_loss_mlp": 1.01288867, + "epoch": 0.6361641364797835, + "flos": 23269845029760.0, + "grad_norm": 1.8998657723042809, + "language_loss": 0.67131698, + "learning_rate": 1.235037946268301e-06, + "loss": 0.69584846, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18859863, + "step": 10581, + "time_per_iteration": 4.316204071044922 + }, + { + "auxiliary_loss_clip": 0.01408869, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.24583614, + "balance_loss_mlp": 1.01711893, + "epoch": 0.6362242597324516, + "flos": 26005580501760.0, + "grad_norm": 1.3440964918187783, + "language_loss": 0.69150364, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.71595347, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18981934, + "step": 10582, + "time_per_iteration": 4.347703456878662 + }, + { + "auxiliary_loss_clip": 0.01427786, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.26064873, + "balance_loss_mlp": 1.02040148, + "epoch": 0.6362843829851195, + "flos": 25714393096320.0, + "grad_norm": 1.7224745402780957, + "language_loss": 0.85513926, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.87981468, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19335938, + "step": 10583, + "time_per_iteration": 2.8928844928741455 + }, + { + "auxiliary_loss_clip": 0.01412062, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.25054669, + "balance_loss_mlp": 1.01947212, + "epoch": 0.6363445062377875, + "flos": 20532887948160.0, + "grad_norm": 1.5227455626226185, + "language_loss": 0.76192576, + "learning_rate": 1.233958531908538e-06, + "loss": 0.78644627, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20507812, + "step": 10584, + "time_per_iteration": 2.8696115016937256 + }, + { + "auxiliary_loss_clip": 0.01417218, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.25055683, + "balance_loss_mlp": 1.01737976, + "epoch": 0.6364046294904554, + "flos": 19473233293440.0, + "grad_norm": 2.0630833970609532, + "language_loss": 0.73611951, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.76067394, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20825195, + "step": 10585, + "time_per_iteration": 2.868460178375244 + }, + { + "auxiliary_loss_clip": 0.01425073, + "auxiliary_loss_mlp": 0.01033077, + "balance_loss_clip": 1.26008201, + "balance_loss_mlp": 1.01374125, + "epoch": 0.6364647527431234, + "flos": 21005643312000.0, + "grad_norm": 1.7944357164275786, + "language_loss": 0.83554518, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.86012673, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19335938, + "step": 10586, + "time_per_iteration": 2.8866946697235107 + }, + { + "auxiliary_loss_clip": 0.01409184, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.24659169, + "balance_loss_mlp": 1.0110743, + "epoch": 0.6365248759957913, + "flos": 25780550273280.0, + "grad_norm": 1.5685212476703394, + "language_loss": 0.73136413, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.75575912, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19238281, + "step": 10587, + "time_per_iteration": 2.958512306213379 + }, + { + "auxiliary_loss_clip": 0.01428326, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.26136446, + "balance_loss_mlp": 1.0147289, + "epoch": 0.6365849992484593, + "flos": 22465788105600.0, + "grad_norm": 1.936855784689104, + "language_loss": 0.77571714, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.80033326, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18566895, + "step": 10588, + "time_per_iteration": 2.9962804317474365 + }, + { + "auxiliary_loss_clip": 0.01400661, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.24122858, + "balance_loss_mlp": 1.01160669, + "epoch": 0.6366451225011273, + "flos": 19034936484480.0, + "grad_norm": 1.3668919056259032, + "language_loss": 0.80311882, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82743812, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1965332, + "step": 10589, + "time_per_iteration": 2.9305076599121094 + }, + { + "auxiliary_loss_clip": 0.01415845, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.25303173, + "balance_loss_mlp": 1.01192307, + "epoch": 0.6367052457537953, + "flos": 25239285002880.0, + "grad_norm": 2.108781839617669, + "language_loss": 0.68065667, + "learning_rate": 1.231800487863257e-06, + "loss": 0.70512891, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19470215, + "step": 10590, + "time_per_iteration": 2.8894801139831543 + }, + { + "auxiliary_loss_clip": 0.01426443, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.25595176, + "balance_loss_mlp": 1.01109374, + "epoch": 0.6367653690064633, + "flos": 19217816542080.0, + "grad_norm": 1.7479076855526154, + "language_loss": 0.79758489, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.82215655, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.19616699, + "step": 10591, + "time_per_iteration": 2.8550961017608643 + }, + { + "auxiliary_loss_clip": 0.01407532, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.24612641, + "balance_loss_mlp": 1.01417434, + "epoch": 0.6368254922591312, + "flos": 23556417465600.0, + "grad_norm": 1.7488806719252232, + "language_loss": 0.89462864, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91903448, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18884277, + "step": 10592, + "time_per_iteration": 2.907463312149048 + }, + { + "auxiliary_loss_clip": 0.01402388, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.24125302, + "balance_loss_mlp": 1.01173925, + "epoch": 0.6368856155117992, + "flos": 26478109641600.0, + "grad_norm": 1.3397571402311352, + "language_loss": 0.69192624, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.71625769, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19006348, + "step": 10593, + "time_per_iteration": 2.932339668273926 + }, + { + "auxiliary_loss_clip": 0.01406732, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.24401164, + "balance_loss_mlp": 1.01391792, + "epoch": 0.6369457387644671, + "flos": 33705346775040.0, + "grad_norm": 1.7149351663367853, + "language_loss": 0.64017713, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.66457868, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19494629, + "step": 10594, + "time_per_iteration": 3.0017402172088623 + }, + { + "auxiliary_loss_clip": 0.01190368, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.09842455, + "balance_loss_mlp": 1.01252711, + "epoch": 0.6370058620171352, + "flos": 70940821541760.0, + "grad_norm": 0.7870682531777012, + "language_loss": 0.54687274, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56910294, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.20117188, + "step": 10595, + "time_per_iteration": 3.5273349285125732 + }, + { + "auxiliary_loss_clip": 0.01430058, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.26218736, + "balance_loss_mlp": 1.01487017, + "epoch": 0.6370659852698031, + "flos": 21151757329920.0, + "grad_norm": 1.7176365065226327, + "language_loss": 0.67989755, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.70454401, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19714355, + "step": 10596, + "time_per_iteration": 2.892890214920044 + }, + { + "auxiliary_loss_clip": 0.01409918, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.24642992, + "balance_loss_mlp": 1.01145482, + "epoch": 0.6371261085224711, + "flos": 20202853242240.0, + "grad_norm": 2.1852321942164568, + "language_loss": 0.80372751, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.82814223, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20092773, + "step": 10597, + "time_per_iteration": 2.9108800888061523 + }, + { + "auxiliary_loss_clip": 0.01426753, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.26053691, + "balance_loss_mlp": 1.01611876, + "epoch": 0.637186231775139, + "flos": 19693241349120.0, + "grad_norm": 1.6315838106475793, + "language_loss": 0.74797535, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77260226, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19824219, + "step": 10598, + "time_per_iteration": 2.8771023750305176 + }, + { + "auxiliary_loss_clip": 0.01418334, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.25267899, + "balance_loss_mlp": 1.01477051, + "epoch": 0.637246355027807, + "flos": 13077001002240.0, + "grad_norm": 3.293320990180704, + "language_loss": 0.690162, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.71469462, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20178223, + "step": 10599, + "time_per_iteration": 2.875793695449829 + }, + { + "auxiliary_loss_clip": 0.01428189, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.25862646, + "balance_loss_mlp": 1.01917505, + "epoch": 0.6373064782804749, + "flos": 18231603477120.0, + "grad_norm": 2.275020919271935, + "language_loss": 0.81714725, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.8418231, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20214844, + "step": 10600, + "time_per_iteration": 2.861290693283081 + }, + { + "auxiliary_loss_clip": 0.01409905, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.24651229, + "balance_loss_mlp": 1.01412392, + "epoch": 0.637366601533143, + "flos": 24509122116480.0, + "grad_norm": 1.4510349697095504, + "language_loss": 0.80258715, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82703066, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20336914, + "step": 10601, + "time_per_iteration": 2.929255247116089 + }, + { + "auxiliary_loss_clip": 0.01416891, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.25001729, + "balance_loss_mlp": 1.01383436, + "epoch": 0.6374267247858109, + "flos": 26370119007360.0, + "grad_norm": 2.105873162889238, + "language_loss": 0.67679965, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.70130908, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20214844, + "step": 10602, + "time_per_iteration": 2.916118621826172 + }, + { + "auxiliary_loss_clip": 0.0141652, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.25027072, + "balance_loss_mlp": 1.01559663, + "epoch": 0.6374868480384789, + "flos": 20380303923840.0, + "grad_norm": 1.650262179358348, + "language_loss": 0.80250895, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.82702506, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19494629, + "step": 10603, + "time_per_iteration": 2.870232105255127 + }, + { + "auxiliary_loss_clip": 0.01416429, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.25123394, + "balance_loss_mlp": 1.01099634, + "epoch": 0.6375469712911469, + "flos": 21006095760000.0, + "grad_norm": 1.9192523967253292, + "language_loss": 0.7809096, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.80539006, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20629883, + "step": 10604, + "time_per_iteration": 4.327500581741333 + }, + { + "auxiliary_loss_clip": 0.01430147, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.26006079, + "balance_loss_mlp": 1.01334321, + "epoch": 0.6376070945438148, + "flos": 19724351788800.0, + "grad_norm": 1.6029820786924245, + "language_loss": 0.77452469, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79915321, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19360352, + "step": 10605, + "time_per_iteration": 3.0158066749572754 + }, + { + "auxiliary_loss_clip": 0.01425567, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.25685799, + "balance_loss_mlp": 1.01198435, + "epoch": 0.6376672177964828, + "flos": 21516657793920.0, + "grad_norm": 2.324611814441127, + "language_loss": 0.66280955, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.6873976, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21264648, + "step": 10606, + "time_per_iteration": 2.919983386993408 + }, + { + "auxiliary_loss_clip": 0.0139559, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.23555243, + "balance_loss_mlp": 1.01648116, + "epoch": 0.6377273410491507, + "flos": 18853232791680.0, + "grad_norm": 1.5589402164559836, + "language_loss": 0.75836182, + "learning_rate": 1.225691734459971e-06, + "loss": 0.78268623, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.20361328, + "step": 10607, + "time_per_iteration": 2.8723268508911133 + }, + { + "auxiliary_loss_clip": 0.01427652, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.26156688, + "balance_loss_mlp": 1.02097714, + "epoch": 0.6377874643018188, + "flos": 53080027614720.0, + "grad_norm": 5.980226397178839, + "language_loss": 0.66144705, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68613017, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19689941, + "step": 10608, + "time_per_iteration": 3.1558682918548584 + }, + { + "auxiliary_loss_clip": 0.01190254, + "auxiliary_loss_mlp": 0.01021066, + "balance_loss_clip": 1.09873652, + "balance_loss_mlp": 1.00046635, + "epoch": 0.6378475875544867, + "flos": 65163458632320.0, + "grad_norm": 0.7109534433152319, + "language_loss": 0.51855457, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.54066765, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.20605469, + "step": 10609, + "time_per_iteration": 3.384315013885498 + }, + { + "auxiliary_loss_clip": 0.01401672, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.2394278, + "balance_loss_mlp": 1.01489842, + "epoch": 0.6379077108071547, + "flos": 23013070934400.0, + "grad_norm": 1.5666766323443047, + "language_loss": 0.75608134, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.78043419, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18737793, + "step": 10610, + "time_per_iteration": 2.921985149383545 + }, + { + "auxiliary_loss_clip": 0.01191757, + "auxiliary_loss_mlp": 0.01021953, + "balance_loss_clip": 1.10152173, + "balance_loss_mlp": 1.00097179, + "epoch": 0.6379678340598226, + "flos": 67636827636480.0, + "grad_norm": 0.8503768416156406, + "language_loss": 0.63187242, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65400958, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.20996094, + "step": 10611, + "time_per_iteration": 3.3566787242889404 + }, + { + "auxiliary_loss_clip": 0.01417855, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.25246584, + "balance_loss_mlp": 1.01522398, + "epoch": 0.6380279573124906, + "flos": 29692210832640.0, + "grad_norm": 2.198029529750739, + "language_loss": 0.73653013, + "learning_rate": 1.223896654187282e-06, + "loss": 0.76105344, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19250488, + "step": 10612, + "time_per_iteration": 2.974773406982422 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01014853, + "balance_loss_clip": 1.10014606, + "balance_loss_mlp": 0.99816418, + "epoch": 0.6380880805651585, + "flos": 66512581390080.0, + "grad_norm": 0.7174302918635455, + "language_loss": 0.57945591, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.60151386, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16699219, + "step": 10613, + "time_per_iteration": 3.201345205307007 + }, + { + "auxiliary_loss_clip": 0.01420807, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.25543833, + "balance_loss_mlp": 1.0158093, + "epoch": 0.6381482038178266, + "flos": 23925570940800.0, + "grad_norm": 1.9732364729060767, + "language_loss": 0.76043212, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.78499901, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20080566, + "step": 10614, + "time_per_iteration": 4.344970703125 + }, + { + "auxiliary_loss_clip": 0.01410409, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.24769926, + "balance_loss_mlp": 1.01673031, + "epoch": 0.6382083270704945, + "flos": 24253614875520.0, + "grad_norm": 2.262292130660365, + "language_loss": 0.80941999, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.83389175, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20031738, + "step": 10615, + "time_per_iteration": 2.909511089324951 + }, + { + "auxiliary_loss_clip": 0.01192863, + "auxiliary_loss_mlp": 0.01024557, + "balance_loss_clip": 1.10013855, + "balance_loss_mlp": 1.0005244, + "epoch": 0.6382684503231625, + "flos": 70811086141440.0, + "grad_norm": 0.6582320655047929, + "language_loss": 0.55674541, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57891959, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.24023438, + "step": 10616, + "time_per_iteration": 3.352346658706665 + }, + { + "auxiliary_loss_clip": 0.01429428, + "auxiliary_loss_mlp": 0.01035201, + "balance_loss_clip": 1.2627126, + "balance_loss_mlp": 1.01542401, + "epoch": 0.6383285735758305, + "flos": 16553486643840.0, + "grad_norm": 4.783911857562708, + "language_loss": 0.85377115, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.87841743, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19775391, + "step": 10617, + "time_per_iteration": 5.810904026031494 + }, + { + "auxiliary_loss_clip": 0.01421031, + "auxiliary_loss_mlp": 0.01038587, + "balance_loss_clip": 1.2545526, + "balance_loss_mlp": 1.01854837, + "epoch": 0.6383886968284984, + "flos": 14435489433600.0, + "grad_norm": 1.772428926348425, + "language_loss": 0.8751663, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89976251, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20043945, + "step": 10618, + "time_per_iteration": 2.852480173110962 + }, + { + "auxiliary_loss_clip": 0.0142712, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.2609098, + "balance_loss_mlp": 1.01736641, + "epoch": 0.6384488200811664, + "flos": 17938244321280.0, + "grad_norm": 2.4304630797388436, + "language_loss": 0.74075896, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.76539278, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.18884277, + "step": 10619, + "time_per_iteration": 2.8696787357330322 + }, + { + "auxiliary_loss_clip": 0.0144455, + "auxiliary_loss_mlp": 0.01043364, + "balance_loss_clip": 1.2713306, + "balance_loss_mlp": 1.02162051, + "epoch": 0.6385089433338343, + "flos": 18524872143360.0, + "grad_norm": 1.9117435515245458, + "language_loss": 0.76809472, + "learning_rate": 1.221026056814193e-06, + "loss": 0.79297388, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.21728516, + "step": 10620, + "time_per_iteration": 2.834789752960205 + }, + { + "auxiliary_loss_clip": 0.01424594, + "auxiliary_loss_mlp": 0.01034016, + "balance_loss_clip": 1.25993049, + "balance_loss_mlp": 1.01450121, + "epoch": 0.6385690665865024, + "flos": 24764267399040.0, + "grad_norm": 2.354045156599609, + "language_loss": 0.72596782, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.75055391, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19506836, + "step": 10621, + "time_per_iteration": 2.8915207386016846 + }, + { + "auxiliary_loss_clip": 0.01395112, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.23581576, + "balance_loss_mlp": 1.01076484, + "epoch": 0.6386291898391703, + "flos": 20130180814080.0, + "grad_norm": 1.7326989499827687, + "language_loss": 0.788149, + "learning_rate": 1.220308702586529e-06, + "loss": 0.81239855, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19055176, + "step": 10622, + "time_per_iteration": 2.897085189819336 + }, + { + "auxiliary_loss_clip": 0.01400834, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.23960865, + "balance_loss_mlp": 1.01289821, + "epoch": 0.6386893130918383, + "flos": 16874472389760.0, + "grad_norm": 1.8656485860660403, + "language_loss": 0.75478286, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77910721, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18676758, + "step": 10623, + "time_per_iteration": 2.8348002433776855 + }, + { + "auxiliary_loss_clip": 0.01404102, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.2434181, + "balance_loss_mlp": 1.01526415, + "epoch": 0.6387494363445062, + "flos": 22976576363520.0, + "grad_norm": 2.105012327800066, + "language_loss": 0.76819652, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.79258054, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19018555, + "step": 10624, + "time_per_iteration": 2.9461028575897217 + }, + { + "auxiliary_loss_clip": 0.01421476, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.25560713, + "balance_loss_mlp": 1.01604056, + "epoch": 0.6388095595971742, + "flos": 22868223770880.0, + "grad_norm": 1.8815024198633832, + "language_loss": 0.81112021, + "learning_rate": 1.21923289302382e-06, + "loss": 0.83569729, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20202637, + "step": 10625, + "time_per_iteration": 2.892503261566162 + }, + { + "auxiliary_loss_clip": 0.01435147, + "auxiliary_loss_mlp": 0.01041965, + "balance_loss_clip": 1.2671243, + "balance_loss_mlp": 1.02131808, + "epoch": 0.6388696828498421, + "flos": 17320506059520.0, + "grad_norm": 4.8590372101689425, + "language_loss": 0.73667407, + "learning_rate": 1.218874349031654e-06, + "loss": 0.76144516, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.2064209, + "step": 10626, + "time_per_iteration": 2.8904950618743896 + }, + { + "auxiliary_loss_clip": 0.01431019, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.26452756, + "balance_loss_mlp": 1.01484311, + "epoch": 0.6389298061025102, + "flos": 17137445022720.0, + "grad_norm": 1.7914528375793113, + "language_loss": 0.73458624, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.75924754, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20263672, + "step": 10627, + "time_per_iteration": 2.8895771503448486 + }, + { + "auxiliary_loss_clip": 0.01441741, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.27066541, + "balance_loss_mlp": 1.01762176, + "epoch": 0.6389899293551781, + "flos": 27722906593920.0, + "grad_norm": 1.6079667470696684, + "language_loss": 0.67835248, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.70316231, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.21630859, + "step": 10628, + "time_per_iteration": 2.922659158706665 + }, + { + "auxiliary_loss_clip": 0.0140187, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.24035263, + "balance_loss_mlp": 1.01469791, + "epoch": 0.6390500526078461, + "flos": 21225741857280.0, + "grad_norm": 1.763853472525078, + "language_loss": 0.68809628, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.71245688, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19470215, + "step": 10629, + "time_per_iteration": 2.912322998046875 + }, + { + "auxiliary_loss_clip": 0.01437087, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.26388943, + "balance_loss_mlp": 1.02114654, + "epoch": 0.6391101758605141, + "flos": 21591320993280.0, + "grad_norm": 1.5218047276149684, + "language_loss": 0.75778008, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.78258324, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.22094727, + "step": 10630, + "time_per_iteration": 2.944474935531616 + }, + { + "auxiliary_loss_clip": 0.01415585, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.25176167, + "balance_loss_mlp": 1.01865625, + "epoch": 0.639170299113182, + "flos": 19909720310400.0, + "grad_norm": 1.6302352487402056, + "language_loss": 0.70919836, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.73374021, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19934082, + "step": 10631, + "time_per_iteration": 2.9247424602508545 + }, + { + "auxiliary_loss_clip": 0.01186176, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.09630311, + "balance_loss_mlp": 1.00533152, + "epoch": 0.63923042236585, + "flos": 69907119419520.0, + "grad_norm": 0.7736767569588264, + "language_loss": 0.63027972, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.65241796, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.22363281, + "step": 10632, + "time_per_iteration": 3.394996404647827 + }, + { + "auxiliary_loss_clip": 0.01411491, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.24773991, + "balance_loss_mlp": 1.0188663, + "epoch": 0.639290545618518, + "flos": 22685162734080.0, + "grad_norm": 1.7549325404893397, + "language_loss": 0.67521405, + "learning_rate": 1.216365371217893e-06, + "loss": 0.69972301, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20544434, + "step": 10633, + "time_per_iteration": 2.8727967739105225 + }, + { + "auxiliary_loss_clip": 0.01418026, + "auxiliary_loss_mlp": 0.01034473, + "balance_loss_clip": 1.25359845, + "balance_loss_mlp": 1.01448166, + "epoch": 0.639350668871186, + "flos": 19838857674240.0, + "grad_norm": 1.8832953461727837, + "language_loss": 0.82526946, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84979451, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19995117, + "step": 10634, + "time_per_iteration": 2.8772552013397217 + }, + { + "auxiliary_loss_clip": 0.01423851, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.25953674, + "balance_loss_mlp": 1.01608789, + "epoch": 0.6394107921238539, + "flos": 20561736147840.0, + "grad_norm": 1.6351288090620961, + "language_loss": 0.75842631, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.7830233, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19763184, + "step": 10635, + "time_per_iteration": 2.8638575077056885 + }, + { + "auxiliary_loss_clip": 0.01419533, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.25356746, + "balance_loss_mlp": 1.01630259, + "epoch": 0.6394709153765219, + "flos": 25785843914880.0, + "grad_norm": 1.9103929695053528, + "language_loss": 0.72311699, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.74768591, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21057129, + "step": 10636, + "time_per_iteration": 2.9158835411071777 + }, + { + "auxiliary_loss_clip": 0.01436455, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.26676583, + "balance_loss_mlp": 1.0181272, + "epoch": 0.6395310386291898, + "flos": 17539111526400.0, + "grad_norm": 1.7827667256817197, + "language_loss": 0.74897647, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.77372301, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20068359, + "step": 10637, + "time_per_iteration": 2.8634557723999023 + }, + { + "auxiliary_loss_clip": 0.01429432, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.26219583, + "balance_loss_mlp": 1.01617146, + "epoch": 0.6395911618818578, + "flos": 18597454081920.0, + "grad_norm": 1.7638654148147994, + "language_loss": 0.787718, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.81237477, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20080566, + "step": 10638, + "time_per_iteration": 4.283633708953857 + }, + { + "auxiliary_loss_clip": 0.01419982, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.25548196, + "balance_loss_mlp": 1.01789415, + "epoch": 0.6396512851345257, + "flos": 28378361036160.0, + "grad_norm": 1.5567233425345377, + "language_loss": 0.82814515, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.85272509, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20117188, + "step": 10639, + "time_per_iteration": 2.9391729831695557 + }, + { + "auxiliary_loss_clip": 0.01188109, + "auxiliary_loss_mlp": 0.0101556, + "balance_loss_clip": 1.09906125, + "balance_loss_mlp": 0.99543744, + "epoch": 0.6397114083871938, + "flos": 70755995957760.0, + "grad_norm": 0.8322793018720489, + "language_loss": 0.59084594, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61288267, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20117188, + "step": 10640, + "time_per_iteration": 3.330916166305542 + }, + { + "auxiliary_loss_clip": 0.01411993, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.25044847, + "balance_loss_mlp": 1.01173711, + "epoch": 0.6397715316398617, + "flos": 18149610620160.0, + "grad_norm": 1.8221305477584664, + "language_loss": 0.79428267, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.81870389, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18395996, + "step": 10641, + "time_per_iteration": 2.8732833862304688 + }, + { + "auxiliary_loss_clip": 0.0143476, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.26246238, + "balance_loss_mlp": 1.01316845, + "epoch": 0.6398316548925297, + "flos": 25750616198400.0, + "grad_norm": 1.6235313928436281, + "language_loss": 0.64310747, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.66778135, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.19458008, + "step": 10642, + "time_per_iteration": 2.961947202682495 + }, + { + "auxiliary_loss_clip": 0.01185897, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.09587097, + "balance_loss_mlp": 1.0136981, + "epoch": 0.6398917781451977, + "flos": 71240632721280.0, + "grad_norm": 1.1528374812429372, + "language_loss": 0.56034714, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.58253288, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.18945312, + "step": 10643, + "time_per_iteration": 3.2471749782562256 + }, + { + "auxiliary_loss_clip": 0.01445083, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.2747376, + "balance_loss_mlp": 1.01310587, + "epoch": 0.6399519013978656, + "flos": 20531530604160.0, + "grad_norm": 1.7599455108568705, + "language_loss": 0.77586806, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.80063701, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.18713379, + "step": 10644, + "time_per_iteration": 2.9127166271209717 + }, + { + "auxiliary_loss_clip": 0.01424452, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.26067805, + "balance_loss_mlp": 1.01792586, + "epoch": 0.6400120246505336, + "flos": 24471179712000.0, + "grad_norm": 1.4673891422580556, + "language_loss": 0.82872653, + "learning_rate": 1.212067656542203e-06, + "loss": 0.85335654, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.2064209, + "step": 10645, + "time_per_iteration": 2.9292383193969727 + }, + { + "auxiliary_loss_clip": 0.0142456, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.25484347, + "balance_loss_mlp": 1.01821744, + "epoch": 0.6400721479032015, + "flos": 28377772853760.0, + "grad_norm": 2.5186558208453578, + "language_loss": 0.74340606, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.7680403, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.2064209, + "step": 10646, + "time_per_iteration": 2.918893337249756 + }, + { + "auxiliary_loss_clip": 0.0142159, + "auxiliary_loss_mlp": 0.01037106, + "balance_loss_clip": 1.256845, + "balance_loss_mlp": 1.01753163, + "epoch": 0.6401322711558696, + "flos": 17824145639040.0, + "grad_norm": 2.2304574692842, + "language_loss": 0.80047709, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82506406, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19567871, + "step": 10647, + "time_per_iteration": 3.002793550491333 + }, + { + "auxiliary_loss_clip": 0.01425092, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.26329124, + "balance_loss_mlp": 1.01545215, + "epoch": 0.6401923944085375, + "flos": 26041894093440.0, + "grad_norm": 1.6022407143921893, + "language_loss": 0.76441002, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.78900409, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18859863, + "step": 10648, + "time_per_iteration": 2.9292759895324707 + }, + { + "auxiliary_loss_clip": 0.01416814, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.25257564, + "balance_loss_mlp": 1.01485825, + "epoch": 0.6402525176612055, + "flos": 23596893578880.0, + "grad_norm": 3.0806887890717416, + "language_loss": 0.79655683, + "learning_rate": 1.210636039936138e-06, + "loss": 0.82106185, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18835449, + "step": 10649, + "time_per_iteration": 4.324416637420654 + }, + { + "auxiliary_loss_clip": 0.0142662, + "auxiliary_loss_mlp": 0.01042193, + "balance_loss_clip": 1.2605722, + "balance_loss_mlp": 1.02213025, + "epoch": 0.6403126409138734, + "flos": 18050623701120.0, + "grad_norm": 1.7752478916176784, + "language_loss": 0.75973666, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.78442478, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20043945, + "step": 10650, + "time_per_iteration": 2.828914165496826 + }, + { + "auxiliary_loss_clip": 0.01412915, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.25132704, + "balance_loss_mlp": 1.01294398, + "epoch": 0.6403727641665414, + "flos": 21989096444160.0, + "grad_norm": 1.421939643236227, + "language_loss": 0.71454763, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.739003, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19677734, + "step": 10651, + "time_per_iteration": 2.853471279144287 + }, + { + "auxiliary_loss_clip": 0.0142193, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.25642169, + "balance_loss_mlp": 1.01954782, + "epoch": 0.6404328874192093, + "flos": 24905178264960.0, + "grad_norm": 2.3841513333747595, + "language_loss": 0.64629447, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.67090356, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19421387, + "step": 10652, + "time_per_iteration": 5.7885777950286865 + }, + { + "auxiliary_loss_clip": 0.01428772, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.26294708, + "balance_loss_mlp": 1.01513839, + "epoch": 0.6404930106718774, + "flos": 17604635276160.0, + "grad_norm": 1.9232193597009615, + "language_loss": 0.80037957, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.82500798, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18945312, + "step": 10653, + "time_per_iteration": 2.847811460494995 + }, + { + "auxiliary_loss_clip": 0.01457545, + "auxiliary_loss_mlp": 0.01040116, + "balance_loss_clip": 1.28304648, + "balance_loss_mlp": 1.01985049, + "epoch": 0.6405531339245453, + "flos": 20167716015360.0, + "grad_norm": 2.2960601970604086, + "language_loss": 0.72050655, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.74548328, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.20263672, + "step": 10654, + "time_per_iteration": 2.8639028072357178 + }, + { + "auxiliary_loss_clip": 0.01440767, + "auxiliary_loss_mlp": 0.01037564, + "balance_loss_clip": 1.26968098, + "balance_loss_mlp": 1.01698852, + "epoch": 0.6406132571772133, + "flos": 21951696977280.0, + "grad_norm": 1.7498641606610295, + "language_loss": 0.73086828, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75565159, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20556641, + "step": 10655, + "time_per_iteration": 2.8841168880462646 + }, + { + "auxiliary_loss_clip": 0.01422337, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.25652099, + "balance_loss_mlp": 1.01732111, + "epoch": 0.6406733804298813, + "flos": 28779937050240.0, + "grad_norm": 1.500264076200686, + "language_loss": 0.83653414, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.86112994, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19909668, + "step": 10656, + "time_per_iteration": 2.934028148651123 + }, + { + "auxiliary_loss_clip": 0.01428696, + "auxiliary_loss_mlp": 0.01039691, + "balance_loss_clip": 1.26124632, + "balance_loss_mlp": 1.0196991, + "epoch": 0.6407335036825492, + "flos": 17466122384640.0, + "grad_norm": 3.7321961948797844, + "language_loss": 0.73102641, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.75571024, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19995117, + "step": 10657, + "time_per_iteration": 2.8729894161224365 + }, + { + "auxiliary_loss_clip": 0.01433272, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.26715958, + "balance_loss_mlp": 1.01986885, + "epoch": 0.6407936269352172, + "flos": 22134893748480.0, + "grad_norm": 1.6013287787312804, + "language_loss": 0.77738333, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19470215, + "step": 10658, + "time_per_iteration": 2.8620684146881104 + }, + { + "auxiliary_loss_clip": 0.01428599, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.26034474, + "balance_loss_mlp": 1.01528001, + "epoch": 0.6408537501878852, + "flos": 23120563875840.0, + "grad_norm": 1.6232958390410503, + "language_loss": 0.76657844, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.79121989, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20288086, + "step": 10659, + "time_per_iteration": 2.890638828277588 + }, + { + "auxiliary_loss_clip": 0.01428027, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.26003706, + "balance_loss_mlp": 1.01661825, + "epoch": 0.6409138734405532, + "flos": 16481221418880.0, + "grad_norm": 2.1044077916865067, + "language_loss": 0.78435314, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80899191, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19226074, + "step": 10660, + "time_per_iteration": 2.8354198932647705 + }, + { + "auxiliary_loss_clip": 0.01447006, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.27504802, + "balance_loss_mlp": 1.01657939, + "epoch": 0.6409739966932211, + "flos": 22786638117120.0, + "grad_norm": 1.8987962621829833, + "language_loss": 0.69675279, + "learning_rate": 1.206344067135727e-06, + "loss": 0.72158909, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.20043945, + "step": 10661, + "time_per_iteration": 2.8755786418914795 + }, + { + "auxiliary_loss_clip": 0.01417303, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.25559759, + "balance_loss_mlp": 1.01140189, + "epoch": 0.6410341199458891, + "flos": 25162042849920.0, + "grad_norm": 1.5763908432023974, + "language_loss": 0.76929688, + "learning_rate": 1.205986598033362e-06, + "loss": 0.79376435, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18041992, + "step": 10662, + "time_per_iteration": 2.913311243057251 + }, + { + "auxiliary_loss_clip": 0.01428486, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.26235533, + "balance_loss_mlp": 1.01279962, + "epoch": 0.641094243198557, + "flos": 27055869482880.0, + "grad_norm": 2.5191459750923406, + "language_loss": 0.69972992, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.72433501, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19238281, + "step": 10663, + "time_per_iteration": 2.9283523559570312 + }, + { + "auxiliary_loss_clip": 0.0142447, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.25887358, + "balance_loss_mlp": 1.01631224, + "epoch": 0.641154366451225, + "flos": 25385534755200.0, + "grad_norm": 2.2262658125064956, + "language_loss": 0.68544781, + "learning_rate": 1.205271750169389e-06, + "loss": 0.71005166, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19604492, + "step": 10664, + "time_per_iteration": 2.9669623374938965 + }, + { + "auxiliary_loss_clip": 0.01427165, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.26291978, + "balance_loss_mlp": 1.01568484, + "epoch": 0.6412144897038929, + "flos": 25163671662720.0, + "grad_norm": 2.0391274395868626, + "language_loss": 0.66914469, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.69375968, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18652344, + "step": 10665, + "time_per_iteration": 2.906723976135254 + }, + { + "auxiliary_loss_clip": 0.01419699, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.25699663, + "balance_loss_mlp": 1.01418304, + "epoch": 0.641274612956561, + "flos": 23451232008960.0, + "grad_norm": 1.6841320535206803, + "language_loss": 0.64887869, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.6734035, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18615723, + "step": 10666, + "time_per_iteration": 2.9523122310638428 + }, + { + "auxiliary_loss_clip": 0.01428809, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.26217437, + "balance_loss_mlp": 1.01387572, + "epoch": 0.6413347362092289, + "flos": 19436964946560.0, + "grad_norm": 1.9525967246029787, + "language_loss": 0.71790564, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.74252433, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19189453, + "step": 10667, + "time_per_iteration": 2.8985817432403564 + }, + { + "auxiliary_loss_clip": 0.01466095, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.2880882, + "balance_loss_mlp": 1.01692605, + "epoch": 0.6413948594618969, + "flos": 17204099892480.0, + "grad_norm": 7.531950515906734, + "language_loss": 0.78758931, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.81262624, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.20666504, + "step": 10668, + "time_per_iteration": 2.9091477394104004 + }, + { + "auxiliary_loss_clip": 0.01431334, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.26624274, + "balance_loss_mlp": 1.0141952, + "epoch": 0.6414549827145648, + "flos": 22279062240000.0, + "grad_norm": 2.152229645046003, + "language_loss": 0.68659908, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.71125817, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20397949, + "step": 10669, + "time_per_iteration": 2.916707754135132 + }, + { + "auxiliary_loss_clip": 0.01452395, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.27983439, + "balance_loss_mlp": 1.01690793, + "epoch": 0.6415151059672328, + "flos": 19648105021440.0, + "grad_norm": 1.8464743520543008, + "language_loss": 0.79502165, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.81991428, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.19970703, + "step": 10670, + "time_per_iteration": 2.852468729019165 + }, + { + "auxiliary_loss_clip": 0.01437834, + "auxiliary_loss_mlp": 0.01038766, + "balance_loss_clip": 1.26782632, + "balance_loss_mlp": 1.0186193, + "epoch": 0.6415752292199008, + "flos": 14873876732160.0, + "grad_norm": 2.2207479308350093, + "language_loss": 0.89163488, + "learning_rate": 1.20277073264638e-06, + "loss": 0.91640091, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20153809, + "step": 10671, + "time_per_iteration": 2.8192851543426514 + }, + { + "auxiliary_loss_clip": 0.01420009, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.25844097, + "balance_loss_mlp": 1.01381326, + "epoch": 0.6416353524725688, + "flos": 13743540420480.0, + "grad_norm": 1.9193543933643726, + "language_loss": 0.70426619, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.72879136, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18701172, + "step": 10672, + "time_per_iteration": 2.849375009536743 + }, + { + "auxiliary_loss_clip": 0.0144819, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.27579701, + "balance_loss_mlp": 1.01340795, + "epoch": 0.6416954757252368, + "flos": 24545571442560.0, + "grad_norm": 2.0584038140715255, + "language_loss": 0.75240296, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.77722096, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.20214844, + "step": 10673, + "time_per_iteration": 4.343919038772583 + }, + { + "auxiliary_loss_clip": 0.01437586, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.27070141, + "balance_loss_mlp": 1.01630449, + "epoch": 0.6417555989779047, + "flos": 27721684984320.0, + "grad_norm": 9.377517444487138, + "language_loss": 0.70192856, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.72667408, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20654297, + "step": 10674, + "time_per_iteration": 2.953443765640259 + }, + { + "auxiliary_loss_clip": 0.01452658, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.2810154, + "balance_loss_mlp": 1.01340759, + "epoch": 0.6418157222305727, + "flos": 20565898669440.0, + "grad_norm": 1.8069554665396148, + "language_loss": 0.67641377, + "learning_rate": 1.201342244560338e-06, + "loss": 0.70127201, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.1973877, + "step": 10675, + "time_per_iteration": 2.8756954669952393 + }, + { + "auxiliary_loss_clip": 0.01434542, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.26970649, + "balance_loss_mlp": 1.01412261, + "epoch": 0.6418758454832406, + "flos": 22611766389120.0, + "grad_norm": 1.8935539110079342, + "language_loss": 0.67515373, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.69984519, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20483398, + "step": 10676, + "time_per_iteration": 2.8884360790252686 + }, + { + "auxiliary_loss_clip": 0.01423655, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.25718617, + "balance_loss_mlp": 1.0121007, + "epoch": 0.6419359687359086, + "flos": 27384999292800.0, + "grad_norm": 1.7809758788584291, + "language_loss": 0.76946425, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.79403442, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21264648, + "step": 10677, + "time_per_iteration": 2.935551404953003 + }, + { + "auxiliary_loss_clip": 0.01190475, + "auxiliary_loss_mlp": 0.01023203, + "balance_loss_clip": 1.10204387, + "balance_loss_mlp": 1.00708568, + "epoch": 0.6419960919885765, + "flos": 67281093884160.0, + "grad_norm": 0.7665629148441414, + "language_loss": 0.60796797, + "learning_rate": 1.200271196442818e-06, + "loss": 0.63010478, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16113281, + "step": 10678, + "time_per_iteration": 3.4241528511047363 + }, + { + "auxiliary_loss_clip": 0.01415066, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.25327182, + "balance_loss_mlp": 1.01425266, + "epoch": 0.6420562152412446, + "flos": 19911484857600.0, + "grad_norm": 1.6971540601049537, + "language_loss": 0.6813162, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.70579916, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18969727, + "step": 10679, + "time_per_iteration": 2.897221326828003 + }, + { + "auxiliary_loss_clip": 0.0144488, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.27552247, + "balance_loss_mlp": 1.01837778, + "epoch": 0.6421163384939125, + "flos": 24800852459520.0, + "grad_norm": 2.376874515234927, + "language_loss": 0.7344048, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75924397, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20666504, + "step": 10680, + "time_per_iteration": 2.9345006942749023 + }, + { + "auxiliary_loss_clip": 0.01429974, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.26430452, + "balance_loss_mlp": 1.01504111, + "epoch": 0.6421764617465805, + "flos": 25603687774080.0, + "grad_norm": 1.8341236258258593, + "language_loss": 0.6937654, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.71839756, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18212891, + "step": 10681, + "time_per_iteration": 2.9172184467315674 + }, + { + "auxiliary_loss_clip": 0.01421641, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.25648963, + "balance_loss_mlp": 1.01335835, + "epoch": 0.6422365849992484, + "flos": 14142220767360.0, + "grad_norm": 1.872058628280906, + "language_loss": 0.75711232, + "learning_rate": 1.198843556910427e-06, + "loss": 0.78165686, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19458008, + "step": 10682, + "time_per_iteration": 2.837601900100708 + }, + { + "auxiliary_loss_clip": 0.01406371, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.24697948, + "balance_loss_mlp": 1.01278877, + "epoch": 0.6422967082519164, + "flos": 22394427776640.0, + "grad_norm": 1.5111909971224204, + "language_loss": 0.79768056, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.82205731, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18530273, + "step": 10683, + "time_per_iteration": 2.9045791625976562 + }, + { + "auxiliary_loss_clip": 0.01430182, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.26372552, + "balance_loss_mlp": 1.01296806, + "epoch": 0.6423568315045844, + "flos": 14656176161280.0, + "grad_norm": 1.8447997025607277, + "language_loss": 0.68560451, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.71024299, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20678711, + "step": 10684, + "time_per_iteration": 4.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01427995, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.26060963, + "balance_loss_mlp": 1.01253033, + "epoch": 0.6424169547572524, + "flos": 26845272345600.0, + "grad_norm": 1.8508719947285983, + "language_loss": 0.72685158, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.75144541, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.18859863, + "step": 10685, + "time_per_iteration": 2.9081976413726807 + }, + { + "auxiliary_loss_clip": 0.0142107, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.25874758, + "balance_loss_mlp": 1.01665998, + "epoch": 0.6424770780099204, + "flos": 22716770866560.0, + "grad_norm": 1.4569821208569986, + "language_loss": 0.75700456, + "learning_rate": 1.197416403456935e-06, + "loss": 0.78156984, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18786621, + "step": 10686, + "time_per_iteration": 2.8941800594329834 + }, + { + "auxiliary_loss_clip": 0.01432115, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.26200795, + "balance_loss_mlp": 1.01632726, + "epoch": 0.6425372012625883, + "flos": 28479655440000.0, + "grad_norm": 2.6517935294205337, + "language_loss": 0.69253433, + "learning_rate": 1.197059691144867e-06, + "loss": 0.7172184, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.19958496, + "step": 10687, + "time_per_iteration": 4.522422790527344 + }, + { + "auxiliary_loss_clip": 0.01441026, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.27206433, + "balance_loss_mlp": 1.01741958, + "epoch": 0.6425973245152563, + "flos": 29363759694720.0, + "grad_norm": 1.7661022084918028, + "language_loss": 0.6712476, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.69602895, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19689941, + "step": 10688, + "time_per_iteration": 2.932551145553589 + }, + { + "auxiliary_loss_clip": 0.0142935, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.26289177, + "balance_loss_mlp": 1.0167501, + "epoch": 0.6426574477679242, + "flos": 16437713904000.0, + "grad_norm": 1.7150861968859532, + "language_loss": 0.73864454, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.76329553, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18994141, + "step": 10689, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.01410759, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.24812031, + "balance_loss_mlp": 1.01163769, + "epoch": 0.6427175710205922, + "flos": 21846104317440.0, + "grad_norm": 2.1334692388235137, + "language_loss": 0.73208773, + "learning_rate": 1.195989736948226e-06, + "loss": 0.75649667, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18505859, + "step": 10690, + "time_per_iteration": 2.899162530899048 + }, + { + "auxiliary_loss_clip": 0.01423285, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.25997949, + "balance_loss_mlp": 1.01206636, + "epoch": 0.6427776942732601, + "flos": 17795930866560.0, + "grad_norm": 1.8397615616458913, + "language_loss": 0.78285342, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80740619, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19946289, + "step": 10691, + "time_per_iteration": 2.887563943862915 + }, + { + "auxiliary_loss_clip": 0.01433303, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.26460516, + "balance_loss_mlp": 1.01540029, + "epoch": 0.6428378175259282, + "flos": 15094291991040.0, + "grad_norm": 1.645920146953257, + "language_loss": 0.75615132, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.78083098, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19262695, + "step": 10692, + "time_per_iteration": 2.852415084838867 + }, + { + "auxiliary_loss_clip": 0.01429112, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.26243806, + "balance_loss_mlp": 1.01873112, + "epoch": 0.6428979407785961, + "flos": 23852265085440.0, + "grad_norm": 1.808450138857936, + "language_loss": 0.62432265, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.64900053, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19946289, + "step": 10693, + "time_per_iteration": 2.888190746307373 + }, + { + "auxiliary_loss_clip": 0.01430688, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.26105404, + "balance_loss_mlp": 1.01487494, + "epoch": 0.6429580640312641, + "flos": 32939729948160.0, + "grad_norm": 1.9219967714160997, + "language_loss": 0.61382437, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.63848358, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20361328, + "step": 10694, + "time_per_iteration": 3.0111265182495117 + }, + { + "auxiliary_loss_clip": 0.01426918, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.26052904, + "balance_loss_mlp": 1.01907802, + "epoch": 0.643018187283932, + "flos": 21077999026560.0, + "grad_norm": 1.3842545962455544, + "language_loss": 0.80352914, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82817996, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1907959, + "step": 10695, + "time_per_iteration": 2.8511664867401123 + }, + { + "auxiliary_loss_clip": 0.01417617, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.25081968, + "balance_loss_mlp": 1.01497746, + "epoch": 0.6430783105366, + "flos": 26736014856960.0, + "grad_norm": 1.6600577459499624, + "language_loss": 0.74419242, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.768713, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19445801, + "step": 10696, + "time_per_iteration": 2.917100191116333 + }, + { + "auxiliary_loss_clip": 0.01422182, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.25923085, + "balance_loss_mlp": 1.01006281, + "epoch": 0.643138433789268, + "flos": 23707553656320.0, + "grad_norm": 1.68381558531295, + "language_loss": 0.75997633, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.78449583, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19714355, + "step": 10697, + "time_per_iteration": 2.886347770690918 + }, + { + "auxiliary_loss_clip": 0.0142138, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.25947118, + "balance_loss_mlp": 1.01698661, + "epoch": 0.643198557041936, + "flos": 34214958668160.0, + "grad_norm": 1.9410327597725991, + "language_loss": 0.66734207, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.69191515, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18945312, + "step": 10698, + "time_per_iteration": 2.9949021339416504 + }, + { + "auxiliary_loss_clip": 0.01195049, + "auxiliary_loss_mlp": 0.01014534, + "balance_loss_clip": 1.10547543, + "balance_loss_mlp": 1.00080156, + "epoch": 0.643258680294604, + "flos": 67658255688960.0, + "grad_norm": 0.8591675424222531, + "language_loss": 0.63507801, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65717387, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.13769531, + "step": 10699, + "time_per_iteration": 3.35263729095459 + }, + { + "auxiliary_loss_clip": 0.01411584, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.25230908, + "balance_loss_mlp": 1.01202381, + "epoch": 0.6433188035472719, + "flos": 25195325040000.0, + "grad_norm": 1.6588616924309962, + "language_loss": 0.69976437, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.72418404, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18359375, + "step": 10700, + "time_per_iteration": 2.899168014526367 + }, + { + "auxiliary_loss_clip": 0.01422034, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.2558794, + "balance_loss_mlp": 1.01117396, + "epoch": 0.6433789267999399, + "flos": 24984546923520.0, + "grad_norm": 1.8658578572892917, + "language_loss": 0.73898029, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.76351202, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19958496, + "step": 10701, + "time_per_iteration": 2.941054105758667 + }, + { + "auxiliary_loss_clip": 0.01444837, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.27330661, + "balance_loss_mlp": 1.01436067, + "epoch": 0.6434390500526078, + "flos": 17574836935680.0, + "grad_norm": 1.8696570200189369, + "language_loss": 0.82702291, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.85182142, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.20629883, + "step": 10702, + "time_per_iteration": 2.8928802013397217 + }, + { + "auxiliary_loss_clip": 0.01419668, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.25590801, + "balance_loss_mlp": 1.02037311, + "epoch": 0.6434991733052758, + "flos": 20851656698880.0, + "grad_norm": 1.805814768839921, + "language_loss": 0.75468886, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.77927643, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18701172, + "step": 10703, + "time_per_iteration": 2.8814473152160645 + }, + { + "auxiliary_loss_clip": 0.01196709, + "auxiliary_loss_mlp": 0.01022506, + "balance_loss_clip": 1.10511088, + "balance_loss_mlp": 1.00448132, + "epoch": 0.6435592965579437, + "flos": 66126117139200.0, + "grad_norm": 0.6623677001978532, + "language_loss": 0.54623955, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56843168, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.18066406, + "step": 10704, + "time_per_iteration": 3.375680685043335 + }, + { + "auxiliary_loss_clip": 0.01428477, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.26223564, + "balance_loss_mlp": 1.01463437, + "epoch": 0.6436194198106118, + "flos": 23779321188480.0, + "grad_norm": 1.593680684976102, + "language_loss": 0.77865839, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.80327046, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.18103027, + "step": 10705, + "time_per_iteration": 2.9029922485351562 + }, + { + "auxiliary_loss_clip": 0.01427463, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.26184249, + "balance_loss_mlp": 1.01907718, + "epoch": 0.6436795430632797, + "flos": 20240026485120.0, + "grad_norm": 1.6576529138690521, + "language_loss": 0.7986517, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.82330275, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.18566895, + "step": 10706, + "time_per_iteration": 2.93033766746521 + }, + { + "auxiliary_loss_clip": 0.01419598, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.25433564, + "balance_loss_mlp": 1.01283264, + "epoch": 0.6437396663159477, + "flos": 20311205834880.0, + "grad_norm": 2.5579114984113116, + "language_loss": 0.80862391, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.83314812, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19995117, + "step": 10707, + "time_per_iteration": 2.9159953594207764 + }, + { + "auxiliary_loss_clip": 0.01423327, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.25796103, + "balance_loss_mlp": 1.01460862, + "epoch": 0.6437997895686156, + "flos": 23889166859520.0, + "grad_norm": 1.7545105355431025, + "language_loss": 0.86014867, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88471782, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18969727, + "step": 10708, + "time_per_iteration": 4.296573877334595 + }, + { + "auxiliary_loss_clip": 0.01458293, + "auxiliary_loss_mlp": 0.01038439, + "balance_loss_clip": 1.28401041, + "balance_loss_mlp": 1.01814938, + "epoch": 0.6438599128212836, + "flos": 18998532403200.0, + "grad_norm": 2.1707738992867793, + "language_loss": 0.66716301, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.69213033, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.20288086, + "step": 10709, + "time_per_iteration": 2.877901077270508 + }, + { + "auxiliary_loss_clip": 0.01425767, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.26174247, + "balance_loss_mlp": 1.01622844, + "epoch": 0.6439200360739517, + "flos": 24106686451200.0, + "grad_norm": 1.8261284951769534, + "language_loss": 0.81272042, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.83732778, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18737793, + "step": 10710, + "time_per_iteration": 2.8937854766845703 + }, + { + "auxiliary_loss_clip": 0.01422719, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.25771606, + "balance_loss_mlp": 1.01266146, + "epoch": 0.6439801593266196, + "flos": 31913945665920.0, + "grad_norm": 1.7614913686423783, + "language_loss": 0.67037034, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.69491571, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19165039, + "step": 10711, + "time_per_iteration": 2.946032762527466 + }, + { + "auxiliary_loss_clip": 0.01433548, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.26506233, + "balance_loss_mlp": 1.01605248, + "epoch": 0.6440402825792876, + "flos": 27137319402240.0, + "grad_norm": 1.5955118602219833, + "language_loss": 0.79086792, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.81555873, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19482422, + "step": 10712, + "time_per_iteration": 2.906083345413208 + }, + { + "auxiliary_loss_clip": 0.01429817, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.26107526, + "balance_loss_mlp": 1.01736307, + "epoch": 0.6441004058319555, + "flos": 20677418398080.0, + "grad_norm": 1.5893081109515568, + "language_loss": 0.8354193, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.86009252, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20141602, + "step": 10713, + "time_per_iteration": 2.853761672973633 + }, + { + "auxiliary_loss_clip": 0.01405005, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.24518359, + "balance_loss_mlp": 1.01385951, + "epoch": 0.6441605290846235, + "flos": 26035786045440.0, + "grad_norm": 1.395930042385437, + "language_loss": 0.79337871, + "learning_rate": 1.187440012188684e-06, + "loss": 0.81775886, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19152832, + "step": 10714, + "time_per_iteration": 2.9263272285461426 + }, + { + "auxiliary_loss_clip": 0.01431247, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.26587379, + "balance_loss_mlp": 1.01593053, + "epoch": 0.6442206523372914, + "flos": 24910155192960.0, + "grad_norm": 1.4347535334971606, + "language_loss": 0.82028353, + "learning_rate": 1.187084157517583e-06, + "loss": 0.84494901, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19372559, + "step": 10715, + "time_per_iteration": 2.950824499130249 + }, + { + "auxiliary_loss_clip": 0.01431827, + "auxiliary_loss_mlp": 0.01038797, + "balance_loss_clip": 1.26330137, + "balance_loss_mlp": 1.01797116, + "epoch": 0.6442807755899594, + "flos": 25167426981120.0, + "grad_norm": 1.8577050634505778, + "language_loss": 0.81449395, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83920026, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20825195, + "step": 10716, + "time_per_iteration": 2.938124656677246 + }, + { + "auxiliary_loss_clip": 0.01441731, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.2708056, + "balance_loss_mlp": 1.01309657, + "epoch": 0.6443408988426274, + "flos": 27355924869120.0, + "grad_norm": 1.6895320121029476, + "language_loss": 0.7835021, + "learning_rate": 1.186372540666424e-06, + "loss": 0.8082391, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.18859863, + "step": 10717, + "time_per_iteration": 2.9358303546905518 + }, + { + "auxiliary_loss_clip": 0.01402607, + "auxiliary_loss_mlp": 0.01033678, + "balance_loss_clip": 1.24292934, + "balance_loss_mlp": 1.01447296, + "epoch": 0.6444010220952954, + "flos": 27939928492800.0, + "grad_norm": 1.7225602624529082, + "language_loss": 0.69111943, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.71548223, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19189453, + "step": 10718, + "time_per_iteration": 4.376666784286499 + }, + { + "auxiliary_loss_clip": 0.01195843, + "auxiliary_loss_mlp": 0.01012771, + "balance_loss_clip": 1.10432959, + "balance_loss_mlp": 0.99560511, + "epoch": 0.6444611453479633, + "flos": 71241628106880.0, + "grad_norm": 0.7731634279231949, + "language_loss": 0.49737352, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51945966, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.171875, + "step": 10719, + "time_per_iteration": 3.5296919345855713 + }, + { + "auxiliary_loss_clip": 0.01434392, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.26548624, + "balance_loss_mlp": 1.02007937, + "epoch": 0.6445212686006313, + "flos": 22713875199360.0, + "grad_norm": 1.8037411369799676, + "language_loss": 0.79063082, + "learning_rate": 1.18530534681967e-06, + "loss": 0.81538117, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20581055, + "step": 10720, + "time_per_iteration": 2.926892042160034 + }, + { + "auxiliary_loss_clip": 0.01429791, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.26546133, + "balance_loss_mlp": 1.01744413, + "epoch": 0.6445813918532992, + "flos": 21188749593600.0, + "grad_norm": 4.4375291925736144, + "language_loss": 0.7745651, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79923159, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1940918, + "step": 10721, + "time_per_iteration": 2.908325672149658 + }, + { + "auxiliary_loss_clip": 0.0142525, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.25805497, + "balance_loss_mlp": 1.01443911, + "epoch": 0.6446415151059672, + "flos": 25202790432000.0, + "grad_norm": 2.3882175405704347, + "language_loss": 0.73944461, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.76404011, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19836426, + "step": 10722, + "time_per_iteration": 5.807856321334839 + }, + { + "auxiliary_loss_clip": 0.01437298, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.27203536, + "balance_loss_mlp": 1.01938152, + "epoch": 0.6447016383586353, + "flos": 25313360019840.0, + "grad_norm": 1.6328168014541293, + "language_loss": 0.78843397, + "learning_rate": 1.184238431012635e-06, + "loss": 0.81319237, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19152832, + "step": 10723, + "time_per_iteration": 2.93548583984375 + }, + { + "auxiliary_loss_clip": 0.01441927, + "auxiliary_loss_mlp": 0.01039654, + "balance_loss_clip": 1.271227, + "balance_loss_mlp": 1.01922166, + "epoch": 0.6447617616113032, + "flos": 27713043227520.0, + "grad_norm": 1.5689093655521364, + "language_loss": 0.59007174, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.61488754, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.2043457, + "step": 10724, + "time_per_iteration": 2.9897994995117188 + }, + { + "auxiliary_loss_clip": 0.01418338, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.25674415, + "balance_loss_mlp": 1.0196476, + "epoch": 0.6448218848639712, + "flos": 23049384526080.0, + "grad_norm": 1.8718510971645606, + "language_loss": 0.84589779, + "learning_rate": 1.183527308454271e-06, + "loss": 0.87046903, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19128418, + "step": 10725, + "time_per_iteration": 2.9101297855377197 + }, + { + "auxiliary_loss_clip": 0.0141823, + "auxiliary_loss_mlp": 0.01040073, + "balance_loss_clip": 1.25401974, + "balance_loss_mlp": 1.0193429, + "epoch": 0.6448820081166391, + "flos": 24506135959680.0, + "grad_norm": 4.063805667903456, + "language_loss": 0.82714474, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.85172772, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20751953, + "step": 10726, + "time_per_iteration": 2.8951659202575684 + }, + { + "auxiliary_loss_clip": 0.01444781, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.27376211, + "balance_loss_mlp": 1.01783526, + "epoch": 0.6449421313693071, + "flos": 22429519758720.0, + "grad_norm": 1.7920537206404201, + "language_loss": 0.82170647, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.84653473, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20214844, + "step": 10727, + "time_per_iteration": 2.8644561767578125 + }, + { + "auxiliary_loss_clip": 0.01454479, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.2801398, + "balance_loss_mlp": 1.02097654, + "epoch": 0.645002254621975, + "flos": 20234235150720.0, + "grad_norm": 1.8068615813701856, + "language_loss": 0.79684073, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.82180417, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 1.74316406, + "router_z_loss_mlp": 0.20861816, + "step": 10728, + "time_per_iteration": 2.8891477584838867 + }, + { + "auxiliary_loss_clip": 0.01423743, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.25813699, + "balance_loss_mlp": 1.01819372, + "epoch": 0.645062377874643, + "flos": 27867029840640.0, + "grad_norm": 1.8647059992801507, + "language_loss": 0.75230628, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.77691901, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.1932373, + "step": 10729, + "time_per_iteration": 2.9148471355438232 + }, + { + "auxiliary_loss_clip": 0.01432384, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.26527596, + "balance_loss_mlp": 1.01746297, + "epoch": 0.645122501127311, + "flos": 25312455123840.0, + "grad_norm": 1.5752941509913736, + "language_loss": 0.66752511, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.69221866, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19506836, + "step": 10730, + "time_per_iteration": 2.909738063812256 + }, + { + "auxiliary_loss_clip": 0.01427763, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.26152372, + "balance_loss_mlp": 1.01804757, + "epoch": 0.645182624379979, + "flos": 18816692976000.0, + "grad_norm": 1.640571364440011, + "language_loss": 0.65431768, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.67897546, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19970703, + "step": 10731, + "time_per_iteration": 2.942887306213379 + }, + { + "auxiliary_loss_clip": 0.01416444, + "auxiliary_loss_mlp": 0.01038904, + "balance_loss_clip": 1.2531743, + "balance_loss_mlp": 1.01838851, + "epoch": 0.6452427476326469, + "flos": 18341177679360.0, + "grad_norm": 1.6475484631221913, + "language_loss": 0.68493098, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70948446, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.2052002, + "step": 10732, + "time_per_iteration": 2.902336597442627 + }, + { + "auxiliary_loss_clip": 0.01415075, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.25189662, + "balance_loss_mlp": 1.02002263, + "epoch": 0.6453028708853149, + "flos": 22795234629120.0, + "grad_norm": 2.5434230556995656, + "language_loss": 0.76631153, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.79086018, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19763184, + "step": 10733, + "time_per_iteration": 2.91213321685791 + }, + { + "auxiliary_loss_clip": 0.01462361, + "auxiliary_loss_mlp": 0.01043164, + "balance_loss_clip": 1.28967249, + "balance_loss_mlp": 1.02184987, + "epoch": 0.6453629941379828, + "flos": 23955143057280.0, + "grad_norm": 2.07320248012528, + "language_loss": 0.68391204, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.70896727, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.21313477, + "step": 10734, + "time_per_iteration": 2.9263014793395996 + }, + { + "auxiliary_loss_clip": 0.01412434, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.25304675, + "balance_loss_mlp": 1.02077889, + "epoch": 0.6454231173906508, + "flos": 17685451768320.0, + "grad_norm": 2.7099464956219537, + "language_loss": 0.74485689, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76938778, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1986084, + "step": 10735, + "time_per_iteration": 2.851369857788086 + }, + { + "auxiliary_loss_clip": 0.01425506, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.25980115, + "balance_loss_mlp": 1.01871943, + "epoch": 0.6454832406433189, + "flos": 23302674771840.0, + "grad_norm": 1.8731594838543324, + "language_loss": 0.75529397, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.77993274, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.1965332, + "step": 10736, + "time_per_iteration": 2.92983078956604 + }, + { + "auxiliary_loss_clip": 0.01459813, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.28721285, + "balance_loss_mlp": 1.01911902, + "epoch": 0.6455433638959868, + "flos": 20166856364160.0, + "grad_norm": 5.6473538090312205, + "language_loss": 0.71657181, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.74156123, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20019531, + "step": 10737, + "time_per_iteration": 2.8788201808929443 + }, + { + "auxiliary_loss_clip": 0.01194785, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.10301113, + "balance_loss_mlp": 1.01343429, + "epoch": 0.6456034871486548, + "flos": 66564685416960.0, + "grad_norm": 0.7991951359951835, + "language_loss": 0.58481252, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60705304, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15820312, + "step": 10738, + "time_per_iteration": 3.4525833129882812 + }, + { + "auxiliary_loss_clip": 0.01422634, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.25806427, + "balance_loss_mlp": 1.01296854, + "epoch": 0.6456636104013227, + "flos": 24216532122240.0, + "grad_norm": 1.6489125173554904, + "language_loss": 0.75407732, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.77863169, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19848633, + "step": 10739, + "time_per_iteration": 2.9349186420440674 + }, + { + "auxiliary_loss_clip": 0.01440272, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.26998663, + "balance_loss_mlp": 1.01453984, + "epoch": 0.6457237336539907, + "flos": 23634835983360.0, + "grad_norm": 1.8355693402391666, + "language_loss": 0.72498274, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.74973106, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.20031738, + "step": 10740, + "time_per_iteration": 2.901472568511963 + }, + { + "auxiliary_loss_clip": 0.01192819, + "auxiliary_loss_mlp": 0.01019994, + "balance_loss_clip": 1.10255015, + "balance_loss_mlp": 1.00282812, + "epoch": 0.6457838569066586, + "flos": 65879251655040.0, + "grad_norm": 0.6646151101911751, + "language_loss": 0.55369961, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57582772, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.171875, + "step": 10741, + "time_per_iteration": 3.347278594970703 + }, + { + "auxiliary_loss_clip": 0.01417762, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.25422955, + "balance_loss_mlp": 1.01709199, + "epoch": 0.6458439801593266, + "flos": 22391984557440.0, + "grad_norm": 1.6712121324950662, + "language_loss": 0.80694562, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.83149195, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19775391, + "step": 10742, + "time_per_iteration": 2.8826849460601807 + }, + { + "auxiliary_loss_clip": 0.01410836, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.2504127, + "balance_loss_mlp": 1.01286197, + "epoch": 0.6459041034119946, + "flos": 24799676094720.0, + "grad_norm": 1.5408063950607875, + "language_loss": 0.82519627, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84963703, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20385742, + "step": 10743, + "time_per_iteration": 4.373667240142822 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.25011587, + "balance_loss_mlp": 1.01213253, + "epoch": 0.6459642266646626, + "flos": 18332581167360.0, + "grad_norm": 3.0013242520569987, + "language_loss": 0.72495508, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74940318, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19836426, + "step": 10744, + "time_per_iteration": 2.867992639541626 + }, + { + "auxiliary_loss_clip": 0.01418198, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.25324368, + "balance_loss_mlp": 1.01068163, + "epoch": 0.6460243499173305, + "flos": 43597185275520.0, + "grad_norm": 1.879999344697192, + "language_loss": 0.67918563, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.70366502, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19055176, + "step": 10745, + "time_per_iteration": 3.0845792293548584 + }, + { + "auxiliary_loss_clip": 0.01418678, + "auxiliary_loss_mlp": 0.01037999, + "balance_loss_clip": 1.25362265, + "balance_loss_mlp": 1.01806736, + "epoch": 0.6460844731699985, + "flos": 19253179992960.0, + "grad_norm": 2.7652021153453146, + "language_loss": 0.74870098, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.77326787, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19934082, + "step": 10746, + "time_per_iteration": 2.849994421005249 + }, + { + "auxiliary_loss_clip": 0.01436714, + "auxiliary_loss_mlp": 0.01040501, + "balance_loss_clip": 1.26855433, + "balance_loss_mlp": 1.02004504, + "epoch": 0.6461445964226664, + "flos": 27464639420160.0, + "grad_norm": 1.5501179505607803, + "language_loss": 0.67364407, + "learning_rate": 1.175713157660413e-06, + "loss": 0.69841623, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20471191, + "step": 10747, + "time_per_iteration": 2.942025661468506 + }, + { + "auxiliary_loss_clip": 0.01435922, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.26839185, + "balance_loss_mlp": 1.01691604, + "epoch": 0.6462047196753344, + "flos": 20303197505280.0, + "grad_norm": 2.1338332300825718, + "language_loss": 0.67746878, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.70219183, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19470215, + "step": 10748, + "time_per_iteration": 2.886044979095459 + }, + { + "auxiliary_loss_clip": 0.01435772, + "auxiliary_loss_mlp": 0.0103823, + "balance_loss_clip": 1.26630974, + "balance_loss_mlp": 1.01782131, + "epoch": 0.6462648429280025, + "flos": 22028577171840.0, + "grad_norm": 1.750320588054417, + "language_loss": 0.7685656, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.79330564, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20397949, + "step": 10749, + "time_per_iteration": 2.911282539367676 + }, + { + "auxiliary_loss_clip": 0.0143869, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.26956773, + "balance_loss_mlp": 1.01509476, + "epoch": 0.6463249661806704, + "flos": 27792366641280.0, + "grad_norm": 1.7061669104491564, + "language_loss": 0.77456731, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79929864, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19348145, + "step": 10750, + "time_per_iteration": 2.9091992378234863 + }, + { + "auxiliary_loss_clip": 0.0144583, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.27479565, + "balance_loss_mlp": 1.01451182, + "epoch": 0.6463850894333384, + "flos": 22059732856320.0, + "grad_norm": 2.2725485857624057, + "language_loss": 0.69880319, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.7236132, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.20666504, + "step": 10751, + "time_per_iteration": 2.8715059757232666 + }, + { + "auxiliary_loss_clip": 0.01452805, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.2818687, + "balance_loss_mlp": 1.01766539, + "epoch": 0.6464452126860063, + "flos": 21116258144640.0, + "grad_norm": 1.690631577526552, + "language_loss": 0.72108614, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.74598879, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.19787598, + "step": 10752, + "time_per_iteration": 2.898308753967285 + }, + { + "auxiliary_loss_clip": 0.01434749, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.26514173, + "balance_loss_mlp": 1.0159409, + "epoch": 0.6465053359386743, + "flos": 16035006769920.0, + "grad_norm": 1.6684512044340727, + "language_loss": 0.78774977, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.81247348, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.21691895, + "step": 10753, + "time_per_iteration": 4.322907209396362 + }, + { + "auxiliary_loss_clip": 0.01422832, + "auxiliary_loss_mlp": 0.01040914, + "balance_loss_clip": 1.25817561, + "balance_loss_mlp": 1.0208993, + "epoch": 0.6465654591913422, + "flos": 23407679249280.0, + "grad_norm": 1.6401490286898197, + "language_loss": 0.85832322, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.88296068, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20007324, + "step": 10754, + "time_per_iteration": 2.8825440406799316 + }, + { + "auxiliary_loss_clip": 0.0142953, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.26422644, + "balance_loss_mlp": 1.01595438, + "epoch": 0.6466255824440102, + "flos": 15385615130880.0, + "grad_norm": 2.071176243086444, + "language_loss": 0.6096372, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.63428295, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19067383, + "step": 10755, + "time_per_iteration": 2.8428735733032227 + }, + { + "auxiliary_loss_clip": 0.01422637, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.25785935, + "balance_loss_mlp": 1.01467621, + "epoch": 0.6466857056966782, + "flos": 16261394342400.0, + "grad_norm": 2.08075005219916, + "language_loss": 0.6889869, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.7135641, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20397949, + "step": 10756, + "time_per_iteration": 2.9133362770080566 + }, + { + "auxiliary_loss_clip": 0.01458343, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.28515506, + "balance_loss_mlp": 1.01506758, + "epoch": 0.6467458289493462, + "flos": 21188025676800.0, + "grad_norm": 5.817000355244816, + "language_loss": 0.76100391, + "learning_rate": 1.172166263444844e-06, + "loss": 0.78594196, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.20385742, + "step": 10757, + "time_per_iteration": 4.284683465957642 + }, + { + "auxiliary_loss_clip": 0.01414603, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.25345182, + "balance_loss_mlp": 1.01673615, + "epoch": 0.6468059522020141, + "flos": 17977453580160.0, + "grad_norm": 1.4278562692587216, + "language_loss": 0.74999726, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.77450466, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1940918, + "step": 10758, + "time_per_iteration": 4.4209043979644775 + }, + { + "auxiliary_loss_clip": 0.01433068, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.26589131, + "balance_loss_mlp": 1.01440871, + "epoch": 0.6468660754546821, + "flos": 17897949187200.0, + "grad_norm": 1.8996781043691047, + "language_loss": 0.68820763, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.71288931, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20703125, + "step": 10759, + "time_per_iteration": 2.8946831226348877 + }, + { + "auxiliary_loss_clip": 0.01449413, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.27751851, + "balance_loss_mlp": 1.01821017, + "epoch": 0.64692619870735, + "flos": 22611087717120.0, + "grad_norm": 1.8233904374703835, + "language_loss": 0.76313412, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.78802276, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.21264648, + "step": 10760, + "time_per_iteration": 2.9149868488311768 + }, + { + "auxiliary_loss_clip": 0.01418029, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.25472069, + "balance_loss_mlp": 1.01358545, + "epoch": 0.646986321960018, + "flos": 49617975064320.0, + "grad_norm": 1.6561867437683997, + "language_loss": 0.66206241, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.68656617, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18762207, + "step": 10761, + "time_per_iteration": 3.131577491760254 + }, + { + "auxiliary_loss_clip": 0.01437139, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.27065694, + "balance_loss_mlp": 1.01372814, + "epoch": 0.6470464452126861, + "flos": 21918550521600.0, + "grad_norm": 2.0321387128391515, + "language_loss": 0.70659357, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.73130572, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20336914, + "step": 10762, + "time_per_iteration": 2.882758378982544 + }, + { + "auxiliary_loss_clip": 0.01436879, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.26736486, + "balance_loss_mlp": 1.01754105, + "epoch": 0.647106568465354, + "flos": 18113206538880.0, + "grad_norm": 1.800473494091686, + "language_loss": 0.82701796, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.8517555, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1932373, + "step": 10763, + "time_per_iteration": 2.8798041343688965 + }, + { + "auxiliary_loss_clip": 0.01192675, + "auxiliary_loss_mlp": 0.01012573, + "balance_loss_clip": 1.10217476, + "balance_loss_mlp": 0.99550217, + "epoch": 0.647166691718022, + "flos": 69510429843840.0, + "grad_norm": 0.7155594614219051, + "language_loss": 0.57803178, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.6000843, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.17089844, + "step": 10764, + "time_per_iteration": 3.549649238586426 + }, + { + "auxiliary_loss_clip": 0.01414743, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.24978161, + "balance_loss_mlp": 1.01168239, + "epoch": 0.6472268149706899, + "flos": 34108551601920.0, + "grad_norm": 1.9864320295195048, + "language_loss": 0.61113292, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.63559234, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19506836, + "step": 10765, + "time_per_iteration": 2.9923577308654785 + }, + { + "auxiliary_loss_clip": 0.01415168, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.25358903, + "balance_loss_mlp": 1.0112108, + "epoch": 0.6472869382233579, + "flos": 28122989529600.0, + "grad_norm": 1.904315398402937, + "language_loss": 0.63860655, + "learning_rate": 1.168976742243437e-06, + "loss": 0.66306686, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.1965332, + "step": 10766, + "time_per_iteration": 2.9702703952789307 + }, + { + "auxiliary_loss_clip": 0.01436202, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.26993954, + "balance_loss_mlp": 1.01320863, + "epoch": 0.6473470614760258, + "flos": 22502373166080.0, + "grad_norm": 1.6639522314277153, + "language_loss": 0.75861239, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.78331304, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20666504, + "step": 10767, + "time_per_iteration": 2.8857662677764893 + }, + { + "auxiliary_loss_clip": 0.01427218, + "auxiliary_loss_mlp": 0.01038912, + "balance_loss_clip": 1.26001263, + "balance_loss_mlp": 1.01819396, + "epoch": 0.6474071847286939, + "flos": 14547280631040.0, + "grad_norm": 2.032694836779207, + "language_loss": 0.7892257, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.813887, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20727539, + "step": 10768, + "time_per_iteration": 2.8729171752929688 + }, + { + "auxiliary_loss_clip": 0.0141315, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.24921846, + "balance_loss_mlp": 1.01544249, + "epoch": 0.6474673079813618, + "flos": 24109039180800.0, + "grad_norm": 1.7751970963889592, + "language_loss": 0.72359508, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74807012, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18896484, + "step": 10769, + "time_per_iteration": 2.9030165672302246 + }, + { + "auxiliary_loss_clip": 0.01417808, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.25580001, + "balance_loss_mlp": 1.0129056, + "epoch": 0.6475274312340298, + "flos": 14984491564800.0, + "grad_norm": 4.878751300495201, + "language_loss": 0.73656678, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.76107574, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.2019043, + "step": 10770, + "time_per_iteration": 2.901657819747925 + }, + { + "auxiliary_loss_clip": 0.01444614, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.27232695, + "balance_loss_mlp": 1.01616979, + "epoch": 0.6475875544866977, + "flos": 25055997742080.0, + "grad_norm": 1.892982682952068, + "language_loss": 0.74211836, + "learning_rate": 1.167205888330325e-06, + "loss": 0.76693738, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21118164, + "step": 10771, + "time_per_iteration": 2.9919731616973877 + }, + { + "auxiliary_loss_clip": 0.01431304, + "auxiliary_loss_mlp": 0.0103991, + "balance_loss_clip": 1.26576436, + "balance_loss_mlp": 1.02025235, + "epoch": 0.6476476777393657, + "flos": 16480859460480.0, + "grad_norm": 2.005794770355393, + "language_loss": 0.74627662, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.77098876, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19665527, + "step": 10772, + "time_per_iteration": 2.8533477783203125 + }, + { + "auxiliary_loss_clip": 0.01404887, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.24385333, + "balance_loss_mlp": 1.01262403, + "epoch": 0.6477078009920336, + "flos": 25823333871360.0, + "grad_norm": 1.5160822614139204, + "language_loss": 0.83856165, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.86292851, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19165039, + "step": 10773, + "time_per_iteration": 2.964881658554077 + }, + { + "auxiliary_loss_clip": 0.01417148, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.255059, + "balance_loss_mlp": 1.01584435, + "epoch": 0.6477679242447016, + "flos": 17685135054720.0, + "grad_norm": 1.4715790427150905, + "language_loss": 0.78926563, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.81379122, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19580078, + "step": 10774, + "time_per_iteration": 2.8641586303710938 + }, + { + "auxiliary_loss_clip": 0.01432966, + "auxiliary_loss_mlp": 0.01040306, + "balance_loss_clip": 1.26422036, + "balance_loss_mlp": 1.01890802, + "epoch": 0.6478280474973696, + "flos": 21042047393280.0, + "grad_norm": 2.021756486080307, + "language_loss": 0.70558155, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.73031425, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21386719, + "step": 10775, + "time_per_iteration": 2.8877298831939697 + }, + { + "auxiliary_loss_clip": 0.01440047, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.26983941, + "balance_loss_mlp": 1.01734495, + "epoch": 0.6478881707500376, + "flos": 21627046402560.0, + "grad_norm": 1.8142985367208029, + "language_loss": 0.67052728, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.69529754, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.19628906, + "step": 10776, + "time_per_iteration": 2.8824782371520996 + }, + { + "auxiliary_loss_clip": 0.01430314, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.262501, + "balance_loss_mlp": 1.01575327, + "epoch": 0.6479482940027056, + "flos": 18451837756800.0, + "grad_norm": 2.5918182266615015, + "language_loss": 0.7958796, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.82054174, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20129395, + "step": 10777, + "time_per_iteration": 2.850985527038574 + }, + { + "auxiliary_loss_clip": 0.014208, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.25569665, + "balance_loss_mlp": 1.01380205, + "epoch": 0.6480084172553735, + "flos": 22174283986560.0, + "grad_norm": 2.9602066145499975, + "language_loss": 0.73872852, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.76327705, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20251465, + "step": 10778, + "time_per_iteration": 4.284914016723633 + }, + { + "auxiliary_loss_clip": 0.01417071, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.25377667, + "balance_loss_mlp": 1.01563203, + "epoch": 0.6480685405080415, + "flos": 24326875486080.0, + "grad_norm": 1.5905359241668007, + "language_loss": 0.78661883, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.81113791, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19189453, + "step": 10779, + "time_per_iteration": 2.9020681381225586 + }, + { + "auxiliary_loss_clip": 0.01191475, + "auxiliary_loss_mlp": 0.01018323, + "balance_loss_clip": 1.10114992, + "balance_loss_mlp": 0.99762869, + "epoch": 0.6481286637607094, + "flos": 59920908969600.0, + "grad_norm": 0.719537901766192, + "language_loss": 0.59446484, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61656284, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.20703125, + "step": 10780, + "time_per_iteration": 3.3689403533935547 + }, + { + "auxiliary_loss_clip": 0.01421398, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.25754189, + "balance_loss_mlp": 1.01345658, + "epoch": 0.6481887870133775, + "flos": 25495244691840.0, + "grad_norm": 1.7103454507426306, + "language_loss": 0.79703748, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.82157922, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1932373, + "step": 10781, + "time_per_iteration": 2.927130699157715 + }, + { + "auxiliary_loss_clip": 0.01439785, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.27131295, + "balance_loss_mlp": 1.01321554, + "epoch": 0.6482489102660454, + "flos": 19937708858880.0, + "grad_norm": 1.9777326793566872, + "language_loss": 0.80099064, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.82572877, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20812988, + "step": 10782, + "time_per_iteration": 2.8674728870391846 + }, + { + "auxiliary_loss_clip": 0.01431096, + "auxiliary_loss_mlp": 0.01035833, + "balance_loss_clip": 1.26287234, + "balance_loss_mlp": 1.01561546, + "epoch": 0.6483090335187134, + "flos": 26990752936320.0, + "grad_norm": 1.9571870556305826, + "language_loss": 0.64896226, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.67363155, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20227051, + "step": 10783, + "time_per_iteration": 2.972557306289673 + }, + { + "auxiliary_loss_clip": 0.0143114, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.26181614, + "balance_loss_mlp": 1.0133208, + "epoch": 0.6483691567713813, + "flos": 25087651119360.0, + "grad_norm": 1.7514355288779386, + "language_loss": 0.8937996, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.91845143, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20727539, + "step": 10784, + "time_per_iteration": 2.8838706016540527 + }, + { + "auxiliary_loss_clip": 0.01413863, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.25021887, + "balance_loss_mlp": 1.01266265, + "epoch": 0.6484292800240493, + "flos": 16114239694080.0, + "grad_norm": 2.639473403194717, + "language_loss": 0.74483955, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.76931071, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20568848, + "step": 10785, + "time_per_iteration": 2.8852479457855225 + }, + { + "auxiliary_loss_clip": 0.01409105, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.24893081, + "balance_loss_mlp": 1.0133853, + "epoch": 0.6484894032767172, + "flos": 28852202275200.0, + "grad_norm": 1.4394198430908807, + "language_loss": 0.69889665, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.72331917, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19763184, + "step": 10786, + "time_per_iteration": 2.9695687294006348 + }, + { + "auxiliary_loss_clip": 0.01418454, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.25453138, + "balance_loss_mlp": 1.01618862, + "epoch": 0.6485495265293852, + "flos": 30239267437440.0, + "grad_norm": 2.059909296579217, + "language_loss": 0.72100306, + "learning_rate": 1.161544469455041e-06, + "loss": 0.74554861, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19885254, + "step": 10787, + "time_per_iteration": 2.9280216693878174 + }, + { + "auxiliary_loss_clip": 0.01424891, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.2584157, + "balance_loss_mlp": 1.01566315, + "epoch": 0.6486096497820532, + "flos": 20091288268800.0, + "grad_norm": 1.8672290681561614, + "language_loss": 0.8498888, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.87449431, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19995117, + "step": 10788, + "time_per_iteration": 4.353024959564209 + }, + { + "auxiliary_loss_clip": 0.01425277, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.26053166, + "balance_loss_mlp": 1.01282406, + "epoch": 0.6486697730347212, + "flos": 17137987960320.0, + "grad_norm": 2.4795030154764213, + "language_loss": 0.7812587, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.80584705, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20751953, + "step": 10789, + "time_per_iteration": 2.843346357345581 + }, + { + "auxiliary_loss_clip": 0.01408988, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.24729204, + "balance_loss_mlp": 1.01046777, + "epoch": 0.6487298962873892, + "flos": 38926739854080.0, + "grad_norm": 1.800193102271303, + "language_loss": 0.77106726, + "learning_rate": 1.160483857897479e-06, + "loss": 0.79544961, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18762207, + "step": 10790, + "time_per_iteration": 3.004951238632202 + }, + { + "auxiliary_loss_clip": 0.01417563, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.25521827, + "balance_loss_mlp": 1.0134356, + "epoch": 0.6487900195400571, + "flos": 11955351692160.0, + "grad_norm": 4.246744081048147, + "language_loss": 0.61917138, + "learning_rate": 1.160130384362823e-06, + "loss": 0.64366984, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18859863, + "step": 10791, + "time_per_iteration": 2.8411202430725098 + }, + { + "auxiliary_loss_clip": 0.01410649, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.24705398, + "balance_loss_mlp": 1.01337361, + "epoch": 0.6488501427927251, + "flos": 22354177887360.0, + "grad_norm": 1.6409577748980588, + "language_loss": 0.86622524, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.89066768, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20202637, + "step": 10792, + "time_per_iteration": 2.8722622394561768 + }, + { + "auxiliary_loss_clip": 0.01429189, + "auxiliary_loss_mlp": 0.01030461, + "balance_loss_clip": 1.26101065, + "balance_loss_mlp": 1.01008821, + "epoch": 0.648910266045393, + "flos": 22246322987520.0, + "grad_norm": 1.8794208881363526, + "language_loss": 0.78869754, + "learning_rate": 1.159423532850735e-06, + "loss": 0.81329405, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20361328, + "step": 10793, + "time_per_iteration": 5.780864715576172 + }, + { + "auxiliary_loss_clip": 0.0143166, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.26503038, + "balance_loss_mlp": 1.01362705, + "epoch": 0.6489703892980611, + "flos": 25312138410240.0, + "grad_norm": 3.5118481304509994, + "language_loss": 0.75215137, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77680904, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20483398, + "step": 10794, + "time_per_iteration": 2.914161205291748 + }, + { + "auxiliary_loss_clip": 0.01417889, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.25306702, + "balance_loss_mlp": 1.0142622, + "epoch": 0.649030512550729, + "flos": 24582699440640.0, + "grad_norm": 1.8203083284019141, + "language_loss": 0.70877624, + "learning_rate": 1.158716808837621e-06, + "loss": 0.73330021, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20239258, + "step": 10795, + "time_per_iteration": 2.9107322692871094 + }, + { + "auxiliary_loss_clip": 0.01437418, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.26825929, + "balance_loss_mlp": 1.01344323, + "epoch": 0.649090635803397, + "flos": 26254482001920.0, + "grad_norm": 1.7451116402078855, + "language_loss": 0.54861856, + "learning_rate": 1.158363494676679e-06, + "loss": 0.57333434, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20715332, + "step": 10796, + "time_per_iteration": 2.891923427581787 + }, + { + "auxiliary_loss_clip": 0.01430087, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.26389396, + "balance_loss_mlp": 1.01329041, + "epoch": 0.6491507590560649, + "flos": 24948414311040.0, + "grad_norm": 1.4758619001465354, + "language_loss": 0.78632379, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.81095606, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19824219, + "step": 10797, + "time_per_iteration": 2.9002041816711426 + }, + { + "auxiliary_loss_clip": 0.01409489, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.24944067, + "balance_loss_mlp": 1.01291537, + "epoch": 0.6492108823087329, + "flos": 19509094437120.0, + "grad_norm": 1.9160505620612323, + "language_loss": 0.7071563, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.73157144, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19104004, + "step": 10798, + "time_per_iteration": 2.914146661758423 + }, + { + "auxiliary_loss_clip": 0.01418714, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.25390637, + "balance_loss_mlp": 1.0104183, + "epoch": 0.6492710055614008, + "flos": 19728831024000.0, + "grad_norm": 1.6809956389635154, + "language_loss": 0.77230012, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79677838, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.18688965, + "step": 10799, + "time_per_iteration": 2.9596080780029297 + }, + { + "auxiliary_loss_clip": 0.01433891, + "auxiliary_loss_mlp": 0.01034547, + "balance_loss_clip": 1.26273155, + "balance_loss_mlp": 1.01382852, + "epoch": 0.6493311288140688, + "flos": 24327870871680.0, + "grad_norm": 2.0868608733626117, + "language_loss": 0.72311419, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74779856, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.20727539, + "step": 10800, + "time_per_iteration": 2.9081668853759766 + }, + { + "auxiliary_loss_clip": 0.01192534, + "auxiliary_loss_mlp": 0.01016541, + "balance_loss_clip": 1.10063183, + "balance_loss_mlp": 1.00004244, + "epoch": 0.6493912520667368, + "flos": 70964149875840.0, + "grad_norm": 0.7765422169870086, + "language_loss": 0.60327303, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62536383, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.16503906, + "step": 10801, + "time_per_iteration": 3.4809653759002686 + }, + { + "auxiliary_loss_clip": 0.01438971, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.27185535, + "balance_loss_mlp": 1.01938057, + "epoch": 0.6494513753194048, + "flos": 25348678225920.0, + "grad_norm": 1.6563104734776464, + "language_loss": 0.79599309, + "learning_rate": 1.156244280393614e-06, + "loss": 0.82078493, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20837402, + "step": 10802, + "time_per_iteration": 2.9208619594573975 + }, + { + "auxiliary_loss_clip": 0.01419742, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.25460172, + "balance_loss_mlp": 1.01890302, + "epoch": 0.6495114985720728, + "flos": 24692952314880.0, + "grad_norm": 1.6383561074643163, + "language_loss": 0.75090212, + "learning_rate": 1.155891189918541e-06, + "loss": 0.77549505, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.2064209, + "step": 10803, + "time_per_iteration": 2.913355588912964 + }, + { + "auxiliary_loss_clip": 0.01423097, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.25746584, + "balance_loss_mlp": 1.01225317, + "epoch": 0.6495716218247407, + "flos": 23659476416640.0, + "grad_norm": 2.6692424920739666, + "language_loss": 0.70923579, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.73377728, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18811035, + "step": 10804, + "time_per_iteration": 2.8485493659973145 + }, + { + "auxiliary_loss_clip": 0.01404628, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.24214244, + "balance_loss_mlp": 1.01553881, + "epoch": 0.6496317450774087, + "flos": 22356349637760.0, + "grad_norm": 2.17417021306463, + "language_loss": 0.73736525, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.76177061, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20349121, + "step": 10805, + "time_per_iteration": 2.898684501647949 + }, + { + "auxiliary_loss_clip": 0.01424601, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.25753832, + "balance_loss_mlp": 1.009794, + "epoch": 0.6496918683300766, + "flos": 30530862046080.0, + "grad_norm": 2.647061799862123, + "language_loss": 0.67076588, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.69529718, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18725586, + "step": 10806, + "time_per_iteration": 2.922217845916748 + }, + { + "auxiliary_loss_clip": 0.01437755, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.26798487, + "balance_loss_mlp": 1.00928235, + "epoch": 0.6497519915827447, + "flos": 12466728132480.0, + "grad_norm": 2.6229756591887696, + "language_loss": 0.79886878, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.82353479, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.19555664, + "step": 10807, + "time_per_iteration": 2.8560972213745117 + }, + { + "auxiliary_loss_clip": 0.01195043, + "auxiliary_loss_mlp": 0.01009413, + "balance_loss_clip": 1.10266161, + "balance_loss_mlp": 0.99320096, + "epoch": 0.6498121148354126, + "flos": 69127947135360.0, + "grad_norm": 0.7754770142799517, + "language_loss": 0.58927572, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.61132026, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16210938, + "step": 10808, + "time_per_iteration": 3.520206928253174 + }, + { + "auxiliary_loss_clip": 0.0142725, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.26474071, + "balance_loss_mlp": 1.0112288, + "epoch": 0.6498722380880806, + "flos": 36909313130880.0, + "grad_norm": 1.7015686339212714, + "language_loss": 0.63571703, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.66029763, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19580078, + "step": 10809, + "time_per_iteration": 3.0098509788513184 + }, + { + "auxiliary_loss_clip": 0.01421562, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.25788927, + "balance_loss_mlp": 1.01266336, + "epoch": 0.6499323613407485, + "flos": 29028295612800.0, + "grad_norm": 1.7258266101876474, + "language_loss": 0.81447649, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83900833, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.1895752, + "step": 10810, + "time_per_iteration": 2.918860673904419 + }, + { + "auxiliary_loss_clip": 0.01410708, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.25034809, + "balance_loss_mlp": 1.01630592, + "epoch": 0.6499924845934165, + "flos": 20128506756480.0, + "grad_norm": 1.6046377297650405, + "language_loss": 0.72510183, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.74956942, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19750977, + "step": 10811, + "time_per_iteration": 2.8658456802368164 + }, + { + "auxiliary_loss_clip": 0.01414286, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.25513959, + "balance_loss_mlp": 1.01201606, + "epoch": 0.6500526078460844, + "flos": 24430884577920.0, + "grad_norm": 1.7663003169552878, + "language_loss": 0.78361005, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.8080619, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18884277, + "step": 10812, + "time_per_iteration": 2.9117116928100586 + }, + { + "auxiliary_loss_clip": 0.01431202, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.26549864, + "balance_loss_mlp": 1.01476943, + "epoch": 0.6501127310987524, + "flos": 23341386337920.0, + "grad_norm": 1.8815826554205881, + "language_loss": 0.8551116, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87976247, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19128418, + "step": 10813, + "time_per_iteration": 4.422515392303467 + }, + { + "auxiliary_loss_clip": 0.01421846, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.25796974, + "balance_loss_mlp": 1.01487947, + "epoch": 0.6501728543514204, + "flos": 18707118773760.0, + "grad_norm": 1.5676059544669247, + "language_loss": 0.80493176, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82948595, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18701172, + "step": 10814, + "time_per_iteration": 2.82851505279541 + }, + { + "auxiliary_loss_clip": 0.01449675, + "auxiliary_loss_mlp": 0.0104044, + "balance_loss_clip": 1.28000116, + "balance_loss_mlp": 1.01992369, + "epoch": 0.6502329776040884, + "flos": 44215375985280.0, + "grad_norm": 1.5060618998302724, + "language_loss": 0.6609658, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.68586695, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20507812, + "step": 10815, + "time_per_iteration": 3.0241148471832275 + }, + { + "auxiliary_loss_clip": 0.01446317, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.27545822, + "balance_loss_mlp": 1.0143187, + "epoch": 0.6502931008567564, + "flos": 14582915550720.0, + "grad_norm": 1.8931937509034875, + "language_loss": 0.76560485, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.7904278, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.2166748, + "step": 10816, + "time_per_iteration": 2.828721523284912 + }, + { + "auxiliary_loss_clip": 0.01422423, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.26086211, + "balance_loss_mlp": 1.01408172, + "epoch": 0.6503532241094243, + "flos": 21404278414080.0, + "grad_norm": 1.713786047523561, + "language_loss": 0.73803806, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.76260245, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19934082, + "step": 10817, + "time_per_iteration": 2.8730146884918213 + }, + { + "auxiliary_loss_clip": 0.0142914, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.26202047, + "balance_loss_mlp": 1.01299095, + "epoch": 0.6504133473620923, + "flos": 74764906151040.0, + "grad_norm": 1.4628871938314734, + "language_loss": 0.7235741, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.7481972, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20178223, + "step": 10818, + "time_per_iteration": 3.275144338607788 + }, + { + "auxiliary_loss_clip": 0.01441011, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.27258575, + "balance_loss_mlp": 1.01592851, + "epoch": 0.6504734706147602, + "flos": 19721229897600.0, + "grad_norm": 1.9788829466570557, + "language_loss": 0.65609288, + "learning_rate": 1.150246104600249e-06, + "loss": 0.68086594, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20349121, + "step": 10819, + "time_per_iteration": 2.853773832321167 + }, + { + "auxiliary_loss_clip": 0.01437776, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.2690177, + "balance_loss_mlp": 1.01543868, + "epoch": 0.6505335938674283, + "flos": 25567871875200.0, + "grad_norm": 2.9241708587515385, + "language_loss": 0.84463441, + "learning_rate": 1.14989356009286e-06, + "loss": 0.86936587, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19934082, + "step": 10820, + "time_per_iteration": 2.900480270385742 + }, + { + "auxiliary_loss_clip": 0.01439215, + "auxiliary_loss_mlp": 0.01037823, + "balance_loss_clip": 1.27068043, + "balance_loss_mlp": 1.01753342, + "epoch": 0.6505937171200962, + "flos": 17830298931840.0, + "grad_norm": 2.190884819955326, + "language_loss": 0.78897083, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.81374121, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20275879, + "step": 10821, + "time_per_iteration": 2.8452041149139404 + }, + { + "auxiliary_loss_clip": 0.01407094, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.24794054, + "balance_loss_mlp": 1.01207304, + "epoch": 0.6506538403727642, + "flos": 20677961335680.0, + "grad_norm": 1.5892807158149256, + "language_loss": 0.80268013, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82705057, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17871094, + "step": 10822, + "time_per_iteration": 2.842895984649658 + }, + { + "auxiliary_loss_clip": 0.01413289, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.2510196, + "balance_loss_mlp": 1.0125035, + "epoch": 0.6507139636254321, + "flos": 11726928103680.0, + "grad_norm": 1.9889910299914795, + "language_loss": 0.88208371, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.90654522, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20349121, + "step": 10823, + "time_per_iteration": 4.246881484985352 + }, + { + "auxiliary_loss_clip": 0.01420551, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.25519848, + "balance_loss_mlp": 1.0142101, + "epoch": 0.6507740868781001, + "flos": 26773278589440.0, + "grad_norm": 1.5621854874731897, + "language_loss": 0.67218816, + "learning_rate": 1.148483704558183e-06, + "loss": 0.6967355, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.1998291, + "step": 10824, + "time_per_iteration": 2.9019312858581543 + }, + { + "auxiliary_loss_clip": 0.01428583, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.25975156, + "balance_loss_mlp": 1.01395571, + "epoch": 0.650834210130768, + "flos": 16480316522880.0, + "grad_norm": 2.7681032417487708, + "language_loss": 0.88181233, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.90643603, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19836426, + "step": 10825, + "time_per_iteration": 2.932523250579834 + }, + { + "auxiliary_loss_clip": 0.01425716, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.25713396, + "balance_loss_mlp": 1.01131821, + "epoch": 0.650894333383436, + "flos": 17137490267520.0, + "grad_norm": 2.99633255532371, + "language_loss": 0.74497783, + "learning_rate": 1.147778970474885e-06, + "loss": 0.76954776, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19970703, + "step": 10826, + "time_per_iteration": 2.8330492973327637 + }, + { + "auxiliary_loss_clip": 0.01420964, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.25506878, + "balance_loss_mlp": 1.01785016, + "epoch": 0.650954456636104, + "flos": 18743522855040.0, + "grad_norm": 5.3702311835114145, + "language_loss": 0.70179725, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.72638261, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19726562, + "step": 10827, + "time_per_iteration": 2.838696241378784 + }, + { + "auxiliary_loss_clip": 0.01437621, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.26974964, + "balance_loss_mlp": 1.01345277, + "epoch": 0.651014579888772, + "flos": 24537155909760.0, + "grad_norm": 1.8224506955910706, + "language_loss": 0.77761674, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.80231589, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.18847656, + "step": 10828, + "time_per_iteration": 5.685888290405273 + }, + { + "auxiliary_loss_clip": 0.01435235, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.27029979, + "balance_loss_mlp": 1.01542211, + "epoch": 0.65107470314144, + "flos": 24071730203520.0, + "grad_norm": 2.181255231826281, + "language_loss": 0.89926839, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.92397738, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20239258, + "step": 10829, + "time_per_iteration": 2.8604989051818848 + }, + { + "auxiliary_loss_clip": 0.01193448, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.1003089, + "balance_loss_mlp": 1.01193511, + "epoch": 0.6511348263941079, + "flos": 72514205366400.0, + "grad_norm": 0.653131509950109, + "language_loss": 0.55427498, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57648617, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15722656, + "step": 10830, + "time_per_iteration": 3.5037951469421387 + }, + { + "auxiliary_loss_clip": 0.0143743, + "auxiliary_loss_mlp": 0.01038154, + "balance_loss_clip": 1.26698542, + "balance_loss_mlp": 1.01811469, + "epoch": 0.6511949496467759, + "flos": 23378107132800.0, + "grad_norm": 1.7547823038219335, + "language_loss": 0.75779724, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.78255308, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.20031738, + "step": 10831, + "time_per_iteration": 2.8659768104553223 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01020899, + "balance_loss_clip": 1.10237336, + "balance_loss_mlp": 1.00468671, + "epoch": 0.6512550728994438, + "flos": 67364534574720.0, + "grad_norm": 0.6561742051579238, + "language_loss": 0.51044691, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53259552, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.16210938, + "step": 10832, + "time_per_iteration": 3.427170753479004 + }, + { + "auxiliary_loss_clip": 0.01434551, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.26495671, + "balance_loss_mlp": 1.01488972, + "epoch": 0.6513151961521119, + "flos": 21151440616320.0, + "grad_norm": 1.9948949847177644, + "language_loss": 0.84735477, + "learning_rate": 1.145313419848316e-06, + "loss": 0.87204444, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19543457, + "step": 10833, + "time_per_iteration": 2.871377468109131 + }, + { + "auxiliary_loss_clip": 0.01428674, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.26313829, + "balance_loss_mlp": 1.01191747, + "epoch": 0.6513753194047798, + "flos": 15167235888000.0, + "grad_norm": 3.720737024636619, + "language_loss": 0.84840405, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.87301266, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20288086, + "step": 10834, + "time_per_iteration": 2.8400421142578125 + }, + { + "auxiliary_loss_clip": 0.0143296, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.2663908, + "balance_loss_mlp": 1.01923752, + "epoch": 0.6514354426574478, + "flos": 30238724499840.0, + "grad_norm": 1.5574545451022714, + "language_loss": 0.77305716, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79777366, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19445801, + "step": 10835, + "time_per_iteration": 3.0243070125579834 + }, + { + "auxiliary_loss_clip": 0.01437507, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.27133191, + "balance_loss_mlp": 1.01309276, + "epoch": 0.6514955659101157, + "flos": 24215898695040.0, + "grad_norm": 1.4570842421290429, + "language_loss": 0.78132236, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.80602717, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19885254, + "step": 10836, + "time_per_iteration": 2.9367830753326416 + }, + { + "auxiliary_loss_clip": 0.01427604, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.26025009, + "balance_loss_mlp": 1.01750672, + "epoch": 0.6515556891627837, + "flos": 12383332686720.0, + "grad_norm": 2.0505950002557194, + "language_loss": 0.83116281, + "learning_rate": 1.143905246497783e-06, + "loss": 0.85581094, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19702148, + "step": 10837, + "time_per_iteration": 2.8523926734924316 + }, + { + "auxiliary_loss_clip": 0.01415026, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.25239158, + "balance_loss_mlp": 1.01229048, + "epoch": 0.6516158124154516, + "flos": 49618518001920.0, + "grad_norm": 2.669869087632107, + "language_loss": 0.59803808, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.6225239, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.21276855, + "step": 10838, + "time_per_iteration": 3.1150951385498047 + }, + { + "auxiliary_loss_clip": 0.01192959, + "auxiliary_loss_mlp": 0.01018813, + "balance_loss_clip": 1.10060692, + "balance_loss_mlp": 1.00202835, + "epoch": 0.6516759356681197, + "flos": 59730834988800.0, + "grad_norm": 0.7304456712723424, + "language_loss": 0.61035275, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.63247049, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.16796875, + "step": 10839, + "time_per_iteration": 3.4331166744232178 + }, + { + "auxiliary_loss_clip": 0.0141848, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.25602102, + "balance_loss_mlp": 1.01543427, + "epoch": 0.6517360589207876, + "flos": 37464785268480.0, + "grad_norm": 1.5900251191658636, + "language_loss": 0.6832785, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.7078023, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18457031, + "step": 10840, + "time_per_iteration": 3.039977550506592 + }, + { + "auxiliary_loss_clip": 0.01412542, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.24890566, + "balance_loss_mlp": 1.01450038, + "epoch": 0.6517961821734556, + "flos": 25385987203200.0, + "grad_norm": 1.96222045931857, + "language_loss": 0.74697983, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.77144039, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19018555, + "step": 10841, + "time_per_iteration": 2.944181442260742 + }, + { + "auxiliary_loss_clip": 0.01422863, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.25585675, + "balance_loss_mlp": 1.01704693, + "epoch": 0.6518563054261236, + "flos": 28778670195840.0, + "grad_norm": 2.4693666720722436, + "language_loss": 0.63268518, + "learning_rate": 1.142145760331648e-06, + "loss": 0.65728462, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20031738, + "step": 10842, + "time_per_iteration": 2.9586222171783447 + }, + { + "auxiliary_loss_clip": 0.01198548, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.1038866, + "balance_loss_mlp": 1.02416945, + "epoch": 0.6519164286787915, + "flos": 68952921690240.0, + "grad_norm": 0.8242824711016649, + "language_loss": 0.56138325, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58378017, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16992188, + "step": 10843, + "time_per_iteration": 3.117135763168335 + }, + { + "auxiliary_loss_clip": 0.01447384, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.27449393, + "balance_loss_mlp": 1.02022254, + "epoch": 0.6519765519314595, + "flos": 20449809216000.0, + "grad_norm": 1.6050341062999347, + "language_loss": 0.83489698, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.8597737, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.20068359, + "step": 10844, + "time_per_iteration": 2.8662898540496826 + }, + { + "auxiliary_loss_clip": 0.01417047, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.25093913, + "balance_loss_mlp": 1.01008296, + "epoch": 0.6520366751841274, + "flos": 28414765117440.0, + "grad_norm": 1.8549469549301951, + "language_loss": 0.60625607, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.63074839, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.2208252, + "step": 10845, + "time_per_iteration": 2.9218716621398926 + }, + { + "auxiliary_loss_clip": 0.01424742, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.25708818, + "balance_loss_mlp": 1.01535058, + "epoch": 0.6520967984367955, + "flos": 22283631964800.0, + "grad_norm": 1.7747723740844616, + "language_loss": 0.80064392, + "learning_rate": 1.140738756857194e-06, + "loss": 0.82524204, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19714355, + "step": 10846, + "time_per_iteration": 2.86114239692688 + }, + { + "auxiliary_loss_clip": 0.01193036, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.10028601, + "balance_loss_mlp": 1.01495123, + "epoch": 0.6521569216894634, + "flos": 68952803938560.0, + "grad_norm": 0.7104997386678515, + "language_loss": 0.60268444, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62495506, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.19042969, + "step": 10847, + "time_per_iteration": 3.3894426822662354 + }, + { + "auxiliary_loss_clip": 0.01435802, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.26622343, + "balance_loss_mlp": 1.01878262, + "epoch": 0.6522170449421314, + "flos": 29141941847040.0, + "grad_norm": 1.4546805032112704, + "language_loss": 0.81260204, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83735174, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.20385742, + "step": 10848, + "time_per_iteration": 2.9208948612213135 + }, + { + "auxiliary_loss_clip": 0.0141928, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.25449443, + "balance_loss_mlp": 1.01723289, + "epoch": 0.6522771681947993, + "flos": 26663342428800.0, + "grad_norm": 2.333687245797798, + "language_loss": 0.75752842, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.78209704, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20324707, + "step": 10849, + "time_per_iteration": 4.293798208236694 + }, + { + "auxiliary_loss_clip": 0.01417804, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.25579023, + "balance_loss_mlp": 1.01484954, + "epoch": 0.6523372914474673, + "flos": 25750932912000.0, + "grad_norm": 1.3594619538019048, + "language_loss": 0.6864472, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.71097016, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1965332, + "step": 10850, + "time_per_iteration": 2.929518222808838 + }, + { + "auxiliary_loss_clip": 0.01413865, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.25128865, + "balance_loss_mlp": 1.01229417, + "epoch": 0.6523974147001352, + "flos": 24838206681600.0, + "grad_norm": 1.946083477422338, + "language_loss": 0.67766792, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.70213342, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20373535, + "step": 10851, + "time_per_iteration": 2.923574686050415 + }, + { + "auxiliary_loss_clip": 0.01440494, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.27244556, + "balance_loss_mlp": 1.01797962, + "epoch": 0.6524575379528033, + "flos": 26327425898880.0, + "grad_norm": 2.294505267370526, + "language_loss": 0.74465001, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76942885, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.1940918, + "step": 10852, + "time_per_iteration": 2.9678995609283447 + }, + { + "auxiliary_loss_clip": 0.01435877, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.2647872, + "balance_loss_mlp": 1.01431775, + "epoch": 0.6525176612054712, + "flos": 19501991003520.0, + "grad_norm": 2.104392173534739, + "language_loss": 0.67071384, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.69542837, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.21264648, + "step": 10853, + "time_per_iteration": 2.8843600749969482 + }, + { + "auxiliary_loss_clip": 0.01192539, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.10001087, + "balance_loss_mlp": 1.01711369, + "epoch": 0.6525777844581392, + "flos": 71738182235520.0, + "grad_norm": 0.7256362626580298, + "language_loss": 0.63077074, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65310949, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.2421875, + "step": 10854, + "time_per_iteration": 3.4702813625335693 + }, + { + "auxiliary_loss_clip": 0.01435957, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_clip": 1.26784396, + "balance_loss_mlp": 1.02001131, + "epoch": 0.6526379077108072, + "flos": 26664835507200.0, + "grad_norm": 1.7153043686610199, + "language_loss": 0.78550541, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.81027985, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.21484375, + "step": 10855, + "time_per_iteration": 2.9372026920318604 + }, + { + "auxiliary_loss_clip": 0.01406375, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.24413419, + "balance_loss_mlp": 1.00888014, + "epoch": 0.6526980309634751, + "flos": 22830643324800.0, + "grad_norm": 1.9806546869009416, + "language_loss": 0.79673129, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.82109398, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.21008301, + "step": 10856, + "time_per_iteration": 2.8697712421417236 + }, + { + "auxiliary_loss_clip": 0.01422167, + "auxiliary_loss_mlp": 0.01035251, + "balance_loss_clip": 1.25660968, + "balance_loss_mlp": 1.01380491, + "epoch": 0.6527581542161431, + "flos": 28376460754560.0, + "grad_norm": 1.6226242877310706, + "language_loss": 0.74486226, + "learning_rate": 1.136872187988815e-06, + "loss": 0.76943648, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.21472168, + "step": 10857, + "time_per_iteration": 2.9095025062561035 + }, + { + "auxiliary_loss_clip": 0.01415923, + "auxiliary_loss_mlp": 0.0103296, + "balance_loss_clip": 1.25067306, + "balance_loss_mlp": 1.0124557, + "epoch": 0.652818277468811, + "flos": 18378350922240.0, + "grad_norm": 2.2092194832885825, + "language_loss": 0.63922381, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.66371262, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20495605, + "step": 10858, + "time_per_iteration": 4.29092812538147 + }, + { + "auxiliary_loss_clip": 0.01420005, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.25553942, + "balance_loss_mlp": 1.01515818, + "epoch": 0.6528784007214791, + "flos": 18044017960320.0, + "grad_norm": 1.803031795774203, + "language_loss": 0.79205382, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.81661224, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20678711, + "step": 10859, + "time_per_iteration": 2.8555378913879395 + }, + { + "auxiliary_loss_clip": 0.01441087, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.27100599, + "balance_loss_mlp": 1.01636946, + "epoch": 0.652938523974147, + "flos": 22392029802240.0, + "grad_norm": 1.6481859868455946, + "language_loss": 0.68840373, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.71318734, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.20922852, + "step": 10860, + "time_per_iteration": 2.8458690643310547 + }, + { + "auxiliary_loss_clip": 0.0144107, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.26989388, + "balance_loss_mlp": 1.01288462, + "epoch": 0.652998647226815, + "flos": 16772816027520.0, + "grad_norm": 1.88560659738565, + "language_loss": 0.6794405, + "learning_rate": 1.135467143909712e-06, + "loss": 0.70418751, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20739746, + "step": 10861, + "time_per_iteration": 2.853938579559326 + }, + { + "auxiliary_loss_clip": 0.01428755, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.26027691, + "balance_loss_mlp": 1.01681936, + "epoch": 0.6530587704794829, + "flos": 35786080252800.0, + "grad_norm": 1.857848964641663, + "language_loss": 0.65946269, + "learning_rate": 1.135115964814572e-06, + "loss": 0.68414456, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.22607422, + "step": 10862, + "time_per_iteration": 2.9925315380096436 + }, + { + "auxiliary_loss_clip": 0.01425479, + "auxiliary_loss_mlp": 0.01040455, + "balance_loss_clip": 1.2592901, + "balance_loss_mlp": 1.01995063, + "epoch": 0.6531188937321509, + "flos": 19325083259520.0, + "grad_norm": 1.6096630739754856, + "language_loss": 0.77840781, + "learning_rate": 1.13476481851592e-06, + "loss": 0.80306721, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20495605, + "step": 10863, + "time_per_iteration": 5.694023847579956 + }, + { + "auxiliary_loss_clip": 0.01426554, + "auxiliary_loss_mlp": 0.0103651, + "balance_loss_clip": 1.2610333, + "balance_loss_mlp": 1.01649475, + "epoch": 0.6531790169848188, + "flos": 22904175404160.0, + "grad_norm": 1.9396682627015769, + "language_loss": 0.74735034, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.77198094, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20007324, + "step": 10864, + "time_per_iteration": 2.893057346343994 + }, + { + "auxiliary_loss_clip": 0.01412542, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.24735689, + "balance_loss_mlp": 1.01355457, + "epoch": 0.6532391402374869, + "flos": 29573813894400.0, + "grad_norm": 2.3514685328647986, + "language_loss": 0.86252463, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88699222, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20654297, + "step": 10865, + "time_per_iteration": 2.9253711700439453 + }, + { + "auxiliary_loss_clip": 0.01438733, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.27010822, + "balance_loss_mlp": 1.01268554, + "epoch": 0.6532992634901548, + "flos": 23114048624640.0, + "grad_norm": 1.6357639514701978, + "language_loss": 0.82271415, + "learning_rate": 1.133711576532051e-06, + "loss": 0.84743118, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20288086, + "step": 10866, + "time_per_iteration": 2.9799749851226807 + }, + { + "auxiliary_loss_clip": 0.01428661, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.26403427, + "balance_loss_mlp": 1.01321948, + "epoch": 0.6533593867428228, + "flos": 26078388664320.0, + "grad_norm": 1.3784516528211717, + "language_loss": 0.82866263, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.85329401, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.21240234, + "step": 10867, + "time_per_iteration": 2.935029983520508 + }, + { + "auxiliary_loss_clip": 0.01425595, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.25810337, + "balance_loss_mlp": 1.01234794, + "epoch": 0.6534195099954908, + "flos": 21221669825280.0, + "grad_norm": 1.935843425736942, + "language_loss": 0.82226646, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8468582, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.21240234, + "step": 10868, + "time_per_iteration": 2.852872133255005 + }, + { + "auxiliary_loss_clip": 0.01425601, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.25633276, + "balance_loss_mlp": 1.01409936, + "epoch": 0.6534796332481587, + "flos": 19656158595840.0, + "grad_norm": 1.8941453317831618, + "language_loss": 0.80563748, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.8302424, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20788574, + "step": 10869, + "time_per_iteration": 2.831394672393799 + }, + { + "auxiliary_loss_clip": 0.01430968, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.26440501, + "balance_loss_mlp": 1.01719141, + "epoch": 0.6535397565008267, + "flos": 24032566189440.0, + "grad_norm": 2.3409717268772217, + "language_loss": 0.73537004, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.76005667, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20507812, + "step": 10870, + "time_per_iteration": 2.8762388229370117 + }, + { + "auxiliary_loss_clip": 0.0144319, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.27534294, + "balance_loss_mlp": 1.01873469, + "epoch": 0.6535998797534947, + "flos": 24612090577920.0, + "grad_norm": 2.2934099595834443, + "language_loss": 0.75951487, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.78434098, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20666504, + "step": 10871, + "time_per_iteration": 2.949042797088623 + }, + { + "auxiliary_loss_clip": 0.0141425, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.25110543, + "balance_loss_mlp": 1.01284885, + "epoch": 0.6536600030061627, + "flos": 23373673142400.0, + "grad_norm": 1.4471908706452457, + "language_loss": 0.56129181, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.58577156, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20874023, + "step": 10872, + "time_per_iteration": 2.883596420288086 + }, + { + "auxiliary_loss_clip": 0.01419092, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.25451684, + "balance_loss_mlp": 1.01576841, + "epoch": 0.6537201262588306, + "flos": 23889031125120.0, + "grad_norm": 1.611541548932472, + "language_loss": 0.75629711, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.78084737, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20166016, + "step": 10873, + "time_per_iteration": 2.8784608840942383 + }, + { + "auxiliary_loss_clip": 0.01417043, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.25197649, + "balance_loss_mlp": 1.01454782, + "epoch": 0.6537802495114986, + "flos": 24365949010560.0, + "grad_norm": 1.5203515330787174, + "language_loss": 0.75822592, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.78274792, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20605469, + "step": 10874, + "time_per_iteration": 2.9199771881103516 + }, + { + "auxiliary_loss_clip": 0.01423461, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.25811267, + "balance_loss_mlp": 1.01360846, + "epoch": 0.6538403727641665, + "flos": 28007352524160.0, + "grad_norm": 1.6150081202768392, + "language_loss": 0.82164109, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.84621811, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.2064209, + "step": 10875, + "time_per_iteration": 2.9745771884918213 + }, + { + "auxiliary_loss_clip": 0.01429827, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.26268053, + "balance_loss_mlp": 1.01892948, + "epoch": 0.6539004960168345, + "flos": 27575118518400.0, + "grad_norm": 1.5747355303999402, + "language_loss": 0.70523751, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72993487, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.2097168, + "step": 10876, + "time_per_iteration": 2.9171407222747803 + }, + { + "auxiliary_loss_clip": 0.0141883, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.25446188, + "balance_loss_mlp": 1.02003908, + "epoch": 0.6539606192695024, + "flos": 14536105165440.0, + "grad_norm": 1.8843143956521342, + "language_loss": 0.80153811, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.82613987, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.21313477, + "step": 10877, + "time_per_iteration": 2.9932634830474854 + }, + { + "auxiliary_loss_clip": 0.01415507, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.25114322, + "balance_loss_mlp": 1.01417565, + "epoch": 0.6540207425221705, + "flos": 21626322485760.0, + "grad_norm": 2.659019297056791, + "language_loss": 0.80711776, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.83161724, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20275879, + "step": 10878, + "time_per_iteration": 2.8777878284454346 + }, + { + "auxiliary_loss_clip": 0.01412263, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.24630547, + "balance_loss_mlp": 1.01345539, + "epoch": 0.6540808657748384, + "flos": 17676131339520.0, + "grad_norm": 1.9716899732150166, + "language_loss": 0.85163468, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.87610745, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.2154541, + "step": 10879, + "time_per_iteration": 2.8494739532470703 + }, + { + "auxiliary_loss_clip": 0.01425943, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.25758433, + "balance_loss_mlp": 1.01144493, + "epoch": 0.6541409890275064, + "flos": 14546466224640.0, + "grad_norm": 2.243738346282337, + "language_loss": 0.73297203, + "learning_rate": 1.128800362199601e-06, + "loss": 0.75755918, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.21325684, + "step": 10880, + "time_per_iteration": 2.811908006668091 + }, + { + "auxiliary_loss_clip": 0.01413216, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.24982572, + "balance_loss_mlp": 1.01822615, + "epoch": 0.6542011122801744, + "flos": 17174165817600.0, + "grad_norm": 1.9628602878786534, + "language_loss": 0.85292339, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.8774578, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.2199707, + "step": 10881, + "time_per_iteration": 2.817138433456421 + }, + { + "auxiliary_loss_clip": 0.01437976, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.26881194, + "balance_loss_mlp": 1.01449871, + "epoch": 0.6542612355328423, + "flos": 18195651843840.0, + "grad_norm": 1.798973425548956, + "language_loss": 0.78609115, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.81083429, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21826172, + "step": 10882, + "time_per_iteration": 2.8899927139282227 + }, + { + "auxiliary_loss_clip": 0.01427355, + "auxiliary_loss_mlp": 0.01041259, + "balance_loss_clip": 1.25829661, + "balance_loss_mlp": 1.01938391, + "epoch": 0.6543213587855103, + "flos": 19802046389760.0, + "grad_norm": 1.6330096991536083, + "language_loss": 0.8274399, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.85212606, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.21875, + "step": 10883, + "time_per_iteration": 4.268296718597412 + }, + { + "auxiliary_loss_clip": 0.01433463, + "auxiliary_loss_mlp": 0.01041628, + "balance_loss_clip": 1.26457882, + "balance_loss_mlp": 1.02048063, + "epoch": 0.6543814820381783, + "flos": 21114810311040.0, + "grad_norm": 2.6330714553624603, + "language_loss": 0.86018312, + "learning_rate": 1.127398345803988e-06, + "loss": 0.88493401, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21130371, + "step": 10884, + "time_per_iteration": 2.895533800125122 + }, + { + "auxiliary_loss_clip": 0.01442608, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.27395868, + "balance_loss_mlp": 1.01881647, + "epoch": 0.6544416052908463, + "flos": 20203803383040.0, + "grad_norm": 2.316525657595679, + "language_loss": 0.80916488, + "learning_rate": 1.127047924394715e-06, + "loss": 0.83399886, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21984863, + "step": 10885, + "time_per_iteration": 2.838981866836548 + }, + { + "auxiliary_loss_clip": 0.0141728, + "auxiliary_loss_mlp": 0.01037996, + "balance_loss_clip": 1.25241303, + "balance_loss_mlp": 1.01659834, + "epoch": 0.6545017285435142, + "flos": 23378831049600.0, + "grad_norm": 2.319521416729139, + "language_loss": 0.72437191, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74892467, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.21411133, + "step": 10886, + "time_per_iteration": 2.8583364486694336 + }, + { + "auxiliary_loss_clip": 0.01425343, + "auxiliary_loss_mlp": 0.01037187, + "balance_loss_clip": 1.26113105, + "balance_loss_mlp": 1.01746964, + "epoch": 0.6545618517961822, + "flos": 19143877259520.0, + "grad_norm": 1.7428465616588742, + "language_loss": 0.78831184, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.81293714, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19726562, + "step": 10887, + "time_per_iteration": 2.8325719833374023 + }, + { + "auxiliary_loss_clip": 0.01415011, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.24978495, + "balance_loss_mlp": 1.01460576, + "epoch": 0.6546219750488501, + "flos": 14946639649920.0, + "grad_norm": 1.7928425492701143, + "language_loss": 0.78947902, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81398737, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.2121582, + "step": 10888, + "time_per_iteration": 2.7950119972229004 + }, + { + "auxiliary_loss_clip": 0.01413349, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.25088692, + "balance_loss_mlp": 1.01288557, + "epoch": 0.6546820983015181, + "flos": 36334403712000.0, + "grad_norm": 1.4702230208731475, + "language_loss": 0.66769993, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.69215959, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1973877, + "step": 10889, + "time_per_iteration": 2.9670400619506836 + }, + { + "auxiliary_loss_clip": 0.0141771, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.2515049, + "balance_loss_mlp": 1.01471031, + "epoch": 0.654742221554186, + "flos": 20420870526720.0, + "grad_norm": 1.843130604935648, + "language_loss": 0.80480886, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82933867, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.2052002, + "step": 10890, + "time_per_iteration": 2.889808177947998 + }, + { + "auxiliary_loss_clip": 0.0143737, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.26833034, + "balance_loss_mlp": 1.01578462, + "epoch": 0.6548023448068541, + "flos": 24874475028480.0, + "grad_norm": 2.0920338205117854, + "language_loss": 0.66574907, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.69047827, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19763184, + "step": 10891, + "time_per_iteration": 2.946877956390381 + }, + { + "auxiliary_loss_clip": 0.01413685, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.24916708, + "balance_loss_mlp": 1.02089894, + "epoch": 0.654862468059522, + "flos": 21435841301760.0, + "grad_norm": 1.7686355616921339, + "language_loss": 0.80330014, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.82784516, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19934082, + "step": 10892, + "time_per_iteration": 2.891942262649536 + }, + { + "auxiliary_loss_clip": 0.01445167, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.27351022, + "balance_loss_mlp": 1.01693773, + "epoch": 0.65492259131219, + "flos": 26588950698240.0, + "grad_norm": 1.8932556186018628, + "language_loss": 0.78890276, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.81372374, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.1998291, + "step": 10893, + "time_per_iteration": 2.8958094120025635 + }, + { + "auxiliary_loss_clip": 0.01425288, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.25939608, + "balance_loss_mlp": 1.01568162, + "epoch": 0.6549827145648579, + "flos": 21509825829120.0, + "grad_norm": 11.68303263453298, + "language_loss": 0.70960438, + "learning_rate": 1.123895622914766e-06, + "loss": 0.73423064, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21630859, + "step": 10894, + "time_per_iteration": 4.355911731719971 + }, + { + "auxiliary_loss_clip": 0.01435853, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.2665472, + "balance_loss_mlp": 1.01482034, + "epoch": 0.6550428378175259, + "flos": 22602626939520.0, + "grad_norm": 2.8524955377829397, + "language_loss": 0.63340902, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65812743, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21154785, + "step": 10895, + "time_per_iteration": 2.8494338989257812 + }, + { + "auxiliary_loss_clip": 0.01415074, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.25025749, + "balance_loss_mlp": 1.01699328, + "epoch": 0.655102961070194, + "flos": 12831628596480.0, + "grad_norm": 1.9877234913840465, + "language_loss": 0.80069274, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.82521558, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20227051, + "step": 10896, + "time_per_iteration": 2.891592264175415 + }, + { + "auxiliary_loss_clip": 0.01408105, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.24703228, + "balance_loss_mlp": 1.01419997, + "epoch": 0.6551630843228619, + "flos": 24801983579520.0, + "grad_norm": 1.4011637202386802, + "language_loss": 0.71027893, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.73470187, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1998291, + "step": 10897, + "time_per_iteration": 4.310504674911499 + }, + { + "auxiliary_loss_clip": 0.01432882, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.26450944, + "balance_loss_mlp": 1.01749635, + "epoch": 0.6552232075755299, + "flos": 16732430403840.0, + "grad_norm": 1.7383432717582983, + "language_loss": 0.76447999, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.7891891, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20507812, + "step": 10898, + "time_per_iteration": 4.183542013168335 + }, + { + "auxiliary_loss_clip": 0.01426866, + "auxiliary_loss_mlp": 0.01040527, + "balance_loss_clip": 1.26252079, + "balance_loss_mlp": 1.02017808, + "epoch": 0.6552833308281978, + "flos": 22026224442240.0, + "grad_norm": 2.627376176975439, + "language_loss": 0.73955989, + "learning_rate": 1.122145506463827e-06, + "loss": 0.76423377, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20349121, + "step": 10899, + "time_per_iteration": 2.8368682861328125 + }, + { + "auxiliary_loss_clip": 0.01426423, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.25954413, + "balance_loss_mlp": 1.0157243, + "epoch": 0.6553434540808658, + "flos": 24874158314880.0, + "grad_norm": 1.9327041120856, + "language_loss": 0.57322764, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.59785187, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20263672, + "step": 10900, + "time_per_iteration": 2.8571484088897705 + }, + { + "auxiliary_loss_clip": 0.01430675, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.26408708, + "balance_loss_mlp": 1.01181853, + "epoch": 0.6554035773335337, + "flos": 23231269198080.0, + "grad_norm": 1.6844271728232898, + "language_loss": 0.77172482, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.7963624, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21276855, + "step": 10901, + "time_per_iteration": 2.8535618782043457 + }, + { + "auxiliary_loss_clip": 0.01412654, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.24737358, + "balance_loss_mlp": 1.00888324, + "epoch": 0.6554637005862017, + "flos": 22793786795520.0, + "grad_norm": 2.0953725678229937, + "language_loss": 0.74257326, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.7669819, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.1932373, + "step": 10902, + "time_per_iteration": 2.844329357147217 + }, + { + "auxiliary_loss_clip": 0.01415309, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.25371838, + "balance_loss_mlp": 1.0147022, + "epoch": 0.6555238238388696, + "flos": 21517245976320.0, + "grad_norm": 1.6853182192299556, + "language_loss": 0.68586099, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.71036637, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20532227, + "step": 10903, + "time_per_iteration": 2.8493540287017822 + }, + { + "auxiliary_loss_clip": 0.01443678, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.27282429, + "balance_loss_mlp": 1.01498818, + "epoch": 0.6555839470915377, + "flos": 30531902676480.0, + "grad_norm": 2.1376801818427182, + "language_loss": 0.67758071, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.70237505, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.20776367, + "step": 10904, + "time_per_iteration": 2.9194397926330566 + }, + { + "auxiliary_loss_clip": 0.01430339, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.26272333, + "balance_loss_mlp": 1.01827705, + "epoch": 0.6556440703442056, + "flos": 24653245363200.0, + "grad_norm": 1.8735676526084863, + "language_loss": 0.91115963, + "learning_rate": 1.120046465383464e-06, + "loss": 0.93586159, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21582031, + "step": 10905, + "time_per_iteration": 2.8457276821136475 + }, + { + "auxiliary_loss_clip": 0.01412486, + "auxiliary_loss_mlp": 0.01033719, + "balance_loss_clip": 1.25120163, + "balance_loss_mlp": 1.01359653, + "epoch": 0.6557041935968736, + "flos": 23742962352000.0, + "grad_norm": 2.038286875798592, + "language_loss": 0.76773643, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.79219848, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.20141602, + "step": 10906, + "time_per_iteration": 2.8617193698883057 + }, + { + "auxiliary_loss_clip": 0.01436738, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.26837862, + "balance_loss_mlp": 1.01841247, + "epoch": 0.6557643168495415, + "flos": 11108511169920.0, + "grad_norm": 2.9150660672622064, + "language_loss": 0.75986099, + "learning_rate": 1.119347051825267e-06, + "loss": 0.7846185, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20605469, + "step": 10907, + "time_per_iteration": 2.8545119762420654 + }, + { + "auxiliary_loss_clip": 0.01425166, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.2580483, + "balance_loss_mlp": 1.0156281, + "epoch": 0.6558244401022095, + "flos": 30203361048960.0, + "grad_norm": 1.8012360576596527, + "language_loss": 0.72607434, + "learning_rate": 1.118997395131211e-06, + "loss": 0.75069571, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21362305, + "step": 10908, + "time_per_iteration": 2.9261651039123535 + }, + { + "auxiliary_loss_clip": 0.01425162, + "auxiliary_loss_mlp": 0.01036332, + "balance_loss_clip": 1.25962257, + "balance_loss_mlp": 1.01657915, + "epoch": 0.6558845633548775, + "flos": 17940008868480.0, + "grad_norm": 2.0085107427104156, + "language_loss": 0.82917017, + "learning_rate": 1.118647771844861e-06, + "loss": 0.8537851, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19775391, + "step": 10909, + "time_per_iteration": 2.813429355621338 + }, + { + "auxiliary_loss_clip": 0.01438362, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.26879048, + "balance_loss_mlp": 1.01273286, + "epoch": 0.6559446866075455, + "flos": 21913121145600.0, + "grad_norm": 2.5087385631891914, + "language_loss": 0.64845985, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.67317939, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20849609, + "step": 10910, + "time_per_iteration": 2.8423595428466797 + }, + { + "auxiliary_loss_clip": 0.01447372, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.27401447, + "balance_loss_mlp": 1.01455927, + "epoch": 0.6560048098602135, + "flos": 14133714744960.0, + "grad_norm": 3.981958061259692, + "language_loss": 0.77378529, + "learning_rate": 1.117948625548313e-06, + "loss": 0.79861957, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.21472168, + "step": 10911, + "time_per_iteration": 2.787630796432495 + }, + { + "auxiliary_loss_clip": 0.01415227, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.25347352, + "balance_loss_mlp": 1.01484418, + "epoch": 0.6560649331128814, + "flos": 18816964444800.0, + "grad_norm": 1.5029185351344643, + "language_loss": 0.75645769, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.78095376, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.1953125, + "step": 10912, + "time_per_iteration": 2.885342597961426 + }, + { + "auxiliary_loss_clip": 0.01464841, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.29114854, + "balance_loss_mlp": 1.02126503, + "epoch": 0.6561250563655494, + "flos": 17061876927360.0, + "grad_norm": 1.5914334685633287, + "language_loss": 0.77971268, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.80478173, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 1.73925781, + "router_z_loss_mlp": 0.20812988, + "step": 10913, + "time_per_iteration": 2.8213424682617188 + }, + { + "auxiliary_loss_clip": 0.01416043, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.25336099, + "balance_loss_mlp": 1.01452315, + "epoch": 0.6561851796182173, + "flos": 22647808512000.0, + "grad_norm": 1.9468487033975106, + "language_loss": 0.722067, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.74657416, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20141602, + "step": 10914, + "time_per_iteration": 2.86246657371521 + }, + { + "auxiliary_loss_clip": 0.01410355, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.24667466, + "balance_loss_mlp": 1.01490366, + "epoch": 0.6562453028708853, + "flos": 19247117189760.0, + "grad_norm": 2.0814886122969094, + "language_loss": 0.74573159, + "learning_rate": 1.116550734430958e-06, + "loss": 0.77018428, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20019531, + "step": 10915, + "time_per_iteration": 2.8925211429595947 + }, + { + "auxiliary_loss_clip": 0.01433635, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.26928699, + "balance_loss_mlp": 1.01489735, + "epoch": 0.6563054261235532, + "flos": 23810748341760.0, + "grad_norm": 1.5846274575889747, + "language_loss": 0.79955143, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.82424235, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20568848, + "step": 10916, + "time_per_iteration": 2.8720598220825195 + }, + { + "auxiliary_loss_clip": 0.0142823, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.26085591, + "balance_loss_mlp": 1.0157001, + "epoch": 0.6563655493762213, + "flos": 19248157820160.0, + "grad_norm": 1.8326744384420888, + "language_loss": 0.77003074, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.79466724, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19714355, + "step": 10917, + "time_per_iteration": 2.8184945583343506 + }, + { + "auxiliary_loss_clip": 0.01408797, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.24483919, + "balance_loss_mlp": 1.01152456, + "epoch": 0.6564256726288892, + "flos": 25567328937600.0, + "grad_norm": 2.5769072291713475, + "language_loss": 0.71373308, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.73813903, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20263672, + "step": 10918, + "time_per_iteration": 2.874936819076538 + }, + { + "auxiliary_loss_clip": 0.01404562, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.24620628, + "balance_loss_mlp": 1.01165175, + "epoch": 0.6564857958815572, + "flos": 22210869047040.0, + "grad_norm": 1.490278212800338, + "language_loss": 0.76604438, + "learning_rate": 1.115153379321332e-06, + "loss": 0.79039502, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18835449, + "step": 10919, + "time_per_iteration": 4.2468461990356445 + }, + { + "auxiliary_loss_clip": 0.01199112, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.10669446, + "balance_loss_mlp": 1.01237857, + "epoch": 0.6565459191342251, + "flos": 58148040245760.0, + "grad_norm": 0.7182174874767043, + "language_loss": 0.53039926, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.552724, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.20996094, + "step": 10920, + "time_per_iteration": 3.393949508666992 + }, + { + "auxiliary_loss_clip": 0.01418591, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.25648904, + "balance_loss_mlp": 1.01540589, + "epoch": 0.6566060423868931, + "flos": 30821687493120.0, + "grad_norm": 1.5043208159886705, + "language_loss": 0.65994763, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.68449777, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.21020508, + "step": 10921, + "time_per_iteration": 2.999271869659424 + }, + { + "auxiliary_loss_clip": 0.01422168, + "auxiliary_loss_mlp": 0.01031439, + "balance_loss_clip": 1.25773418, + "balance_loss_mlp": 1.00969529, + "epoch": 0.6566661656395612, + "flos": 23377790419200.0, + "grad_norm": 7.77535786273537, + "language_loss": 0.82160378, + "learning_rate": 1.114105715254205e-06, + "loss": 0.84613991, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.21728516, + "step": 10922, + "time_per_iteration": 2.890613317489624 + }, + { + "auxiliary_loss_clip": 0.0143226, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.26563561, + "balance_loss_mlp": 1.01478922, + "epoch": 0.6567262888922291, + "flos": 25745684515200.0, + "grad_norm": 1.806614570048082, + "language_loss": 0.71799135, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.74266481, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20275879, + "step": 10923, + "time_per_iteration": 2.8676459789276123 + }, + { + "auxiliary_loss_clip": 0.01426666, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.2615521, + "balance_loss_mlp": 1.01524556, + "epoch": 0.6567864121448971, + "flos": 17131246485120.0, + "grad_norm": 2.1118114291103667, + "language_loss": 0.81853962, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.84316492, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20593262, + "step": 10924, + "time_per_iteration": 2.847930431365967 + }, + { + "auxiliary_loss_clip": 0.01434463, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.26806974, + "balance_loss_mlp": 1.01163089, + "epoch": 0.656846535397565, + "flos": 22429565003520.0, + "grad_norm": 1.6692573697672237, + "language_loss": 0.73308015, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.75773126, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19030762, + "step": 10925, + "time_per_iteration": 2.864029884338379 + }, + { + "auxiliary_loss_clip": 0.01424045, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.25867867, + "balance_loss_mlp": 1.01338983, + "epoch": 0.656906658650233, + "flos": 17711449545600.0, + "grad_norm": 2.409249772435587, + "language_loss": 0.73184466, + "learning_rate": 1.112709300197942e-06, + "loss": 0.75641537, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19641113, + "step": 10926, + "time_per_iteration": 2.797229051589966 + }, + { + "auxiliary_loss_clip": 0.01438318, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.26833868, + "balance_loss_mlp": 1.01377141, + "epoch": 0.6569667819029009, + "flos": 21184134624000.0, + "grad_norm": 2.719002107901623, + "language_loss": 0.73225021, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.75697863, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.20739746, + "step": 10927, + "time_per_iteration": 2.910715341567993 + }, + { + "auxiliary_loss_clip": 0.01193515, + "auxiliary_loss_mlp": 0.01039131, + "balance_loss_clip": 1.10121405, + "balance_loss_mlp": 1.01814997, + "epoch": 0.6570269051555689, + "flos": 68795107251840.0, + "grad_norm": 0.7321475560030118, + "language_loss": 0.64523602, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66756248, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.20996094, + "step": 10928, + "time_per_iteration": 3.356295347213745 + }, + { + "auxiliary_loss_clip": 0.01422543, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.25977576, + "balance_loss_mlp": 1.01436615, + "epoch": 0.6570870284082369, + "flos": 26329507159680.0, + "grad_norm": 2.0556909202252096, + "language_loss": 0.78022969, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80480456, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20568848, + "step": 10929, + "time_per_iteration": 4.334246635437012 + }, + { + "auxiliary_loss_clip": 0.01421912, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.25808167, + "balance_loss_mlp": 1.01223397, + "epoch": 0.6571471516609049, + "flos": 26185293423360.0, + "grad_norm": 1.9816009271745454, + "language_loss": 0.66150987, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.6860646, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.21325684, + "step": 10930, + "time_per_iteration": 2.8868889808654785 + }, + { + "auxiliary_loss_clip": 0.01416641, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.25208902, + "balance_loss_mlp": 1.01328564, + "epoch": 0.6572072749135728, + "flos": 20386004768640.0, + "grad_norm": 1.6267753237517748, + "language_loss": 0.71718812, + "learning_rate": 1.110964538515258e-06, + "loss": 0.74170053, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.2130127, + "step": 10931, + "time_per_iteration": 2.8707470893859863 + }, + { + "auxiliary_loss_clip": 0.01436546, + "auxiliary_loss_mlp": 0.01035925, + "balance_loss_clip": 1.26904511, + "balance_loss_mlp": 1.0160048, + "epoch": 0.6572673981662408, + "flos": 17137580757120.0, + "grad_norm": 2.092471576048599, + "language_loss": 0.6975342, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.72225893, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19921875, + "step": 10932, + "time_per_iteration": 2.91862154006958 + }, + { + "auxiliary_loss_clip": 0.01429778, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.26409256, + "balance_loss_mlp": 1.01675725, + "epoch": 0.6573275214189087, + "flos": 41288344922880.0, + "grad_norm": 1.6028697125647475, + "language_loss": 0.8049227, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82959211, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20422363, + "step": 10933, + "time_per_iteration": 5.79594087600708 + }, + { + "auxiliary_loss_clip": 0.01438626, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.27178836, + "balance_loss_mlp": 1.01659513, + "epoch": 0.6573876446715767, + "flos": 22899605679360.0, + "grad_norm": 2.1267757225063137, + "language_loss": 0.74354184, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.7682997, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20568848, + "step": 10934, + "time_per_iteration": 2.8708555698394775 + }, + { + "auxiliary_loss_clip": 0.01420235, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.25732684, + "balance_loss_mlp": 1.0150522, + "epoch": 0.6574477679242448, + "flos": 44033808026880.0, + "grad_norm": 1.7868778836997965, + "language_loss": 0.76729262, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.79184616, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20068359, + "step": 10935, + "time_per_iteration": 3.1085095405578613 + }, + { + "auxiliary_loss_clip": 0.01436045, + "auxiliary_loss_mlp": 0.0104108, + "balance_loss_clip": 1.26868474, + "balance_loss_mlp": 1.01966977, + "epoch": 0.6575078911769127, + "flos": 24582201747840.0, + "grad_norm": 1.740009048185618, + "language_loss": 0.7892909, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.81406212, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21411133, + "step": 10936, + "time_per_iteration": 2.8994131088256836 + }, + { + "auxiliary_loss_clip": 0.01408279, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.2456224, + "balance_loss_mlp": 1.01763642, + "epoch": 0.6575680144295807, + "flos": 20934056759040.0, + "grad_norm": 1.8860349856068197, + "language_loss": 0.70102322, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.72549081, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20849609, + "step": 10937, + "time_per_iteration": 2.8665032386779785 + }, + { + "auxiliary_loss_clip": 0.01428182, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.26272452, + "balance_loss_mlp": 1.0197643, + "epoch": 0.6576281376822486, + "flos": 10932734545920.0, + "grad_norm": 4.376563727804312, + "language_loss": 0.69918716, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.7238723, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20556641, + "step": 10938, + "time_per_iteration": 2.809796094894409 + }, + { + "auxiliary_loss_clip": 0.01423112, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.25768471, + "balance_loss_mlp": 1.01636291, + "epoch": 0.6576882609349166, + "flos": 19290986663040.0, + "grad_norm": 2.633202177144158, + "language_loss": 0.7171613, + "learning_rate": 1.108174673550927e-06, + "loss": 0.74176276, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20666504, + "step": 10939, + "time_per_iteration": 2.819267749786377 + }, + { + "auxiliary_loss_clip": 0.01426554, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.25872517, + "balance_loss_mlp": 1.01299095, + "epoch": 0.6577483841875845, + "flos": 20227719899520.0, + "grad_norm": 2.1322291517574286, + "language_loss": 0.78606105, + "learning_rate": 1.107826092473037e-06, + "loss": 0.81066906, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.21264648, + "step": 10940, + "time_per_iteration": 2.8114871978759766 + }, + { + "auxiliary_loss_clip": 0.01433101, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.26399517, + "balance_loss_mlp": 1.01916349, + "epoch": 0.6578085074402525, + "flos": 34764413247360.0, + "grad_norm": 1.8988660825645325, + "language_loss": 0.69356191, + "learning_rate": 1.107477545226471e-06, + "loss": 0.71828592, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20141602, + "step": 10941, + "time_per_iteration": 2.9528722763061523 + }, + { + "auxiliary_loss_clip": 0.01428108, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.26247573, + "balance_loss_mlp": 1.01582789, + "epoch": 0.6578686306929205, + "flos": 23479853984640.0, + "grad_norm": 1.8335024656621393, + "language_loss": 0.69430077, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.7189368, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19677734, + "step": 10942, + "time_per_iteration": 2.8631181716918945 + }, + { + "auxiliary_loss_clip": 0.01467916, + "auxiliary_loss_mlp": 0.01047891, + "balance_loss_clip": 1.29319787, + "balance_loss_mlp": 1.02499056, + "epoch": 0.6579287539455885, + "flos": 18086077641600.0, + "grad_norm": 15.467182501508834, + "language_loss": 0.72393858, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.74909663, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 1.74511719, + "router_z_loss_mlp": 0.22900391, + "step": 10943, + "time_per_iteration": 2.84393048286438 + }, + { + "auxiliary_loss_clip": 0.01424361, + "auxiliary_loss_mlp": 0.01042484, + "balance_loss_clip": 1.25995493, + "balance_loss_mlp": 1.02064514, + "epoch": 0.6579888771982564, + "flos": 28674027676800.0, + "grad_norm": 3.3885259865066084, + "language_loss": 0.59427404, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.6189425, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.21838379, + "step": 10944, + "time_per_iteration": 2.9354195594787598 + }, + { + "auxiliary_loss_clip": 0.0145017, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.27724457, + "balance_loss_mlp": 1.02253318, + "epoch": 0.6580490004509244, + "flos": 25057536065280.0, + "grad_norm": 1.7617976056341416, + "language_loss": 0.72972548, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.75466633, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.21398926, + "step": 10945, + "time_per_iteration": 2.8798065185546875 + }, + { + "auxiliary_loss_clip": 0.01422608, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.25793529, + "balance_loss_mlp": 1.01571155, + "epoch": 0.6581091237035923, + "flos": 43524693826560.0, + "grad_norm": 1.7608569874731699, + "language_loss": 0.7111398, + "learning_rate": 1.105735316926046e-06, + "loss": 0.73572665, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20349121, + "step": 10946, + "time_per_iteration": 3.0531089305877686 + }, + { + "auxiliary_loss_clip": 0.01433517, + "auxiliary_loss_mlp": 0.01049609, + "balance_loss_clip": 1.26891041, + "balance_loss_mlp": 1.02915239, + "epoch": 0.6581692469562603, + "flos": 22424633320320.0, + "grad_norm": 2.1568307659182904, + "language_loss": 0.83176839, + "learning_rate": 1.105386972944934e-06, + "loss": 0.85659963, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20471191, + "step": 10947, + "time_per_iteration": 2.8723208904266357 + }, + { + "auxiliary_loss_clip": 0.01441986, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.27377272, + "balance_loss_mlp": 1.01855326, + "epoch": 0.6582293702089284, + "flos": 24869588590080.0, + "grad_norm": 1.6430278450456721, + "language_loss": 0.7779268, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.80272931, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.19702148, + "step": 10948, + "time_per_iteration": 2.8501296043395996 + }, + { + "auxiliary_loss_clip": 0.01423887, + "auxiliary_loss_mlp": 0.01040978, + "balance_loss_clip": 1.25942314, + "balance_loss_mlp": 1.02109396, + "epoch": 0.6582894934615963, + "flos": 23050108442880.0, + "grad_norm": 1.8093567297954019, + "language_loss": 0.79561853, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.8202672, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19885254, + "step": 10949, + "time_per_iteration": 2.8416035175323486 + }, + { + "auxiliary_loss_clip": 0.0118973, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.09778976, + "balance_loss_mlp": 1.01488972, + "epoch": 0.6583496167142643, + "flos": 72585022757760.0, + "grad_norm": 0.73537135419588, + "language_loss": 0.61849815, + "learning_rate": 1.104342144597323e-06, + "loss": 0.64079034, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.24511719, + "step": 10950, + "time_per_iteration": 3.4317963123321533 + }, + { + "auxiliary_loss_clip": 0.01416749, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.2545892, + "balance_loss_mlp": 1.01992798, + "epoch": 0.6584097399669322, + "flos": 13086638144640.0, + "grad_norm": 1.9543701806870706, + "language_loss": 0.68041098, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.70497382, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19616699, + "step": 10951, + "time_per_iteration": 2.8191659450531006 + }, + { + "auxiliary_loss_clip": 0.01414353, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.25105596, + "balance_loss_mlp": 1.01771438, + "epoch": 0.6584698632196002, + "flos": 28704595178880.0, + "grad_norm": 1.3312035394806727, + "language_loss": 0.77161658, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.79614866, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.21130371, + "step": 10952, + "time_per_iteration": 2.94565486907959 + }, + { + "auxiliary_loss_clip": 0.01427658, + "auxiliary_loss_mlp": 0.01044636, + "balance_loss_clip": 1.26346874, + "balance_loss_mlp": 1.02394104, + "epoch": 0.6585299864722681, + "flos": 14327363064960.0, + "grad_norm": 1.700846415542881, + "language_loss": 0.74742758, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.77215058, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.20690918, + "step": 10953, + "time_per_iteration": 4.2381911277771 + }, + { + "auxiliary_loss_clip": 0.01428458, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.26277781, + "balance_loss_mlp": 1.02064121, + "epoch": 0.6585901097249361, + "flos": 26809365957120.0, + "grad_norm": 1.9506701604816052, + "language_loss": 0.7932055, + "learning_rate": 1.102949515683546e-06, + "loss": 0.81790239, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20581055, + "step": 10954, + "time_per_iteration": 2.9020566940307617 + }, + { + "auxiliary_loss_clip": 0.01423411, + "auxiliary_loss_mlp": 0.01043206, + "balance_loss_clip": 1.25811172, + "balance_loss_mlp": 1.02242744, + "epoch": 0.658650232977604, + "flos": 18742120266240.0, + "grad_norm": 4.458385453844709, + "language_loss": 0.70659375, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.73125988, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20751953, + "step": 10955, + "time_per_iteration": 2.80837345123291 + }, + { + "auxiliary_loss_clip": 0.01411492, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.2508738, + "balance_loss_mlp": 1.02761495, + "epoch": 0.6587103562302721, + "flos": 24764041175040.0, + "grad_norm": 2.349923664367933, + "language_loss": 0.81281447, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.83740211, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19641113, + "step": 10956, + "time_per_iteration": 2.8610870838165283 + }, + { + "auxiliary_loss_clip": 0.01423502, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.25892186, + "balance_loss_mlp": 1.02156889, + "epoch": 0.65877047948294, + "flos": 22356304392960.0, + "grad_norm": 2.1196313467990384, + "language_loss": 0.82306659, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.84771878, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20166016, + "step": 10957, + "time_per_iteration": 2.8464512825012207 + }, + { + "auxiliary_loss_clip": 0.01418978, + "auxiliary_loss_mlp": 0.0103811, + "balance_loss_clip": 1.25607157, + "balance_loss_mlp": 1.01934624, + "epoch": 0.658830602735608, + "flos": 45195164288640.0, + "grad_norm": 5.164176164756197, + "language_loss": 0.76823962, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.7928105, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18774414, + "step": 10958, + "time_per_iteration": 3.078305244445801 + }, + { + "auxiliary_loss_clip": 0.01422619, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.25912321, + "balance_loss_mlp": 1.02094674, + "epoch": 0.6588907259882759, + "flos": 19911168144000.0, + "grad_norm": 1.8148091744974095, + "language_loss": 0.75646508, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.78110385, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20324707, + "step": 10959, + "time_per_iteration": 2.843550682067871 + }, + { + "auxiliary_loss_clip": 0.01424443, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.26029468, + "balance_loss_mlp": 1.01580667, + "epoch": 0.6589508492409439, + "flos": 24144357386880.0, + "grad_norm": 1.900180164673239, + "language_loss": 0.65939224, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.68398887, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19421387, + "step": 10960, + "time_per_iteration": 2.868428945541382 + }, + { + "auxiliary_loss_clip": 0.01449388, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.27914333, + "balance_loss_mlp": 1.02157569, + "epoch": 0.659010972493612, + "flos": 18231920190720.0, + "grad_norm": 2.09456280934446, + "language_loss": 0.82757664, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.85249704, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21081543, + "step": 10961, + "time_per_iteration": 2.814931869506836 + }, + { + "auxiliary_loss_clip": 0.01438708, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.2736609, + "balance_loss_mlp": 1.01536381, + "epoch": 0.6590710957462799, + "flos": 27611522599680.0, + "grad_norm": 1.939296058666974, + "language_loss": 0.75013566, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.77486676, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19042969, + "step": 10962, + "time_per_iteration": 2.9552595615386963 + }, + { + "auxiliary_loss_clip": 0.0144326, + "auxiliary_loss_mlp": 0.0104477, + "balance_loss_clip": 1.27488422, + "balance_loss_mlp": 1.02482593, + "epoch": 0.6591312189989479, + "flos": 20312608423680.0, + "grad_norm": 1.790726759614983, + "language_loss": 0.81044823, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.83532852, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19946289, + "step": 10963, + "time_per_iteration": 2.9176297187805176 + }, + { + "auxiliary_loss_clip": 0.01420539, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.25792909, + "balance_loss_mlp": 1.01957393, + "epoch": 0.6591913422516158, + "flos": 12320614114560.0, + "grad_norm": 1.684615616476281, + "language_loss": 0.7916559, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.81625223, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19519043, + "step": 10964, + "time_per_iteration": 4.3206446170806885 + }, + { + "auxiliary_loss_clip": 0.01442578, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.27364123, + "balance_loss_mlp": 1.01650786, + "epoch": 0.6592514655042838, + "flos": 25895508606720.0, + "grad_norm": 1.6359464937062185, + "language_loss": 0.74770415, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.77248991, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19494629, + "step": 10965, + "time_per_iteration": 2.874070882797241 + }, + { + "auxiliary_loss_clip": 0.01447937, + "auxiliary_loss_mlp": 0.01043582, + "balance_loss_clip": 1.2761122, + "balance_loss_mlp": 1.02330494, + "epoch": 0.6593115887569517, + "flos": 14071584355200.0, + "grad_norm": 1.9832818141541908, + "language_loss": 0.74094224, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.76585746, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20288086, + "step": 10966, + "time_per_iteration": 2.8877007961273193 + }, + { + "auxiliary_loss_clip": 0.01427093, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.26167679, + "balance_loss_mlp": 1.01874578, + "epoch": 0.6593717120096197, + "flos": 24728768213760.0, + "grad_norm": 1.593173671543881, + "language_loss": 0.77654445, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.80120718, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.2043457, + "step": 10967, + "time_per_iteration": 4.295374155044556 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.09915709, + "balance_loss_mlp": 1.00260377, + "epoch": 0.6594318352622877, + "flos": 55588913786880.0, + "grad_norm": 0.7844241022478061, + "language_loss": 0.48529333, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50751507, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.25585938, + "step": 10968, + "time_per_iteration": 4.737373352050781 + }, + { + "auxiliary_loss_clip": 0.01441304, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.27218175, + "balance_loss_mlp": 1.01795852, + "epoch": 0.6594919585149557, + "flos": 17465805671040.0, + "grad_norm": 1.8681780279323057, + "language_loss": 0.79687047, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.82167459, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.21142578, + "step": 10969, + "time_per_iteration": 2.886795997619629 + }, + { + "auxiliary_loss_clip": 0.01429979, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.26461673, + "balance_loss_mlp": 1.01926613, + "epoch": 0.6595520817676236, + "flos": 18232327393920.0, + "grad_norm": 3.2074024256710016, + "language_loss": 0.66433144, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.68902194, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19824219, + "step": 10970, + "time_per_iteration": 2.8572988510131836 + }, + { + "auxiliary_loss_clip": 0.01434292, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.26664495, + "balance_loss_mlp": 1.01619589, + "epoch": 0.6596122050202916, + "flos": 22209783171840.0, + "grad_norm": 1.669875231367687, + "language_loss": 0.7733897, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.79809344, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19885254, + "step": 10971, + "time_per_iteration": 2.8899338245391846 + }, + { + "auxiliary_loss_clip": 0.0143446, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.26778173, + "balance_loss_mlp": 1.0189724, + "epoch": 0.6596723282729595, + "flos": 14182018208640.0, + "grad_norm": 2.500499564055094, + "language_loss": 0.71728885, + "learning_rate": 1.096689432978629e-06, + "loss": 0.74201107, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18786621, + "step": 10972, + "time_per_iteration": 2.7915186882019043 + }, + { + "auxiliary_loss_clip": 0.01425211, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.26029265, + "balance_loss_mlp": 1.01071823, + "epoch": 0.6597324515256275, + "flos": 30564913397760.0, + "grad_norm": 1.9122364836382735, + "language_loss": 0.56761861, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.59217072, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19287109, + "step": 10973, + "time_per_iteration": 2.9353761672973633 + }, + { + "auxiliary_loss_clip": 0.0145975, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.28667617, + "balance_loss_mlp": 1.02024639, + "epoch": 0.6597925747782956, + "flos": 17648911952640.0, + "grad_norm": 1.8973052244687596, + "language_loss": 0.79267734, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81767422, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 1.72949219, + "router_z_loss_mlp": 0.19677734, + "step": 10974, + "time_per_iteration": 2.7895748615264893 + }, + { + "auxiliary_loss_clip": 0.01427459, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.26007366, + "balance_loss_mlp": 1.01350713, + "epoch": 0.6598526980309635, + "flos": 22829059756800.0, + "grad_norm": 2.7104091843030234, + "language_loss": 0.69063509, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71524626, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20141602, + "step": 10975, + "time_per_iteration": 2.8518686294555664 + }, + { + "auxiliary_loss_clip": 0.01442859, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.27613187, + "balance_loss_mlp": 1.01369405, + "epoch": 0.6599128212836315, + "flos": 21077591823360.0, + "grad_norm": 1.6553419306269885, + "language_loss": 0.71289349, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.7376495, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19055176, + "step": 10976, + "time_per_iteration": 2.8385961055755615 + }, + { + "auxiliary_loss_clip": 0.01421754, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.25875783, + "balance_loss_mlp": 1.0129559, + "epoch": 0.6599729445362994, + "flos": 22173605314560.0, + "grad_norm": 1.9051431461042905, + "language_loss": 0.68047488, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.70501631, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19421387, + "step": 10977, + "time_per_iteration": 2.9098968505859375 + }, + { + "auxiliary_loss_clip": 0.01442846, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.27229166, + "balance_loss_mlp": 1.01674151, + "epoch": 0.6600330677889674, + "flos": 18159112028160.0, + "grad_norm": 2.455265698195825, + "language_loss": 0.82163179, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.84644234, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.21459961, + "step": 10978, + "time_per_iteration": 2.8498494625091553 + }, + { + "auxiliary_loss_clip": 0.01434774, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.2664634, + "balance_loss_mlp": 1.01726174, + "epoch": 0.6600931910416353, + "flos": 18159157272960.0, + "grad_norm": 2.178262625761936, + "language_loss": 0.68207771, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.7068013, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20324707, + "step": 10979, + "time_per_iteration": 2.808260679244995 + }, + { + "auxiliary_loss_clip": 0.01440701, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.27222729, + "balance_loss_mlp": 1.01334262, + "epoch": 0.6601533142943034, + "flos": 17429673058560.0, + "grad_norm": 2.1822376973683464, + "language_loss": 0.74279439, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.76753241, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19750977, + "step": 10980, + "time_per_iteration": 2.816565990447998 + }, + { + "auxiliary_loss_clip": 0.01414933, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.25301838, + "balance_loss_mlp": 1.01531386, + "epoch": 0.6602134375469713, + "flos": 28231070653440.0, + "grad_norm": 1.641802742037147, + "language_loss": 0.74365091, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.76813722, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18371582, + "step": 10981, + "time_per_iteration": 2.8924028873443604 + }, + { + "auxiliary_loss_clip": 0.01434561, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.26688528, + "balance_loss_mlp": 1.01871002, + "epoch": 0.6602735607996393, + "flos": 29428423793280.0, + "grad_norm": 1.9165924845988083, + "language_loss": 0.69480109, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71953398, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20019531, + "step": 10982, + "time_per_iteration": 2.912083864212036 + }, + { + "auxiliary_loss_clip": 0.01434881, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.26956725, + "balance_loss_mlp": 1.0164063, + "epoch": 0.6603336840523072, + "flos": 18597227857920.0, + "grad_norm": 1.6036143411135828, + "language_loss": 0.70121622, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.7259286, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19946289, + "step": 10983, + "time_per_iteration": 2.8398022651672363 + }, + { + "auxiliary_loss_clip": 0.01450008, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.28118396, + "balance_loss_mlp": 1.01679611, + "epoch": 0.6603938073049752, + "flos": 33267728638080.0, + "grad_norm": 1.618533785857711, + "language_loss": 0.70960331, + "learning_rate": 1.092522205413239e-06, + "loss": 0.73447841, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.20703125, + "step": 10984, + "time_per_iteration": 2.949765205383301 + }, + { + "auxiliary_loss_clip": 0.01424403, + "auxiliary_loss_mlp": 0.01041583, + "balance_loss_clip": 1.26177406, + "balance_loss_mlp": 1.0210197, + "epoch": 0.6604539305576431, + "flos": 17393223732480.0, + "grad_norm": 6.6697318406176525, + "language_loss": 0.84982377, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.87448364, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20568848, + "step": 10985, + "time_per_iteration": 2.8638951778411865 + }, + { + "auxiliary_loss_clip": 0.01433326, + "auxiliary_loss_mlp": 0.01042235, + "balance_loss_clip": 1.26601398, + "balance_loss_mlp": 1.0227797, + "epoch": 0.6605140538103111, + "flos": 21260924328960.0, + "grad_norm": 1.966282765104304, + "language_loss": 0.74863005, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.77338564, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19433594, + "step": 10986, + "time_per_iteration": 2.8699233531951904 + }, + { + "auxiliary_loss_clip": 0.01425444, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.26223433, + "balance_loss_mlp": 1.01449454, + "epoch": 0.6605741770629792, + "flos": 13889744928000.0, + "grad_norm": 1.7563460733274847, + "language_loss": 0.79782033, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.8224172, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19750977, + "step": 10987, + "time_per_iteration": 2.8451921939849854 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01017531, + "balance_loss_clip": 1.10221434, + "balance_loss_mlp": 0.99902993, + "epoch": 0.6606343003156471, + "flos": 69351393795840.0, + "grad_norm": 0.8164921363868355, + "language_loss": 0.54237878, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.5645138, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.18457031, + "step": 10988, + "time_per_iteration": 4.929902076721191 + }, + { + "auxiliary_loss_clip": 0.01426041, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.26223195, + "balance_loss_mlp": 1.02038968, + "epoch": 0.6606944235683151, + "flos": 27284021602560.0, + "grad_norm": 1.6003259157535057, + "language_loss": 0.78083026, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.80548793, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1932373, + "step": 10989, + "time_per_iteration": 2.9416205883026123 + }, + { + "auxiliary_loss_clip": 0.01421637, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.25896358, + "balance_loss_mlp": 1.01936162, + "epoch": 0.660754546820983, + "flos": 13780985132160.0, + "grad_norm": 2.1318961361213535, + "language_loss": 0.77914625, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.80375278, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19677734, + "step": 10990, + "time_per_iteration": 2.8615665435791016 + }, + { + "auxiliary_loss_clip": 0.01436758, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.2688421, + "balance_loss_mlp": 1.01376879, + "epoch": 0.660814670073651, + "flos": 15713523331200.0, + "grad_norm": 1.7762384470734154, + "language_loss": 0.61083078, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.63554204, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20593262, + "step": 10991, + "time_per_iteration": 2.8613767623901367 + }, + { + "auxiliary_loss_clip": 0.01425824, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.25758028, + "balance_loss_mlp": 1.01753831, + "epoch": 0.6608747933263189, + "flos": 20859755518080.0, + "grad_norm": 2.3980015223955466, + "language_loss": 0.69140971, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.71604395, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20056152, + "step": 10992, + "time_per_iteration": 2.92490816116333 + }, + { + "auxiliary_loss_clip": 0.01432662, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.26317573, + "balance_loss_mlp": 1.0145601, + "epoch": 0.660934916578987, + "flos": 20642054947200.0, + "grad_norm": 2.8373544852623107, + "language_loss": 0.88522524, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90989745, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1998291, + "step": 10993, + "time_per_iteration": 2.870835065841675 + }, + { + "auxiliary_loss_clip": 0.01450593, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.27811933, + "balance_loss_mlp": 1.0150131, + "epoch": 0.6609950398316549, + "flos": 25123059815040.0, + "grad_norm": 1.6415001594437124, + "language_loss": 0.67099082, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69586021, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.21325684, + "step": 10994, + "time_per_iteration": 2.8836066722869873 + }, + { + "auxiliary_loss_clip": 0.0142273, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.25885296, + "balance_loss_mlp": 1.01633215, + "epoch": 0.6610551630843229, + "flos": 18670443223680.0, + "grad_norm": 1.625882352999474, + "language_loss": 0.78141105, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.80601037, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20874023, + "step": 10995, + "time_per_iteration": 2.866201400756836 + }, + { + "auxiliary_loss_clip": 0.01425038, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.25981593, + "balance_loss_mlp": 1.01239848, + "epoch": 0.6611152863369908, + "flos": 23268578175360.0, + "grad_norm": 1.83441985039408, + "language_loss": 0.75329894, + "learning_rate": 1.088359933123053e-06, + "loss": 0.77785861, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.18530273, + "step": 10996, + "time_per_iteration": 2.8797202110290527 + }, + { + "auxiliary_loss_clip": 0.01436584, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.27020442, + "balance_loss_mlp": 1.01497638, + "epoch": 0.6611754095896588, + "flos": 22168809365760.0, + "grad_norm": 1.7580351886437768, + "language_loss": 0.6954245, + "learning_rate": 1.088013301487126e-06, + "loss": 0.72012925, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.18920898, + "step": 10997, + "time_per_iteration": 2.876394748687744 + }, + { + "auxiliary_loss_clip": 0.0143537, + "auxiliary_loss_mlp": 0.01031329, + "balance_loss_clip": 1.26589465, + "balance_loss_mlp": 1.01142073, + "epoch": 0.6612355328423267, + "flos": 13999319130240.0, + "grad_norm": 2.0641019560732836, + "language_loss": 0.69999003, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.724657, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.19897461, + "step": 10998, + "time_per_iteration": 2.9563357830047607 + }, + { + "auxiliary_loss_clip": 0.01200081, + "auxiliary_loss_mlp": 0.01022255, + "balance_loss_clip": 1.10787654, + "balance_loss_mlp": 0.99688685, + "epoch": 0.6612956560949947, + "flos": 61482167187840.0, + "grad_norm": 0.6543628429404955, + "language_loss": 0.51178765, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53401101, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.25390625, + "step": 10999, + "time_per_iteration": 4.768325328826904 + }, + { + "auxiliary_loss_clip": 0.01434388, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.2638135, + "balance_loss_mlp": 1.01486838, + "epoch": 0.6613557793476627, + "flos": 21626684444160.0, + "grad_norm": 2.525818009311192, + "language_loss": 0.71144152, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73613596, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20178223, + "step": 11000, + "time_per_iteration": 2.8796279430389404 + }, + { + "auxiliary_loss_clip": 0.01419475, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.25742531, + "balance_loss_mlp": 1.01807332, + "epoch": 0.6614159026003307, + "flos": 34032938261760.0, + "grad_norm": 2.92556389187988, + "language_loss": 0.66006047, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.68461919, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18334961, + "step": 11001, + "time_per_iteration": 2.993093967437744 + }, + { + "auxiliary_loss_clip": 0.0143039, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.2664156, + "balance_loss_mlp": 1.01385736, + "epoch": 0.6614760258529987, + "flos": 24107862816000.0, + "grad_norm": 1.8219628087561184, + "language_loss": 0.73362482, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75826198, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19470215, + "step": 11002, + "time_per_iteration": 4.338241338729858 + }, + { + "auxiliary_loss_clip": 0.01423158, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.25859475, + "balance_loss_mlp": 1.01559544, + "epoch": 0.6615361491056666, + "flos": 14912226339840.0, + "grad_norm": 2.351977397410118, + "language_loss": 0.79621828, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.8208158, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20983887, + "step": 11003, + "time_per_iteration": 4.222936153411865 + }, + { + "auxiliary_loss_clip": 0.01434602, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.26663327, + "balance_loss_mlp": 1.01476407, + "epoch": 0.6615962723583346, + "flos": 15313349905920.0, + "grad_norm": 2.6619180899722084, + "language_loss": 0.69675767, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.72145814, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20690918, + "step": 11004, + "time_per_iteration": 2.8871705532073975 + }, + { + "auxiliary_loss_clip": 0.01440277, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.27059758, + "balance_loss_mlp": 1.01398385, + "epoch": 0.6616563956110025, + "flos": 18741441594240.0, + "grad_norm": 2.66075156555359, + "language_loss": 0.70541143, + "learning_rate": 1.085241494478132e-06, + "loss": 0.7301625, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20861816, + "step": 11005, + "time_per_iteration": 2.9882431030273438 + }, + { + "auxiliary_loss_clip": 0.01421151, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.2574718, + "balance_loss_mlp": 1.01091945, + "epoch": 0.6617165188636706, + "flos": 24504823860480.0, + "grad_norm": 1.5577203097400751, + "language_loss": 0.78617454, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.81069684, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20141602, + "step": 11006, + "time_per_iteration": 2.9194726943969727 + }, + { + "auxiliary_loss_clip": 0.01432857, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.26720643, + "balance_loss_mlp": 1.01396322, + "epoch": 0.6617766421163385, + "flos": 22389088890240.0, + "grad_norm": 1.4908491551080651, + "language_loss": 0.76952457, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.79419208, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19934082, + "step": 11007, + "time_per_iteration": 2.9044909477233887 + }, + { + "auxiliary_loss_clip": 0.01428424, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.26269567, + "balance_loss_mlp": 1.01109195, + "epoch": 0.6618367653690065, + "flos": 20860253210880.0, + "grad_norm": 1.6115861110822407, + "language_loss": 0.79215527, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.81674612, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19567871, + "step": 11008, + "time_per_iteration": 2.828664779663086 + }, + { + "auxiliary_loss_clip": 0.0144494, + "auxiliary_loss_mlp": 0.01042153, + "balance_loss_clip": 1.27313876, + "balance_loss_mlp": 1.02100563, + "epoch": 0.6618968886216744, + "flos": 17721267667200.0, + "grad_norm": 1.6992891888753698, + "language_loss": 0.82747173, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.85234272, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.21142578, + "step": 11009, + "time_per_iteration": 2.8581507205963135 + }, + { + "auxiliary_loss_clip": 0.01198594, + "auxiliary_loss_mlp": 0.01017489, + "balance_loss_clip": 1.10648942, + "balance_loss_mlp": 1.00108552, + "epoch": 0.6619570118743424, + "flos": 67065067370880.0, + "grad_norm": 1.066420775521907, + "language_loss": 0.67392373, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.6960845, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.1640625, + "step": 11010, + "time_per_iteration": 3.308628559112549 + }, + { + "auxiliary_loss_clip": 0.01433329, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.26607561, + "balance_loss_mlp": 1.0155375, + "epoch": 0.6620171351270103, + "flos": 18670081265280.0, + "grad_norm": 1.711432934618372, + "language_loss": 0.72070307, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.74539542, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20361328, + "step": 11011, + "time_per_iteration": 3.003183364868164 + }, + { + "auxiliary_loss_clip": 0.01428206, + "auxiliary_loss_mlp": 0.01036592, + "balance_loss_clip": 1.26215982, + "balance_loss_mlp": 1.0172441, + "epoch": 0.6620772583796783, + "flos": 24181168671360.0, + "grad_norm": 1.4998660404920836, + "language_loss": 0.72886801, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.75351596, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19348145, + "step": 11012, + "time_per_iteration": 2.950166702270508 + }, + { + "auxiliary_loss_clip": 0.01410494, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.25402236, + "balance_loss_mlp": 1.01852274, + "epoch": 0.6621373816323463, + "flos": 23634157311360.0, + "grad_norm": 2.3463813490119114, + "language_loss": 0.79763502, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.82210302, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17797852, + "step": 11013, + "time_per_iteration": 2.9012835025787354 + }, + { + "auxiliary_loss_clip": 0.0142602, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.26254463, + "balance_loss_mlp": 1.01119113, + "epoch": 0.6621975048850143, + "flos": 18451385308800.0, + "grad_norm": 2.0963815646088557, + "language_loss": 0.71115911, + "learning_rate": 1.082125865538971e-06, + "loss": 0.73573577, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20446777, + "step": 11014, + "time_per_iteration": 2.8598523139953613 + }, + { + "auxiliary_loss_clip": 0.0142071, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.25853658, + "balance_loss_mlp": 1.01464629, + "epoch": 0.6622576281376823, + "flos": 14071901068800.0, + "grad_norm": 2.004796984183188, + "language_loss": 0.78101063, + "learning_rate": 1.081779858400137e-06, + "loss": 0.80555129, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18701172, + "step": 11015, + "time_per_iteration": 2.8403983116149902 + }, + { + "auxiliary_loss_clip": 0.01421857, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.25835729, + "balance_loss_mlp": 1.01381302, + "epoch": 0.6623177513903502, + "flos": 17027689841280.0, + "grad_norm": 1.68257584952964, + "language_loss": 0.82779592, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.85235178, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19921875, + "step": 11016, + "time_per_iteration": 2.847731351852417 + }, + { + "auxiliary_loss_clip": 0.01441387, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.27216482, + "balance_loss_mlp": 1.01251662, + "epoch": 0.6623778746430182, + "flos": 17278944071040.0, + "grad_norm": 2.0403059928324607, + "language_loss": 0.70672214, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.73146093, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.1998291, + "step": 11017, + "time_per_iteration": 2.847506523132324 + }, + { + "auxiliary_loss_clip": 0.01416786, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.25371718, + "balance_loss_mlp": 1.01725554, + "epoch": 0.6624379978956861, + "flos": 48808262540160.0, + "grad_norm": 2.129593044946957, + "language_loss": 0.7772975, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.80184406, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20593262, + "step": 11018, + "time_per_iteration": 3.090596914291382 + }, + { + "auxiliary_loss_clip": 0.01417406, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.25386882, + "balance_loss_mlp": 1.01619828, + "epoch": 0.6624981211483542, + "flos": 18961585384320.0, + "grad_norm": 1.942744994543718, + "language_loss": 0.84377569, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.86831903, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20739746, + "step": 11019, + "time_per_iteration": 2.845418691635132 + }, + { + "auxiliary_loss_clip": 0.01425262, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.2620101, + "balance_loss_mlp": 1.01527584, + "epoch": 0.6625582444010221, + "flos": 23266542159360.0, + "grad_norm": 1.4545264189571288, + "language_loss": 0.72199631, + "learning_rate": 1.080050345253328e-06, + "loss": 0.74659455, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19299316, + "step": 11020, + "time_per_iteration": 2.8669159412384033 + }, + { + "auxiliary_loss_clip": 0.01457278, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.28368688, + "balance_loss_mlp": 1.01664865, + "epoch": 0.6626183676536901, + "flos": 21404052190080.0, + "grad_norm": 1.7441546112786488, + "language_loss": 0.72823197, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.75318408, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.21289062, + "step": 11021, + "time_per_iteration": 2.853935956954956 + }, + { + "auxiliary_loss_clip": 0.01434594, + "auxiliary_loss_mlp": 0.01037981, + "balance_loss_clip": 1.26913154, + "balance_loss_mlp": 1.01751256, + "epoch": 0.662678490906358, + "flos": 14578572049920.0, + "grad_norm": 3.562046711791596, + "language_loss": 0.84117097, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.8658967, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20471191, + "step": 11022, + "time_per_iteration": 2.857743740081787 + }, + { + "auxiliary_loss_clip": 0.01457142, + "auxiliary_loss_mlp": 0.01041259, + "balance_loss_clip": 1.28059101, + "balance_loss_mlp": 1.01949143, + "epoch": 0.662738614159026, + "flos": 15999507584640.0, + "grad_norm": 2.317050365163501, + "language_loss": 0.74430817, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.76929218, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.21765137, + "step": 11023, + "time_per_iteration": 2.7965474128723145 + }, + { + "auxiliary_loss_clip": 0.0142859, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.26393557, + "balance_loss_mlp": 1.0146178, + "epoch": 0.6627987374116939, + "flos": 19545679497600.0, + "grad_norm": 1.8604968575581138, + "language_loss": 0.75302637, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77765763, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19909668, + "step": 11024, + "time_per_iteration": 4.29595685005188 + }, + { + "auxiliary_loss_clip": 0.01446337, + "auxiliary_loss_mlp": 0.01033466, + "balance_loss_clip": 1.2780205, + "balance_loss_mlp": 1.01280737, + "epoch": 0.662858860664362, + "flos": 15710537174400.0, + "grad_norm": 3.9791488322279176, + "language_loss": 0.69922733, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.72402537, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20654297, + "step": 11025, + "time_per_iteration": 2.9009432792663574 + }, + { + "auxiliary_loss_clip": 0.01438253, + "auxiliary_loss_mlp": 0.01041853, + "balance_loss_clip": 1.27175021, + "balance_loss_mlp": 1.02124143, + "epoch": 0.6629189839170299, + "flos": 20163191535360.0, + "grad_norm": 2.103361138824014, + "language_loss": 0.79861617, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.82341725, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20617676, + "step": 11026, + "time_per_iteration": 2.8928418159484863 + }, + { + "auxiliary_loss_clip": 0.01419692, + "auxiliary_loss_mlp": 0.01036541, + "balance_loss_clip": 1.25645065, + "balance_loss_mlp": 1.01681137, + "epoch": 0.6629791071696979, + "flos": 20923198007040.0, + "grad_norm": 1.6561361561026828, + "language_loss": 0.76782769, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.79238999, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19726562, + "step": 11027, + "time_per_iteration": 2.8810386657714844 + }, + { + "auxiliary_loss_clip": 0.0144035, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.27272677, + "balance_loss_mlp": 1.01837289, + "epoch": 0.6630392304223659, + "flos": 20855819220480.0, + "grad_norm": 4.082304778947198, + "language_loss": 0.70424139, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72903889, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.21032715, + "step": 11028, + "time_per_iteration": 2.8347034454345703 + }, + { + "auxiliary_loss_clip": 0.01430523, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.26495826, + "balance_loss_mlp": 1.01662052, + "epoch": 0.6630993536750338, + "flos": 21005552822400.0, + "grad_norm": 1.8925748807286094, + "language_loss": 0.80316794, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.82783067, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19140625, + "step": 11029, + "time_per_iteration": 2.892699718475342 + }, + { + "auxiliary_loss_clip": 0.01437052, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.26869535, + "balance_loss_mlp": 1.01605916, + "epoch": 0.6631594769277018, + "flos": 18267962313600.0, + "grad_norm": 1.8848817858762656, + "language_loss": 0.76795733, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.79270291, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.21447754, + "step": 11030, + "time_per_iteration": 2.8199734687805176 + }, + { + "auxiliary_loss_clip": 0.01455599, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.2838223, + "balance_loss_mlp": 1.01303411, + "epoch": 0.6632196001803697, + "flos": 17829122567040.0, + "grad_norm": 2.455962154670929, + "language_loss": 0.76641083, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.79129565, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.19824219, + "step": 11031, + "time_per_iteration": 2.8109354972839355 + }, + { + "auxiliary_loss_clip": 0.01441895, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.2716223, + "balance_loss_mlp": 1.01789951, + "epoch": 0.6632797234330378, + "flos": 12675967925760.0, + "grad_norm": 2.909222146518567, + "language_loss": 0.75884277, + "learning_rate": 1.075903075048228e-06, + "loss": 0.78364146, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20068359, + "step": 11032, + "time_per_iteration": 2.8275091648101807 + }, + { + "auxiliary_loss_clip": 0.0142474, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.26031065, + "balance_loss_mlp": 1.01238227, + "epoch": 0.6633398466857057, + "flos": 23594721828480.0, + "grad_norm": 1.7711713044232624, + "language_loss": 0.81153369, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.83610821, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.203125, + "step": 11033, + "time_per_iteration": 2.878173589706421 + }, + { + "auxiliary_loss_clip": 0.01432896, + "auxiliary_loss_mlp": 0.01035414, + "balance_loss_clip": 1.26575732, + "balance_loss_mlp": 1.01594734, + "epoch": 0.6633999699383737, + "flos": 20641512009600.0, + "grad_norm": 1.5941875014603308, + "language_loss": 0.8128767, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.8375597, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19470215, + "step": 11034, + "time_per_iteration": 2.866323709487915 + }, + { + "auxiliary_loss_clip": 0.01434854, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.26936364, + "balance_loss_mlp": 1.01522648, + "epoch": 0.6634600931910416, + "flos": 21806668834560.0, + "grad_norm": 1.6795845384947061, + "language_loss": 0.7631079, + "learning_rate": 1.074867045054166e-06, + "loss": 0.78779978, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19104004, + "step": 11035, + "time_per_iteration": 4.261380195617676 + }, + { + "auxiliary_loss_clip": 0.01441196, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.26981187, + "balance_loss_mlp": 1.01065516, + "epoch": 0.6635202164437096, + "flos": 18741803552640.0, + "grad_norm": 1.9897433638394464, + "language_loss": 0.83989078, + "learning_rate": 1.074521771867622e-06, + "loss": 0.86460137, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.1920166, + "step": 11036, + "time_per_iteration": 2.8304898738861084 + }, + { + "auxiliary_loss_clip": 0.01191666, + "auxiliary_loss_mlp": 0.01019375, + "balance_loss_clip": 1.10119271, + "balance_loss_mlp": 0.99772704, + "epoch": 0.6635803396963775, + "flos": 60253386894720.0, + "grad_norm": 0.7851179417026694, + "language_loss": 0.52346206, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54557252, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.21679688, + "step": 11037, + "time_per_iteration": 4.71683144569397 + }, + { + "auxiliary_loss_clip": 0.01427659, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.26044178, + "balance_loss_mlp": 1.01807654, + "epoch": 0.6636404629490456, + "flos": 29178481662720.0, + "grad_norm": 1.805759948702868, + "language_loss": 0.79702127, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.82168621, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20788574, + "step": 11038, + "time_per_iteration": 4.329996824264526 + }, + { + "auxiliary_loss_clip": 0.01444839, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.27671313, + "balance_loss_mlp": 1.01907825, + "epoch": 0.6637005862017135, + "flos": 38921400967680.0, + "grad_norm": 2.368792921063246, + "language_loss": 0.64827871, + "learning_rate": 1.073486162925716e-06, + "loss": 0.67313021, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.2121582, + "step": 11039, + "time_per_iteration": 3.0127322673797607 + }, + { + "auxiliary_loss_clip": 0.01438996, + "auxiliary_loss_mlp": 0.01031972, + "balance_loss_clip": 1.26913977, + "balance_loss_mlp": 1.01170635, + "epoch": 0.6637607094543815, + "flos": 22793470081920.0, + "grad_norm": 2.1597634048532144, + "language_loss": 0.65167117, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.67638087, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.20251465, + "step": 11040, + "time_per_iteration": 2.913280487060547 + }, + { + "auxiliary_loss_clip": 0.01414539, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.25004768, + "balance_loss_mlp": 1.01384211, + "epoch": 0.6638208327070495, + "flos": 18123250884480.0, + "grad_norm": 3.244780806413256, + "language_loss": 0.72762805, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.75211012, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19824219, + "step": 11041, + "time_per_iteration": 2.8591063022613525 + }, + { + "auxiliary_loss_clip": 0.01425889, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.26080537, + "balance_loss_mlp": 1.02211821, + "epoch": 0.6638809559597174, + "flos": 29437020305280.0, + "grad_norm": 2.529496351712869, + "language_loss": 0.62381572, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64850837, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.21264648, + "step": 11042, + "time_per_iteration": 2.914703130722046 + }, + { + "auxiliary_loss_clip": 0.01445482, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.27393341, + "balance_loss_mlp": 1.01716185, + "epoch": 0.6639410792123854, + "flos": 28083916005120.0, + "grad_norm": 2.9119625467128762, + "language_loss": 0.68839514, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.71322638, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 1.71484375, + "router_z_loss_mlp": 0.20483398, + "step": 11043, + "time_per_iteration": 2.9312386512756348 + }, + { + "auxiliary_loss_clip": 0.01418789, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.25883305, + "balance_loss_mlp": 1.01581001, + "epoch": 0.6640012024650533, + "flos": 25567238448000.0, + "grad_norm": 1.573654016784822, + "language_loss": 0.84294999, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86748278, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18676758, + "step": 11044, + "time_per_iteration": 2.8979432582855225 + }, + { + "auxiliary_loss_clip": 0.01433504, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.26810455, + "balance_loss_mlp": 1.01496327, + "epoch": 0.6640613257177214, + "flos": 14875279320960.0, + "grad_norm": 2.109222610165583, + "language_loss": 0.70706749, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.73175716, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20495605, + "step": 11045, + "time_per_iteration": 2.80625581741333 + }, + { + "auxiliary_loss_clip": 0.01437573, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.26973164, + "balance_loss_mlp": 1.0136677, + "epoch": 0.6641214489703893, + "flos": 23231314442880.0, + "grad_norm": 1.569761347225777, + "language_loss": 0.64933926, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.67405498, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.20336914, + "step": 11046, + "time_per_iteration": 2.914215326309204 + }, + { + "auxiliary_loss_clip": 0.01437028, + "auxiliary_loss_mlp": 0.01030444, + "balance_loss_clip": 1.2700479, + "balance_loss_mlp": 1.01147735, + "epoch": 0.6641815722230573, + "flos": 37756651345920.0, + "grad_norm": 1.8740582499990948, + "language_loss": 0.72281349, + "learning_rate": 1.070726085914088e-06, + "loss": 0.7474882, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.1895752, + "step": 11047, + "time_per_iteration": 3.0393762588500977 + }, + { + "auxiliary_loss_clip": 0.01434977, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.26913261, + "balance_loss_mlp": 1.01960886, + "epoch": 0.6642416954757252, + "flos": 17940099358080.0, + "grad_norm": 1.970262623586702, + "language_loss": 0.77916253, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.80391204, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20361328, + "step": 11048, + "time_per_iteration": 2.986861228942871 + }, + { + "auxiliary_loss_clip": 0.01187163, + "auxiliary_loss_mlp": 0.01020708, + "balance_loss_clip": 1.09700966, + "balance_loss_mlp": 0.99925023, + "epoch": 0.6643018187283932, + "flos": 52019169333120.0, + "grad_norm": 0.7444373283764798, + "language_loss": 0.55020601, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.5722847, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.21484375, + "step": 11049, + "time_per_iteration": 3.3650968074798584 + }, + { + "auxiliary_loss_clip": 0.01434832, + "auxiliary_loss_mlp": 0.01033797, + "balance_loss_clip": 1.26963425, + "balance_loss_mlp": 1.01459265, + "epoch": 0.6643619419810611, + "flos": 30238769744640.0, + "grad_norm": 1.6693653236596666, + "language_loss": 0.65030479, + "learning_rate": 1.069691638104648e-06, + "loss": 0.67499113, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19213867, + "step": 11050, + "time_per_iteration": 2.9674363136291504 + }, + { + "auxiliary_loss_clip": 0.01427558, + "auxiliary_loss_mlp": 0.01032006, + "balance_loss_clip": 1.26359713, + "balance_loss_mlp": 1.01327777, + "epoch": 0.6644220652337292, + "flos": 22976395384320.0, + "grad_norm": 2.733315299220215, + "language_loss": 0.7993691, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.82396472, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18725586, + "step": 11051, + "time_per_iteration": 2.9163060188293457 + }, + { + "auxiliary_loss_clip": 0.01433917, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.26723409, + "balance_loss_mlp": 1.01342773, + "epoch": 0.6644821884863971, + "flos": 21152119288320.0, + "grad_norm": 14.21130762084279, + "language_loss": 0.86170185, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.88636857, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19311523, + "step": 11052, + "time_per_iteration": 2.8769726753234863 + }, + { + "auxiliary_loss_clip": 0.01442153, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.27185023, + "balance_loss_mlp": 1.01463151, + "epoch": 0.6645423117390651, + "flos": 20202265059840.0, + "grad_norm": 4.268282098917331, + "language_loss": 0.75537252, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.78014243, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.2019043, + "step": 11053, + "time_per_iteration": 2.8455793857574463 + }, + { + "auxiliary_loss_clip": 0.01423491, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.25965595, + "balance_loss_mlp": 1.01367044, + "epoch": 0.6646024349917331, + "flos": 24362419916160.0, + "grad_norm": 1.6150701955792264, + "language_loss": 0.80074596, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.8253057, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18798828, + "step": 11054, + "time_per_iteration": 2.9354350566864014 + }, + { + "auxiliary_loss_clip": 0.01420702, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.25705409, + "balance_loss_mlp": 1.01436758, + "epoch": 0.664662558244401, + "flos": 18815878569600.0, + "grad_norm": 1.4932260789753116, + "language_loss": 0.74292928, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76747203, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19189453, + "step": 11055, + "time_per_iteration": 2.8746349811553955 + }, + { + "auxiliary_loss_clip": 0.01438916, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.27109814, + "balance_loss_mlp": 1.01467526, + "epoch": 0.664722681497069, + "flos": 18962037832320.0, + "grad_norm": 1.847191410660478, + "language_loss": 0.73381138, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75854957, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20239258, + "step": 11056, + "time_per_iteration": 2.8484530448913574 + }, + { + "auxiliary_loss_clip": 0.01427586, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.26121902, + "balance_loss_mlp": 1.0097115, + "epoch": 0.6647828047497369, + "flos": 19580228542080.0, + "grad_norm": 2.010839557810378, + "language_loss": 0.70980263, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.73437899, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20324707, + "step": 11057, + "time_per_iteration": 2.847320079803467 + }, + { + "auxiliary_loss_clip": 0.01439613, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.27221918, + "balance_loss_mlp": 1.01857924, + "epoch": 0.664842928002405, + "flos": 23159999358720.0, + "grad_norm": 1.7382456390597079, + "language_loss": 0.8141911, + "learning_rate": 1.066934663776291e-06, + "loss": 0.83897257, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19946289, + "step": 11058, + "time_per_iteration": 2.869825601577759 + }, + { + "auxiliary_loss_clip": 0.01190136, + "auxiliary_loss_mlp": 0.0101851, + "balance_loss_clip": 1.10079193, + "balance_loss_mlp": 1.00039053, + "epoch": 0.6649030512550729, + "flos": 65273485282560.0, + "grad_norm": 0.790812221820074, + "language_loss": 0.62647396, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64856046, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18164062, + "step": 11059, + "time_per_iteration": 4.655359983444214 + }, + { + "auxiliary_loss_clip": 0.01425894, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.26113927, + "balance_loss_mlp": 1.01257527, + "epoch": 0.6649631745077409, + "flos": 20204753523840.0, + "grad_norm": 1.8489823462693635, + "language_loss": 0.79375249, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81832975, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19250488, + "step": 11060, + "time_per_iteration": 2.8769779205322266 + }, + { + "auxiliary_loss_clip": 0.01427172, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.26157975, + "balance_loss_mlp": 1.01783919, + "epoch": 0.6650232977604088, + "flos": 17247516917760.0, + "grad_norm": 2.1077119501333823, + "language_loss": 0.80245471, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.82710618, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20129395, + "step": 11061, + "time_per_iteration": 2.8112502098083496 + }, + { + "auxiliary_loss_clip": 0.01428519, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.26384234, + "balance_loss_mlp": 1.01175857, + "epoch": 0.6650834210130768, + "flos": 10011999985920.0, + "grad_norm": 1.9269218711121516, + "language_loss": 0.57078993, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59538031, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1875, + "step": 11062, + "time_per_iteration": 2.8444645404815674 + }, + { + "auxiliary_loss_clip": 0.01434745, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.26472616, + "balance_loss_mlp": 1.0134201, + "epoch": 0.6651435442657447, + "flos": 10459255265280.0, + "grad_norm": 1.6337774282706161, + "language_loss": 0.76303196, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78772646, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.21264648, + "step": 11063, + "time_per_iteration": 2.8262743949890137 + }, + { + "auxiliary_loss_clip": 0.01431422, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.26553988, + "balance_loss_mlp": 1.01578629, + "epoch": 0.6652036675184128, + "flos": 22353906418560.0, + "grad_norm": 1.4060071169018622, + "language_loss": 0.71080381, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.73546666, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19067383, + "step": 11064, + "time_per_iteration": 2.8775088787078857 + }, + { + "auxiliary_loss_clip": 0.01188688, + "auxiliary_loss_mlp": 0.01017708, + "balance_loss_clip": 1.09863472, + "balance_loss_mlp": 0.99987394, + "epoch": 0.6652637907710807, + "flos": 52934203048320.0, + "grad_norm": 0.8540781495687443, + "language_loss": 0.63121879, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65328276, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.17871094, + "step": 11065, + "time_per_iteration": 3.3033342361450195 + }, + { + "auxiliary_loss_clip": 0.01437403, + "auxiliary_loss_mlp": 0.01036123, + "balance_loss_clip": 1.26889133, + "balance_loss_mlp": 1.01629865, + "epoch": 0.6653239140237487, + "flos": 23113007994240.0, + "grad_norm": 1.6734835490972844, + "language_loss": 0.63468516, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.65942043, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19812012, + "step": 11066, + "time_per_iteration": 2.872493028640747 + }, + { + "auxiliary_loss_clip": 0.01435476, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.26757991, + "balance_loss_mlp": 1.01412868, + "epoch": 0.6653840372764167, + "flos": 25970805233280.0, + "grad_norm": 1.4657565624026951, + "language_loss": 0.70306152, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72775114, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19360352, + "step": 11067, + "time_per_iteration": 2.9806556701660156 + }, + { + "auxiliary_loss_clip": 0.01187169, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.09883559, + "balance_loss_mlp": 1.01529491, + "epoch": 0.6654441605290846, + "flos": 66069307653120.0, + "grad_norm": 0.9304117856200816, + "language_loss": 0.72140974, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74360031, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16601562, + "step": 11068, + "time_per_iteration": 3.275740623474121 + }, + { + "auxiliary_loss_clip": 0.01188927, + "auxiliary_loss_mlp": 0.01022641, + "balance_loss_clip": 1.09941053, + "balance_loss_mlp": 1.00394917, + "epoch": 0.6655042837817526, + "flos": 65230945891200.0, + "grad_norm": 0.7084678261711423, + "language_loss": 0.57871044, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.60082614, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.18652344, + "step": 11069, + "time_per_iteration": 3.3919081687927246 + }, + { + "auxiliary_loss_clip": 0.01187625, + "auxiliary_loss_mlp": 0.0102565, + "balance_loss_clip": 1.09844756, + "balance_loss_mlp": 1.00428772, + "epoch": 0.6655644070344205, + "flos": 69039800985600.0, + "grad_norm": 0.7438551325511158, + "language_loss": 0.63504118, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65717393, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.21386719, + "step": 11070, + "time_per_iteration": 4.7253851890563965 + }, + { + "auxiliary_loss_clip": 0.01427435, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.26114428, + "balance_loss_mlp": 1.01162422, + "epoch": 0.6656245302870886, + "flos": 36328159929600.0, + "grad_norm": 1.6898430311218327, + "language_loss": 0.59392536, + "learning_rate": 1.062459413096116e-06, + "loss": 0.61850882, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19287109, + "step": 11071, + "time_per_iteration": 3.0616161823272705 + }, + { + "auxiliary_loss_clip": 0.0142618, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.26210284, + "balance_loss_mlp": 1.01525927, + "epoch": 0.6656846535397565, + "flos": 21803818412160.0, + "grad_norm": 2.1794079979652876, + "language_loss": 0.73726499, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.76187921, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.1998291, + "step": 11072, + "time_per_iteration": 4.270760536193848 + }, + { + "auxiliary_loss_clip": 0.01414358, + "auxiliary_loss_mlp": 0.01040698, + "balance_loss_clip": 1.25299966, + "balance_loss_mlp": 1.02136278, + "epoch": 0.6657447767924245, + "flos": 37501687042560.0, + "grad_norm": 3.4395791240180142, + "language_loss": 0.71579933, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.74034995, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19335938, + "step": 11073, + "time_per_iteration": 4.415718078613281 + }, + { + "auxiliary_loss_clip": 0.01439418, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.27013576, + "balance_loss_mlp": 1.01834047, + "epoch": 0.6658049000450924, + "flos": 16846664820480.0, + "grad_norm": 2.4268388190033456, + "language_loss": 0.56622148, + "learning_rate": 1.061427515134354e-06, + "loss": 0.59100175, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20275879, + "step": 11074, + "time_per_iteration": 2.8065924644470215 + }, + { + "auxiliary_loss_clip": 0.01421598, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.25835156, + "balance_loss_mlp": 1.01501989, + "epoch": 0.6658650232977604, + "flos": 33524095530240.0, + "grad_norm": 1.9496460179119457, + "language_loss": 0.72876608, + "learning_rate": 1.061083620311235e-06, + "loss": 0.75332385, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19177246, + "step": 11075, + "time_per_iteration": 2.9418752193450928 + }, + { + "auxiliary_loss_clip": 0.01415139, + "auxiliary_loss_mlp": 0.01039589, + "balance_loss_clip": 1.25287974, + "balance_loss_mlp": 1.0204674, + "epoch": 0.6659251465504283, + "flos": 37720382999040.0, + "grad_norm": 1.3831723067228412, + "language_loss": 0.66490567, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68945289, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19116211, + "step": 11076, + "time_per_iteration": 2.992400646209717 + }, + { + "auxiliary_loss_clip": 0.01416411, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.25280905, + "balance_loss_mlp": 1.01471043, + "epoch": 0.6659852698030964, + "flos": 24902508821760.0, + "grad_norm": 3.7877905383057455, + "language_loss": 0.76426768, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.78878582, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20703125, + "step": 11077, + "time_per_iteration": 2.9186413288116455 + }, + { + "auxiliary_loss_clip": 0.0142438, + "auxiliary_loss_mlp": 0.01037207, + "balance_loss_clip": 1.25827527, + "balance_loss_mlp": 1.01754987, + "epoch": 0.6660453930557643, + "flos": 24363505791360.0, + "grad_norm": 2.076873971179817, + "language_loss": 0.6736269, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.69824278, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.1965332, + "step": 11078, + "time_per_iteration": 2.9222872257232666 + }, + { + "auxiliary_loss_clip": 0.01449645, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.27901483, + "balance_loss_mlp": 1.02090108, + "epoch": 0.6661055163084323, + "flos": 10604962080000.0, + "grad_norm": 2.233994828458855, + "language_loss": 0.7056247, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.73053086, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20068359, + "step": 11079, + "time_per_iteration": 2.892307758331299 + }, + { + "auxiliary_loss_clip": 0.01436707, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.27179241, + "balance_loss_mlp": 1.01420665, + "epoch": 0.6661656395611003, + "flos": 24067205723520.0, + "grad_norm": 1.5161627584612951, + "language_loss": 0.8069002, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.83159506, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.18579102, + "step": 11080, + "time_per_iteration": 2.845039129257202 + }, + { + "auxiliary_loss_clip": 0.01412919, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.25300765, + "balance_loss_mlp": 1.01275432, + "epoch": 0.6662257628137682, + "flos": 23045719697280.0, + "grad_norm": 6.023128034946133, + "language_loss": 0.78669322, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.81114382, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19396973, + "step": 11081, + "time_per_iteration": 2.896991491317749 + }, + { + "auxiliary_loss_clip": 0.01439609, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.27141464, + "balance_loss_mlp": 1.01461172, + "epoch": 0.6662858860664362, + "flos": 24765308029440.0, + "grad_norm": 1.7345664431752867, + "language_loss": 0.80412841, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82886797, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.19750977, + "step": 11082, + "time_per_iteration": 2.8748488426208496 + }, + { + "auxiliary_loss_clip": 0.01420939, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.25704396, + "balance_loss_mlp": 1.01344693, + "epoch": 0.6663460093191041, + "flos": 20018118147840.0, + "grad_norm": 1.5053816309873678, + "language_loss": 0.84378737, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86832297, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19165039, + "step": 11083, + "time_per_iteration": 2.8534185886383057 + }, + { + "auxiliary_loss_clip": 0.01445493, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.27592373, + "balance_loss_mlp": 1.01929486, + "epoch": 0.6664061325717722, + "flos": 17830253687040.0, + "grad_norm": 2.250099841320157, + "language_loss": 0.86156094, + "learning_rate": 1.057990170638731e-06, + "loss": 0.88640869, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19995117, + "step": 11084, + "time_per_iteration": 2.84954833984375 + }, + { + "auxiliary_loss_clip": 0.01437354, + "auxiliary_loss_mlp": 0.01039036, + "balance_loss_clip": 1.2681638, + "balance_loss_mlp": 1.01857948, + "epoch": 0.6664662558244401, + "flos": 18085851417600.0, + "grad_norm": 2.5021527052427546, + "language_loss": 0.74320561, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.76796949, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20458984, + "step": 11085, + "time_per_iteration": 2.8573334217071533 + }, + { + "auxiliary_loss_clip": 0.01419305, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.25323558, + "balance_loss_mlp": 1.0164696, + "epoch": 0.6665263790771081, + "flos": 21582679236480.0, + "grad_norm": 1.7715154564091538, + "language_loss": 0.80755067, + "learning_rate": 1.057303129975894e-06, + "loss": 0.832102, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19348145, + "step": 11086, + "time_per_iteration": 2.833106756210327 + }, + { + "auxiliary_loss_clip": 0.01428811, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.26362872, + "balance_loss_mlp": 1.01708424, + "epoch": 0.666586502329776, + "flos": 24217210794240.0, + "grad_norm": 1.8152405315560656, + "language_loss": 0.75184894, + "learning_rate": 1.056959663258702e-06, + "loss": 0.7765128, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20495605, + "step": 11087, + "time_per_iteration": 2.8348052501678467 + }, + { + "auxiliary_loss_clip": 0.01425742, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.26110148, + "balance_loss_mlp": 1.01654732, + "epoch": 0.666646625582444, + "flos": 22210733312640.0, + "grad_norm": 1.7040851295726143, + "language_loss": 0.65510464, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67972946, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20202637, + "step": 11088, + "time_per_iteration": 2.936216354370117 + }, + { + "auxiliary_loss_clip": 0.01443676, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.27609944, + "balance_loss_mlp": 1.01165295, + "epoch": 0.6667067488351119, + "flos": 18269093433600.0, + "grad_norm": 2.3196289007575235, + "language_loss": 0.64906335, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.67381144, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19470215, + "step": 11089, + "time_per_iteration": 2.848558187484741 + }, + { + "auxiliary_loss_clip": 0.01432778, + "auxiliary_loss_mlp": 0.01032228, + "balance_loss_clip": 1.26963913, + "balance_loss_mlp": 1.0133338, + "epoch": 0.66676687208778, + "flos": 17244983208960.0, + "grad_norm": 2.0685602873936424, + "language_loss": 0.81370699, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83835709, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18896484, + "step": 11090, + "time_per_iteration": 2.860603094100952 + }, + { + "auxiliary_loss_clip": 0.01438416, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.26887405, + "balance_loss_mlp": 1.01754713, + "epoch": 0.6668269953404479, + "flos": 19760801114880.0, + "grad_norm": 2.373551797917491, + "language_loss": 0.79145694, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.8162083, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19140625, + "step": 11091, + "time_per_iteration": 2.9464988708496094 + }, + { + "auxiliary_loss_clip": 0.01417879, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.25398231, + "balance_loss_mlp": 1.01619983, + "epoch": 0.6668871185931159, + "flos": 20568160909440.0, + "grad_norm": 2.2474601469819486, + "language_loss": 0.7944392, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.8189714, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19152832, + "step": 11092, + "time_per_iteration": 2.885115623474121 + }, + { + "auxiliary_loss_clip": 0.01189784, + "auxiliary_loss_mlp": 0.01022726, + "balance_loss_clip": 1.09935999, + "balance_loss_mlp": 1.0004096, + "epoch": 0.6669472418457839, + "flos": 58113445956480.0, + "grad_norm": 0.7598091510470407, + "language_loss": 0.57820797, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.60033309, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.22363281, + "step": 11093, + "time_per_iteration": 3.4036717414855957 + }, + { + "auxiliary_loss_clip": 0.01427017, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.26234508, + "balance_loss_mlp": 1.01257849, + "epoch": 0.6670073650984518, + "flos": 26075583486720.0, + "grad_norm": 1.511615471106695, + "language_loss": 0.76894677, + "learning_rate": 1.054556398252703e-06, + "loss": 0.79353595, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19311523, + "step": 11094, + "time_per_iteration": 4.3695337772369385 + }, + { + "auxiliary_loss_clip": 0.01420026, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.25489283, + "balance_loss_mlp": 1.01614535, + "epoch": 0.6670674883511198, + "flos": 32429801341440.0, + "grad_norm": 1.6937028947056387, + "language_loss": 0.73780954, + "learning_rate": 1.05421321798155e-06, + "loss": 0.76237702, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20593262, + "step": 11095, + "time_per_iteration": 3.0058164596557617 + }, + { + "auxiliary_loss_clip": 0.01435712, + "auxiliary_loss_mlp": 0.01037301, + "balance_loss_clip": 1.27088642, + "balance_loss_mlp": 1.0175482, + "epoch": 0.6671276116037878, + "flos": 18046053976320.0, + "grad_norm": 2.2146656239482807, + "language_loss": 0.74626327, + "learning_rate": 1.053870073574727e-06, + "loss": 0.77099335, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19750977, + "step": 11096, + "time_per_iteration": 2.935943603515625 + }, + { + "auxiliary_loss_clip": 0.0141455, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.25367665, + "balance_loss_mlp": 1.01416802, + "epoch": 0.6671877348564558, + "flos": 23777058948480.0, + "grad_norm": 2.1124908395191704, + "language_loss": 0.64624739, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.67072642, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19189453, + "step": 11097, + "time_per_iteration": 2.8875958919525146 + }, + { + "auxiliary_loss_clip": 0.01447542, + "auxiliary_loss_mlp": 0.01037467, + "balance_loss_clip": 1.27785182, + "balance_loss_mlp": 1.01799989, + "epoch": 0.6672478581091237, + "flos": 20926908080640.0, + "grad_norm": 1.8053566891702806, + "language_loss": 0.76925945, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.79410952, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19482422, + "step": 11098, + "time_per_iteration": 2.866103172302246 + }, + { + "auxiliary_loss_clip": 0.01436905, + "auxiliary_loss_mlp": 0.01036486, + "balance_loss_clip": 1.26976681, + "balance_loss_mlp": 1.01657796, + "epoch": 0.6673079813617917, + "flos": 27867482288640.0, + "grad_norm": 1.4783846892718366, + "language_loss": 0.75219268, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.77692658, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19897461, + "step": 11099, + "time_per_iteration": 2.941845178604126 + }, + { + "auxiliary_loss_clip": 0.01413907, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.25221837, + "balance_loss_mlp": 1.01423454, + "epoch": 0.6673681046144596, + "flos": 21626955912960.0, + "grad_norm": 2.310717342562936, + "language_loss": 0.78460163, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80908537, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.20227051, + "step": 11100, + "time_per_iteration": 2.877443313598633 + }, + { + "auxiliary_loss_clip": 0.01423189, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.26031411, + "balance_loss_mlp": 1.01924324, + "epoch": 0.6674282278671276, + "flos": 20900503100160.0, + "grad_norm": 2.0591868506382935, + "language_loss": 0.61124074, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.63585651, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19152832, + "step": 11101, + "time_per_iteration": 2.8667876720428467 + }, + { + "auxiliary_loss_clip": 0.01445873, + "auxiliary_loss_mlp": 0.01036866, + "balance_loss_clip": 1.27264512, + "balance_loss_mlp": 1.01650488, + "epoch": 0.6674883511197955, + "flos": 23634971717760.0, + "grad_norm": 1.7262869095556992, + "language_loss": 0.72170943, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.74653685, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.20373535, + "step": 11102, + "time_per_iteration": 2.8671741485595703 + }, + { + "auxiliary_loss_clip": 0.01426863, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.26144528, + "balance_loss_mlp": 1.01467645, + "epoch": 0.6675484743724636, + "flos": 19619121087360.0, + "grad_norm": 1.5821192058812679, + "language_loss": 0.84880668, + "learning_rate": 1.051469068021034e-06, + "loss": 0.87340599, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18408203, + "step": 11103, + "time_per_iteration": 2.835686683654785 + }, + { + "auxiliary_loss_clip": 0.0142546, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.2586199, + "balance_loss_mlp": 1.01124132, + "epoch": 0.6676085976251315, + "flos": 14327589288960.0, + "grad_norm": 1.947674768385336, + "language_loss": 0.78757703, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.81213403, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18981934, + "step": 11104, + "time_per_iteration": 2.8343565464019775 + }, + { + "auxiliary_loss_clip": 0.01451486, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.28038096, + "balance_loss_mlp": 1.0146029, + "epoch": 0.6676687208777995, + "flos": 38117977470720.0, + "grad_norm": 1.5517765638126149, + "language_loss": 0.58710819, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.6119585, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.18933105, + "step": 11105, + "time_per_iteration": 4.42592978477478 + }, + { + "auxiliary_loss_clip": 0.01447889, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.27612472, + "balance_loss_mlp": 1.01560605, + "epoch": 0.6677288441304675, + "flos": 23990506508160.0, + "grad_norm": 1.6536392162727778, + "language_loss": 0.73845309, + "learning_rate": 1.0504406049066e-06, + "loss": 0.76328707, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.19897461, + "step": 11106, + "time_per_iteration": 2.8575658798217773 + }, + { + "auxiliary_loss_clip": 0.01425378, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.26063943, + "balance_loss_mlp": 1.01408875, + "epoch": 0.6677889673831354, + "flos": 24181394895360.0, + "grad_norm": 1.9614804299874335, + "language_loss": 0.77311718, + "learning_rate": 1.0500978558659e-06, + "loss": 0.7977041, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19226074, + "step": 11107, + "time_per_iteration": 4.29552698135376 + }, + { + "auxiliary_loss_clip": 0.0140686, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.2471714, + "balance_loss_mlp": 1.01238132, + "epoch": 0.6678490906358034, + "flos": 22319809822080.0, + "grad_norm": 2.144975013857604, + "language_loss": 0.90019464, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92457736, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19030762, + "step": 11108, + "time_per_iteration": 4.261826753616333 + }, + { + "auxiliary_loss_clip": 0.01417365, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.25531745, + "balance_loss_mlp": 1.01090837, + "epoch": 0.6679092138884714, + "flos": 36911349146880.0, + "grad_norm": 1.3608397724005503, + "language_loss": 0.83412933, + "learning_rate": 1.049412465858646e-06, + "loss": 0.85859323, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18115234, + "step": 11109, + "time_per_iteration": 2.9660942554473877 + }, + { + "auxiliary_loss_clip": 0.01431011, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.26418543, + "balance_loss_mlp": 1.0125221, + "epoch": 0.6679693371411394, + "flos": 18159383496960.0, + "grad_norm": 2.443556270588572, + "language_loss": 0.6962285, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.72085893, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19494629, + "step": 11110, + "time_per_iteration": 2.8377926349639893 + }, + { + "auxiliary_loss_clip": 0.01429309, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.26072001, + "balance_loss_mlp": 1.01591408, + "epoch": 0.6680294603938073, + "flos": 27209448892800.0, + "grad_norm": 1.6177040642839182, + "language_loss": 0.7393024, + "learning_rate": 1.04872722003689e-06, + "loss": 0.7639612, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20654297, + "step": 11111, + "time_per_iteration": 2.902770519256592 + }, + { + "auxiliary_loss_clip": 0.01425061, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.2603085, + "balance_loss_mlp": 1.01395226, + "epoch": 0.6680895836464753, + "flos": 21735172771200.0, + "grad_norm": 1.9525374717576611, + "language_loss": 0.66157007, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.68614954, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18933105, + "step": 11112, + "time_per_iteration": 2.8218181133270264 + }, + { + "auxiliary_loss_clip": 0.01423763, + "auxiliary_loss_mlp": 0.01035072, + "balance_loss_clip": 1.25932693, + "balance_loss_mlp": 1.01639175, + "epoch": 0.6681497068991432, + "flos": 19656022861440.0, + "grad_norm": 1.8321413613679016, + "language_loss": 0.63838267, + "learning_rate": 1.048042118504569e-06, + "loss": 0.66297102, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18688965, + "step": 11113, + "time_per_iteration": 2.841856002807617 + }, + { + "auxiliary_loss_clip": 0.01415407, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.25457859, + "balance_loss_mlp": 1.01190758, + "epoch": 0.6682098301518112, + "flos": 17427682287360.0, + "grad_norm": 1.810489436877256, + "language_loss": 0.66314411, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68760896, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19177246, + "step": 11114, + "time_per_iteration": 2.826577663421631 + }, + { + "auxiliary_loss_clip": 0.01422759, + "auxiliary_loss_mlp": 0.01036515, + "balance_loss_clip": 1.25887764, + "balance_loss_mlp": 1.01723886, + "epoch": 0.6682699534044791, + "flos": 22608599253120.0, + "grad_norm": 1.4731775282690613, + "language_loss": 0.79131973, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.81591249, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19287109, + "step": 11115, + "time_per_iteration": 2.8473198413848877 + }, + { + "auxiliary_loss_clip": 0.01419837, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.25423908, + "balance_loss_mlp": 1.01280665, + "epoch": 0.6683300766571472, + "flos": 24874520273280.0, + "grad_norm": 1.8069303436138708, + "language_loss": 0.80151606, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.82602406, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18164062, + "step": 11116, + "time_per_iteration": 2.8971760272979736 + }, + { + "auxiliary_loss_clip": 0.0144262, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.27498519, + "balance_loss_mlp": 1.02139771, + "epoch": 0.6683901999098151, + "flos": 27138269543040.0, + "grad_norm": 1.5765453631664295, + "language_loss": 0.8031112, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.82795942, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20812988, + "step": 11117, + "time_per_iteration": 2.9236955642700195 + }, + { + "auxiliary_loss_clip": 0.01425277, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.25905538, + "balance_loss_mlp": 1.01329517, + "epoch": 0.6684503231624831, + "flos": 20748326279040.0, + "grad_norm": 3.3414870050218224, + "language_loss": 0.66314435, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.68773127, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.2010498, + "step": 11118, + "time_per_iteration": 2.8796873092651367 + }, + { + "auxiliary_loss_clip": 0.0142745, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.26247168, + "balance_loss_mlp": 1.01450777, + "epoch": 0.668510446415151, + "flos": 21772165034880.0, + "grad_norm": 2.590461052957698, + "language_loss": 0.69603598, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.72063947, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18395996, + "step": 11119, + "time_per_iteration": 2.914313554763794 + }, + { + "auxiliary_loss_clip": 0.01435133, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.26823545, + "balance_loss_mlp": 1.01490128, + "epoch": 0.668570569667819, + "flos": 30203632517760.0, + "grad_norm": 1.7042042395058659, + "language_loss": 0.67934179, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.70403177, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18981934, + "step": 11120, + "time_per_iteration": 2.9352755546569824 + }, + { + "auxiliary_loss_clip": 0.01436231, + "auxiliary_loss_mlp": 0.01043004, + "balance_loss_clip": 1.26949275, + "balance_loss_mlp": 1.02328658, + "epoch": 0.668630692920487, + "flos": 24181349650560.0, + "grad_norm": 2.992925473320502, + "language_loss": 0.73173988, + "learning_rate": 1.045303157347638e-06, + "loss": 0.75653219, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19714355, + "step": 11121, + "time_per_iteration": 2.9509148597717285 + }, + { + "auxiliary_loss_clip": 0.01427967, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.26000786, + "balance_loss_mlp": 1.01633954, + "epoch": 0.668690816173155, + "flos": 17466077139840.0, + "grad_norm": 2.845159035875399, + "language_loss": 0.70949292, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.73412722, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19128418, + "step": 11122, + "time_per_iteration": 2.849296808242798 + }, + { + "auxiliary_loss_clip": 0.01433778, + "auxiliary_loss_mlp": 0.01037961, + "balance_loss_clip": 1.26626718, + "balance_loss_mlp": 1.01785016, + "epoch": 0.668750939425823, + "flos": 25014797712000.0, + "grad_norm": 1.5690782188907024, + "language_loss": 0.72035563, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.74507302, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.20092773, + "step": 11123, + "time_per_iteration": 2.937446117401123 + }, + { + "auxiliary_loss_clip": 0.01437597, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.26995027, + "balance_loss_mlp": 1.02116632, + "epoch": 0.6688110626784909, + "flos": 24107229388800.0, + "grad_norm": 1.5262061150681672, + "language_loss": 0.79938734, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.82416719, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19226074, + "step": 11124, + "time_per_iteration": 2.8772456645965576 + }, + { + "auxiliary_loss_clip": 0.01435564, + "auxiliary_loss_mlp": 0.01039784, + "balance_loss_clip": 1.27037001, + "balance_loss_mlp": 1.01974511, + "epoch": 0.6688711859311589, + "flos": 21768816919680.0, + "grad_norm": 1.6291473084014227, + "language_loss": 0.75172162, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.77647507, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20056152, + "step": 11125, + "time_per_iteration": 2.8896243572235107 + }, + { + "auxiliary_loss_clip": 0.01431576, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.26520371, + "balance_loss_mlp": 1.01170266, + "epoch": 0.6689313091838268, + "flos": 22939900813440.0, + "grad_norm": 2.279715372351084, + "language_loss": 0.67352307, + "learning_rate": 1.043592482774116e-06, + "loss": 0.69815016, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19445801, + "step": 11126, + "time_per_iteration": 2.864891529083252 + }, + { + "auxiliary_loss_clip": 0.01435276, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.26801658, + "balance_loss_mlp": 1.01425517, + "epoch": 0.6689914324364948, + "flos": 20895797640960.0, + "grad_norm": 11.424251048117114, + "language_loss": 0.72065276, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.74533486, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.18664551, + "step": 11127, + "time_per_iteration": 2.8378522396087646 + }, + { + "auxiliary_loss_clip": 0.01445193, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.27281356, + "balance_loss_mlp": 1.01661444, + "epoch": 0.6690515556891627, + "flos": 22758694813440.0, + "grad_norm": 1.9837402075798045, + "language_loss": 0.8187238, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.84354758, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20568848, + "step": 11128, + "time_per_iteration": 2.8787641525268555 + }, + { + "auxiliary_loss_clip": 0.01439455, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.27046871, + "balance_loss_mlp": 1.01261365, + "epoch": 0.6691116789418308, + "flos": 23341929275520.0, + "grad_norm": 2.800676560715508, + "language_loss": 0.81197667, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83669364, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19628906, + "step": 11129, + "time_per_iteration": 4.28815484046936 + }, + { + "auxiliary_loss_clip": 0.01408137, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.24715698, + "balance_loss_mlp": 1.01268435, + "epoch": 0.6691718021944987, + "flos": 32458332827520.0, + "grad_norm": 2.188447262640093, + "language_loss": 0.71206796, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.73646975, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19335938, + "step": 11130, + "time_per_iteration": 2.9213359355926514 + }, + { + "auxiliary_loss_clip": 0.01424473, + "auxiliary_loss_mlp": 0.01041801, + "balance_loss_clip": 1.26271057, + "balance_loss_mlp": 1.02235782, + "epoch": 0.6692319254471667, + "flos": 23741876476800.0, + "grad_norm": 1.5446444510145168, + "language_loss": 0.70957112, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.7342338, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19458008, + "step": 11131, + "time_per_iteration": 2.875110626220703 + }, + { + "auxiliary_loss_clip": 0.01430636, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.26214588, + "balance_loss_mlp": 1.01234651, + "epoch": 0.6692920486998346, + "flos": 14435806147200.0, + "grad_norm": 3.0602631240362146, + "language_loss": 0.66106319, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.68568945, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.1965332, + "step": 11132, + "time_per_iteration": 2.826151132583618 + }, + { + "auxiliary_loss_clip": 0.0143404, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.26581311, + "balance_loss_mlp": 1.01197171, + "epoch": 0.6693521719525026, + "flos": 21517562689920.0, + "grad_norm": 2.052981968941779, + "language_loss": 0.75095028, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.77561677, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.2064209, + "step": 11133, + "time_per_iteration": 2.942416191101074 + }, + { + "auxiliary_loss_clip": 0.01446412, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.27625751, + "balance_loss_mlp": 1.01778162, + "epoch": 0.6694122952051706, + "flos": 25416871418880.0, + "grad_norm": 2.057315250537488, + "language_loss": 0.66910547, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.69393909, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19177246, + "step": 11134, + "time_per_iteration": 2.9374921321868896 + }, + { + "auxiliary_loss_clip": 0.01444776, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.27344465, + "balance_loss_mlp": 1.01479292, + "epoch": 0.6694724184578386, + "flos": 25671564253440.0, + "grad_norm": 1.728706186414479, + "language_loss": 0.77416945, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.7989772, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.21203613, + "step": 11135, + "time_per_iteration": 2.8583390712738037 + }, + { + "auxiliary_loss_clip": 0.01422894, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.25998902, + "balance_loss_mlp": 1.01334095, + "epoch": 0.6695325417105066, + "flos": 17717467104000.0, + "grad_norm": 1.6313023345218096, + "language_loss": 0.74407864, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76863974, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19873047, + "step": 11136, + "time_per_iteration": 2.8936526775360107 + }, + { + "auxiliary_loss_clip": 0.0144637, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.27596021, + "balance_loss_mlp": 1.01358747, + "epoch": 0.6695926649631745, + "flos": 24470184326400.0, + "grad_norm": 1.811101687926553, + "language_loss": 0.63008344, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.65487576, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.19274902, + "step": 11137, + "time_per_iteration": 2.8636484146118164 + }, + { + "auxiliary_loss_clip": 0.0143336, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.26798069, + "balance_loss_mlp": 1.01582503, + "epoch": 0.6696527882158425, + "flos": 24290878608000.0, + "grad_norm": 2.4163579637871435, + "language_loss": 0.66850686, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.69319779, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19897461, + "step": 11138, + "time_per_iteration": 2.8650753498077393 + }, + { + "auxiliary_loss_clip": 0.01408496, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.24764049, + "balance_loss_mlp": 1.01264477, + "epoch": 0.6697129114685104, + "flos": 23013070934400.0, + "grad_norm": 1.5618162247398246, + "language_loss": 0.73391861, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75831848, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18847656, + "step": 11139, + "time_per_iteration": 2.846755266189575 + }, + { + "auxiliary_loss_clip": 0.01404749, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.24578691, + "balance_loss_mlp": 1.01494038, + "epoch": 0.6697730347211784, + "flos": 22648396694400.0, + "grad_norm": 2.036397201057483, + "language_loss": 0.71757936, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.74196279, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18652344, + "step": 11140, + "time_per_iteration": 4.451754808425903 + }, + { + "auxiliary_loss_clip": 0.01431053, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.26204634, + "balance_loss_mlp": 1.01277506, + "epoch": 0.6698331579738463, + "flos": 28889149294080.0, + "grad_norm": 1.7601380381257328, + "language_loss": 0.75750124, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78213751, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.19799805, + "step": 11141, + "time_per_iteration": 3.000377893447876 + }, + { + "auxiliary_loss_clip": 0.01429784, + "auxiliary_loss_mlp": 0.01034426, + "balance_loss_clip": 1.2630012, + "balance_loss_mlp": 1.01488733, + "epoch": 0.6698932812265144, + "flos": 24217798976640.0, + "grad_norm": 2.0901854843769887, + "language_loss": 0.82884026, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.85348237, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19543457, + "step": 11142, + "time_per_iteration": 4.307194471359253 + }, + { + "auxiliary_loss_clip": 0.01409314, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.24594402, + "balance_loss_mlp": 1.01417947, + "epoch": 0.6699534044791823, + "flos": 22100254214400.0, + "grad_norm": 1.7102415254973589, + "language_loss": 0.7097699, + "learning_rate": 1.037782980862959e-06, + "loss": 0.73419368, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1887207, + "step": 11143, + "time_per_iteration": 4.281297206878662 + }, + { + "auxiliary_loss_clip": 0.01406355, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.24522066, + "balance_loss_mlp": 1.01289189, + "epoch": 0.6700135277318503, + "flos": 25203378614400.0, + "grad_norm": 1.5262272589293764, + "language_loss": 0.70961374, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.73398763, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18151855, + "step": 11144, + "time_per_iteration": 2.8696465492248535 + }, + { + "auxiliary_loss_clip": 0.01429515, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.26533186, + "balance_loss_mlp": 1.01359534, + "epoch": 0.6700736509845182, + "flos": 23450508092160.0, + "grad_norm": 1.8308704367636814, + "language_loss": 0.75090277, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.77554023, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20617676, + "step": 11145, + "time_per_iteration": 2.847994804382324 + }, + { + "auxiliary_loss_clip": 0.01427899, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.26041603, + "balance_loss_mlp": 1.01284111, + "epoch": 0.6701337742371862, + "flos": 24400814768640.0, + "grad_norm": 4.131606129087925, + "language_loss": 0.71430516, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73891342, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20080566, + "step": 11146, + "time_per_iteration": 2.8610198497772217 + }, + { + "auxiliary_loss_clip": 0.01400117, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.24055982, + "balance_loss_mlp": 1.01944685, + "epoch": 0.6701938974898543, + "flos": 14801882976000.0, + "grad_norm": 1.877995160589152, + "language_loss": 0.79297107, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.81735837, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19152832, + "step": 11147, + "time_per_iteration": 2.8161816596984863 + }, + { + "auxiliary_loss_clip": 0.01422822, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.25895739, + "balance_loss_mlp": 1.0158844, + "epoch": 0.6702540207425222, + "flos": 20162829576960.0, + "grad_norm": 2.2741456182990536, + "language_loss": 0.70779908, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.73237652, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19042969, + "step": 11148, + "time_per_iteration": 2.8232264518737793 + }, + { + "auxiliary_loss_clip": 0.01424379, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.2595582, + "balance_loss_mlp": 1.01668763, + "epoch": 0.6703141439951902, + "flos": 21223796330880.0, + "grad_norm": 2.0008857823438957, + "language_loss": 0.70817751, + "learning_rate": 1.035735082774636e-06, + "loss": 0.73277628, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18811035, + "step": 11149, + "time_per_iteration": 2.8280773162841797 + }, + { + "auxiliary_loss_clip": 0.01423313, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.25677514, + "balance_loss_mlp": 1.011325, + "epoch": 0.6703742672478581, + "flos": 23122961850240.0, + "grad_norm": 1.9701891807864087, + "language_loss": 0.74783486, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.7723701, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18884277, + "step": 11150, + "time_per_iteration": 2.8882875442504883 + }, + { + "auxiliary_loss_clip": 0.01426883, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.26075721, + "balance_loss_mlp": 1.01342726, + "epoch": 0.6704343905005261, + "flos": 22539229695360.0, + "grad_norm": 2.7833942384166677, + "language_loss": 0.79441345, + "learning_rate": 1.035052742460671e-06, + "loss": 0.81900382, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18737793, + "step": 11151, + "time_per_iteration": 2.910689115524292 + }, + { + "auxiliary_loss_clip": 0.01183419, + "auxiliary_loss_mlp": 0.01040254, + "balance_loss_clip": 1.09327173, + "balance_loss_mlp": 1.0198456, + "epoch": 0.670494513753194, + "flos": 64827469595520.0, + "grad_norm": 0.8078575823219534, + "language_loss": 0.55508566, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57732236, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20410156, + "step": 11152, + "time_per_iteration": 3.4526185989379883 + }, + { + "auxiliary_loss_clip": 0.01421727, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.25489056, + "balance_loss_mlp": 1.0127914, + "epoch": 0.670554637005862, + "flos": 23521415973120.0, + "grad_norm": 2.333353269989664, + "language_loss": 0.81221437, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83674806, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18835449, + "step": 11153, + "time_per_iteration": 2.8935024738311768 + }, + { + "auxiliary_loss_clip": 0.01425813, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.25877655, + "balance_loss_mlp": 1.01527762, + "epoch": 0.67061476025853, + "flos": 19472735600640.0, + "grad_norm": 1.4726488605530594, + "language_loss": 0.7674309, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.79203284, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19091797, + "step": 11154, + "time_per_iteration": 2.879734992980957 + }, + { + "auxiliary_loss_clip": 0.01438656, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.26845026, + "balance_loss_mlp": 1.01939797, + "epoch": 0.670674883511198, + "flos": 20529177874560.0, + "grad_norm": 1.400748324236154, + "language_loss": 0.76500577, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78978276, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.19628906, + "step": 11155, + "time_per_iteration": 2.8642945289611816 + }, + { + "auxiliary_loss_clip": 0.01440746, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.27413249, + "balance_loss_mlp": 1.01710987, + "epoch": 0.6707350067638659, + "flos": 25494973223040.0, + "grad_norm": 1.7676936066436593, + "language_loss": 0.8259865, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.85075676, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19165039, + "step": 11156, + "time_per_iteration": 2.89217209815979 + }, + { + "auxiliary_loss_clip": 0.01409426, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.248281, + "balance_loss_mlp": 1.01617122, + "epoch": 0.6707951300165339, + "flos": 22283677209600.0, + "grad_norm": 2.367773109414181, + "language_loss": 0.75600553, + "learning_rate": 1.033006600114165e-06, + "loss": 0.78045356, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1920166, + "step": 11157, + "time_per_iteration": 2.853684425354004 + }, + { + "auxiliary_loss_clip": 0.01433207, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.265378, + "balance_loss_mlp": 1.01619959, + "epoch": 0.6708552532692018, + "flos": 23994307071360.0, + "grad_norm": 3.29698040121345, + "language_loss": 0.74919713, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.77388549, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19433594, + "step": 11158, + "time_per_iteration": 2.8713324069976807 + }, + { + "auxiliary_loss_clip": 0.01427007, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.25854897, + "balance_loss_mlp": 1.01413965, + "epoch": 0.6709153765218698, + "flos": 24948595290240.0, + "grad_norm": 1.5062742102445699, + "language_loss": 0.82238287, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.8469981, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20373535, + "step": 11159, + "time_per_iteration": 2.8994321823120117 + }, + { + "auxiliary_loss_clip": 0.01438357, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.27000368, + "balance_loss_mlp": 1.01368499, + "epoch": 0.6709754997745379, + "flos": 17539202016000.0, + "grad_norm": 3.228606841439439, + "language_loss": 0.77728015, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.80199206, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19140625, + "step": 11160, + "time_per_iteration": 2.8161447048187256 + }, + { + "auxiliary_loss_clip": 0.01421054, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.25675631, + "balance_loss_mlp": 1.013978, + "epoch": 0.6710356230272058, + "flos": 22101023376000.0, + "grad_norm": 1.8934254798651293, + "language_loss": 0.73954624, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.76409227, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19567871, + "step": 11161, + "time_per_iteration": 2.8638856410980225 + }, + { + "auxiliary_loss_clip": 0.01434315, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.26466453, + "balance_loss_mlp": 1.01949191, + "epoch": 0.6710957462798738, + "flos": 24216984570240.0, + "grad_norm": 1.7723136089099039, + "language_loss": 0.69033521, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.71506703, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19360352, + "step": 11162, + "time_per_iteration": 2.9104793071746826 + }, + { + "auxiliary_loss_clip": 0.01421996, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.25759673, + "balance_loss_mlp": 1.0175786, + "epoch": 0.6711558695325417, + "flos": 19101998557440.0, + "grad_norm": 1.7946894933192514, + "language_loss": 0.70537663, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72995687, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18457031, + "step": 11163, + "time_per_iteration": 2.7993149757385254 + }, + { + "auxiliary_loss_clip": 0.01424824, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.26312685, + "balance_loss_mlp": 1.01498556, + "epoch": 0.6712159927852097, + "flos": 25569138729600.0, + "grad_norm": 1.5811604148597977, + "language_loss": 0.76072586, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.78531229, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18835449, + "step": 11164, + "time_per_iteration": 4.275823593139648 + }, + { + "auxiliary_loss_clip": 0.01423045, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.25825715, + "balance_loss_mlp": 1.01299644, + "epoch": 0.6712761160378776, + "flos": 22236912069120.0, + "grad_norm": 2.0024064867941354, + "language_loss": 0.6588763, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.68342394, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18725586, + "step": 11165, + "time_per_iteration": 2.821869134902954 + }, + { + "auxiliary_loss_clip": 0.01421354, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.2573483, + "balance_loss_mlp": 1.01625609, + "epoch": 0.6713362392905456, + "flos": 22465697616000.0, + "grad_norm": 2.3670901914803384, + "language_loss": 0.72164333, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.74621403, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19458008, + "step": 11166, + "time_per_iteration": 2.8316218852996826 + }, + { + "auxiliary_loss_clip": 0.01424416, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.26177442, + "balance_loss_mlp": 1.0142529, + "epoch": 0.6713963625432136, + "flos": 25641313464960.0, + "grad_norm": 1.877940402667431, + "language_loss": 0.77945936, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.80403364, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18762207, + "step": 11167, + "time_per_iteration": 2.8686482906341553 + }, + { + "auxiliary_loss_clip": 0.01428548, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.26118827, + "balance_loss_mlp": 1.01142883, + "epoch": 0.6714564857958816, + "flos": 35019875243520.0, + "grad_norm": 1.7129522664165657, + "language_loss": 0.69379824, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71838772, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.1895752, + "step": 11168, + "time_per_iteration": 2.9571127891540527 + }, + { + "auxiliary_loss_clip": 0.01438528, + "auxiliary_loss_mlp": 0.01041922, + "balance_loss_clip": 1.27015233, + "balance_loss_mlp": 1.02027404, + "epoch": 0.6715166090485495, + "flos": 26289393004800.0, + "grad_norm": 1.9746275895954053, + "language_loss": 0.74588192, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.77068645, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.21655273, + "step": 11169, + "time_per_iteration": 2.848971366882324 + }, + { + "auxiliary_loss_clip": 0.01440244, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.27111816, + "balance_loss_mlp": 1.01361799, + "epoch": 0.6715767323012175, + "flos": 15932536001280.0, + "grad_norm": 2.1919190267895057, + "language_loss": 0.7668069, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.79154009, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19433594, + "step": 11170, + "time_per_iteration": 2.7861461639404297 + }, + { + "auxiliary_loss_clip": 0.01434538, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.26590717, + "balance_loss_mlp": 1.01250851, + "epoch": 0.6716368555538854, + "flos": 17499811777920.0, + "grad_norm": 2.3456806494638127, + "language_loss": 0.75897145, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.78363645, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19445801, + "step": 11171, + "time_per_iteration": 2.827265501022339 + }, + { + "auxiliary_loss_clip": 0.01438658, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.26980472, + "balance_loss_mlp": 1.0141499, + "epoch": 0.6716969788065534, + "flos": 16769648891520.0, + "grad_norm": 1.6024656472006908, + "language_loss": 0.86998653, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.8947075, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.19299316, + "step": 11172, + "time_per_iteration": 2.866748571395874 + }, + { + "auxiliary_loss_clip": 0.01425849, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.25934291, + "balance_loss_mlp": 1.01406956, + "epoch": 0.6717571020592215, + "flos": 22719757023360.0, + "grad_norm": 4.216270137151523, + "language_loss": 0.63878965, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.6633907, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20178223, + "step": 11173, + "time_per_iteration": 2.861457586288452 + }, + { + "auxiliary_loss_clip": 0.01458236, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.28311241, + "balance_loss_mlp": 1.01796293, + "epoch": 0.6718172253118894, + "flos": 18743341875840.0, + "grad_norm": 2.539078668148549, + "language_loss": 0.72649062, + "learning_rate": 1.02721637475002e-06, + "loss": 0.75145566, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.203125, + "step": 11174, + "time_per_iteration": 2.816046953201294 + }, + { + "auxiliary_loss_clip": 0.01420469, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.25847411, + "balance_loss_mlp": 1.015885, + "epoch": 0.6718773485645574, + "flos": 15640896147840.0, + "grad_norm": 2.127704778584871, + "language_loss": 0.69573224, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.72028953, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19360352, + "step": 11175, + "time_per_iteration": 4.3781023025512695 + }, + { + "auxiliary_loss_clip": 0.01413576, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.25185823, + "balance_loss_mlp": 1.01454067, + "epoch": 0.6719374718172253, + "flos": 19364292518400.0, + "grad_norm": 1.8740866203121913, + "language_loss": 0.74763626, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.77210361, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18615723, + "step": 11176, + "time_per_iteration": 2.9132983684539795 + }, + { + "auxiliary_loss_clip": 0.01431314, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.26257896, + "balance_loss_mlp": 1.01475179, + "epoch": 0.6719975950698933, + "flos": 21991403928960.0, + "grad_norm": 1.8377965480122378, + "language_loss": 0.7437501, + "learning_rate": 1.026195675108182e-06, + "loss": 0.76839602, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.18518066, + "step": 11177, + "time_per_iteration": 4.2998597621917725 + }, + { + "auxiliary_loss_clip": 0.01423326, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.25719023, + "balance_loss_mlp": 1.01486778, + "epoch": 0.6720577183225612, + "flos": 25238877799680.0, + "grad_norm": 2.0055842456422597, + "language_loss": 0.78036511, + "learning_rate": 1.025855515730551e-06, + "loss": 0.80494475, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19763184, + "step": 11178, + "time_per_iteration": 2.880328893661499 + }, + { + "auxiliary_loss_clip": 0.01442148, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.27287006, + "balance_loss_mlp": 1.01281381, + "epoch": 0.6721178415752292, + "flos": 16954564965120.0, + "grad_norm": 2.6189697790450728, + "language_loss": 0.71288818, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.7376259, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.18823242, + "step": 11179, + "time_per_iteration": 4.227755546569824 + }, + { + "auxiliary_loss_clip": 0.01429492, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.26429701, + "balance_loss_mlp": 1.01239693, + "epoch": 0.6721779648278972, + "flos": 21550799635200.0, + "grad_norm": 1.4999145056035152, + "language_loss": 0.74815279, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.77276576, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19396973, + "step": 11180, + "time_per_iteration": 2.887552261352539 + }, + { + "auxiliary_loss_clip": 0.01414608, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.25117397, + "balance_loss_mlp": 1.01274359, + "epoch": 0.6722380880805652, + "flos": 22616698072320.0, + "grad_norm": 1.5216115196224718, + "language_loss": 0.75898337, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.7834549, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19812012, + "step": 11181, + "time_per_iteration": 2.943776845932007 + }, + { + "auxiliary_loss_clip": 0.01421305, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.25448966, + "balance_loss_mlp": 1.01232052, + "epoch": 0.6722982113332331, + "flos": 15933938590080.0, + "grad_norm": 1.873958847421389, + "language_loss": 0.75246596, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.77699566, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19335938, + "step": 11182, + "time_per_iteration": 2.8862056732177734 + }, + { + "auxiliary_loss_clip": 0.0141601, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.25324702, + "balance_loss_mlp": 1.01794171, + "epoch": 0.6723583345859011, + "flos": 20606239048320.0, + "grad_norm": 3.1872858020171977, + "language_loss": 0.70487446, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72940719, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19335938, + "step": 11183, + "time_per_iteration": 2.8479068279266357 + }, + { + "auxiliary_loss_clip": 0.01420587, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.25617456, + "balance_loss_mlp": 1.01666093, + "epoch": 0.672418457838569, + "flos": 21735851443200.0, + "grad_norm": 1.530763082606305, + "language_loss": 0.78599769, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.81056839, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19812012, + "step": 11184, + "time_per_iteration": 2.823882579803467 + }, + { + "auxiliary_loss_clip": 0.01455416, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.28180718, + "balance_loss_mlp": 1.0151813, + "epoch": 0.672478581091237, + "flos": 21480298957440.0, + "grad_norm": 2.445310921564665, + "language_loss": 0.67221904, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.69712329, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.19812012, + "step": 11185, + "time_per_iteration": 2.834935188293457 + }, + { + "auxiliary_loss_clip": 0.01426319, + "auxiliary_loss_mlp": 0.01037009, + "balance_loss_clip": 1.26090682, + "balance_loss_mlp": 1.01667237, + "epoch": 0.6725387043439051, + "flos": 30858996470400.0, + "grad_norm": 1.7372146392619452, + "language_loss": 0.80933475, + "learning_rate": 1.023135571620345e-06, + "loss": 0.83396804, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20336914, + "step": 11186, + "time_per_iteration": 2.9240360260009766 + }, + { + "auxiliary_loss_clip": 0.01411298, + "auxiliary_loss_mlp": 0.01034592, + "balance_loss_clip": 1.24991298, + "balance_loss_mlp": 1.01597142, + "epoch": 0.672598827596573, + "flos": 24065260197120.0, + "grad_norm": 2.2527238905003832, + "language_loss": 0.81020319, + "learning_rate": 1.022795745163813e-06, + "loss": 0.83466208, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18615723, + "step": 11187, + "time_per_iteration": 2.890038013458252 + }, + { + "auxiliary_loss_clip": 0.01459355, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.28602266, + "balance_loss_mlp": 1.01952231, + "epoch": 0.672658950849241, + "flos": 21881920216320.0, + "grad_norm": 2.036410489214374, + "language_loss": 0.71433043, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73931694, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.19775391, + "step": 11188, + "time_per_iteration": 2.862586498260498 + }, + { + "auxiliary_loss_clip": 0.01416514, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.2546593, + "balance_loss_mlp": 1.01794672, + "epoch": 0.6727190741019089, + "flos": 23232581297280.0, + "grad_norm": 1.7101983885023246, + "language_loss": 0.76592708, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.79045719, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1854248, + "step": 11189, + "time_per_iteration": 2.930891752243042 + }, + { + "auxiliary_loss_clip": 0.01436147, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.26652646, + "balance_loss_mlp": 1.0140326, + "epoch": 0.6727791973545769, + "flos": 15787055410560.0, + "grad_norm": 2.295697020851425, + "language_loss": 0.76120287, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.78589934, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19470215, + "step": 11190, + "time_per_iteration": 2.816148042678833 + }, + { + "auxiliary_loss_clip": 0.01414421, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.25032699, + "balance_loss_mlp": 1.01311409, + "epoch": 0.6728393206072448, + "flos": 21259295516160.0, + "grad_norm": 1.5104859132541189, + "language_loss": 0.77421969, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79868937, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19421387, + "step": 11191, + "time_per_iteration": 2.8385205268859863 + }, + { + "auxiliary_loss_clip": 0.01418455, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.25611889, + "balance_loss_mlp": 1.01313663, + "epoch": 0.6728994438599128, + "flos": 32135718268800.0, + "grad_norm": 1.6843919186816652, + "language_loss": 0.86885458, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.89336014, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18969727, + "step": 11192, + "time_per_iteration": 3.0052175521850586 + }, + { + "auxiliary_loss_clip": 0.01432333, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.26506472, + "balance_loss_mlp": 1.01075613, + "epoch": 0.6729595671125808, + "flos": 23122735626240.0, + "grad_norm": 2.2802828924453307, + "language_loss": 0.76717865, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.79181039, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.20080566, + "step": 11193, + "time_per_iteration": 2.8555662631988525 + }, + { + "auxiliary_loss_clip": 0.01420961, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.25622106, + "balance_loss_mlp": 1.01513338, + "epoch": 0.6730196903652488, + "flos": 14619636345600.0, + "grad_norm": 2.109629072564097, + "language_loss": 0.8001405, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.82469571, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19396973, + "step": 11194, + "time_per_iteration": 2.828916311264038 + }, + { + "auxiliary_loss_clip": 0.0142019, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.25319254, + "balance_loss_mlp": 1.01307917, + "epoch": 0.6730798136179167, + "flos": 21115850941440.0, + "grad_norm": 1.821957805780065, + "language_loss": 0.90292907, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92744762, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18566895, + "step": 11195, + "time_per_iteration": 2.841773509979248 + }, + { + "auxiliary_loss_clip": 0.01423592, + "auxiliary_loss_mlp": 0.01040116, + "balance_loss_clip": 1.25889146, + "balance_loss_mlp": 1.02088737, + "epoch": 0.6731399368705847, + "flos": 28998090069120.0, + "grad_norm": 1.713372310781971, + "language_loss": 0.73345435, + "learning_rate": 1.019738976106662e-06, + "loss": 0.75809145, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19250488, + "step": 11196, + "time_per_iteration": 2.877742290496826 + }, + { + "auxiliary_loss_clip": 0.01182368, + "auxiliary_loss_mlp": 0.01084762, + "balance_loss_clip": 1.09578657, + "balance_loss_mlp": 1.06158817, + "epoch": 0.6732000601232526, + "flos": 64774641651840.0, + "grad_norm": 0.7984003756862422, + "language_loss": 0.56571376, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58838511, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.23144531, + "step": 11197, + "time_per_iteration": 3.2934730052948 + }, + { + "auxiliary_loss_clip": 0.01416828, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.2569983, + "balance_loss_mlp": 1.01414371, + "epoch": 0.6732601833759206, + "flos": 17210479409280.0, + "grad_norm": 2.597663402591247, + "language_loss": 0.7622714, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.78676778, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18664551, + "step": 11198, + "time_per_iteration": 2.8126072883605957 + }, + { + "auxiliary_loss_clip": 0.01435448, + "auxiliary_loss_mlp": 0.01036413, + "balance_loss_clip": 1.26756454, + "balance_loss_mlp": 1.01668358, + "epoch": 0.6733203066285887, + "flos": 18667502311680.0, + "grad_norm": 1.9467753556814487, + "language_loss": 0.82441616, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.8491348, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.1973877, + "step": 11199, + "time_per_iteration": 4.250103235244751 + }, + { + "auxiliary_loss_clip": 0.01436669, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.26785219, + "balance_loss_mlp": 1.01650429, + "epoch": 0.6733804298812566, + "flos": 35822982026880.0, + "grad_norm": 1.67463560400295, + "language_loss": 0.72247314, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.7472012, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19628906, + "step": 11200, + "time_per_iteration": 2.9659039974212646 + }, + { + "auxiliary_loss_clip": 0.01437149, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.26954925, + "balance_loss_mlp": 1.01906657, + "epoch": 0.6734405531339246, + "flos": 61658559256320.0, + "grad_norm": 1.4378163095899341, + "language_loss": 0.64952421, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.6742779, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19140625, + "step": 11201, + "time_per_iteration": 3.241058349609375 + }, + { + "auxiliary_loss_clip": 0.01439939, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.27089405, + "balance_loss_mlp": 1.01886332, + "epoch": 0.6735006763865925, + "flos": 20532164031360.0, + "grad_norm": 1.7219050916038503, + "language_loss": 0.64179194, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.66658235, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20239258, + "step": 11202, + "time_per_iteration": 2.846827983856201 + }, + { + "auxiliary_loss_clip": 0.0142671, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.26033819, + "balance_loss_mlp": 1.01498878, + "epoch": 0.6735607996392605, + "flos": 13927099150080.0, + "grad_norm": 2.094537201504983, + "language_loss": 0.75993222, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.78454167, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19226074, + "step": 11203, + "time_per_iteration": 2.8294644355773926 + }, + { + "auxiliary_loss_clip": 0.01450899, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.27760839, + "balance_loss_mlp": 1.01753736, + "epoch": 0.6736209228919284, + "flos": 18816557241600.0, + "grad_norm": 1.671165476643503, + "language_loss": 0.68456972, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.7094512, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.19702148, + "step": 11204, + "time_per_iteration": 2.81169056892395 + }, + { + "auxiliary_loss_clip": 0.01439559, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.27003956, + "balance_loss_mlp": 1.01497412, + "epoch": 0.6736810461445965, + "flos": 20382068471040.0, + "grad_norm": 2.1371447813929247, + "language_loss": 0.74405366, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76879549, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 1.69726562, + "router_z_loss_mlp": 0.19628906, + "step": 11205, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.01408871, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.247684, + "balance_loss_mlp": 1.01408553, + "epoch": 0.6737411693972644, + "flos": 30019530850560.0, + "grad_norm": 4.385556712548779, + "language_loss": 0.72109318, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.74550581, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1829834, + "step": 11206, + "time_per_iteration": 2.9045214653015137 + }, + { + "auxiliary_loss_clip": 0.01466514, + "auxiliary_loss_mlp": 0.01040026, + "balance_loss_clip": 1.29115605, + "balance_loss_mlp": 1.01973677, + "epoch": 0.6738012926499324, + "flos": 25458297672960.0, + "grad_norm": 1.7783335739800734, + "language_loss": 0.68024772, + "learning_rate": 1.016007014855092e-06, + "loss": 0.70531309, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 1.75488281, + "router_z_loss_mlp": 0.20275879, + "step": 11207, + "time_per_iteration": 2.8745791912078857 + }, + { + "auxiliary_loss_clip": 0.01417726, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.25631964, + "balance_loss_mlp": 1.01785088, + "epoch": 0.6738614159026003, + "flos": 20786540152320.0, + "grad_norm": 2.1827011863755263, + "language_loss": 0.74552751, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.77008206, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.1986084, + "step": 11208, + "time_per_iteration": 2.82566237449646 + }, + { + "auxiliary_loss_clip": 0.01432331, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.26353407, + "balance_loss_mlp": 1.02341294, + "epoch": 0.6739215391552683, + "flos": 19574708676480.0, + "grad_norm": 1.9458324391368427, + "language_loss": 0.76215959, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78692412, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20703125, + "step": 11209, + "time_per_iteration": 2.803452968597412 + }, + { + "auxiliary_loss_clip": 0.01405612, + "auxiliary_loss_mlp": 0.01036199, + "balance_loss_clip": 1.2455771, + "balance_loss_mlp": 1.01737571, + "epoch": 0.6739816624079362, + "flos": 24398597773440.0, + "grad_norm": 1.6456170043600975, + "language_loss": 0.67357314, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.69799125, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18811035, + "step": 11210, + "time_per_iteration": 4.2679266929626465 + }, + { + "auxiliary_loss_clip": 0.01411303, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.25014257, + "balance_loss_mlp": 1.01525259, + "epoch": 0.6740417856606042, + "flos": 22538279554560.0, + "grad_norm": 2.5708709556369653, + "language_loss": 0.80498546, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82942826, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.17724609, + "step": 11211, + "time_per_iteration": 4.3047380447387695 + }, + { + "auxiliary_loss_clip": 0.01404781, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.24246323, + "balance_loss_mlp": 1.01646554, + "epoch": 0.6741019089132723, + "flos": 25786432097280.0, + "grad_norm": 1.3964461783003972, + "language_loss": 0.76820254, + "learning_rate": 1.014312160327143e-06, + "loss": 0.7926079, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19262695, + "step": 11212, + "time_per_iteration": 2.8905327320098877 + }, + { + "auxiliary_loss_clip": 0.01415639, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.24910808, + "balance_loss_mlp": 1.01219964, + "epoch": 0.6741620321659402, + "flos": 21115534227840.0, + "grad_norm": 1.6968079917243384, + "language_loss": 0.78912944, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.81359935, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19165039, + "step": 11213, + "time_per_iteration": 2.852665901184082 + }, + { + "auxiliary_loss_clip": 0.01435311, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.26730454, + "balance_loss_mlp": 1.02067053, + "epoch": 0.6742221554186082, + "flos": 20750090826240.0, + "grad_norm": 4.413172105669341, + "language_loss": 0.68297416, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.70772696, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.19311523, + "step": 11214, + "time_per_iteration": 4.2738142013549805 + }, + { + "auxiliary_loss_clip": 0.01434555, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.26626837, + "balance_loss_mlp": 1.02571392, + "epoch": 0.6742822786712761, + "flos": 37786902134400.0, + "grad_norm": 1.613315426621431, + "language_loss": 0.73166674, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.75645888, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1895752, + "step": 11215, + "time_per_iteration": 2.975663661956787 + }, + { + "auxiliary_loss_clip": 0.01429765, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.26153159, + "balance_loss_mlp": 1.01561785, + "epoch": 0.6743424019239441, + "flos": 37276656814080.0, + "grad_norm": 2.185111848608758, + "language_loss": 0.67444539, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69908118, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.18225098, + "step": 11216, + "time_per_iteration": 2.9590272903442383 + }, + { + "auxiliary_loss_clip": 0.01183531, + "auxiliary_loss_mlp": 0.01037541, + "balance_loss_clip": 1.09696531, + "balance_loss_mlp": 1.017609, + "epoch": 0.674402525176612, + "flos": 66032514351360.0, + "grad_norm": 0.681121460388815, + "language_loss": 0.56298029, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58519101, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19921875, + "step": 11217, + "time_per_iteration": 3.4389703273773193 + }, + { + "auxiliary_loss_clip": 0.01416925, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.25432634, + "balance_loss_mlp": 1.01388645, + "epoch": 0.67446264842928, + "flos": 26470418025600.0, + "grad_norm": 1.7387756178730809, + "language_loss": 0.75140107, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.77589273, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18347168, + "step": 11218, + "time_per_iteration": 2.936734437942505 + }, + { + "auxiliary_loss_clip": 0.0143286, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.26549768, + "balance_loss_mlp": 1.0169611, + "epoch": 0.674522771681948, + "flos": 23742871862400.0, + "grad_norm": 1.6161481067398233, + "language_loss": 0.66513628, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68983233, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19787598, + "step": 11219, + "time_per_iteration": 2.908695936203003 + }, + { + "auxiliary_loss_clip": 0.01429585, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.26283193, + "balance_loss_mlp": 1.01418471, + "epoch": 0.674582894934616, + "flos": 24765172295040.0, + "grad_norm": 1.6903957586421063, + "language_loss": 0.75382268, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77844656, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18603516, + "step": 11220, + "time_per_iteration": 2.884932279586792 + }, + { + "auxiliary_loss_clip": 0.01413808, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.24881911, + "balance_loss_mlp": 1.01546955, + "epoch": 0.6746430181872839, + "flos": 24837301785600.0, + "grad_norm": 1.5911339138647658, + "language_loss": 0.71196866, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.73646158, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20031738, + "step": 11221, + "time_per_iteration": 2.9021379947662354 + }, + { + "auxiliary_loss_clip": 0.0142158, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.25726807, + "balance_loss_mlp": 1.01375723, + "epoch": 0.6747031414399519, + "flos": 16882299740160.0, + "grad_norm": 1.7259143951453721, + "language_loss": 0.58972275, + "learning_rate": 1.010925256180498e-06, + "loss": 0.61425769, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18151855, + "step": 11222, + "time_per_iteration": 2.8484365940093994 + }, + { + "auxiliary_loss_clip": 0.01429236, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.26164019, + "balance_loss_mlp": 1.01150453, + "epoch": 0.6747632646926198, + "flos": 22795460853120.0, + "grad_norm": 2.2933928167687645, + "language_loss": 0.77352387, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.79814106, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20959473, + "step": 11223, + "time_per_iteration": 2.8326826095581055 + }, + { + "auxiliary_loss_clip": 0.0141933, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.25479794, + "balance_loss_mlp": 1.01215553, + "epoch": 0.6748233879452878, + "flos": 20055291390720.0, + "grad_norm": 1.6041890913579429, + "language_loss": 0.7598027, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.7843082, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1907959, + "step": 11224, + "time_per_iteration": 2.827742576599121 + }, + { + "auxiliary_loss_clip": 0.01419417, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.25589454, + "balance_loss_mlp": 1.01415491, + "epoch": 0.6748835111979558, + "flos": 23013161424000.0, + "grad_norm": 1.5643717431182482, + "language_loss": 0.63308573, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65760928, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18774414, + "step": 11225, + "time_per_iteration": 2.90899658203125 + }, + { + "auxiliary_loss_clip": 0.01405717, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.24597096, + "balance_loss_mlp": 1.0149951, + "epoch": 0.6749436344506238, + "flos": 12203348296320.0, + "grad_norm": 1.6692321059844535, + "language_loss": 0.64526415, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66965348, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18225098, + "step": 11226, + "time_per_iteration": 2.817673444747925 + }, + { + "auxiliary_loss_clip": 0.01423934, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.25685787, + "balance_loss_mlp": 1.01240575, + "epoch": 0.6750037577032918, + "flos": 11880914716800.0, + "grad_norm": 2.3332411981327725, + "language_loss": 0.72527331, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.74982852, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.19189453, + "step": 11227, + "time_per_iteration": 2.8295962810516357 + }, + { + "auxiliary_loss_clip": 0.01410637, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.24777675, + "balance_loss_mlp": 1.01434946, + "epoch": 0.6750638809559597, + "flos": 17028504247680.0, + "grad_norm": 1.8972293351223453, + "language_loss": 0.72733188, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.75178099, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19934082, + "step": 11228, + "time_per_iteration": 2.826847791671753 + }, + { + "auxiliary_loss_clip": 0.01186252, + "auxiliary_loss_mlp": 0.01023069, + "balance_loss_clip": 1.09869015, + "balance_loss_mlp": 1.00294614, + "epoch": 0.6751240042086277, + "flos": 70984781504640.0, + "grad_norm": 0.7623513704984965, + "language_loss": 0.53358984, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55568302, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.20117188, + "step": 11229, + "time_per_iteration": 3.395169496536255 + }, + { + "auxiliary_loss_clip": 0.0140433, + "auxiliary_loss_mlp": 0.01031647, + "balance_loss_clip": 1.24352229, + "balance_loss_mlp": 1.01246583, + "epoch": 0.6751841274612956, + "flos": 22685569937280.0, + "grad_norm": 1.7516031160688172, + "language_loss": 0.81049269, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.83485246, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19177246, + "step": 11230, + "time_per_iteration": 2.8386027812957764 + }, + { + "auxiliary_loss_clip": 0.01411586, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.24990475, + "balance_loss_mlp": 1.01132226, + "epoch": 0.6752442507139637, + "flos": 21298685754240.0, + "grad_norm": 1.4240900585739449, + "language_loss": 0.6667136, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.69113553, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19299316, + "step": 11231, + "time_per_iteration": 2.8295722007751465 + }, + { + "auxiliary_loss_clip": 0.0144464, + "auxiliary_loss_mlp": 0.01040115, + "balance_loss_clip": 1.2732935, + "balance_loss_mlp": 1.01919436, + "epoch": 0.6753043739666316, + "flos": 28268651099520.0, + "grad_norm": 1.7217560347602512, + "language_loss": 0.67598951, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.70083708, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.20947266, + "step": 11232, + "time_per_iteration": 2.9281342029571533 + }, + { + "auxiliary_loss_clip": 0.01411476, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.24823356, + "balance_loss_mlp": 1.01328576, + "epoch": 0.6753644972192996, + "flos": 21370091328000.0, + "grad_norm": 1.6148316312993396, + "language_loss": 0.73143458, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.75587958, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.1973877, + "step": 11233, + "time_per_iteration": 2.8647220134735107 + }, + { + "auxiliary_loss_clip": 0.01434422, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.26741576, + "balance_loss_mlp": 1.0143559, + "epoch": 0.6754246204719675, + "flos": 26553225288960.0, + "grad_norm": 1.7509896986781204, + "language_loss": 0.77492702, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79961717, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20227051, + "step": 11234, + "time_per_iteration": 4.352690696716309 + }, + { + "auxiliary_loss_clip": 0.01420271, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.25596452, + "balance_loss_mlp": 1.01696312, + "epoch": 0.6754847437246355, + "flos": 25567645651200.0, + "grad_norm": 2.412404365666908, + "language_loss": 0.76004773, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.78461742, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19726562, + "step": 11235, + "time_per_iteration": 2.872056245803833 + }, + { + "auxiliary_loss_clip": 0.01184674, + "auxiliary_loss_mlp": 0.0102464, + "balance_loss_clip": 1.096071, + "balance_loss_mlp": 1.00384998, + "epoch": 0.6755448669773034, + "flos": 59539539398400.0, + "grad_norm": 0.7833881521914292, + "language_loss": 0.51474053, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.5368337, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20800781, + "step": 11236, + "time_per_iteration": 3.3188884258270264 + }, + { + "auxiliary_loss_clip": 0.0141174, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.24856281, + "balance_loss_mlp": 1.01451349, + "epoch": 0.6756049902299714, + "flos": 23304891767040.0, + "grad_norm": 2.1207822869406265, + "language_loss": 0.76327699, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.78773904, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19946289, + "step": 11237, + "time_per_iteration": 2.854684829711914 + }, + { + "auxiliary_loss_clip": 0.01425802, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.26123214, + "balance_loss_mlp": 1.01722193, + "epoch": 0.6756651134826394, + "flos": 31587575788800.0, + "grad_norm": 1.5751279412275998, + "language_loss": 0.77369905, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79832608, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19665527, + "step": 11238, + "time_per_iteration": 2.9037444591522217 + }, + { + "auxiliary_loss_clip": 0.01431533, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.26186264, + "balance_loss_mlp": 1.01590514, + "epoch": 0.6757252367353074, + "flos": 27283885868160.0, + "grad_norm": 2.0203373056200893, + "language_loss": 0.67585051, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.7005285, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.20361328, + "step": 11239, + "time_per_iteration": 2.9243950843811035 + }, + { + "auxiliary_loss_clip": 0.01406469, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.24608469, + "balance_loss_mlp": 1.01373351, + "epoch": 0.6757853599879754, + "flos": 16838113553280.0, + "grad_norm": 1.9121899035772907, + "language_loss": 0.83539724, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85978431, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18505859, + "step": 11240, + "time_per_iteration": 2.8092992305755615 + }, + { + "auxiliary_loss_clip": 0.01445857, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.27341461, + "balance_loss_mlp": 1.01463604, + "epoch": 0.6758454832406433, + "flos": 23230002343680.0, + "grad_norm": 2.3485874225806853, + "language_loss": 0.7570582, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.78186691, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.20361328, + "step": 11241, + "time_per_iteration": 2.825873374938965 + }, + { + "auxiliary_loss_clip": 0.01419395, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.2553283, + "balance_loss_mlp": 1.02104235, + "epoch": 0.6759056064933113, + "flos": 16298160382080.0, + "grad_norm": 2.970177172062965, + "language_loss": 0.81076372, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.83536756, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19946289, + "step": 11242, + "time_per_iteration": 2.881956100463867 + }, + { + "auxiliary_loss_clip": 0.01424884, + "auxiliary_loss_mlp": 0.01035146, + "balance_loss_clip": 1.2588433, + "balance_loss_mlp": 1.01640606, + "epoch": 0.6759657297459792, + "flos": 25933541500800.0, + "grad_norm": 1.9179393313317445, + "language_loss": 0.72970343, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.75430369, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18737793, + "step": 11243, + "time_per_iteration": 2.8492431640625 + }, + { + "auxiliary_loss_clip": 0.01418868, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.25369167, + "balance_loss_mlp": 1.02244043, + "epoch": 0.6760258529986473, + "flos": 23010627715200.0, + "grad_norm": 1.9155133160127673, + "language_loss": 0.73504001, + "learning_rate": 1.003487287162221e-06, + "loss": 0.75965011, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19689941, + "step": 11244, + "time_per_iteration": 2.8308210372924805 + }, + { + "auxiliary_loss_clip": 0.01428323, + "auxiliary_loss_mlp": 0.01041663, + "balance_loss_clip": 1.2618525, + "balance_loss_mlp": 1.02175498, + "epoch": 0.6760859762513152, + "flos": 20969058251520.0, + "grad_norm": 3.532575152755126, + "language_loss": 0.86466956, + "learning_rate": 1.003149631190393e-06, + "loss": 0.88936937, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19897461, + "step": 11245, + "time_per_iteration": 4.291662931442261 + }, + { + "auxiliary_loss_clip": 0.01445695, + "auxiliary_loss_mlp": 0.0104084, + "balance_loss_clip": 1.27391768, + "balance_loss_mlp": 1.02090788, + "epoch": 0.6761460995039832, + "flos": 23633388149760.0, + "grad_norm": 1.7049506634949008, + "language_loss": 0.73788965, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.76275498, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.19946289, + "step": 11246, + "time_per_iteration": 4.291633129119873 + }, + { + "auxiliary_loss_clip": 0.01417204, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.25241327, + "balance_loss_mlp": 1.01093626, + "epoch": 0.6762062227566511, + "flos": 20778984270720.0, + "grad_norm": 1.691443914578042, + "language_loss": 0.88532627, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90979993, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19213867, + "step": 11247, + "time_per_iteration": 2.8198928833007812 + }, + { + "auxiliary_loss_clip": 0.01189441, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.09864092, + "balance_loss_mlp": 1.0174439, + "epoch": 0.6762663460093191, + "flos": 52847296490880.0, + "grad_norm": 0.8179205197727337, + "language_loss": 0.54051077, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56281328, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.23339844, + "step": 11248, + "time_per_iteration": 3.3671419620513916 + }, + { + "auxiliary_loss_clip": 0.01407438, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.24825287, + "balance_loss_mlp": 1.01897001, + "epoch": 0.676326469261987, + "flos": 23706874984320.0, + "grad_norm": 1.8698015779887829, + "language_loss": 0.74731672, + "learning_rate": 1.001799385437761e-06, + "loss": 0.77176839, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18786621, + "step": 11249, + "time_per_iteration": 4.355906963348389 + }, + { + "auxiliary_loss_clip": 0.01433759, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.26473391, + "balance_loss_mlp": 1.01622963, + "epoch": 0.676386592514655, + "flos": 14071720089600.0, + "grad_norm": 1.874217154506343, + "language_loss": 0.7533493, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.77804887, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.19970703, + "step": 11250, + "time_per_iteration": 2.807476282119751 + }, + { + "auxiliary_loss_clip": 0.01423059, + "auxiliary_loss_mlp": 0.01038797, + "balance_loss_clip": 1.25720739, + "balance_loss_mlp": 1.01946092, + "epoch": 0.676446715767323, + "flos": 20421865912320.0, + "grad_norm": 2.2866520160158794, + "language_loss": 0.75870132, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.78331989, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19335938, + "step": 11251, + "time_per_iteration": 2.8696303367614746 + }, + { + "auxiliary_loss_clip": 0.01417089, + "auxiliary_loss_mlp": 0.0102999, + "balance_loss_clip": 1.25420523, + "balance_loss_mlp": 1.01036847, + "epoch": 0.676506839019991, + "flos": 21298188061440.0, + "grad_norm": 1.9198920694562533, + "language_loss": 0.70911598, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.73358679, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19616699, + "step": 11252, + "time_per_iteration": 2.8438289165496826 + }, + { + "auxiliary_loss_clip": 0.01419404, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.25584233, + "balance_loss_mlp": 1.01458263, + "epoch": 0.676566962272659, + "flos": 29943917510400.0, + "grad_norm": 1.8048824894933624, + "language_loss": 0.67342532, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69795233, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18725586, + "step": 11253, + "time_per_iteration": 2.9105162620544434 + }, + { + "auxiliary_loss_clip": 0.01431653, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.26293802, + "balance_loss_mlp": 1.01693738, + "epoch": 0.6766270855253269, + "flos": 17940189847680.0, + "grad_norm": 1.6998826911945832, + "language_loss": 0.77629697, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.80098718, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.2043457, + "step": 11254, + "time_per_iteration": 2.8724215030670166 + }, + { + "auxiliary_loss_clip": 0.01430969, + "auxiliary_loss_mlp": 0.0103055, + "balance_loss_clip": 1.26363599, + "balance_loss_mlp": 1.01195312, + "epoch": 0.6766872087779949, + "flos": 23113279463040.0, + "grad_norm": 1.8736773460946687, + "language_loss": 0.72942448, + "learning_rate": 9.997751526206835e-07, + "loss": 0.75403965, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.18591309, + "step": 11255, + "time_per_iteration": 2.8525168895721436 + }, + { + "auxiliary_loss_clip": 0.01431815, + "auxiliary_loss_mlp": 0.01042239, + "balance_loss_clip": 1.26300859, + "balance_loss_mlp": 1.02284384, + "epoch": 0.6767473320306628, + "flos": 26224185968640.0, + "grad_norm": 2.364548215164128, + "language_loss": 0.76398909, + "learning_rate": 9.994379131600828e-07, + "loss": 0.78872967, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.1940918, + "step": 11256, + "time_per_iteration": 2.901160478591919 + }, + { + "auxiliary_loss_clip": 0.01414126, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.24965012, + "balance_loss_mlp": 1.01609063, + "epoch": 0.6768074552833309, + "flos": 18377762739840.0, + "grad_norm": 2.2596065018432254, + "language_loss": 0.66147494, + "learning_rate": 9.991007116408965e-07, + "loss": 0.68597448, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19714355, + "step": 11257, + "time_per_iteration": 2.8886899948120117 + }, + { + "auxiliary_loss_clip": 0.01404796, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.24260807, + "balance_loss_mlp": 1.01674402, + "epoch": 0.6768675785359988, + "flos": 23050425156480.0, + "grad_norm": 1.4335510845529764, + "language_loss": 0.76275277, + "learning_rate": 9.987635480759109e-07, + "loss": 0.7871623, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19421387, + "step": 11258, + "time_per_iteration": 2.836700677871704 + }, + { + "auxiliary_loss_clip": 0.01410011, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.24983335, + "balance_loss_mlp": 1.01388586, + "epoch": 0.6769277017886668, + "flos": 33049485129600.0, + "grad_norm": 1.6548869124540608, + "language_loss": 0.67584044, + "learning_rate": 9.984264224779127e-07, + "loss": 0.70027107, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19177246, + "step": 11259, + "time_per_iteration": 2.933305501937866 + }, + { + "auxiliary_loss_clip": 0.01408811, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.2444905, + "balance_loss_mlp": 1.0158658, + "epoch": 0.6769878250413347, + "flos": 20857719502080.0, + "grad_norm": 2.0513872845514602, + "language_loss": 0.8618716, + "learning_rate": 9.980893348596839e-07, + "loss": 0.88631701, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1986084, + "step": 11260, + "time_per_iteration": 2.8243842124938965 + }, + { + "auxiliary_loss_clip": 0.01439266, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.26843059, + "balance_loss_mlp": 1.01639295, + "epoch": 0.6770479482940027, + "flos": 15604311087360.0, + "grad_norm": 2.3538293372455614, + "language_loss": 0.77888483, + "learning_rate": 9.977522852340081e-07, + "loss": 0.80363554, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.19396973, + "step": 11261, + "time_per_iteration": 2.8419673442840576 + }, + { + "auxiliary_loss_clip": 0.01422731, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.25638485, + "balance_loss_mlp": 1.0176723, + "epoch": 0.6771080715466706, + "flos": 18629288438400.0, + "grad_norm": 2.003262749703773, + "language_loss": 0.88828522, + "learning_rate": 9.97415273613666e-07, + "loss": 0.91289723, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20800781, + "step": 11262, + "time_per_iteration": 2.902888298034668 + }, + { + "auxiliary_loss_clip": 0.01425626, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.25867224, + "balance_loss_mlp": 1.01327753, + "epoch": 0.6771681947993387, + "flos": 12503177458560.0, + "grad_norm": 2.147705495014056, + "language_loss": 0.74895895, + "learning_rate": 9.97078300011439e-07, + "loss": 0.77354091, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19287109, + "step": 11263, + "time_per_iteration": 2.818920373916626 + }, + { + "auxiliary_loss_clip": 0.01436703, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.26639748, + "balance_loss_mlp": 1.02010179, + "epoch": 0.6772283180520066, + "flos": 22247182638720.0, + "grad_norm": 1.9039323514292321, + "language_loss": 0.68283838, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70761162, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.20532227, + "step": 11264, + "time_per_iteration": 2.8019471168518066 + }, + { + "auxiliary_loss_clip": 0.01404884, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.24166894, + "balance_loss_mlp": 1.0135541, + "epoch": 0.6772884413046746, + "flos": 16151593916160.0, + "grad_norm": 1.9360543372571062, + "language_loss": 0.74073637, + "learning_rate": 9.964044669124324e-07, + "loss": 0.76512593, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20544434, + "step": 11265, + "time_per_iteration": 2.82668137550354 + }, + { + "auxiliary_loss_clip": 0.01415711, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.25258005, + "balance_loss_mlp": 1.01420784, + "epoch": 0.6773485645573426, + "flos": 19145189358720.0, + "grad_norm": 1.489109419946897, + "language_loss": 0.62357032, + "learning_rate": 9.96067607441207e-07, + "loss": 0.64806849, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19873047, + "step": 11266, + "time_per_iteration": 2.8105502128601074 + }, + { + "auxiliary_loss_clip": 0.01412707, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.24838138, + "balance_loss_mlp": 1.01237392, + "epoch": 0.6774086878100105, + "flos": 14144980700160.0, + "grad_norm": 1.7634799577808051, + "language_loss": 0.71933043, + "learning_rate": 9.957307860391976e-07, + "loss": 0.74377823, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19702148, + "step": 11267, + "time_per_iteration": 2.834014415740967 + }, + { + "auxiliary_loss_clip": 0.01414252, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.24974597, + "balance_loss_mlp": 1.01238418, + "epoch": 0.6774688110626785, + "flos": 22206389811840.0, + "grad_norm": 2.457054654739568, + "language_loss": 0.71318364, + "learning_rate": 9.953940027191785e-07, + "loss": 0.7376442, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19421387, + "step": 11268, + "time_per_iteration": 2.936436414718628 + }, + { + "auxiliary_loss_clip": 0.0142622, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.26145279, + "balance_loss_mlp": 1.01237988, + "epoch": 0.6775289343153464, + "flos": 23050470401280.0, + "grad_norm": 2.648316782781714, + "language_loss": 0.77631009, + "learning_rate": 9.950572574939194e-07, + "loss": 0.80089104, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19494629, + "step": 11269, + "time_per_iteration": 4.366933584213257 + }, + { + "auxiliary_loss_clip": 0.01426562, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.25877237, + "balance_loss_mlp": 1.0179714, + "epoch": 0.6775890575680145, + "flos": 18301923175680.0, + "grad_norm": 2.179088861607088, + "language_loss": 0.7477001, + "learning_rate": 9.94720550376189e-07, + "loss": 0.77235574, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21020508, + "step": 11270, + "time_per_iteration": 2.8500144481658936 + }, + { + "auxiliary_loss_clip": 0.01416498, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.25231647, + "balance_loss_mlp": 1.0136826, + "epoch": 0.6776491808206824, + "flos": 25347004168320.0, + "grad_norm": 1.6856467433249251, + "language_loss": 0.73650515, + "learning_rate": 9.94383881378756e-07, + "loss": 0.76101828, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.21130371, + "step": 11271, + "time_per_iteration": 2.848832368850708 + }, + { + "auxiliary_loss_clip": 0.01426492, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.26029384, + "balance_loss_mlp": 1.01294637, + "epoch": 0.6777093040733504, + "flos": 26038591223040.0, + "grad_norm": 2.542409789613877, + "language_loss": 0.68876714, + "learning_rate": 9.94047250514387e-07, + "loss": 0.71335548, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.1940918, + "step": 11272, + "time_per_iteration": 2.868018865585327 + }, + { + "auxiliary_loss_clip": 0.0143334, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.26392078, + "balance_loss_mlp": 1.01682615, + "epoch": 0.6777694273260183, + "flos": 18012455072640.0, + "grad_norm": 1.8400628996928945, + "language_loss": 0.74637687, + "learning_rate": 9.937106577958481e-07, + "loss": 0.77109706, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.21862793, + "step": 11273, + "time_per_iteration": 2.8254570960998535 + }, + { + "auxiliary_loss_clip": 0.01408234, + "auxiliary_loss_mlp": 0.01045717, + "balance_loss_clip": 1.24513459, + "balance_loss_mlp": 1.02438998, + "epoch": 0.6778295505786863, + "flos": 23451639212160.0, + "grad_norm": 2.275979459116119, + "language_loss": 0.70764017, + "learning_rate": 9.933741032359015e-07, + "loss": 0.73217964, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.21325684, + "step": 11274, + "time_per_iteration": 2.8286259174346924 + }, + { + "auxiliary_loss_clip": 0.01424382, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.25731289, + "balance_loss_mlp": 1.01226783, + "epoch": 0.6778896738313542, + "flos": 19107608912640.0, + "grad_norm": 1.5338748678282261, + "language_loss": 0.66405052, + "learning_rate": 9.930375868473093e-07, + "loss": 0.68862027, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.203125, + "step": 11275, + "time_per_iteration": 2.850635528564453 + }, + { + "auxiliary_loss_clip": 0.01424107, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.25921357, + "balance_loss_mlp": 1.01508141, + "epoch": 0.6779497970840223, + "flos": 26115018969600.0, + "grad_norm": 2.1696697171242723, + "language_loss": 0.73176479, + "learning_rate": 9.927011086428335e-07, + "loss": 0.75635523, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1986084, + "step": 11276, + "time_per_iteration": 2.8668644428253174 + }, + { + "auxiliary_loss_clip": 0.01410471, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.24869335, + "balance_loss_mlp": 1.01317835, + "epoch": 0.6780099203366902, + "flos": 19728740534400.0, + "grad_norm": 1.8867708060739383, + "language_loss": 0.7746889, + "learning_rate": 9.923646686352317e-07, + "loss": 0.79912162, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19628906, + "step": 11277, + "time_per_iteration": 2.8433282375335693 + }, + { + "auxiliary_loss_clip": 0.01433158, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.26425588, + "balance_loss_mlp": 1.01720738, + "epoch": 0.6780700435893582, + "flos": 18221378152320.0, + "grad_norm": 2.507708213798488, + "language_loss": 0.84744966, + "learning_rate": 9.920282668372627e-07, + "loss": 0.872154, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20068359, + "step": 11278, + "time_per_iteration": 2.789520025253296 + }, + { + "auxiliary_loss_clip": 0.01401856, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.24120605, + "balance_loss_mlp": 1.01204848, + "epoch": 0.6781301668420262, + "flos": 25387661260800.0, + "grad_norm": 2.246124082331041, + "language_loss": 0.70408392, + "learning_rate": 9.916919032616844e-07, + "loss": 0.7284162, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19335938, + "step": 11279, + "time_per_iteration": 2.861539602279663 + }, + { + "auxiliary_loss_clip": 0.01416446, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.25167966, + "balance_loss_mlp": 1.01421368, + "epoch": 0.6781902900946941, + "flos": 24029987235840.0, + "grad_norm": 1.9754614764232779, + "language_loss": 0.74934232, + "learning_rate": 9.913555779212485e-07, + "loss": 0.77385497, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20605469, + "step": 11280, + "time_per_iteration": 4.315319776535034 + }, + { + "auxiliary_loss_clip": 0.01424775, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.25485694, + "balance_loss_mlp": 1.01193249, + "epoch": 0.6782504133473621, + "flos": 19656384819840.0, + "grad_norm": 1.8023470798250691, + "language_loss": 0.71094191, + "learning_rate": 9.910192908287104e-07, + "loss": 0.73551673, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20751953, + "step": 11281, + "time_per_iteration": 2.882263660430908 + }, + { + "auxiliary_loss_clip": 0.01416484, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.25445724, + "balance_loss_mlp": 1.01541185, + "epoch": 0.67831053660003, + "flos": 24942577731840.0, + "grad_norm": 1.4810746510706485, + "language_loss": 0.64675957, + "learning_rate": 9.906830419968217e-07, + "loss": 0.67128235, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20397949, + "step": 11282, + "time_per_iteration": 4.3315064907073975 + }, + { + "auxiliary_loss_clip": 0.01432217, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.26153445, + "balance_loss_mlp": 1.01984131, + "epoch": 0.6783706598526981, + "flos": 31219870147200.0, + "grad_norm": 1.7792129745308065, + "language_loss": 0.75265372, + "learning_rate": 9.90346831438334e-07, + "loss": 0.77738118, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.20690918, + "step": 11283, + "time_per_iteration": 2.901803970336914 + }, + { + "auxiliary_loss_clip": 0.01417343, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.25205636, + "balance_loss_mlp": 1.01187837, + "epoch": 0.678430783105366, + "flos": 35454054775680.0, + "grad_norm": 1.5782463981378694, + "language_loss": 0.574682, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59916806, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19396973, + "step": 11284, + "time_per_iteration": 4.358064651489258 + }, + { + "auxiliary_loss_clip": 0.01416248, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.25118554, + "balance_loss_mlp": 1.01325858, + "epoch": 0.678490906358034, + "flos": 14436937267200.0, + "grad_norm": 2.1218315328407322, + "language_loss": 0.76242232, + "learning_rate": 9.896745251925535e-07, + "loss": 0.78691888, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20141602, + "step": 11285, + "time_per_iteration": 2.8184762001037598 + }, + { + "auxiliary_loss_clip": 0.01402456, + "auxiliary_loss_mlp": 0.01036449, + "balance_loss_clip": 1.24122655, + "balance_loss_mlp": 1.0160048, + "epoch": 0.6785510296107019, + "flos": 24320586458880.0, + "grad_norm": 1.69964895994125, + "language_loss": 0.66680771, + "learning_rate": 9.893384295307557e-07, + "loss": 0.6911968, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20446777, + "step": 11286, + "time_per_iteration": 2.874673843383789 + }, + { + "auxiliary_loss_clip": 0.01413247, + "auxiliary_loss_mlp": 0.01030331, + "balance_loss_clip": 1.2466743, + "balance_loss_mlp": 1.01006532, + "epoch": 0.6786111528633699, + "flos": 26987947758720.0, + "grad_norm": 2.31654858611157, + "language_loss": 0.53541523, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55985099, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20263672, + "step": 11287, + "time_per_iteration": 2.8719193935394287 + }, + { + "auxiliary_loss_clip": 0.01410818, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.24626613, + "balance_loss_mlp": 1.01599073, + "epoch": 0.6786712761160378, + "flos": 24328549543680.0, + "grad_norm": 1.4435068111251812, + "language_loss": 0.776425, + "learning_rate": 9.886663531930655e-07, + "loss": 0.80089462, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20153809, + "step": 11288, + "time_per_iteration": 2.887211322784424 + }, + { + "auxiliary_loss_clip": 0.01423731, + "auxiliary_loss_mlp": 0.01041052, + "balance_loss_clip": 1.25886631, + "balance_loss_mlp": 1.02122712, + "epoch": 0.6787313993687059, + "flos": 22940896199040.0, + "grad_norm": 2.6862971389687655, + "language_loss": 0.74131191, + "learning_rate": 9.883303725426593e-07, + "loss": 0.76595974, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19824219, + "step": 11289, + "time_per_iteration": 2.8789894580841064 + }, + { + "auxiliary_loss_clip": 0.01412804, + "auxiliary_loss_mlp": 0.01035915, + "balance_loss_clip": 1.24759984, + "balance_loss_mlp": 1.01632881, + "epoch": 0.6787915226213738, + "flos": 26879278452480.0, + "grad_norm": 1.5116264527097434, + "language_loss": 0.80383253, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82831973, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19580078, + "step": 11290, + "time_per_iteration": 2.867624521255493 + }, + { + "auxiliary_loss_clip": 0.01401587, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.23988724, + "balance_loss_mlp": 1.01275802, + "epoch": 0.6788516458740418, + "flos": 20017846679040.0, + "grad_norm": 1.5201554448099261, + "language_loss": 0.75504929, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77939236, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19970703, + "step": 11291, + "time_per_iteration": 2.8425180912017822 + }, + { + "auxiliary_loss_clip": 0.0141025, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.24379349, + "balance_loss_mlp": 1.01453137, + "epoch": 0.6789117691267098, + "flos": 28737877368960.0, + "grad_norm": 1.8344090428464768, + "language_loss": 0.76031041, + "learning_rate": 9.873226608180785e-07, + "loss": 0.78476191, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20349121, + "step": 11292, + "time_per_iteration": 2.9313571453094482 + }, + { + "auxiliary_loss_clip": 0.01409303, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.24418664, + "balance_loss_mlp": 1.01317191, + "epoch": 0.6789718923793777, + "flos": 23413696807680.0, + "grad_norm": 5.073850271019591, + "language_loss": 0.85112441, + "learning_rate": 9.869868336945556e-07, + "loss": 0.87554848, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19946289, + "step": 11293, + "time_per_iteration": 2.9133214950561523 + }, + { + "auxiliary_loss_clip": 0.01440673, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.26857722, + "balance_loss_mlp": 1.01557612, + "epoch": 0.6790320156320457, + "flos": 20458541462400.0, + "grad_norm": 2.6195683866197585, + "language_loss": 0.80698299, + "learning_rate": 9.866510449845929e-07, + "loss": 0.83175099, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.20544434, + "step": 11294, + "time_per_iteration": 2.840186595916748 + }, + { + "auxiliary_loss_clip": 0.01415216, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.25090504, + "balance_loss_mlp": 1.01057029, + "epoch": 0.6790921388847136, + "flos": 24177006149760.0, + "grad_norm": 1.7551487779009975, + "language_loss": 0.79491174, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81936073, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19104004, + "step": 11295, + "time_per_iteration": 2.8824830055236816 + }, + { + "auxiliary_loss_clip": 0.0139699, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.23799169, + "balance_loss_mlp": 1.01463664, + "epoch": 0.6791522621373817, + "flos": 21918098073600.0, + "grad_norm": 1.7527298955205517, + "language_loss": 0.71848845, + "learning_rate": 9.859795828562823e-07, + "loss": 0.7427938, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18896484, + "step": 11296, + "time_per_iteration": 2.8827760219573975 + }, + { + "auxiliary_loss_clip": 0.01419055, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.25424206, + "balance_loss_mlp": 1.01158476, + "epoch": 0.6792123853900496, + "flos": 24837166051200.0, + "grad_norm": 4.1947304036601425, + "language_loss": 0.70672762, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73123503, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20092773, + "step": 11297, + "time_per_iteration": 2.88236927986145 + }, + { + "auxiliary_loss_clip": 0.01420344, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.25125313, + "balance_loss_mlp": 1.01342642, + "epoch": 0.6792725086427176, + "flos": 17575153649280.0, + "grad_norm": 2.5828254571242244, + "language_loss": 0.67163193, + "learning_rate": 9.853082745349918e-07, + "loss": 0.69617009, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20043945, + "step": 11298, + "time_per_iteration": 2.797438621520996 + }, + { + "auxiliary_loss_clip": 0.01420188, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.25458765, + "balance_loss_mlp": 1.01562512, + "epoch": 0.6793326318953855, + "flos": 26952810531840.0, + "grad_norm": 1.6536026558085015, + "language_loss": 0.72178936, + "learning_rate": 9.84972678083801e-07, + "loss": 0.74633586, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18847656, + "step": 11299, + "time_per_iteration": 2.8717901706695557 + }, + { + "auxiliary_loss_clip": 0.01414142, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.2487762, + "balance_loss_mlp": 1.01810062, + "epoch": 0.6793927551480535, + "flos": 24329454439680.0, + "grad_norm": 1.2905766205020102, + "language_loss": 0.77893174, + "learning_rate": 9.846371201225488e-07, + "loss": 0.80347264, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.21862793, + "step": 11300, + "time_per_iteration": 2.8312931060791016 + }, + { + "auxiliary_loss_clip": 0.01408104, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.24368, + "balance_loss_mlp": 1.02112222, + "epoch": 0.6794528784007214, + "flos": 11443884762240.0, + "grad_norm": 2.290388361300239, + "language_loss": 0.64418095, + "learning_rate": 9.843016006639577e-07, + "loss": 0.66866791, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19470215, + "step": 11301, + "time_per_iteration": 2.7834506034851074 + }, + { + "auxiliary_loss_clip": 0.01400104, + "auxiliary_loss_mlp": 0.0103946, + "balance_loss_clip": 1.23513389, + "balance_loss_mlp": 1.01909947, + "epoch": 0.6795130016533895, + "flos": 25240913815680.0, + "grad_norm": 1.6460007763224784, + "language_loss": 0.8314954, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85589105, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20361328, + "step": 11302, + "time_per_iteration": 2.8881211280822754 + }, + { + "auxiliary_loss_clip": 0.01406566, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.241009, + "balance_loss_mlp": 1.0186013, + "epoch": 0.6795731249060574, + "flos": 18305588004480.0, + "grad_norm": 2.0350837504847443, + "language_loss": 0.70770973, + "learning_rate": 9.83630677305654e-07, + "loss": 0.73216254, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.2010498, + "step": 11303, + "time_per_iteration": 2.8204898834228516 + }, + { + "auxiliary_loss_clip": 0.01429952, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.26030445, + "balance_loss_mlp": 1.01828992, + "epoch": 0.6796332481587254, + "flos": 20309350798080.0, + "grad_norm": 2.0066774561287226, + "language_loss": 0.71171033, + "learning_rate": 9.832952734313813e-07, + "loss": 0.7363925, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.1998291, + "step": 11304, + "time_per_iteration": 4.337868928909302 + }, + { + "auxiliary_loss_clip": 0.0142428, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.25822258, + "balance_loss_mlp": 1.01594305, + "epoch": 0.6796933714113934, + "flos": 23597391271680.0, + "grad_norm": 2.1984984510686476, + "language_loss": 0.7349211, + "learning_rate": 9.829599081106536e-07, + "loss": 0.75952458, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20129395, + "step": 11305, + "time_per_iteration": 3.025583267211914 + }, + { + "auxiliary_loss_clip": 0.01422252, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.25661349, + "balance_loss_mlp": 1.01727271, + "epoch": 0.6797534946640613, + "flos": 27129808765440.0, + "grad_norm": 4.476429731091663, + "language_loss": 0.66965985, + "learning_rate": 9.826245813561882e-07, + "loss": 0.69425404, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19897461, + "step": 11306, + "time_per_iteration": 2.9169840812683105 + }, + { + "auxiliary_loss_clip": 0.01418554, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.2548852, + "balance_loss_mlp": 1.01732218, + "epoch": 0.6798136179167293, + "flos": 22137608436480.0, + "grad_norm": 1.7728283200996082, + "language_loss": 0.8078481, + "learning_rate": 9.822892931807021e-07, + "loss": 0.83240747, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20068359, + "step": 11307, + "time_per_iteration": 2.8640127182006836 + }, + { + "auxiliary_loss_clip": 0.01411669, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.24837208, + "balance_loss_mlp": 1.02114761, + "epoch": 0.6798737411693972, + "flos": 17496282683520.0, + "grad_norm": 1.5135174022101265, + "language_loss": 0.89179122, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91632068, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20117188, + "step": 11308, + "time_per_iteration": 2.835096597671509 + }, + { + "auxiliary_loss_clip": 0.01418785, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.25387931, + "balance_loss_mlp": 1.02062416, + "epoch": 0.6799338644220653, + "flos": 22902275122560.0, + "grad_norm": 1.8940799923383618, + "language_loss": 0.72461164, + "learning_rate": 9.816188326175154e-07, + "loss": 0.74919826, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19250488, + "step": 11309, + "time_per_iteration": 2.866344928741455 + }, + { + "auxiliary_loss_clip": 0.01413669, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.25029159, + "balance_loss_mlp": 1.02519643, + "epoch": 0.6799939876747332, + "flos": 23189526230400.0, + "grad_norm": 1.8218838367208725, + "language_loss": 0.85491765, + "learning_rate": 9.812836602552411e-07, + "loss": 0.87950456, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19824219, + "step": 11310, + "time_per_iteration": 2.8506929874420166 + }, + { + "auxiliary_loss_clip": 0.01405995, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.24520338, + "balance_loss_mlp": 1.02180409, + "epoch": 0.6800541109274012, + "flos": 19509275416320.0, + "grad_norm": 1.990362932452939, + "language_loss": 0.8334223, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85789287, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19262695, + "step": 11311, + "time_per_iteration": 2.9016757011413574 + }, + { + "auxiliary_loss_clip": 0.01427994, + "auxiliary_loss_mlp": 0.01042755, + "balance_loss_clip": 1.25593483, + "balance_loss_mlp": 1.02104735, + "epoch": 0.6801142341800691, + "flos": 22288970851200.0, + "grad_norm": 1.5600797293940212, + "language_loss": 0.7687189, + "learning_rate": 9.806134314328767e-07, + "loss": 0.79342639, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.21716309, + "step": 11312, + "time_per_iteration": 2.879802703857422 + }, + { + "auxiliary_loss_clip": 0.01192916, + "auxiliary_loss_mlp": 0.01047497, + "balance_loss_clip": 1.10115099, + "balance_loss_mlp": 1.02756488, + "epoch": 0.6801743574327371, + "flos": 68745265464960.0, + "grad_norm": 0.6716363526172427, + "language_loss": 0.57251465, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59491879, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.19921875, + "step": 11313, + "time_per_iteration": 3.5011777877807617 + }, + { + "auxiliary_loss_clip": 0.01417921, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.25159705, + "balance_loss_mlp": 1.01581168, + "epoch": 0.680234480685405, + "flos": 29472383756160.0, + "grad_norm": 1.8200620466883983, + "language_loss": 0.69567293, + "learning_rate": 9.799433572314754e-07, + "loss": 0.72020209, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19189453, + "step": 11314, + "time_per_iteration": 2.9185609817504883 + }, + { + "auxiliary_loss_clip": 0.01402121, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.2393415, + "balance_loss_mlp": 1.01469517, + "epoch": 0.6802946039380731, + "flos": 15923351306880.0, + "grad_norm": 1.8537638267312848, + "language_loss": 0.81952071, + "learning_rate": 9.796083781453972e-07, + "loss": 0.84387589, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18701172, + "step": 11315, + "time_per_iteration": 4.370807886123657 + }, + { + "auxiliary_loss_clip": 0.01409394, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.24407387, + "balance_loss_mlp": 1.01318073, + "epoch": 0.680354727190741, + "flos": 22028984375040.0, + "grad_norm": 1.5204901192791644, + "language_loss": 0.70376772, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72819364, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20019531, + "step": 11316, + "time_per_iteration": 4.3096232414245605 + }, + { + "auxiliary_loss_clip": 0.01417803, + "auxiliary_loss_mlp": 0.01036457, + "balance_loss_clip": 1.25259864, + "balance_loss_mlp": 1.01701427, + "epoch": 0.680414850443409, + "flos": 18450480412800.0, + "grad_norm": 1.9107274499295817, + "language_loss": 0.67876339, + "learning_rate": 9.789385360660003e-07, + "loss": 0.70330596, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19445801, + "step": 11317, + "time_per_iteration": 2.812840700149536 + }, + { + "auxiliary_loss_clip": 0.01423562, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.25664115, + "balance_loss_mlp": 1.01599371, + "epoch": 0.680474973696077, + "flos": 26369576069760.0, + "grad_norm": 1.5373803792298903, + "language_loss": 0.75489831, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77948105, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18713379, + "step": 11318, + "time_per_iteration": 2.8647563457489014 + }, + { + "auxiliary_loss_clip": 0.01402003, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.24110436, + "balance_loss_mlp": 1.01616287, + "epoch": 0.6805350969487449, + "flos": 18342354044160.0, + "grad_norm": 1.6791396269098764, + "language_loss": 0.69236344, + "learning_rate": 9.782688488616143e-07, + "loss": 0.71673238, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18713379, + "step": 11319, + "time_per_iteration": 4.2774646282196045 + }, + { + "auxiliary_loss_clip": 0.01399259, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.23784161, + "balance_loss_mlp": 1.01719356, + "epoch": 0.6805952202014129, + "flos": 19946893553280.0, + "grad_norm": 1.7308414052713077, + "language_loss": 0.77625227, + "learning_rate": 9.779340633692945e-07, + "loss": 0.80061281, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19592285, + "step": 11320, + "time_per_iteration": 2.880239486694336 + }, + { + "auxiliary_loss_clip": 0.01417559, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.25336313, + "balance_loss_mlp": 1.01290202, + "epoch": 0.6806553434540809, + "flos": 25234127095680.0, + "grad_norm": 4.747781951337733, + "language_loss": 0.75302511, + "learning_rate": 9.77599316633817e-07, + "loss": 0.77753437, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20446777, + "step": 11321, + "time_per_iteration": 2.8584327697753906 + }, + { + "auxiliary_loss_clip": 0.01427436, + "auxiliary_loss_mlp": 0.01038421, + "balance_loss_clip": 1.2621448, + "balance_loss_mlp": 1.01882339, + "epoch": 0.6807154667067489, + "flos": 17794618767360.0, + "grad_norm": 2.962288114201844, + "language_loss": 0.73654282, + "learning_rate": 9.772646086678758e-07, + "loss": 0.76120132, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19616699, + "step": 11322, + "time_per_iteration": 2.8288662433624268 + }, + { + "auxiliary_loss_clip": 0.01417954, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.25216615, + "balance_loss_mlp": 1.01296902, + "epoch": 0.6807755899594168, + "flos": 22209964151040.0, + "grad_norm": 1.7395559834495082, + "language_loss": 0.79751027, + "learning_rate": 9.769299394841638e-07, + "loss": 0.82201672, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19714355, + "step": 11323, + "time_per_iteration": 2.848283052444458 + }, + { + "auxiliary_loss_clip": 0.01188626, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.09888208, + "balance_loss_mlp": 1.00679862, + "epoch": 0.6808357132120848, + "flos": 68658702883200.0, + "grad_norm": 0.7501361763494518, + "language_loss": 0.57144701, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59361291, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.21191406, + "step": 11324, + "time_per_iteration": 3.137296438217163 + }, + { + "auxiliary_loss_clip": 0.01416895, + "auxiliary_loss_mlp": 0.01037566, + "balance_loss_clip": 1.25171161, + "balance_loss_mlp": 1.01801586, + "epoch": 0.6808958364647527, + "flos": 23854255856640.0, + "grad_norm": 1.7617262681605776, + "language_loss": 0.69228351, + "learning_rate": 9.76260717514186e-07, + "loss": 0.71682811, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19543457, + "step": 11325, + "time_per_iteration": 2.880560874938965 + }, + { + "auxiliary_loss_clip": 0.01420184, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.25124097, + "balance_loss_mlp": 1.01261997, + "epoch": 0.6809559597174207, + "flos": 17720498505600.0, + "grad_norm": 2.5098364653162775, + "language_loss": 0.72076178, + "learning_rate": 9.759261647532974e-07, + "loss": 0.74529719, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20739746, + "step": 11326, + "time_per_iteration": 2.8082377910614014 + }, + { + "auxiliary_loss_clip": 0.01413551, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.24955285, + "balance_loss_mlp": 1.01159573, + "epoch": 0.6810160829700886, + "flos": 22502056452480.0, + "grad_norm": 1.7403943873680507, + "language_loss": 0.7383846, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7628206, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18469238, + "step": 11327, + "time_per_iteration": 2.8523879051208496 + }, + { + "auxiliary_loss_clip": 0.0140605, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.24355483, + "balance_loss_mlp": 1.01068592, + "epoch": 0.6810762062227567, + "flos": 16840602017280.0, + "grad_norm": 1.8352902294798372, + "language_loss": 0.77808857, + "learning_rate": 9.752571757431526e-07, + "loss": 0.80244648, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.1907959, + "step": 11328, + "time_per_iteration": 2.842034101486206 + }, + { + "auxiliary_loss_clip": 0.01416045, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.24939597, + "balance_loss_mlp": 1.0115068, + "epoch": 0.6811363294754246, + "flos": 12722552087040.0, + "grad_norm": 2.7640090119118135, + "language_loss": 0.65034783, + "learning_rate": 9.74922739519265e-07, + "loss": 0.67481709, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19384766, + "step": 11329, + "time_per_iteration": 2.8196916580200195 + }, + { + "auxiliary_loss_clip": 0.01412156, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.24676633, + "balance_loss_mlp": 1.01392508, + "epoch": 0.6811964527280926, + "flos": 17720815219200.0, + "grad_norm": 1.8233035201920738, + "language_loss": 0.79505861, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81951767, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19812012, + "step": 11330, + "time_per_iteration": 2.806715965270996 + }, + { + "auxiliary_loss_clip": 0.0141624, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.25203395, + "balance_loss_mlp": 1.01193476, + "epoch": 0.6812565759807605, + "flos": 24874248804480.0, + "grad_norm": 1.7618028653151885, + "language_loss": 0.65100932, + "learning_rate": 9.742539836972665e-07, + "loss": 0.67548668, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19543457, + "step": 11331, + "time_per_iteration": 2.8617892265319824 + }, + { + "auxiliary_loss_clip": 0.01407402, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.24339044, + "balance_loss_mlp": 1.0111357, + "epoch": 0.6813166992334285, + "flos": 17175070713600.0, + "grad_norm": 2.076479242953104, + "language_loss": 0.72693253, + "learning_rate": 9.739196641245148e-07, + "loss": 0.75130963, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19177246, + "step": 11332, + "time_per_iteration": 2.8538248538970947 + }, + { + "auxiliary_loss_clip": 0.01415031, + "auxiliary_loss_mlp": 0.01038868, + "balance_loss_clip": 1.24915206, + "balance_loss_mlp": 1.01928163, + "epoch": 0.6813768224860965, + "flos": 18852825588480.0, + "grad_norm": 1.8249742139039362, + "language_loss": 0.76205456, + "learning_rate": 9.735853834608326e-07, + "loss": 0.78659356, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19580078, + "step": 11333, + "time_per_iteration": 2.832982301712036 + }, + { + "auxiliary_loss_clip": 0.01433956, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.26528454, + "balance_loss_mlp": 1.0113405, + "epoch": 0.6814369457387645, + "flos": 24542811509760.0, + "grad_norm": 1.4119152260756354, + "language_loss": 0.72356653, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74821359, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19396973, + "step": 11334, + "time_per_iteration": 2.8665213584899902 + }, + { + "auxiliary_loss_clip": 0.01403669, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.24317086, + "balance_loss_mlp": 1.01641035, + "epoch": 0.6814970689914325, + "flos": 18232010680320.0, + "grad_norm": 1.9416177926258882, + "language_loss": 0.86827052, + "learning_rate": 9.729169389113791e-07, + "loss": 0.89267224, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20092773, + "step": 11335, + "time_per_iteration": 2.831590414047241 + }, + { + "auxiliary_loss_clip": 0.01393246, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.2348001, + "balance_loss_mlp": 1.01207328, + "epoch": 0.6815571922441004, + "flos": 25239782695680.0, + "grad_norm": 1.693000867777012, + "language_loss": 0.82713437, + "learning_rate": 9.725827750509542e-07, + "loss": 0.85137206, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18444824, + "step": 11336, + "time_per_iteration": 2.853888750076294 + }, + { + "auxiliary_loss_clip": 0.01397939, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.23819137, + "balance_loss_mlp": 1.0129739, + "epoch": 0.6816173154967684, + "flos": 19463505661440.0, + "grad_norm": 1.7927051071948732, + "language_loss": 0.82308495, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84738415, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19006348, + "step": 11337, + "time_per_iteration": 2.813971519470215 + }, + { + "auxiliary_loss_clip": 0.01405631, + "auxiliary_loss_mlp": 0.01034285, + "balance_loss_clip": 1.24481392, + "balance_loss_mlp": 1.014902, + "epoch": 0.6816774387494363, + "flos": 17940506561280.0, + "grad_norm": 1.667103102443736, + "language_loss": 0.73164666, + "learning_rate": 9.719145642220673e-07, + "loss": 0.75604582, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19372559, + "step": 11338, + "time_per_iteration": 2.841360330581665 + }, + { + "auxiliary_loss_clip": 0.01407761, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.24468839, + "balance_loss_mlp": 1.01623702, + "epoch": 0.6817375620021043, + "flos": 22242205710720.0, + "grad_norm": 1.527636114026428, + "language_loss": 0.77953243, + "learning_rate": 9.715805172789435e-07, + "loss": 0.80397415, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20153809, + "step": 11339, + "time_per_iteration": 4.257450580596924 + }, + { + "auxiliary_loss_clip": 0.01415712, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.25078011, + "balance_loss_mlp": 1.01781929, + "epoch": 0.6817976852547722, + "flos": 25385353776000.0, + "grad_norm": 2.009139066165595, + "language_loss": 0.71589959, + "learning_rate": 9.712465093335901e-07, + "loss": 0.74042493, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19006348, + "step": 11340, + "time_per_iteration": 2.877794027328491 + }, + { + "auxiliary_loss_clip": 0.01419154, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.25154507, + "balance_loss_mlp": 1.0154438, + "epoch": 0.6818578085074403, + "flos": 22273994822400.0, + "grad_norm": 2.126685703879322, + "language_loss": 0.8457756, + "learning_rate": 9.709125403986722e-07, + "loss": 0.87032032, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19873047, + "step": 11341, + "time_per_iteration": 2.8404483795166016 + }, + { + "auxiliary_loss_clip": 0.01413706, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.24951577, + "balance_loss_mlp": 1.01848817, + "epoch": 0.6819179317601082, + "flos": 19327616968320.0, + "grad_norm": 1.693491141917834, + "language_loss": 0.69249678, + "learning_rate": 9.705786104868531e-07, + "loss": 0.71701789, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19909668, + "step": 11342, + "time_per_iteration": 2.842381000518799 + }, + { + "auxiliary_loss_clip": 0.01405223, + "auxiliary_loss_mlp": 0.01038537, + "balance_loss_clip": 1.24338627, + "balance_loss_mlp": 1.01889133, + "epoch": 0.6819780550127762, + "flos": 21113588701440.0, + "grad_norm": 3.1791654456655682, + "language_loss": 0.75823903, + "learning_rate": 9.702447196107963e-07, + "loss": 0.7826767, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19641113, + "step": 11343, + "time_per_iteration": 2.8489999771118164 + }, + { + "auxiliary_loss_clip": 0.01426484, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.26137269, + "balance_loss_mlp": 1.01713848, + "epoch": 0.6820381782654441, + "flos": 29728524424320.0, + "grad_norm": 1.5202200310610858, + "language_loss": 0.80320418, + "learning_rate": 9.699108677831639e-07, + "loss": 0.82784379, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20336914, + "step": 11344, + "time_per_iteration": 2.9074172973632812 + }, + { + "auxiliary_loss_clip": 0.01411322, + "auxiliary_loss_mlp": 0.01042052, + "balance_loss_clip": 1.24622333, + "balance_loss_mlp": 1.02244198, + "epoch": 0.6820983015181121, + "flos": 29254140247680.0, + "grad_norm": 1.7696620538954222, + "language_loss": 0.67171717, + "learning_rate": 9.695770550166136e-07, + "loss": 0.69625092, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19616699, + "step": 11345, + "time_per_iteration": 2.929905891418457 + }, + { + "auxiliary_loss_clip": 0.01424098, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.25756848, + "balance_loss_mlp": 1.02024317, + "epoch": 0.6821584247707801, + "flos": 18878913855360.0, + "grad_norm": 2.687360309479774, + "language_loss": 0.66486585, + "learning_rate": 9.692432813238054e-07, + "loss": 0.68952149, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.21228027, + "step": 11346, + "time_per_iteration": 2.843095064163208 + }, + { + "auxiliary_loss_clip": 0.01431491, + "auxiliary_loss_mlp": 0.01034448, + "balance_loss_clip": 1.26354384, + "balance_loss_mlp": 1.01486158, + "epoch": 0.6822185480234481, + "flos": 21334501653120.0, + "grad_norm": 2.122754981547926, + "language_loss": 0.78987813, + "learning_rate": 9.689095467173952e-07, + "loss": 0.81453741, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19592285, + "step": 11347, + "time_per_iteration": 2.830878734588623 + }, + { + "auxiliary_loss_clip": 0.01192343, + "auxiliary_loss_mlp": 0.01054307, + "balance_loss_clip": 1.10237622, + "balance_loss_mlp": 1.03103733, + "epoch": 0.6822786712761161, + "flos": 63515049649920.0, + "grad_norm": 0.7354952465953686, + "language_loss": 0.52479404, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54726052, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.23242188, + "step": 11348, + "time_per_iteration": 3.420215368270874 + }, + { + "auxiliary_loss_clip": 0.01407983, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.24671841, + "balance_loss_mlp": 1.0170027, + "epoch": 0.682338794528784, + "flos": 21078722943360.0, + "grad_norm": 1.6981471416983351, + "language_loss": 0.80386746, + "learning_rate": 9.682421948143873e-07, + "loss": 0.82831007, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19250488, + "step": 11349, + "time_per_iteration": 2.898846387863159 + }, + { + "auxiliary_loss_clip": 0.01449503, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.27485037, + "balance_loss_mlp": 1.01503563, + "epoch": 0.682398917781452, + "flos": 36296913755520.0, + "grad_norm": 1.6327830586349878, + "language_loss": 0.7441256, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76899695, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.22583008, + "step": 11350, + "time_per_iteration": 2.959620952606201 + }, + { + "auxiliary_loss_clip": 0.01412246, + "auxiliary_loss_mlp": 0.01039772, + "balance_loss_clip": 1.24895024, + "balance_loss_mlp": 1.01981688, + "epoch": 0.6824590410341199, + "flos": 24868864673280.0, + "grad_norm": 1.51180371437338, + "language_loss": 0.79801989, + "learning_rate": 9.675749994088161e-07, + "loss": 0.82254004, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19946289, + "step": 11351, + "time_per_iteration": 4.35706901550293 + }, + { + "auxiliary_loss_clip": 0.01402412, + "auxiliary_loss_mlp": 0.01036246, + "balance_loss_clip": 1.24095845, + "balance_loss_mlp": 1.01689816, + "epoch": 0.6825191642867879, + "flos": 22461761318400.0, + "grad_norm": 1.6445584036160426, + "language_loss": 0.74340034, + "learning_rate": 9.672414604241954e-07, + "loss": 0.76778698, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19335938, + "step": 11352, + "time_per_iteration": 2.864840030670166 + }, + { + "auxiliary_loss_clip": 0.01426894, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.26002502, + "balance_loss_mlp": 1.01725841, + "epoch": 0.6825792875394558, + "flos": 29436386878080.0, + "grad_norm": 1.4722488044985225, + "language_loss": 0.80755448, + "learning_rate": 9.669079606018814e-07, + "loss": 0.83220422, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20812988, + "step": 11353, + "time_per_iteration": 2.909999132156372 + }, + { + "auxiliary_loss_clip": 0.01411443, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.247159, + "balance_loss_mlp": 1.01591277, + "epoch": 0.6826394107921239, + "flos": 18780334139520.0, + "grad_norm": 1.7851784987753812, + "language_loss": 0.79011291, + "learning_rate": 9.665744999545218e-07, + "loss": 0.8145895, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20300293, + "step": 11354, + "time_per_iteration": 4.3294806480407715 + }, + { + "auxiliary_loss_clip": 0.01411261, + "auxiliary_loss_mlp": 0.01034424, + "balance_loss_clip": 1.24900389, + "balance_loss_mlp": 1.01484954, + "epoch": 0.6826995340447918, + "flos": 16626204316800.0, + "grad_norm": 1.9388504804637146, + "language_loss": 0.62242436, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64688128, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19567871, + "step": 11355, + "time_per_iteration": 2.8175766468048096 + }, + { + "auxiliary_loss_clip": 0.01419734, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.25528455, + "balance_loss_mlp": 1.01732683, + "epoch": 0.6827596572974598, + "flos": 20857583767680.0, + "grad_norm": 1.7827505902360983, + "language_loss": 0.83182585, + "learning_rate": 9.659076962352398e-07, + "loss": 0.85638875, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19213867, + "step": 11356, + "time_per_iteration": 2.8358891010284424 + }, + { + "auxiliary_loss_clip": 0.01435578, + "auxiliary_loss_mlp": 0.01035991, + "balance_loss_clip": 1.26781249, + "balance_loss_mlp": 1.01500988, + "epoch": 0.6828197805501277, + "flos": 22758513834240.0, + "grad_norm": 1.6493584496785942, + "language_loss": 0.79033369, + "learning_rate": 9.655743531886052e-07, + "loss": 0.81504941, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20983887, + "step": 11357, + "time_per_iteration": 2.840134620666504 + }, + { + "auxiliary_loss_clip": 0.0119034, + "auxiliary_loss_mlp": 0.01018051, + "balance_loss_clip": 1.09993482, + "balance_loss_mlp": 0.99869156, + "epoch": 0.6828799038027957, + "flos": 71681119263360.0, + "grad_norm": 0.837571503406661, + "language_loss": 0.59637719, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61846113, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.19335938, + "step": 11358, + "time_per_iteration": 3.3921425342559814 + }, + { + "auxiliary_loss_clip": 0.01431307, + "auxiliary_loss_mlp": 0.01039424, + "balance_loss_clip": 1.26227689, + "balance_loss_mlp": 1.01869345, + "epoch": 0.6829400270554637, + "flos": 19838812429440.0, + "grad_norm": 1.9706328565152895, + "language_loss": 0.80131733, + "learning_rate": 9.64907784784544e-07, + "loss": 0.82602465, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20739746, + "step": 11359, + "time_per_iteration": 2.8380579948425293 + }, + { + "auxiliary_loss_clip": 0.01410689, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.24510074, + "balance_loss_mlp": 1.0162493, + "epoch": 0.6830001503081317, + "flos": 21990544277760.0, + "grad_norm": 1.829718318151842, + "language_loss": 0.81931925, + "learning_rate": 9.645745594523958e-07, + "loss": 0.84378594, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.1973877, + "step": 11360, + "time_per_iteration": 2.867671012878418 + }, + { + "auxiliary_loss_clip": 0.01418965, + "auxiliary_loss_mlp": 0.01037796, + "balance_loss_clip": 1.25239229, + "balance_loss_mlp": 1.01788783, + "epoch": 0.6830602735607997, + "flos": 24327554158080.0, + "grad_norm": 2.0633055028659455, + "language_loss": 0.75336808, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77793562, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19909668, + "step": 11361, + "time_per_iteration": 2.8632469177246094 + }, + { + "auxiliary_loss_clip": 0.01194176, + "auxiliary_loss_mlp": 0.01015649, + "balance_loss_clip": 1.10301065, + "balance_loss_mlp": 0.99466801, + "epoch": 0.6831203968134676, + "flos": 57716077708800.0, + "grad_norm": 0.8737415376383233, + "language_loss": 0.5978626, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61996078, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.20996094, + "step": 11362, + "time_per_iteration": 3.394331932067871 + }, + { + "auxiliary_loss_clip": 0.01421572, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.25545752, + "balance_loss_mlp": 1.01105285, + "epoch": 0.6831805200661356, + "flos": 14395556257920.0, + "grad_norm": 2.034658829975799, + "language_loss": 0.76486272, + "learning_rate": 9.635751190871074e-07, + "loss": 0.78939492, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20605469, + "step": 11363, + "time_per_iteration": 2.8349788188934326 + }, + { + "auxiliary_loss_clip": 0.01420069, + "auxiliary_loss_mlp": 0.01038688, + "balance_loss_clip": 1.25722647, + "balance_loss_mlp": 1.01770711, + "epoch": 0.6832406433188035, + "flos": 22830552835200.0, + "grad_norm": 2.158969431532805, + "language_loss": 0.89530939, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91989696, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.2097168, + "step": 11364, + "time_per_iteration": 2.8377461433410645 + }, + { + "auxiliary_loss_clip": 0.01416923, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.25262904, + "balance_loss_mlp": 1.01173115, + "epoch": 0.6833007665714715, + "flos": 17569407559680.0, + "grad_norm": 2.0146292247373316, + "language_loss": 0.88783824, + "learning_rate": 9.629090219958697e-07, + "loss": 0.91231751, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19299316, + "step": 11365, + "time_per_iteration": 2.856091260910034 + }, + { + "auxiliary_loss_clip": 0.01424807, + "auxiliary_loss_mlp": 0.01037834, + "balance_loss_clip": 1.25606549, + "balance_loss_mlp": 1.01817596, + "epoch": 0.6833608898241395, + "flos": 22453617254400.0, + "grad_norm": 2.214664570192883, + "language_loss": 0.82291067, + "learning_rate": 9.625760324338272e-07, + "loss": 0.8475371, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.19665527, + "step": 11366, + "time_per_iteration": 2.803571939468384 + }, + { + "auxiliary_loss_clip": 0.01420101, + "auxiliary_loss_mlp": 0.01034053, + "balance_loss_clip": 1.25383639, + "balance_loss_mlp": 1.01409745, + "epoch": 0.6834210130768075, + "flos": 24545209484160.0, + "grad_norm": 1.5523599795780185, + "language_loss": 0.77474838, + "learning_rate": 9.622430822110062e-07, + "loss": 0.79928994, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19946289, + "step": 11367, + "time_per_iteration": 2.8568115234375 + }, + { + "auxiliary_loss_clip": 0.01414585, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.2486347, + "balance_loss_mlp": 1.01994085, + "epoch": 0.6834811363294754, + "flos": 20056603489920.0, + "grad_norm": 2.347942801304673, + "language_loss": 0.70204437, + "learning_rate": 9.619101713400312e-07, + "loss": 0.72659928, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20996094, + "step": 11368, + "time_per_iteration": 2.85105037689209 + }, + { + "auxiliary_loss_clip": 0.01409319, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.2453804, + "balance_loss_mlp": 1.01153696, + "epoch": 0.6835412595821434, + "flos": 24801576376320.0, + "grad_norm": 1.6990733400158802, + "language_loss": 0.74046516, + "learning_rate": 9.615772998335261e-07, + "loss": 0.76487875, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20495605, + "step": 11369, + "time_per_iteration": 2.8680148124694824 + }, + { + "auxiliary_loss_clip": 0.01424637, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.25873351, + "balance_loss_mlp": 1.01425743, + "epoch": 0.6836013828348113, + "flos": 19509818353920.0, + "grad_norm": 2.0127459016453084, + "language_loss": 0.80092072, + "learning_rate": 9.612444677041138e-07, + "loss": 0.82549679, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.18713379, + "step": 11370, + "time_per_iteration": 2.823411703109741 + }, + { + "auxiliary_loss_clip": 0.01196237, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.10512924, + "balance_loss_mlp": 1.01280117, + "epoch": 0.6836615060874793, + "flos": 58394652243840.0, + "grad_norm": 0.7502959758835724, + "language_loss": 0.59776866, + "learning_rate": 9.609116749644162e-07, + "loss": 0.62008226, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.22363281, + "step": 11371, + "time_per_iteration": 3.2548840045928955 + }, + { + "auxiliary_loss_clip": 0.01401145, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.24071646, + "balance_loss_mlp": 1.01326835, + "epoch": 0.6837216293401474, + "flos": 12174500096640.0, + "grad_norm": 1.5231223832905636, + "language_loss": 0.64295727, + "learning_rate": 9.605789216270511e-07, + "loss": 0.6673007, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19909668, + "step": 11372, + "time_per_iteration": 2.8188834190368652 + }, + { + "auxiliary_loss_clip": 0.01421086, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.25661433, + "balance_loss_mlp": 1.01505971, + "epoch": 0.6837817525928153, + "flos": 22137789415680.0, + "grad_norm": 1.4723562932300103, + "language_loss": 0.72156835, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74613166, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.2019043, + "step": 11373, + "time_per_iteration": 2.9459218978881836 + }, + { + "auxiliary_loss_clip": 0.01193737, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.10475469, + "balance_loss_mlp": 1.01738191, + "epoch": 0.6838418758454833, + "flos": 65038700160000.0, + "grad_norm": 1.3176128894358552, + "language_loss": 0.56629759, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58862144, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.21289062, + "step": 11374, + "time_per_iteration": 4.770488739013672 + }, + { + "auxiliary_loss_clip": 0.01429828, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.26313651, + "balance_loss_mlp": 1.0129174, + "epoch": 0.6839019990981512, + "flos": 21040192356480.0, + "grad_norm": 1.651389627724949, + "language_loss": 0.74670857, + "learning_rate": 9.595808981551312e-07, + "loss": 0.77133584, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1998291, + "step": 11375, + "time_per_iteration": 2.8765532970428467 + }, + { + "auxiliary_loss_clip": 0.01413373, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.24968266, + "balance_loss_mlp": 1.01619256, + "epoch": 0.6839621223508192, + "flos": 24946106826240.0, + "grad_norm": 1.7605529255279193, + "language_loss": 0.71050501, + "learning_rate": 9.592483025532651e-07, + "loss": 0.73499596, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19506836, + "step": 11376, + "time_per_iteration": 2.8973183631896973 + }, + { + "auxiliary_loss_clip": 0.01430295, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.26129889, + "balance_loss_mlp": 1.01544333, + "epoch": 0.6840222456034871, + "flos": 26369847538560.0, + "grad_norm": 3.156834145757527, + "language_loss": 0.7471242, + "learning_rate": 9.58915746416808e-07, + "loss": 0.77178669, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.2052002, + "step": 11377, + "time_per_iteration": 2.8828487396240234 + }, + { + "auxiliary_loss_clip": 0.01192218, + "auxiliary_loss_mlp": 0.01017067, + "balance_loss_clip": 1.09976661, + "balance_loss_mlp": 0.99570447, + "epoch": 0.6840823688561551, + "flos": 66020660213760.0, + "grad_norm": 0.7173425014327345, + "language_loss": 0.56907749, + "learning_rate": 9.585832297583707e-07, + "loss": 0.59117031, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.21386719, + "step": 11378, + "time_per_iteration": 3.3827879428863525 + }, + { + "auxiliary_loss_clip": 0.01423493, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.25672734, + "balance_loss_mlp": 1.01622057, + "epoch": 0.684142492108823, + "flos": 21407536039680.0, + "grad_norm": 1.850259942164225, + "language_loss": 0.79478884, + "learning_rate": 9.58250752590561e-07, + "loss": 0.81939036, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20422363, + "step": 11379, + "time_per_iteration": 2.829346179962158 + }, + { + "auxiliary_loss_clip": 0.01398697, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.24173927, + "balance_loss_mlp": 1.01382828, + "epoch": 0.6842026153614911, + "flos": 18809227584000.0, + "grad_norm": 1.6704715384123423, + "language_loss": 0.69692218, + "learning_rate": 9.57918314925988e-07, + "loss": 0.72122717, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.17980957, + "step": 11380, + "time_per_iteration": 2.8209009170532227 + }, + { + "auxiliary_loss_clip": 0.01417444, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.25380528, + "balance_loss_mlp": 1.01426578, + "epoch": 0.684262738614159, + "flos": 19655932371840.0, + "grad_norm": 1.8694414179270298, + "language_loss": 0.78388149, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80840421, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20556641, + "step": 11381, + "time_per_iteration": 2.823035955429077 + }, + { + "auxiliary_loss_clip": 0.011927, + "auxiliary_loss_mlp": 0.01020071, + "balance_loss_clip": 1.10436273, + "balance_loss_mlp": 1.00309551, + "epoch": 0.684322861866827, + "flos": 62380188858240.0, + "grad_norm": 0.8724389944568539, + "language_loss": 0.67242014, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69454789, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16992188, + "step": 11382, + "time_per_iteration": 3.1425552368164062 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.10307646, + "balance_loss_mlp": 1.006971, + "epoch": 0.6843829851194949, + "flos": 65837101484160.0, + "grad_norm": 0.8247509223965037, + "language_loss": 0.58169037, + "learning_rate": 9.569212390777356e-07, + "loss": 0.6038956, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.19921875, + "step": 11383, + "time_per_iteration": 3.3082969188690186 + }, + { + "auxiliary_loss_clip": 0.01402596, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.24087059, + "balance_loss_mlp": 1.01416576, + "epoch": 0.6844431083721629, + "flos": 27866667882240.0, + "grad_norm": 2.1599723540735907, + "language_loss": 0.80578417, + "learning_rate": 9.565889595521517e-07, + "loss": 0.83014488, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19287109, + "step": 11384, + "time_per_iteration": 2.9528353214263916 + }, + { + "auxiliary_loss_clip": 0.01427916, + "auxiliary_loss_mlp": 0.01041169, + "balance_loss_clip": 1.26023901, + "balance_loss_mlp": 1.02140439, + "epoch": 0.684503231624831, + "flos": 18262894896000.0, + "grad_norm": 4.634426974526997, + "language_loss": 0.78065002, + "learning_rate": 9.562567195928187e-07, + "loss": 0.80534089, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19775391, + "step": 11385, + "time_per_iteration": 4.291656970977783 + }, + { + "auxiliary_loss_clip": 0.01438674, + "auxiliary_loss_mlp": 0.0103753, + "balance_loss_clip": 1.26697147, + "balance_loss_mlp": 1.01565492, + "epoch": 0.6845633548774989, + "flos": 17648685728640.0, + "grad_norm": 1.924418198835301, + "language_loss": 0.85293174, + "learning_rate": 9.55924519212335e-07, + "loss": 0.87769377, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 1.71777344, + "router_z_loss_mlp": 0.21862793, + "step": 11386, + "time_per_iteration": 4.2574169635772705 + }, + { + "auxiliary_loss_clip": 0.01429882, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.26523602, + "balance_loss_mlp": 1.01704454, + "epoch": 0.6846234781301669, + "flos": 20815750310400.0, + "grad_norm": 1.9940439389694726, + "language_loss": 0.82987022, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85453129, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19189453, + "step": 11387, + "time_per_iteration": 2.860929012298584 + }, + { + "auxiliary_loss_clip": 0.01416958, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.25397682, + "balance_loss_mlp": 1.00980365, + "epoch": 0.6846836013828348, + "flos": 36114893349120.0, + "grad_norm": 1.7122298095762598, + "language_loss": 0.72651255, + "learning_rate": 9.552602372383047e-07, + "loss": 0.75096911, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18884277, + "step": 11388, + "time_per_iteration": 2.9708025455474854 + }, + { + "auxiliary_loss_clip": 0.01409428, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.24758518, + "balance_loss_mlp": 1.01181602, + "epoch": 0.6847437246355028, + "flos": 43157893080960.0, + "grad_norm": 1.8848650139314924, + "language_loss": 0.63578302, + "learning_rate": 9.549281556699469e-07, + "loss": 0.66018617, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1907959, + "step": 11389, + "time_per_iteration": 4.4353296756744385 + }, + { + "auxiliary_loss_clip": 0.0119265, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.1008637, + "balance_loss_mlp": 1.00910115, + "epoch": 0.6848038478881707, + "flos": 71693272131840.0, + "grad_norm": 0.762036034021924, + "language_loss": 0.56042606, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58266485, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.22167969, + "step": 11390, + "time_per_iteration": 3.504220724105835 + }, + { + "auxiliary_loss_clip": 0.01421072, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.25708961, + "balance_loss_mlp": 1.01343465, + "epoch": 0.6848639711408387, + "flos": 19947300756480.0, + "grad_norm": 2.374242102342309, + "language_loss": 0.88700098, + "learning_rate": 9.542641114335109e-07, + "loss": 0.91154778, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20166016, + "step": 11391, + "time_per_iteration": 2.8390703201293945 + }, + { + "auxiliary_loss_clip": 0.01427064, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.26075053, + "balance_loss_mlp": 1.01444995, + "epoch": 0.6849240943935067, + "flos": 26878102087680.0, + "grad_norm": 1.5077263447067288, + "language_loss": 0.79306757, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81767714, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19445801, + "step": 11392, + "time_per_iteration": 2.880298614501953 + }, + { + "auxiliary_loss_clip": 0.01403029, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.24284279, + "balance_loss_mlp": 1.01254869, + "epoch": 0.6849842176461747, + "flos": 13743268951680.0, + "grad_norm": 2.2869530563890574, + "language_loss": 0.71639073, + "learning_rate": 9.536002258147104e-07, + "loss": 0.74073923, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19274902, + "step": 11393, + "time_per_iteration": 2.8313510417938232 + }, + { + "auxiliary_loss_clip": 0.01433394, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.26529694, + "balance_loss_mlp": 1.01653957, + "epoch": 0.6850443408988426, + "flos": 24983913496320.0, + "grad_norm": 1.5798956781833566, + "language_loss": 0.65180409, + "learning_rate": 9.532683425183936e-07, + "loss": 0.67651004, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20666504, + "step": 11394, + "time_per_iteration": 2.8793210983276367 + }, + { + "auxiliary_loss_clip": 0.01416699, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.25227857, + "balance_loss_mlp": 1.01297438, + "epoch": 0.6851044641515106, + "flos": 27755329132800.0, + "grad_norm": 1.582633987785243, + "language_loss": 0.81388593, + "learning_rate": 9.529364989142468e-07, + "loss": 0.83838367, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.2010498, + "step": 11395, + "time_per_iteration": 2.9071643352508545 + }, + { + "auxiliary_loss_clip": 0.01409358, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.24584436, + "balance_loss_mlp": 1.01730263, + "epoch": 0.6851645874041785, + "flos": 24361288796160.0, + "grad_norm": 1.6855074482080372, + "language_loss": 0.73543203, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75989532, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19677734, + "step": 11396, + "time_per_iteration": 3.0137746334075928 + }, + { + "auxiliary_loss_clip": 0.01428098, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.25969243, + "balance_loss_mlp": 1.01427484, + "epoch": 0.6852247106568465, + "flos": 15084157155840.0, + "grad_norm": 3.4375290130100624, + "language_loss": 0.79814321, + "learning_rate": 9.522729308327931e-07, + "loss": 0.8227669, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19995117, + "step": 11397, + "time_per_iteration": 2.7934505939483643 + }, + { + "auxiliary_loss_clip": 0.01413534, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.24882579, + "balance_loss_mlp": 1.01529622, + "epoch": 0.6852848339095146, + "flos": 18779112529920.0, + "grad_norm": 2.3377544509179398, + "language_loss": 0.72068691, + "learning_rate": 9.519412063806493e-07, + "loss": 0.74517238, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19714355, + "step": 11398, + "time_per_iteration": 2.809738874435425 + }, + { + "auxiliary_loss_clip": 0.01403804, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.24252272, + "balance_loss_mlp": 1.01470423, + "epoch": 0.6853449571621825, + "flos": 27865672496640.0, + "grad_norm": 1.6707693985181886, + "language_loss": 0.70993274, + "learning_rate": 9.516095216709996e-07, + "loss": 0.73431301, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19506836, + "step": 11399, + "time_per_iteration": 2.8674776554107666 + }, + { + "auxiliary_loss_clip": 0.01421791, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.2567637, + "balance_loss_mlp": 1.01433468, + "epoch": 0.6854050804148505, + "flos": 18159654965760.0, + "grad_norm": 1.8885796057061492, + "language_loss": 0.70736367, + "learning_rate": 9.512778767164217e-07, + "loss": 0.73193878, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.21386719, + "step": 11400, + "time_per_iteration": 2.818117141723633 + }, + { + "auxiliary_loss_clip": 0.01449566, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.27371597, + "balance_loss_mlp": 1.01910067, + "epoch": 0.6854652036675184, + "flos": 16334881176960.0, + "grad_norm": 1.8558560047374948, + "language_loss": 0.79465592, + "learning_rate": 9.509462715294927e-07, + "loss": 0.81955981, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 1.7578125, + "router_z_loss_mlp": 0.21716309, + "step": 11401, + "time_per_iteration": 2.8080966472625732 + }, + { + "auxiliary_loss_clip": 0.01399403, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.23841286, + "balance_loss_mlp": 1.01264215, + "epoch": 0.6855253269201864, + "flos": 14949399582720.0, + "grad_norm": 2.309034715925553, + "language_loss": 0.76203853, + "learning_rate": 9.50614706122786e-07, + "loss": 0.78635716, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19824219, + "step": 11402, + "time_per_iteration": 2.8289482593536377 + }, + { + "auxiliary_loss_clip": 0.01432232, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.26373744, + "balance_loss_mlp": 1.01600683, + "epoch": 0.6855854501728543, + "flos": 23047665223680.0, + "grad_norm": 1.9942861748283576, + "language_loss": 0.73448181, + "learning_rate": 9.502831805088742e-07, + "loss": 0.75915611, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19189453, + "step": 11403, + "time_per_iteration": 2.8671300411224365 + }, + { + "auxiliary_loss_clip": 0.01416536, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.25402367, + "balance_loss_mlp": 1.01484764, + "epoch": 0.6856455734255223, + "flos": 13259473856640.0, + "grad_norm": 1.9455043368014764, + "language_loss": 0.81970185, + "learning_rate": 9.499516947003294e-07, + "loss": 0.84420949, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19384766, + "step": 11404, + "time_per_iteration": 2.8450887203216553 + }, + { + "auxiliary_loss_clip": 0.01413536, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.2511766, + "balance_loss_mlp": 1.01419425, + "epoch": 0.6857056966781903, + "flos": 23344462984320.0, + "grad_norm": 1.9813554131380395, + "language_loss": 0.78153074, + "learning_rate": 9.496202487097222e-07, + "loss": 0.80601418, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20605469, + "step": 11405, + "time_per_iteration": 2.8757519721984863 + }, + { + "auxiliary_loss_clip": 0.01194498, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.10332489, + "balance_loss_mlp": 1.01388347, + "epoch": 0.6857658199308583, + "flos": 61880576065920.0, + "grad_norm": 0.8049479921220468, + "language_loss": 0.61093795, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63319719, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.17578125, + "step": 11406, + "time_per_iteration": 3.3820865154266357 + }, + { + "auxiliary_loss_clip": 0.01419469, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.2540884, + "balance_loss_mlp": 1.01622081, + "epoch": 0.6858259431835262, + "flos": 16663287070080.0, + "grad_norm": 1.8468916723564053, + "language_loss": 0.77284855, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79740965, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.2043457, + "step": 11407, + "time_per_iteration": 2.8436667919158936 + }, + { + "auxiliary_loss_clip": 0.0141712, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.25104141, + "balance_loss_mlp": 1.01897264, + "epoch": 0.6858860664361942, + "flos": 21883322805120.0, + "grad_norm": 2.2803989841797208, + "language_loss": 0.71508265, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73965353, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.21008301, + "step": 11408, + "time_per_iteration": 2.8680248260498047 + }, + { + "auxiliary_loss_clip": 0.01431874, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.26326621, + "balance_loss_mlp": 1.01255476, + "epoch": 0.6859461896888621, + "flos": 15275905194240.0, + "grad_norm": 1.7861817460428806, + "language_loss": 0.7110045, + "learning_rate": 9.482948631780087e-07, + "loss": 0.73564649, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19763184, + "step": 11409, + "time_per_iteration": 4.254801034927368 + }, + { + "auxiliary_loss_clip": 0.01399181, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.24082446, + "balance_loss_mlp": 1.01690459, + "epoch": 0.6860063129415301, + "flos": 18628383542400.0, + "grad_norm": 1.5752649018181286, + "language_loss": 0.78323799, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80759323, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19458008, + "step": 11410, + "time_per_iteration": 2.826809883117676 + }, + { + "auxiliary_loss_clip": 0.01429942, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.26119101, + "balance_loss_mlp": 1.01781166, + "epoch": 0.6860664361941982, + "flos": 23961929777280.0, + "grad_norm": 1.7047333789951864, + "language_loss": 0.71906257, + "learning_rate": 9.476324096464821e-07, + "loss": 0.74375594, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.21582031, + "step": 11411, + "time_per_iteration": 2.879160165786743 + }, + { + "auxiliary_loss_clip": 0.01414156, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.24912667, + "balance_loss_mlp": 1.0133481, + "epoch": 0.6861265594468661, + "flos": 20416255557120.0, + "grad_norm": 4.606155308293427, + "language_loss": 0.71147579, + "learning_rate": 9.473012427332654e-07, + "loss": 0.73595583, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20495605, + "step": 11412, + "time_per_iteration": 2.827428102493286 + }, + { + "auxiliary_loss_clip": 0.01413005, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.24969316, + "balance_loss_mlp": 1.01362967, + "epoch": 0.6861866826995341, + "flos": 11433749927040.0, + "grad_norm": 2.664258490837861, + "language_loss": 0.72852969, + "learning_rate": 9.469701157384919e-07, + "loss": 0.75299084, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19482422, + "step": 11413, + "time_per_iteration": 2.7663071155548096 + }, + { + "auxiliary_loss_clip": 0.01414454, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.2504977, + "balance_loss_mlp": 1.01456308, + "epoch": 0.686246805952202, + "flos": 16006113325440.0, + "grad_norm": 1.777551643976596, + "language_loss": 0.74144149, + "learning_rate": 9.466390286747164e-07, + "loss": 0.7659263, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19470215, + "step": 11414, + "time_per_iteration": 2.8261961936950684 + }, + { + "auxiliary_loss_clip": 0.01421569, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.25663614, + "balance_loss_mlp": 1.01396632, + "epoch": 0.68630692920487, + "flos": 19835735783040.0, + "grad_norm": 2.1798679656259847, + "language_loss": 0.87367862, + "learning_rate": 9.46307981554495e-07, + "loss": 0.8982321, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19799805, + "step": 11415, + "time_per_iteration": 2.7976410388946533 + }, + { + "auxiliary_loss_clip": 0.01420121, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.25367224, + "balance_loss_mlp": 1.01716638, + "epoch": 0.6863670524575379, + "flos": 26297310844800.0, + "grad_norm": 2.9473699952047014, + "language_loss": 0.67895639, + "learning_rate": 9.459769743903801e-07, + "loss": 0.70352614, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19689941, + "step": 11416, + "time_per_iteration": 2.8731725215911865 + }, + { + "auxiliary_loss_clip": 0.01399716, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.23839867, + "balance_loss_mlp": 1.01543164, + "epoch": 0.686427175710206, + "flos": 19182815049600.0, + "grad_norm": 1.2901647788251638, + "language_loss": 0.76774132, + "learning_rate": 9.456460071949237e-07, + "loss": 0.7920891, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19628906, + "step": 11417, + "time_per_iteration": 2.835952043533325 + }, + { + "auxiliary_loss_clip": 0.01422048, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.25745201, + "balance_loss_mlp": 1.01338172, + "epoch": 0.6864872989628739, + "flos": 18925950464640.0, + "grad_norm": 1.7845369982584198, + "language_loss": 0.78678626, + "learning_rate": 9.45315079980678e-07, + "loss": 0.81133533, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19494629, + "step": 11418, + "time_per_iteration": 2.805837631225586 + }, + { + "auxiliary_loss_clip": 0.01421798, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.25579739, + "balance_loss_mlp": 1.01045418, + "epoch": 0.6865474222155419, + "flos": 25966778446080.0, + "grad_norm": 3.327932352547946, + "language_loss": 0.77192676, + "learning_rate": 9.449841927601887e-07, + "loss": 0.7964319, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18249512, + "step": 11419, + "time_per_iteration": 2.891226291656494 + }, + { + "auxiliary_loss_clip": 0.01425333, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.26094007, + "balance_loss_mlp": 1.01784205, + "epoch": 0.6866075454682098, + "flos": 18487110718080.0, + "grad_norm": 1.715498495558561, + "language_loss": 0.71800172, + "learning_rate": 9.446533455460044e-07, + "loss": 0.74262887, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1953125, + "step": 11420, + "time_per_iteration": 2.7929141521453857 + }, + { + "auxiliary_loss_clip": 0.01416785, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.25243533, + "balance_loss_mlp": 1.01159596, + "epoch": 0.6866676687208778, + "flos": 34253127296640.0, + "grad_norm": 1.5388659707580634, + "language_loss": 0.75209141, + "learning_rate": 9.443225383506712e-07, + "loss": 0.7765671, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19189453, + "step": 11421, + "time_per_iteration": 5.7138237953186035 + }, + { + "auxiliary_loss_clip": 0.0140632, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.24578762, + "balance_loss_mlp": 1.01420057, + "epoch": 0.6867277919735457, + "flos": 21730693536000.0, + "grad_norm": 1.9214824542861868, + "language_loss": 0.78082597, + "learning_rate": 9.439917711867338e-07, + "loss": 0.80523092, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.1998291, + "step": 11422, + "time_per_iteration": 2.8612911701202393 + }, + { + "auxiliary_loss_clip": 0.01406479, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.24356675, + "balance_loss_mlp": 1.01479459, + "epoch": 0.6867879152262137, + "flos": 24108767712000.0, + "grad_norm": 1.9996874949663241, + "language_loss": 0.78082955, + "learning_rate": 9.436610440667334e-07, + "loss": 0.80524492, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20263672, + "step": 11423, + "time_per_iteration": 2.871936798095703 + }, + { + "auxiliary_loss_clip": 0.01430057, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.26348114, + "balance_loss_mlp": 1.01683831, + "epoch": 0.6868480384788818, + "flos": 21625643813760.0, + "grad_norm": 1.4340012243197235, + "language_loss": 0.73613453, + "learning_rate": 9.433303570032129e-07, + "loss": 0.76080543, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.20202637, + "step": 11424, + "time_per_iteration": 4.302043199539185 + }, + { + "auxiliary_loss_clip": 0.01414965, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.24990249, + "balance_loss_mlp": 1.01114917, + "epoch": 0.6869081617315497, + "flos": 26297220355200.0, + "grad_norm": 2.3496310112638397, + "language_loss": 0.6574313, + "learning_rate": 9.429997100087112e-07, + "loss": 0.68188715, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19458008, + "step": 11425, + "time_per_iteration": 2.8721606731414795 + }, + { + "auxiliary_loss_clip": 0.01403667, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.24218416, + "balance_loss_mlp": 1.01477838, + "epoch": 0.6869682849842177, + "flos": 21114810311040.0, + "grad_norm": 1.4362502166581714, + "language_loss": 0.72112399, + "learning_rate": 9.426691030957657e-07, + "loss": 0.74549872, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19030762, + "step": 11426, + "time_per_iteration": 2.848940849304199 + }, + { + "auxiliary_loss_clip": 0.01422968, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.25669432, + "balance_loss_mlp": 1.0136385, + "epoch": 0.6870284082368856, + "flos": 17101583879040.0, + "grad_norm": 2.6090214809142624, + "language_loss": 0.86032474, + "learning_rate": 9.423385362769136e-07, + "loss": 0.88489014, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19934082, + "step": 11427, + "time_per_iteration": 2.8583896160125732 + }, + { + "auxiliary_loss_clip": 0.01414956, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.25237012, + "balance_loss_mlp": 1.01413655, + "epoch": 0.6870885314895536, + "flos": 27319520787840.0, + "grad_norm": 1.4513710202089727, + "language_loss": 0.76885122, + "learning_rate": 9.420080095646909e-07, + "loss": 0.7933504, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20825195, + "step": 11428, + "time_per_iteration": 2.90556001663208 + }, + { + "auxiliary_loss_clip": 0.01439396, + "auxiliary_loss_mlp": 0.01038684, + "balance_loss_clip": 1.27045488, + "balance_loss_mlp": 1.01781023, + "epoch": 0.6871486547422215, + "flos": 20824527801600.0, + "grad_norm": 1.7754507598131672, + "language_loss": 0.74260628, + "learning_rate": 9.4167752297163e-07, + "loss": 0.76738703, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20861816, + "step": 11429, + "time_per_iteration": 2.8790361881256104 + }, + { + "auxiliary_loss_clip": 0.01415402, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.24930525, + "balance_loss_mlp": 1.0129149, + "epoch": 0.6872087779948896, + "flos": 30166323540480.0, + "grad_norm": 1.9537445211606002, + "language_loss": 0.83708566, + "learning_rate": 9.413470765102643e-07, + "loss": 0.86155975, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19104004, + "step": 11430, + "time_per_iteration": 2.9140877723693848 + }, + { + "auxiliary_loss_clip": 0.01410341, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.24688721, + "balance_loss_mlp": 1.01267028, + "epoch": 0.6872689012475575, + "flos": 20714410661760.0, + "grad_norm": 1.9604863977757747, + "language_loss": 0.71054196, + "learning_rate": 9.410166701931225e-07, + "loss": 0.73497283, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20068359, + "step": 11431, + "time_per_iteration": 2.8622169494628906 + }, + { + "auxiliary_loss_clip": 0.01415542, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.25113273, + "balance_loss_mlp": 1.01442444, + "epoch": 0.6873290245002255, + "flos": 25531965486720.0, + "grad_norm": 1.894136916271802, + "language_loss": 0.81182325, + "learning_rate": 9.406863040327355e-07, + "loss": 0.83632171, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1986084, + "step": 11432, + "time_per_iteration": 2.9425241947174072 + }, + { + "auxiliary_loss_clip": 0.01408554, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.24834359, + "balance_loss_mlp": 1.01753354, + "epoch": 0.6873891477528934, + "flos": 25202247494400.0, + "grad_norm": 2.0719760836853496, + "language_loss": 0.68694746, + "learning_rate": 9.403559780416295e-07, + "loss": 0.71140611, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19775391, + "step": 11433, + "time_per_iteration": 2.942978858947754 + }, + { + "auxiliary_loss_clip": 0.01419137, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.25452626, + "balance_loss_mlp": 1.01393771, + "epoch": 0.6874492710055614, + "flos": 35165808282240.0, + "grad_norm": 1.885315769550077, + "language_loss": 0.73621964, + "learning_rate": 9.400256922323309e-07, + "loss": 0.76075488, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20458984, + "step": 11434, + "time_per_iteration": 2.9723289012908936 + }, + { + "auxiliary_loss_clip": 0.01419027, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.25523448, + "balance_loss_mlp": 1.01794648, + "epoch": 0.6875093942582293, + "flos": 17831294317440.0, + "grad_norm": 1.7365168044879218, + "language_loss": 0.81087482, + "learning_rate": 9.396954466173657e-07, + "loss": 0.83544308, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19848633, + "step": 11435, + "time_per_iteration": 2.819511651992798 + }, + { + "auxiliary_loss_clip": 0.0142133, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.25414336, + "balance_loss_mlp": 1.01184297, + "epoch": 0.6875695175108973, + "flos": 20714365416960.0, + "grad_norm": 1.9927434697875628, + "language_loss": 0.81620181, + "learning_rate": 9.393652412092538e-07, + "loss": 0.8407535, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.21948242, + "step": 11436, + "time_per_iteration": 2.8404290676116943 + }, + { + "auxiliary_loss_clip": 0.01393354, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.23554969, + "balance_loss_mlp": 1.01568961, + "epoch": 0.6876296407635654, + "flos": 25384856083200.0, + "grad_norm": 1.7862468802751503, + "language_loss": 0.82633305, + "learning_rate": 9.390350760205183e-07, + "loss": 0.85061151, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18798828, + "step": 11437, + "time_per_iteration": 2.8769543170928955 + }, + { + "auxiliary_loss_clip": 0.01458965, + "auxiliary_loss_mlp": 0.01037829, + "balance_loss_clip": 1.28490341, + "balance_loss_mlp": 1.01740813, + "epoch": 0.6876897640162333, + "flos": 23232852766080.0, + "grad_norm": 2.516898874743628, + "language_loss": 0.78659666, + "learning_rate": 9.387049510636793e-07, + "loss": 0.81156456, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 1.74023438, + "router_z_loss_mlp": 0.20422363, + "step": 11438, + "time_per_iteration": 2.837991714477539 + }, + { + "auxiliary_loss_clip": 0.01397615, + "auxiliary_loss_mlp": 0.01035921, + "balance_loss_clip": 1.23858023, + "balance_loss_mlp": 1.01583445, + "epoch": 0.6877498872689013, + "flos": 27135373875840.0, + "grad_norm": 1.5443776326978516, + "language_loss": 0.73385751, + "learning_rate": 9.383748663512554e-07, + "loss": 0.7581929, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20080566, + "step": 11439, + "time_per_iteration": 2.954479932785034 + }, + { + "auxiliary_loss_clip": 0.01411794, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.24912846, + "balance_loss_mlp": 1.01512516, + "epoch": 0.6878100105215692, + "flos": 11588731925760.0, + "grad_norm": 2.1067853640506895, + "language_loss": 0.76213574, + "learning_rate": 9.380448218957623e-07, + "loss": 0.78660882, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20385742, + "step": 11440, + "time_per_iteration": 2.858375072479248 + }, + { + "auxiliary_loss_clip": 0.01405229, + "auxiliary_loss_mlp": 0.01038004, + "balance_loss_clip": 1.24516082, + "balance_loss_mlp": 1.01767898, + "epoch": 0.6878701337742372, + "flos": 20312879892480.0, + "grad_norm": 1.7464982199734727, + "language_loss": 0.72706294, + "learning_rate": 9.377148177097167e-07, + "loss": 0.75149524, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.20324707, + "step": 11441, + "time_per_iteration": 2.844583034515381 + }, + { + "auxiliary_loss_clip": 0.0142717, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.25780249, + "balance_loss_mlp": 1.01296878, + "epoch": 0.6879302570269051, + "flos": 13846780350720.0, + "grad_norm": 1.742699604484308, + "language_loss": 0.67420954, + "learning_rate": 9.373848538056317e-07, + "loss": 0.69881707, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.20617676, + "step": 11442, + "time_per_iteration": 2.8520867824554443 + }, + { + "auxiliary_loss_clip": 0.01421231, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.25734007, + "balance_loss_mlp": 1.01874745, + "epoch": 0.6879903802795732, + "flos": 21334592142720.0, + "grad_norm": 9.465640541212256, + "language_loss": 0.7055434, + "learning_rate": 9.370549301960189e-07, + "loss": 0.73013699, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19360352, + "step": 11443, + "time_per_iteration": 2.937373399734497 + }, + { + "auxiliary_loss_clip": 0.01422473, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.25759912, + "balance_loss_mlp": 1.01572394, + "epoch": 0.6880505035322411, + "flos": 25162042849920.0, + "grad_norm": 1.4573042767276814, + "language_loss": 0.76867545, + "learning_rate": 9.367250468933893e-07, + "loss": 0.79326648, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20910645, + "step": 11444, + "time_per_iteration": 4.379446506500244 + }, + { + "auxiliary_loss_clip": 0.01406808, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.24533784, + "balance_loss_mlp": 1.01013458, + "epoch": 0.6881106267849091, + "flos": 23224301498880.0, + "grad_norm": 2.0507362206188393, + "language_loss": 0.77369654, + "learning_rate": 9.363952039102536e-07, + "loss": 0.79805946, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19335938, + "step": 11445, + "time_per_iteration": 2.8553643226623535 + }, + { + "auxiliary_loss_clip": 0.01193045, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.09947145, + "balance_loss_mlp": 1.01091564, + "epoch": 0.688170750037577, + "flos": 48505103245440.0, + "grad_norm": 0.8256906310845094, + "language_loss": 0.58371955, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60596514, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20605469, + "step": 11446, + "time_per_iteration": 3.4447128772735596 + }, + { + "auxiliary_loss_clip": 0.01432086, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.26278996, + "balance_loss_mlp": 1.01614189, + "epoch": 0.688230873290245, + "flos": 22793832040320.0, + "grad_norm": 1.480043388170362, + "language_loss": 0.76028907, + "learning_rate": 9.357356389524886e-07, + "loss": 0.78497225, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.20080566, + "step": 11447, + "time_per_iteration": 2.9029901027679443 + }, + { + "auxiliary_loss_clip": 0.01415265, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.25057161, + "balance_loss_mlp": 1.01143086, + "epoch": 0.6882909965429129, + "flos": 22465878595200.0, + "grad_norm": 1.929946783971295, + "language_loss": 0.7397123, + "learning_rate": 9.354059170028705e-07, + "loss": 0.76417834, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19909668, + "step": 11448, + "time_per_iteration": 2.8681349754333496 + }, + { + "auxiliary_loss_clip": 0.01431963, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.26095009, + "balance_loss_mlp": 1.01576591, + "epoch": 0.688351119795581, + "flos": 26225452823040.0, + "grad_norm": 1.7278142835160022, + "language_loss": 0.75615418, + "learning_rate": 9.350762354227673e-07, + "loss": 0.78083551, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20410156, + "step": 11449, + "time_per_iteration": 2.9387545585632324 + }, + { + "auxiliary_loss_clip": 0.01419625, + "auxiliary_loss_mlp": 0.01038551, + "balance_loss_clip": 1.25581753, + "balance_loss_mlp": 1.01971626, + "epoch": 0.6884112430482489, + "flos": 22575543287040.0, + "grad_norm": 1.8079484956595933, + "language_loss": 0.7086755, + "learning_rate": 9.34746594224679e-07, + "loss": 0.73325729, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18847656, + "step": 11450, + "time_per_iteration": 2.838945150375366 + }, + { + "auxiliary_loss_clip": 0.01434683, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.26417983, + "balance_loss_mlp": 1.01766801, + "epoch": 0.6884713663009169, + "flos": 17348223139200.0, + "grad_norm": 2.552228083450065, + "language_loss": 0.77123988, + "learning_rate": 9.344169934211068e-07, + "loss": 0.79595852, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 1.70605469, + "router_z_loss_mlp": 0.1953125, + "step": 11451, + "time_per_iteration": 2.855661392211914 + }, + { + "auxiliary_loss_clip": 0.01438604, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.27227402, + "balance_loss_mlp": 1.01541305, + "epoch": 0.6885314895535849, + "flos": 26482543632000.0, + "grad_norm": 3.6074601365033874, + "language_loss": 0.69334584, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71807289, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.18688965, + "step": 11452, + "time_per_iteration": 2.905358076095581 + }, + { + "auxiliary_loss_clip": 0.01417755, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.2540319, + "balance_loss_mlp": 1.02048075, + "epoch": 0.6885916128062528, + "flos": 20531168645760.0, + "grad_norm": 1.5889071208768317, + "language_loss": 0.7258296, + "learning_rate": 9.337579130475042e-07, + "loss": 0.75042784, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.21606445, + "step": 11453, + "time_per_iteration": 2.895993232727051 + }, + { + "auxiliary_loss_clip": 0.01197839, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.10306668, + "balance_loss_mlp": 1.00889242, + "epoch": 0.6886517360589208, + "flos": 70745272940160.0, + "grad_norm": 0.791370476608203, + "language_loss": 0.50627553, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52851927, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.17675781, + "step": 11454, + "time_per_iteration": 3.2211222648620605 + }, + { + "auxiliary_loss_clip": 0.01401461, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.24435687, + "balance_loss_mlp": 1.0174551, + "epoch": 0.6887118593115887, + "flos": 17902202198400.0, + "grad_norm": 2.216523756908758, + "language_loss": 0.76650625, + "learning_rate": 9.330989944019263e-07, + "loss": 0.7908932, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19775391, + "step": 11455, + "time_per_iteration": 4.342930793762207 + }, + { + "auxiliary_loss_clip": 0.01424119, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.25532103, + "balance_loss_mlp": 1.01175177, + "epoch": 0.6887719825642568, + "flos": 17460873987840.0, + "grad_norm": 2.487589292932272, + "language_loss": 0.74023169, + "learning_rate": 9.327695957583803e-07, + "loss": 0.76479363, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.20324707, + "step": 11456, + "time_per_iteration": 4.33600378036499 + }, + { + "auxiliary_loss_clip": 0.01406299, + "auxiliary_loss_mlp": 0.01036438, + "balance_loss_clip": 1.24537146, + "balance_loss_mlp": 1.0170188, + "epoch": 0.6888321058169247, + "flos": 23079092376960.0, + "grad_norm": 1.581828812706011, + "language_loss": 0.81525612, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83968347, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1940918, + "step": 11457, + "time_per_iteration": 2.86034893989563 + }, + { + "auxiliary_loss_clip": 0.01423471, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.2569356, + "balance_loss_mlp": 1.01739812, + "epoch": 0.6888922290695927, + "flos": 23379554966400.0, + "grad_norm": 3.8513499491618863, + "language_loss": 0.77070963, + "learning_rate": 9.321109198922301e-07, + "loss": 0.79531717, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19885254, + "step": 11458, + "time_per_iteration": 2.88289213180542 + }, + { + "auxiliary_loss_clip": 0.01413875, + "auxiliary_loss_mlp": 0.01033921, + "balance_loss_clip": 1.24993682, + "balance_loss_mlp": 1.01390564, + "epoch": 0.6889523523222606, + "flos": 17638550893440.0, + "grad_norm": 2.208074354601784, + "language_loss": 0.68264824, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70712614, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20019531, + "step": 11459, + "time_per_iteration": 4.207055330276489 + }, + { + "auxiliary_loss_clip": 0.01414837, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.25095415, + "balance_loss_mlp": 1.01690257, + "epoch": 0.6890124755749286, + "flos": 25238968289280.0, + "grad_norm": 1.940103381934279, + "language_loss": 0.69378811, + "learning_rate": 9.314524060039221e-07, + "loss": 0.71829617, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19067383, + "step": 11460, + "time_per_iteration": 2.886648178100586 + }, + { + "auxiliary_loss_clip": 0.01446431, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.27329493, + "balance_loss_mlp": 1.01441216, + "epoch": 0.6890725988275965, + "flos": 20239935995520.0, + "grad_norm": 1.7365235566031958, + "language_loss": 0.77682483, + "learning_rate": 9.311232098326731e-07, + "loss": 0.80163872, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.20532227, + "step": 11461, + "time_per_iteration": 2.8734724521636963 + }, + { + "auxiliary_loss_clip": 0.01409199, + "auxiliary_loss_mlp": 0.01038741, + "balance_loss_clip": 1.24572182, + "balance_loss_mlp": 1.01896417, + "epoch": 0.6891327220802645, + "flos": 14542710906240.0, + "grad_norm": 2.1344804064488514, + "language_loss": 0.70971727, + "learning_rate": 9.307940541933401e-07, + "loss": 0.73419666, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19787598, + "step": 11462, + "time_per_iteration": 2.8903589248657227 + }, + { + "auxiliary_loss_clip": 0.0142049, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.25483322, + "balance_loss_mlp": 1.01482832, + "epoch": 0.6891928453329325, + "flos": 21148228235520.0, + "grad_norm": 1.5419600521956216, + "language_loss": 0.87569976, + "learning_rate": 9.304649390984034e-07, + "loss": 0.90025234, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19934082, + "step": 11463, + "time_per_iteration": 2.844409942626953 + }, + { + "auxiliary_loss_clip": 0.01403144, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.2440865, + "balance_loss_mlp": 1.01759124, + "epoch": 0.6892529685856005, + "flos": 17867562664320.0, + "grad_norm": 1.8824791479645122, + "language_loss": 0.69240832, + "learning_rate": 9.301358645603428e-07, + "loss": 0.71680331, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18762207, + "step": 11464, + "time_per_iteration": 2.852980852127075 + }, + { + "auxiliary_loss_clip": 0.01421881, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.25749278, + "balance_loss_mlp": 1.01747787, + "epoch": 0.6893130918382685, + "flos": 29946858422400.0, + "grad_norm": 1.877675396092746, + "language_loss": 0.65858805, + "learning_rate": 9.298068305916373e-07, + "loss": 0.68317431, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19262695, + "step": 11465, + "time_per_iteration": 2.9475553035736084 + }, + { + "auxiliary_loss_clip": 0.01433739, + "auxiliary_loss_mlp": 0.01035703, + "balance_loss_clip": 1.26457763, + "balance_loss_mlp": 1.01605725, + "epoch": 0.6893732150909364, + "flos": 24399004976640.0, + "grad_norm": 1.381228478429248, + "language_loss": 0.73352861, + "learning_rate": 9.294778372047649e-07, + "loss": 0.75822306, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.1965332, + "step": 11466, + "time_per_iteration": 2.8867552280426025 + }, + { + "auxiliary_loss_clip": 0.01415244, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.25247812, + "balance_loss_mlp": 1.01451886, + "epoch": 0.6894333383436044, + "flos": 16991692963200.0, + "grad_norm": 1.641887869424099, + "language_loss": 0.73157299, + "learning_rate": 9.291488844121995e-07, + "loss": 0.75605595, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18518066, + "step": 11467, + "time_per_iteration": 2.8365461826324463 + }, + { + "auxiliary_loss_clip": 0.01425248, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.2566402, + "balance_loss_mlp": 1.01742351, + "epoch": 0.6894934615962723, + "flos": 18993826944000.0, + "grad_norm": 2.0850227159159282, + "language_loss": 0.81967729, + "learning_rate": 9.288199722264156e-07, + "loss": 0.84431565, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.21154785, + "step": 11468, + "time_per_iteration": 2.8566792011260986 + }, + { + "auxiliary_loss_clip": 0.01438342, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.27060807, + "balance_loss_mlp": 1.01555085, + "epoch": 0.6895535848489404, + "flos": 34544812394880.0, + "grad_norm": 1.6383637453091446, + "language_loss": 0.6677351, + "learning_rate": 9.284911006598875e-07, + "loss": 0.69247264, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19836426, + "step": 11469, + "time_per_iteration": 2.9751250743865967 + }, + { + "auxiliary_loss_clip": 0.01196261, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.10245728, + "balance_loss_mlp": 1.00334585, + "epoch": 0.6896137081016083, + "flos": 50102828772480.0, + "grad_norm": 0.8016572162458612, + "language_loss": 0.55194318, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57418245, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.24316406, + "step": 11470, + "time_per_iteration": 3.259658098220825 + }, + { + "auxiliary_loss_clip": 0.01417874, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.25548768, + "balance_loss_mlp": 1.0183928, + "epoch": 0.6896738313542763, + "flos": 19947798449280.0, + "grad_norm": 1.643029044928726, + "language_loss": 0.78992152, + "learning_rate": 9.278334794344715e-07, + "loss": 0.8144542, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.17004395, + "step": 11471, + "time_per_iteration": 2.8288698196411133 + }, + { + "auxiliary_loss_clip": 0.01417747, + "auxiliary_loss_mlp": 0.01034887, + "balance_loss_clip": 1.25425148, + "balance_loss_mlp": 1.01564622, + "epoch": 0.6897339546069442, + "flos": 21735398995200.0, + "grad_norm": 1.8185206373165055, + "language_loss": 0.79388702, + "learning_rate": 9.275047298005232e-07, + "loss": 0.81841338, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19250488, + "step": 11472, + "time_per_iteration": 2.8700990676879883 + }, + { + "auxiliary_loss_clip": 0.01416512, + "auxiliary_loss_mlp": 0.01035295, + "balance_loss_clip": 1.25292611, + "balance_loss_mlp": 1.01690102, + "epoch": 0.6897940778596122, + "flos": 19835464314240.0, + "grad_norm": 1.7148096304408054, + "language_loss": 0.76183617, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78635418, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18383789, + "step": 11473, + "time_per_iteration": 2.886295795440674 + }, + { + "auxiliary_loss_clip": 0.01422748, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.25565577, + "balance_loss_mlp": 1.01405942, + "epoch": 0.6898542011122801, + "flos": 17318651022720.0, + "grad_norm": 1.8900747504141944, + "language_loss": 0.76621515, + "learning_rate": 9.268473525524751e-07, + "loss": 0.79077339, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19030762, + "step": 11474, + "time_per_iteration": 2.8892338275909424 + }, + { + "auxiliary_loss_clip": 0.01425176, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.2596606, + "balance_loss_mlp": 1.01667905, + "epoch": 0.6899143243649482, + "flos": 24764810336640.0, + "grad_norm": 2.739696129090592, + "language_loss": 0.75623763, + "learning_rate": 9.26518724963303e-07, + "loss": 0.78085744, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.2010498, + "step": 11475, + "time_per_iteration": 2.869035482406616 + }, + { + "auxiliary_loss_clip": 0.01412416, + "auxiliary_loss_mlp": 0.01037395, + "balance_loss_clip": 1.24987221, + "balance_loss_mlp": 1.01731992, + "epoch": 0.6899744476176161, + "flos": 17242449500160.0, + "grad_norm": 2.455696143370247, + "language_loss": 0.89666349, + "learning_rate": 9.261901380806491e-07, + "loss": 0.92116159, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20068359, + "step": 11476, + "time_per_iteration": 2.972435235977173 + }, + { + "auxiliary_loss_clip": 0.01403671, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.24268508, + "balance_loss_mlp": 1.01798439, + "epoch": 0.6900345708702841, + "flos": 25421576878080.0, + "grad_norm": 1.3680420303569714, + "language_loss": 0.70747238, + "learning_rate": 9.258615919169724e-07, + "loss": 0.73188984, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20092773, + "step": 11477, + "time_per_iteration": 2.9140870571136475 + }, + { + "auxiliary_loss_clip": 0.01434833, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.26677549, + "balance_loss_mlp": 1.01686287, + "epoch": 0.6900946941229521, + "flos": 23442952210560.0, + "grad_norm": 2.604171037995578, + "language_loss": 0.69121814, + "learning_rate": 9.255330864847313e-07, + "loss": 0.7159338, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19873047, + "step": 11478, + "time_per_iteration": 2.873448371887207 + }, + { + "auxiliary_loss_clip": 0.01425507, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.26045954, + "balance_loss_mlp": 1.01706982, + "epoch": 0.69015481737562, + "flos": 17828624874240.0, + "grad_norm": 3.0850416853522753, + "language_loss": 0.7688185, + "learning_rate": 9.252046217963843e-07, + "loss": 0.79343808, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19360352, + "step": 11479, + "time_per_iteration": 2.8568553924560547 + }, + { + "auxiliary_loss_clip": 0.01425752, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.25863659, + "balance_loss_mlp": 1.01486731, + "epoch": 0.690214940628288, + "flos": 17465262733440.0, + "grad_norm": 1.6370902301624772, + "language_loss": 0.79747081, + "learning_rate": 9.248761978643856e-07, + "loss": 0.82207209, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19506836, + "step": 11480, + "time_per_iteration": 4.222980976104736 + }, + { + "auxiliary_loss_clip": 0.01411946, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.24879932, + "balance_loss_mlp": 1.01251626, + "epoch": 0.6902750638809559, + "flos": 29577795436800.0, + "grad_norm": 1.5494769463191953, + "language_loss": 0.75867736, + "learning_rate": 9.245478147011885e-07, + "loss": 0.78311974, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19787598, + "step": 11481, + "time_per_iteration": 2.919602870941162 + }, + { + "auxiliary_loss_clip": 0.01411984, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.25016999, + "balance_loss_mlp": 1.01076639, + "epoch": 0.690335187133624, + "flos": 25568188588800.0, + "grad_norm": 1.8308437893742748, + "language_loss": 0.70403075, + "learning_rate": 9.24219472319246e-07, + "loss": 0.72845578, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19763184, + "step": 11482, + "time_per_iteration": 2.9909377098083496 + }, + { + "auxiliary_loss_clip": 0.01422336, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.25682449, + "balance_loss_mlp": 1.01309061, + "epoch": 0.6903953103862919, + "flos": 22497712951680.0, + "grad_norm": 1.7183319178861016, + "language_loss": 0.83213073, + "learning_rate": 9.238911707310096e-07, + "loss": 0.85668266, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19775391, + "step": 11483, + "time_per_iteration": 2.898716688156128 + }, + { + "auxiliary_loss_clip": 0.01429648, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.26322508, + "balance_loss_mlp": 1.01432502, + "epoch": 0.6904554336389599, + "flos": 26110720713600.0, + "grad_norm": 1.827642030192868, + "language_loss": 0.66744953, + "learning_rate": 9.235629099489273e-07, + "loss": 0.69207978, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19042969, + "step": 11484, + "time_per_iteration": 2.9045732021331787 + }, + { + "auxiliary_loss_clip": 0.01413459, + "auxiliary_loss_mlp": 0.01035404, + "balance_loss_clip": 1.25050211, + "balance_loss_mlp": 1.01512623, + "epoch": 0.6905155568916278, + "flos": 31183194597120.0, + "grad_norm": 1.451736122530799, + "language_loss": 0.74110901, + "learning_rate": 9.232346899854479e-07, + "loss": 0.76559758, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20288086, + "step": 11485, + "time_per_iteration": 2.9417450428009033 + }, + { + "auxiliary_loss_clip": 0.01422432, + "auxiliary_loss_mlp": 0.01034259, + "balance_loss_clip": 1.25569272, + "balance_loss_mlp": 1.01451778, + "epoch": 0.6905756801442958, + "flos": 17648685728640.0, + "grad_norm": 1.7150928143543336, + "language_loss": 0.85963172, + "learning_rate": 9.22906510853017e-07, + "loss": 0.88419867, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19726562, + "step": 11486, + "time_per_iteration": 2.8780910968780518 + }, + { + "auxiliary_loss_clip": 0.01428404, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.26383901, + "balance_loss_mlp": 1.01798117, + "epoch": 0.6906358033969637, + "flos": 22353227746560.0, + "grad_norm": 1.4816921104649559, + "language_loss": 0.73479903, + "learning_rate": 9.225783725640786e-07, + "loss": 0.7594642, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20129395, + "step": 11487, + "time_per_iteration": 2.8461549282073975 + }, + { + "auxiliary_loss_clip": 0.01192356, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.10023618, + "balance_loss_mlp": 1.01296282, + "epoch": 0.6906959266496318, + "flos": 69781456051200.0, + "grad_norm": 0.9085412171193092, + "language_loss": 0.6676228, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68983436, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.15820312, + "step": 11488, + "time_per_iteration": 3.430896043777466 + }, + { + "auxiliary_loss_clip": 0.01434725, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.2638278, + "balance_loss_mlp": 1.01446021, + "epoch": 0.6907560499022997, + "flos": 21444256834560.0, + "grad_norm": 1.7887823726769068, + "language_loss": 0.75670117, + "learning_rate": 9.219222185664519e-07, + "loss": 0.78140533, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 1.70898438, + "router_z_loss_mlp": 0.21228027, + "step": 11489, + "time_per_iteration": 2.8884353637695312 + }, + { + "auxiliary_loss_clip": 0.01422959, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.25693786, + "balance_loss_mlp": 1.01368487, + "epoch": 0.6908161731549677, + "flos": 14400306961920.0, + "grad_norm": 2.4483266871402787, + "language_loss": 0.63156998, + "learning_rate": 9.215942028826445e-07, + "loss": 0.6561389, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20251465, + "step": 11490, + "time_per_iteration": 2.8524436950683594 + }, + { + "auxiliary_loss_clip": 0.01424642, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.25988007, + "balance_loss_mlp": 1.02041554, + "epoch": 0.6908762964076357, + "flos": 20020516122240.0, + "grad_norm": 1.6966380347488528, + "language_loss": 0.73350829, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75815082, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19189453, + "step": 11491, + "time_per_iteration": 4.285810708999634 + }, + { + "auxiliary_loss_clip": 0.0140708, + "auxiliary_loss_mlp": 0.01039282, + "balance_loss_clip": 1.24444604, + "balance_loss_mlp": 1.01956439, + "epoch": 0.6909364196603036, + "flos": 28781294394240.0, + "grad_norm": 1.5839146341973016, + "language_loss": 0.70703357, + "learning_rate": 9.20938294207235e-07, + "loss": 0.73149729, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19714355, + "step": 11492, + "time_per_iteration": 2.9245502948760986 + }, + { + "auxiliary_loss_clip": 0.01439922, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.26869774, + "balance_loss_mlp": 1.01469374, + "epoch": 0.6909965429129716, + "flos": 22538234309760.0, + "grad_norm": 1.8900265630439628, + "language_loss": 0.75072229, + "learning_rate": 9.206104012405049e-07, + "loss": 0.77547121, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20263672, + "step": 11493, + "time_per_iteration": 2.8609297275543213 + }, + { + "auxiliary_loss_clip": 0.01416907, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.2539438, + "balance_loss_mlp": 1.01152086, + "epoch": 0.6910566661656395, + "flos": 18415162206720.0, + "grad_norm": 2.6639915784882002, + "language_loss": 0.75337625, + "learning_rate": 9.20282549204336e-07, + "loss": 0.77785623, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19567871, + "step": 11494, + "time_per_iteration": 4.247552871704102 + }, + { + "auxiliary_loss_clip": 0.01414452, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.25163484, + "balance_loss_mlp": 1.01614082, + "epoch": 0.6911167894183076, + "flos": 30786233552640.0, + "grad_norm": 1.7982988022462836, + "language_loss": 0.6875689, + "learning_rate": 9.19954738111161e-07, + "loss": 0.71206689, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19226074, + "step": 11495, + "time_per_iteration": 2.9460017681121826 + }, + { + "auxiliary_loss_clip": 0.01418877, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.25440109, + "balance_loss_mlp": 1.01238942, + "epoch": 0.6911769126709755, + "flos": 13743721399680.0, + "grad_norm": 1.7015774997540176, + "language_loss": 0.7443217, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76883632, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.2019043, + "step": 11496, + "time_per_iteration": 2.8511016368865967 + }, + { + "auxiliary_loss_clip": 0.01411692, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.24856687, + "balance_loss_mlp": 1.01522207, + "epoch": 0.6912370359236435, + "flos": 17575877566080.0, + "grad_norm": 2.1807626533664477, + "language_loss": 0.8087281, + "learning_rate": 9.19299238803515e-07, + "loss": 0.83319187, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19470215, + "step": 11497, + "time_per_iteration": 2.849093198776245 + }, + { + "auxiliary_loss_clip": 0.01442545, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.27411556, + "balance_loss_mlp": 1.0209583, + "epoch": 0.6912971591763114, + "flos": 22100887641600.0, + "grad_norm": 1.6630973593936271, + "language_loss": 0.81303293, + "learning_rate": 9.189715506138993e-07, + "loss": 0.83786798, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19995117, + "step": 11498, + "time_per_iteration": 2.858480930328369 + }, + { + "auxiliary_loss_clip": 0.01412634, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.25133586, + "balance_loss_mlp": 1.01526451, + "epoch": 0.6913572824289794, + "flos": 29983579217280.0, + "grad_norm": 1.437489176500461, + "language_loss": 0.86470455, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88917816, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19482422, + "step": 11499, + "time_per_iteration": 2.9318525791168213 + }, + { + "auxiliary_loss_clip": 0.01413351, + "auxiliary_loss_mlp": 0.01035674, + "balance_loss_clip": 1.25205779, + "balance_loss_mlp": 1.01544476, + "epoch": 0.6914174056816473, + "flos": 20458631952000.0, + "grad_norm": 1.5401437617659615, + "language_loss": 0.76227963, + "learning_rate": 9.183162972252145e-07, + "loss": 0.78676987, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.20214844, + "step": 11500, + "time_per_iteration": 2.8698318004608154 + }, + { + "auxiliary_loss_clip": 0.01410302, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.24669898, + "balance_loss_mlp": 1.01264215, + "epoch": 0.6914775289343154, + "flos": 21290994138240.0, + "grad_norm": 1.8487118461668168, + "language_loss": 0.78360963, + "learning_rate": 9.179887320509921e-07, + "loss": 0.80803645, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19750977, + "step": 11501, + "time_per_iteration": 2.855605125427246 + }, + { + "auxiliary_loss_clip": 0.01431971, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.26463759, + "balance_loss_mlp": 1.01599789, + "epoch": 0.6915376521869833, + "flos": 23888578677120.0, + "grad_norm": 1.950313524832114, + "language_loss": 0.7464937, + "learning_rate": 9.176612079067458e-07, + "loss": 0.77117866, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20532227, + "step": 11502, + "time_per_iteration": 2.9430646896362305 + }, + { + "auxiliary_loss_clip": 0.01422544, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.2562263, + "balance_loss_mlp": 1.01474643, + "epoch": 0.6915977754396513, + "flos": 11517733555200.0, + "grad_norm": 1.9267561851167225, + "language_loss": 0.7478534, + "learning_rate": 9.173337248048953e-07, + "loss": 0.77242547, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19897461, + "step": 11503, + "time_per_iteration": 2.878425359725952 + }, + { + "auxiliary_loss_clip": 0.01410026, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.2461406, + "balance_loss_mlp": 1.01581931, + "epoch": 0.6916578986923193, + "flos": 22611449675520.0, + "grad_norm": 1.665251067969717, + "language_loss": 0.77711236, + "learning_rate": 9.170062827578575e-07, + "loss": 0.80157304, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20227051, + "step": 11504, + "time_per_iteration": 2.8435680866241455 + }, + { + "auxiliary_loss_clip": 0.01427026, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.26119685, + "balance_loss_mlp": 1.01393414, + "epoch": 0.6917180219449872, + "flos": 23487862314240.0, + "grad_norm": 1.633742850568159, + "language_loss": 0.74890661, + "learning_rate": 9.166788817780499e-07, + "loss": 0.77351671, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20068359, + "step": 11505, + "time_per_iteration": 2.9157793521881104 + }, + { + "auxiliary_loss_clip": 0.01407985, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.24547231, + "balance_loss_mlp": 1.01863742, + "epoch": 0.6917781451976552, + "flos": 23743052841600.0, + "grad_norm": 1.9231582935462288, + "language_loss": 0.88382256, + "learning_rate": 9.163515218778886e-07, + "loss": 0.90830052, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.21166992, + "step": 11506, + "time_per_iteration": 2.9012980461120605 + }, + { + "auxiliary_loss_clip": 0.01423405, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.2603333, + "balance_loss_mlp": 1.00982523, + "epoch": 0.6918382684503231, + "flos": 31479087461760.0, + "grad_norm": 2.4652077092913856, + "language_loss": 0.71464038, + "learning_rate": 9.160242030697856e-07, + "loss": 0.73916513, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19226074, + "step": 11507, + "time_per_iteration": 2.94651460647583 + }, + { + "auxiliary_loss_clip": 0.01414103, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.24792552, + "balance_loss_mlp": 1.01653695, + "epoch": 0.6918983917029912, + "flos": 21659830899840.0, + "grad_norm": 1.8109641794310387, + "language_loss": 0.7775116, + "learning_rate": 9.156969253661538e-07, + "loss": 0.80201542, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.1973877, + "step": 11508, + "time_per_iteration": 2.8526206016540527 + }, + { + "auxiliary_loss_clip": 0.01402263, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.24267399, + "balance_loss_mlp": 1.01554513, + "epoch": 0.6919585149556591, + "flos": 25559320608000.0, + "grad_norm": 1.5726520508848267, + "language_loss": 0.75539982, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77976376, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18579102, + "step": 11509, + "time_per_iteration": 2.9178733825683594 + }, + { + "auxiliary_loss_clip": 0.01412375, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.25117087, + "balance_loss_mlp": 1.01157713, + "epoch": 0.6920186382083271, + "flos": 23670335168640.0, + "grad_norm": 1.584852631886142, + "language_loss": 0.64949286, + "learning_rate": 9.150424933219425e-07, + "loss": 0.67392701, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19482422, + "step": 11510, + "time_per_iteration": 2.8686139583587646 + }, + { + "auxiliary_loss_clip": 0.01436568, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.26673269, + "balance_loss_mlp": 1.01367903, + "epoch": 0.692078761460995, + "flos": 19071159586560.0, + "grad_norm": 1.7155852493044226, + "language_loss": 0.76367795, + "learning_rate": 9.147153390061788e-07, + "loss": 0.78838587, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20544434, + "step": 11511, + "time_per_iteration": 2.8608882427215576 + }, + { + "auxiliary_loss_clip": 0.0141611, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.25467062, + "balance_loss_mlp": 1.01679814, + "epoch": 0.692138884713663, + "flos": 29035263312000.0, + "grad_norm": 1.460031943751676, + "language_loss": 0.63010156, + "learning_rate": 9.143882258445184e-07, + "loss": 0.65462708, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19641113, + "step": 11512, + "time_per_iteration": 2.939884901046753 + }, + { + "auxiliary_loss_clip": 0.01422611, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.25752854, + "balance_loss_mlp": 1.01243877, + "epoch": 0.6921990079663309, + "flos": 14766248056320.0, + "grad_norm": 1.7153673109183003, + "language_loss": 0.83424389, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85880697, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.21252441, + "step": 11513, + "time_per_iteration": 2.8554141521453857 + }, + { + "auxiliary_loss_clip": 0.01409229, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.24781132, + "balance_loss_mlp": 1.01410425, + "epoch": 0.692259131218999, + "flos": 23852355575040.0, + "grad_norm": 1.4766165990608944, + "language_loss": 0.79045165, + "learning_rate": 9.137341230331233e-07, + "loss": 0.81487381, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18908691, + "step": 11514, + "time_per_iteration": 2.9367685317993164 + }, + { + "auxiliary_loss_clip": 0.01432773, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.26368988, + "balance_loss_mlp": 1.01667798, + "epoch": 0.6923192544716669, + "flos": 19144329707520.0, + "grad_norm": 2.4254410388687666, + "language_loss": 0.76046562, + "learning_rate": 9.134071334081907e-07, + "loss": 0.78514516, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.18505859, + "step": 11515, + "time_per_iteration": 4.238208293914795 + }, + { + "auxiliary_loss_clip": 0.01407129, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.24721622, + "balance_loss_mlp": 1.01507187, + "epoch": 0.6923793777243349, + "flos": 28086359224320.0, + "grad_norm": 3.835398272501269, + "language_loss": 0.54029846, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56471133, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1907959, + "step": 11516, + "time_per_iteration": 2.883131742477417 + }, + { + "auxiliary_loss_clip": 0.0139416, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.23863995, + "balance_loss_mlp": 1.01565993, + "epoch": 0.6924395009770029, + "flos": 16589890725120.0, + "grad_norm": 1.6606665628964752, + "language_loss": 0.73663795, + "learning_rate": 9.127532777818557e-07, + "loss": 0.76094508, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.20874023, + "step": 11517, + "time_per_iteration": 2.859261989593506 + }, + { + "auxiliary_loss_clip": 0.0141775, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.25177467, + "balance_loss_mlp": 1.01747918, + "epoch": 0.6924996242296708, + "flos": 16664237210880.0, + "grad_norm": 2.220289450751826, + "language_loss": 0.77055717, + "learning_rate": 9.124264118052465e-07, + "loss": 0.7951144, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20495605, + "step": 11518, + "time_per_iteration": 2.8119728565216064 + }, + { + "auxiliary_loss_clip": 0.01419768, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.25250494, + "balance_loss_mlp": 1.01488996, + "epoch": 0.6925597474823388, + "flos": 34768304300160.0, + "grad_norm": 1.5654027942273196, + "language_loss": 0.65115154, + "learning_rate": 9.120995870695376e-07, + "loss": 0.6757009, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.20300293, + "step": 11519, + "time_per_iteration": 3.1147053241729736 + }, + { + "auxiliary_loss_clip": 0.01412039, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.24823368, + "balance_loss_mlp": 1.01577032, + "epoch": 0.6926198707350067, + "flos": 21881739237120.0, + "grad_norm": 1.8652921045561242, + "language_loss": 0.63665485, + "learning_rate": 9.117728035871212e-07, + "loss": 0.66113025, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19714355, + "step": 11520, + "time_per_iteration": 2.856724739074707 + }, + { + "auxiliary_loss_clip": 0.01428194, + "auxiliary_loss_mlp": 0.01039615, + "balance_loss_clip": 1.25764477, + "balance_loss_mlp": 1.01925445, + "epoch": 0.6926799939876748, + "flos": 13014146695680.0, + "grad_norm": 1.94453896916085, + "language_loss": 0.78638542, + "learning_rate": 9.114460613703887e-07, + "loss": 0.81106347, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.20361328, + "step": 11521, + "time_per_iteration": 2.840867757797241 + }, + { + "auxiliary_loss_clip": 0.01421952, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.25470459, + "balance_loss_mlp": 1.01569915, + "epoch": 0.6927401172403427, + "flos": 16769286933120.0, + "grad_norm": 2.642113157457962, + "language_loss": 0.82672524, + "learning_rate": 9.111193604317304e-07, + "loss": 0.85130417, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20251465, + "step": 11522, + "time_per_iteration": 2.8892040252685547 + }, + { + "auxiliary_loss_clip": 0.01411317, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.24941134, + "balance_loss_mlp": 1.01646888, + "epoch": 0.6928002404930107, + "flos": 25717424497920.0, + "grad_norm": 4.596860724422841, + "language_loss": 0.77193308, + "learning_rate": 9.107927007835361e-07, + "loss": 0.79640543, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19458008, + "step": 11523, + "time_per_iteration": 2.9565186500549316 + }, + { + "auxiliary_loss_clip": 0.01409531, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.24894953, + "balance_loss_mlp": 1.01493716, + "epoch": 0.6928603637456786, + "flos": 18597499326720.0, + "grad_norm": 2.0026078095730684, + "language_loss": 0.69324327, + "learning_rate": 9.104660824381915e-07, + "loss": 0.7176826, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19482422, + "step": 11524, + "time_per_iteration": 2.8525989055633545 + }, + { + "auxiliary_loss_clip": 0.01448712, + "auxiliary_loss_mlp": 0.0103876, + "balance_loss_clip": 1.27844167, + "balance_loss_mlp": 1.01786244, + "epoch": 0.6929204869983466, + "flos": 22211185760640.0, + "grad_norm": 1.696181760597718, + "language_loss": 0.65454435, + "learning_rate": 9.101395054080815e-07, + "loss": 0.67941904, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.2088623, + "step": 11525, + "time_per_iteration": 4.511396169662476 + }, + { + "auxiliary_loss_clip": 0.01414946, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.25217628, + "balance_loss_mlp": 1.01695323, + "epoch": 0.6929806102510145, + "flos": 17903740521600.0, + "grad_norm": 6.671254979082253, + "language_loss": 0.70550275, + "learning_rate": 9.098129697055907e-07, + "loss": 0.73001266, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19116211, + "step": 11526, + "time_per_iteration": 4.2730138301849365 + }, + { + "auxiliary_loss_clip": 0.01420849, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.25834346, + "balance_loss_mlp": 1.01570511, + "epoch": 0.6930407335036826, + "flos": 19764556433280.0, + "grad_norm": 3.6940942231272533, + "language_loss": 0.77337497, + "learning_rate": 9.094864753431022e-07, + "loss": 0.79793197, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19116211, + "step": 11527, + "time_per_iteration": 2.8586692810058594 + }, + { + "auxiliary_loss_clip": 0.01421883, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.25806952, + "balance_loss_mlp": 1.01491714, + "epoch": 0.6931008567563505, + "flos": 21554419219200.0, + "grad_norm": 1.6003826348188137, + "language_loss": 0.79996765, + "learning_rate": 9.091600223329952e-07, + "loss": 0.82452226, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18676758, + "step": 11528, + "time_per_iteration": 2.8595242500305176 + }, + { + "auxiliary_loss_clip": 0.01398813, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.24055183, + "balance_loss_mlp": 1.01589036, + "epoch": 0.6931609800090185, + "flos": 26261178232320.0, + "grad_norm": 1.4328690850478691, + "language_loss": 0.7617197, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78606105, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19458008, + "step": 11529, + "time_per_iteration": 4.297672510147095 + }, + { + "auxiliary_loss_clip": 0.01406404, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.24655402, + "balance_loss_mlp": 1.01562285, + "epoch": 0.6932211032616865, + "flos": 32356178772480.0, + "grad_norm": 1.6421911408478045, + "language_loss": 0.72885263, + "learning_rate": 9.085072404194436e-07, + "loss": 0.75326103, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18811035, + "step": 11530, + "time_per_iteration": 2.942888021469116 + }, + { + "auxiliary_loss_clip": 0.0143876, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.26934695, + "balance_loss_mlp": 1.01905251, + "epoch": 0.6932812265143544, + "flos": 22057832574720.0, + "grad_norm": 1.6443777603459018, + "language_loss": 0.78956223, + "learning_rate": 9.081809115407513e-07, + "loss": 0.81434, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19958496, + "step": 11531, + "time_per_iteration": 2.859013080596924 + }, + { + "auxiliary_loss_clip": 0.01400493, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.24056506, + "balance_loss_mlp": 1.01264513, + "epoch": 0.6933413497670224, + "flos": 26269503275520.0, + "grad_norm": 1.5171325623270502, + "language_loss": 0.69988596, + "learning_rate": 9.078546240639484e-07, + "loss": 0.724195, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1776123, + "step": 11532, + "time_per_iteration": 2.9338858127593994 + }, + { + "auxiliary_loss_clip": 0.01422148, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.25697434, + "balance_loss_mlp": 1.01504004, + "epoch": 0.6934014730196904, + "flos": 19582400292480.0, + "grad_norm": 1.7763343982458926, + "language_loss": 0.67731297, + "learning_rate": 9.075283780014082e-07, + "loss": 0.70187485, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18994141, + "step": 11533, + "time_per_iteration": 2.8527259826660156 + }, + { + "auxiliary_loss_clip": 0.01414653, + "auxiliary_loss_mlp": 0.01040195, + "balance_loss_clip": 1.24988341, + "balance_loss_mlp": 1.0203824, + "epoch": 0.6934615962723584, + "flos": 22127292622080.0, + "grad_norm": 2.6071219783977666, + "language_loss": 0.60166979, + "learning_rate": 9.072021733655007e-07, + "loss": 0.62621832, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19824219, + "step": 11534, + "time_per_iteration": 2.8354551792144775 + }, + { + "auxiliary_loss_clip": 0.01416504, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.251917, + "balance_loss_mlp": 1.0155828, + "epoch": 0.6935217195250263, + "flos": 21370679510400.0, + "grad_norm": 2.0187063830774257, + "language_loss": 0.72222698, + "learning_rate": 9.068760101685971e-07, + "loss": 0.74673617, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18811035, + "step": 11535, + "time_per_iteration": 2.8536369800567627 + }, + { + "auxiliary_loss_clip": 0.01190588, + "auxiliary_loss_mlp": 0.0102969, + "balance_loss_clip": 1.09715915, + "balance_loss_mlp": 1.00689733, + "epoch": 0.6935818427776943, + "flos": 64098030625920.0, + "grad_norm": 0.7119678788099219, + "language_loss": 0.59073544, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61293823, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.22753906, + "step": 11536, + "time_per_iteration": 3.4670650959014893 + }, + { + "auxiliary_loss_clip": 0.0143116, + "auxiliary_loss_mlp": 0.01038453, + "balance_loss_clip": 1.26291907, + "balance_loss_mlp": 1.01903331, + "epoch": 0.6936419660303622, + "flos": 20312110730880.0, + "grad_norm": 2.736900967720685, + "language_loss": 0.73388344, + "learning_rate": 9.062238081412692e-07, + "loss": 0.75857955, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19421387, + "step": 11537, + "time_per_iteration": 2.916876792907715 + }, + { + "auxiliary_loss_clip": 0.0119341, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.09981036, + "balance_loss_mlp": 1.02343106, + "epoch": 0.6937020892830302, + "flos": 67212674467200.0, + "grad_norm": 0.7567591928939326, + "language_loss": 0.55652022, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57888985, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.20117188, + "step": 11538, + "time_per_iteration": 3.306994915008545 + }, + { + "auxiliary_loss_clip": 0.01393765, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.23800468, + "balance_loss_mlp": 1.01489866, + "epoch": 0.6937622125356981, + "flos": 23888669166720.0, + "grad_norm": 1.4150927838528042, + "language_loss": 0.78072834, + "learning_rate": 9.055717720183505e-07, + "loss": 0.80500847, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.19372559, + "step": 11539, + "time_per_iteration": 2.872532844543457 + }, + { + "auxiliary_loss_clip": 0.01399047, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.23848033, + "balance_loss_mlp": 1.01656866, + "epoch": 0.6938223357883662, + "flos": 28742311359360.0, + "grad_norm": 1.882130286265113, + "language_loss": 0.65135419, + "learning_rate": 9.05245816201953e-07, + "loss": 0.67569494, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18469238, + "step": 11540, + "time_per_iteration": 2.9064559936523438 + }, + { + "auxiliary_loss_clip": 0.01404639, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.24405408, + "balance_loss_mlp": 1.01281273, + "epoch": 0.6938824590410341, + "flos": 28666019347200.0, + "grad_norm": 1.4242389762395766, + "language_loss": 0.87361914, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89798105, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1875, + "step": 11541, + "time_per_iteration": 2.9272260665893555 + }, + { + "auxiliary_loss_clip": 0.0142086, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.25544405, + "balance_loss_mlp": 1.01289487, + "epoch": 0.6939425822937021, + "flos": 18990705052800.0, + "grad_norm": 1.627054342232854, + "language_loss": 0.84971368, + "learning_rate": 9.04594029121081e-07, + "loss": 0.87424356, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19213867, + "step": 11542, + "time_per_iteration": 2.8440754413604736 + }, + { + "auxiliary_loss_clip": 0.01426519, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.25882506, + "balance_loss_mlp": 1.01339006, + "epoch": 0.6940027055463701, + "flos": 23086195810560.0, + "grad_norm": 1.7254424088509535, + "language_loss": 0.75854158, + "learning_rate": 9.04268197881323e-07, + "loss": 0.78314441, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20373535, + "step": 11543, + "time_per_iteration": 2.8757340908050537 + }, + { + "auxiliary_loss_clip": 0.0140357, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.24144828, + "balance_loss_mlp": 1.01729798, + "epoch": 0.694062828799038, + "flos": 18195606599040.0, + "grad_norm": 1.673010160996948, + "language_loss": 0.77237636, + "learning_rate": 9.039424081918241e-07, + "loss": 0.79677337, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18823242, + "step": 11544, + "time_per_iteration": 2.8544371128082275 + }, + { + "auxiliary_loss_clip": 0.01419219, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.2540338, + "balance_loss_mlp": 1.01855993, + "epoch": 0.694122952051706, + "flos": 17830434666240.0, + "grad_norm": 2.1143586300817825, + "language_loss": 0.72668087, + "learning_rate": 9.036166600649388e-07, + "loss": 0.75125408, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.1953125, + "step": 11545, + "time_per_iteration": 2.8734676837921143 + }, + { + "auxiliary_loss_clip": 0.01411196, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.2514019, + "balance_loss_mlp": 1.01611304, + "epoch": 0.694183075304374, + "flos": 21225244164480.0, + "grad_norm": 1.6609241108856527, + "language_loss": 0.80157793, + "learning_rate": 9.0329095351302e-07, + "loss": 0.82603455, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18359375, + "step": 11546, + "time_per_iteration": 2.8647756576538086 + }, + { + "auxiliary_loss_clip": 0.01421093, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.25876832, + "balance_loss_mlp": 1.01512241, + "epoch": 0.694243198557042, + "flos": 24071006286720.0, + "grad_norm": 1.3232305189538418, + "language_loss": 0.79056168, + "learning_rate": 9.029652885484194e-07, + "loss": 0.81511611, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19226074, + "step": 11547, + "time_per_iteration": 2.9062814712524414 + }, + { + "auxiliary_loss_clip": 0.01409402, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.24866664, + "balance_loss_mlp": 1.01397145, + "epoch": 0.6943033218097099, + "flos": 21151666840320.0, + "grad_norm": 1.924840607787628, + "language_loss": 0.81193364, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83636403, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19641113, + "step": 11548, + "time_per_iteration": 2.8612732887268066 + }, + { + "auxiliary_loss_clip": 0.01190447, + "auxiliary_loss_mlp": 0.01023646, + "balance_loss_clip": 1.09759545, + "balance_loss_mlp": 1.0044775, + "epoch": 0.6943634450623779, + "flos": 57841306611840.0, + "grad_norm": 0.7191427407658184, + "language_loss": 0.53762555, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55976647, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.19140625, + "step": 11549, + "time_per_iteration": 3.3805699348449707 + }, + { + "auxiliary_loss_clip": 0.01411938, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.24828565, + "balance_loss_mlp": 1.01613498, + "epoch": 0.6944235683150458, + "flos": 30602674823040.0, + "grad_norm": 1.4481334915468063, + "language_loss": 0.74226582, + "learning_rate": 9.01988543302e-07, + "loss": 0.76673937, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19262695, + "step": 11550, + "time_per_iteration": 4.355464458465576 + }, + { + "auxiliary_loss_clip": 0.01427867, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.26167488, + "balance_loss_mlp": 1.0182476, + "epoch": 0.6944836915677138, + "flos": 19729012003200.0, + "grad_norm": 1.8200297336056745, + "language_loss": 0.74701703, + "learning_rate": 9.016630448101425e-07, + "loss": 0.77168369, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20544434, + "step": 11551, + "time_per_iteration": 2.8476433753967285 + }, + { + "auxiliary_loss_clip": 0.01418893, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.25397611, + "balance_loss_mlp": 1.01673007, + "epoch": 0.6945438148203817, + "flos": 24874384538880.0, + "grad_norm": 1.491489764583699, + "language_loss": 0.847417, + "learning_rate": 9.01337587967333e-07, + "loss": 0.87198108, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20800781, + "step": 11552, + "time_per_iteration": 2.887401580810547 + }, + { + "auxiliary_loss_clip": 0.01414157, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.25196934, + "balance_loss_mlp": 1.01384521, + "epoch": 0.6946039380730498, + "flos": 33339224701440.0, + "grad_norm": 1.484045762046424, + "language_loss": 0.67850244, + "learning_rate": 9.010121727859117e-07, + "loss": 0.70297819, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19567871, + "step": 11553, + "time_per_iteration": 2.9933483600616455 + }, + { + "auxiliary_loss_clip": 0.01449058, + "auxiliary_loss_mlp": 0.01037866, + "balance_loss_clip": 1.27978349, + "balance_loss_mlp": 1.01750541, + "epoch": 0.6946640613257177, + "flos": 20860931882880.0, + "grad_norm": 1.501643820900227, + "language_loss": 0.79716694, + "learning_rate": 9.006867992782195e-07, + "loss": 0.82203621, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.20361328, + "step": 11554, + "time_per_iteration": 2.8578603267669678 + }, + { + "auxiliary_loss_clip": 0.01415834, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.25051248, + "balance_loss_mlp": 1.01344967, + "epoch": 0.6947241845783857, + "flos": 19364383008000.0, + "grad_norm": 1.8223108437981177, + "language_loss": 0.73686749, + "learning_rate": 9.003614674565934e-07, + "loss": 0.76135749, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19726562, + "step": 11555, + "time_per_iteration": 2.8619163036346436 + }, + { + "auxiliary_loss_clip": 0.0140834, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.24578285, + "balance_loss_mlp": 1.0156405, + "epoch": 0.6947843078310536, + "flos": 27130215968640.0, + "grad_norm": 1.7558027533526908, + "language_loss": 0.78586274, + "learning_rate": 9.000361773333705e-07, + "loss": 0.81028807, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18566895, + "step": 11556, + "time_per_iteration": 2.906740188598633 + }, + { + "auxiliary_loss_clip": 0.01413189, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.24902654, + "balance_loss_mlp": 1.01823425, + "epoch": 0.6948444310837216, + "flos": 28596876013440.0, + "grad_norm": 2.154351242295796, + "language_loss": 0.6140281, + "learning_rate": 8.997109289208869e-07, + "loss": 0.63852859, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1862793, + "step": 11557, + "time_per_iteration": 2.9419431686401367 + }, + { + "auxiliary_loss_clip": 0.01407099, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.24635673, + "balance_loss_mlp": 1.01349354, + "epoch": 0.6949045543363896, + "flos": 15677707432320.0, + "grad_norm": 1.632112169473129, + "language_loss": 0.86133695, + "learning_rate": 8.993857222314752e-07, + "loss": 0.88574016, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19726562, + "step": 11558, + "time_per_iteration": 2.8617608547210693 + }, + { + "auxiliary_loss_clip": 0.01428025, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.2601881, + "balance_loss_mlp": 1.01457667, + "epoch": 0.6949646775890576, + "flos": 23270116498560.0, + "grad_norm": 1.6732830219327683, + "language_loss": 0.70763361, + "learning_rate": 8.990605572774664e-07, + "loss": 0.73226249, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.20251465, + "step": 11559, + "time_per_iteration": 2.897136688232422 + }, + { + "auxiliary_loss_clip": 0.01422667, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.25952196, + "balance_loss_mlp": 1.01322103, + "epoch": 0.6950248008417256, + "flos": 22392708474240.0, + "grad_norm": 1.4725532868891718, + "language_loss": 0.79994047, + "learning_rate": 8.987354340711921e-07, + "loss": 0.82448506, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18566895, + "step": 11560, + "time_per_iteration": 5.729364395141602 + }, + { + "auxiliary_loss_clip": 0.01403789, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.24362016, + "balance_loss_mlp": 1.01209426, + "epoch": 0.6950849240943935, + "flos": 23487862314240.0, + "grad_norm": 2.198515972028677, + "language_loss": 0.77475113, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79910529, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19567871, + "step": 11561, + "time_per_iteration": 3.024867296218872 + }, + { + "auxiliary_loss_clip": 0.01409153, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.24721599, + "balance_loss_mlp": 1.01488733, + "epoch": 0.6951450473470615, + "flos": 17429446834560.0, + "grad_norm": 3.7916738230705946, + "language_loss": 0.78571546, + "learning_rate": 8.980853129511577e-07, + "loss": 0.8101508, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19494629, + "step": 11562, + "time_per_iteration": 2.884848117828369 + }, + { + "auxiliary_loss_clip": 0.0141977, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.25421715, + "balance_loss_mlp": 1.01726127, + "epoch": 0.6952051705997294, + "flos": 20495533726080.0, + "grad_norm": 1.804438702355151, + "language_loss": 0.6956681, + "learning_rate": 8.977603150620515e-07, + "loss": 0.72024059, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20214844, + "step": 11563, + "time_per_iteration": 4.334613084793091 + }, + { + "auxiliary_loss_clip": 0.01398817, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.24095058, + "balance_loss_mlp": 1.01069295, + "epoch": 0.6952652938523974, + "flos": 13997192624640.0, + "grad_norm": 2.1098769041553496, + "language_loss": 0.74289036, + "learning_rate": 8.974353589699846e-07, + "loss": 0.76717311, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.1875, + "step": 11564, + "time_per_iteration": 2.825165033340454 + }, + { + "auxiliary_loss_clip": 0.01460631, + "auxiliary_loss_mlp": 0.01044366, + "balance_loss_clip": 1.28482485, + "balance_loss_mlp": 1.02342081, + "epoch": 0.6953254171050653, + "flos": 30965629760640.0, + "grad_norm": 1.7936143228763208, + "language_loss": 0.72776115, + "learning_rate": 8.971104446872785e-07, + "loss": 0.75281107, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 1.75878906, + "router_z_loss_mlp": 0.20959473, + "step": 11565, + "time_per_iteration": 2.9423716068267822 + }, + { + "auxiliary_loss_clip": 0.01192125, + "auxiliary_loss_mlp": 0.01021262, + "balance_loss_clip": 1.09758139, + "balance_loss_mlp": 0.99713439, + "epoch": 0.6953855403577334, + "flos": 61698148456320.0, + "grad_norm": 0.9180875469951174, + "language_loss": 0.58500409, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60713798, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.24121094, + "step": 11566, + "time_per_iteration": 3.2167866230010986 + }, + { + "auxiliary_loss_clip": 0.01415894, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.24939406, + "balance_loss_mlp": 1.011657, + "epoch": 0.6954456636104013, + "flos": 23049294036480.0, + "grad_norm": 1.7981596322877396, + "language_loss": 0.74527478, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76974547, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19519043, + "step": 11567, + "time_per_iteration": 2.9325613975524902 + }, + { + "auxiliary_loss_clip": 0.01406107, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.24564838, + "balance_loss_mlp": 1.01604831, + "epoch": 0.6955057868630693, + "flos": 23930004931200.0, + "grad_norm": 1.2530817917592652, + "language_loss": 0.77079207, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79521519, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20153809, + "step": 11568, + "time_per_iteration": 2.920728921890259 + }, + { + "auxiliary_loss_clip": 0.01411872, + "auxiliary_loss_mlp": 0.01034403, + "balance_loss_clip": 1.25039351, + "balance_loss_mlp": 1.01527023, + "epoch": 0.6955659101157372, + "flos": 22603079387520.0, + "grad_norm": 21.18210745015226, + "language_loss": 0.73111677, + "learning_rate": 8.958112058964649e-07, + "loss": 0.75557959, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19128418, + "step": 11569, + "time_per_iteration": 2.869680166244507 + }, + { + "auxiliary_loss_clip": 0.01416498, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.253263, + "balance_loss_mlp": 1.01356459, + "epoch": 0.6956260333684052, + "flos": 24583468602240.0, + "grad_norm": 1.6132259343313964, + "language_loss": 0.77597332, + "learning_rate": 8.954865008453471e-07, + "loss": 0.80047655, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20275879, + "step": 11570, + "time_per_iteration": 2.89581561088562 + }, + { + "auxiliary_loss_clip": 0.01425862, + "auxiliary_loss_mlp": 0.01032853, + "balance_loss_clip": 1.25917101, + "balance_loss_mlp": 1.01249194, + "epoch": 0.6956861566210732, + "flos": 25856435082240.0, + "grad_norm": 2.5290420895878842, + "language_loss": 0.75267249, + "learning_rate": 8.95161837677493e-07, + "loss": 0.77725971, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20336914, + "step": 11571, + "time_per_iteration": 2.9328274726867676 + }, + { + "auxiliary_loss_clip": 0.01400806, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.24352205, + "balance_loss_mlp": 1.01185846, + "epoch": 0.6957462798737412, + "flos": 15308961160320.0, + "grad_norm": 1.9621282340486275, + "language_loss": 0.75222391, + "learning_rate": 8.948372164052118e-07, + "loss": 0.77654034, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18969727, + "step": 11572, + "time_per_iteration": 2.9461464881896973 + }, + { + "auxiliary_loss_clip": 0.01402679, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.24054754, + "balance_loss_mlp": 1.01250887, + "epoch": 0.6958064031264092, + "flos": 36260509674240.0, + "grad_norm": 2.1107299289905868, + "language_loss": 0.70815361, + "learning_rate": 8.94512637040814e-07, + "loss": 0.7324937, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18823242, + "step": 11573, + "time_per_iteration": 2.97440242767334 + }, + { + "auxiliary_loss_clip": 0.01449149, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.27911365, + "balance_loss_mlp": 1.01796281, + "epoch": 0.6958665263790771, + "flos": 19218178500480.0, + "grad_norm": 1.7643323474626194, + "language_loss": 0.75421327, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77907586, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 1.70410156, + "router_z_loss_mlp": 0.19140625, + "step": 11574, + "time_per_iteration": 2.8527088165283203 + }, + { + "auxiliary_loss_clip": 0.01426993, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.25898457, + "balance_loss_mlp": 1.01383209, + "epoch": 0.6959266496317451, + "flos": 21805085266560.0, + "grad_norm": 1.8627069301052535, + "language_loss": 0.7486707, + "learning_rate": 8.938636040849014e-07, + "loss": 0.77327561, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.1965332, + "step": 11575, + "time_per_iteration": 2.904594898223877 + }, + { + "auxiliary_loss_clip": 0.01415333, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.25115585, + "balance_loss_mlp": 1.01188374, + "epoch": 0.695986772884413, + "flos": 20567934685440.0, + "grad_norm": 5.212018950050875, + "language_loss": 0.80247915, + "learning_rate": 8.935391505179966e-07, + "loss": 0.82695001, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1986084, + "step": 11576, + "time_per_iteration": 2.8874452114105225 + }, + { + "auxiliary_loss_clip": 0.01422273, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.25421858, + "balance_loss_mlp": 1.0112735, + "epoch": 0.696046896137081, + "flos": 14943608248320.0, + "grad_norm": 2.9301157685993227, + "language_loss": 0.58093488, + "learning_rate": 8.932147389081985e-07, + "loss": 0.60547578, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20544434, + "step": 11577, + "time_per_iteration": 2.8688881397247314 + }, + { + "auxiliary_loss_clip": 0.01392523, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.23465312, + "balance_loss_mlp": 1.00870132, + "epoch": 0.696107019389749, + "flos": 30753177586560.0, + "grad_norm": 1.525627461832965, + "language_loss": 0.77225506, + "learning_rate": 8.928903692678081e-07, + "loss": 0.79645282, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18554688, + "step": 11578, + "time_per_iteration": 2.959563970565796 + }, + { + "auxiliary_loss_clip": 0.01412472, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.24885392, + "balance_loss_mlp": 1.01711726, + "epoch": 0.696167142642417, + "flos": 20786359173120.0, + "grad_norm": 2.10111104071756, + "language_loss": 0.80965036, + "learning_rate": 8.925660416091254e-07, + "loss": 0.83413863, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19226074, + "step": 11579, + "time_per_iteration": 2.856285333633423 + }, + { + "auxiliary_loss_clip": 0.01404115, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.2432549, + "balance_loss_mlp": 1.01177967, + "epoch": 0.6962272658950849, + "flos": 22575407552640.0, + "grad_norm": 1.8470977396234243, + "language_loss": 0.73661661, + "learning_rate": 8.922417559444502e-07, + "loss": 0.76097608, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20056152, + "step": 11580, + "time_per_iteration": 2.907959461212158 + }, + { + "auxiliary_loss_clip": 0.0141804, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.25282085, + "balance_loss_mlp": 1.01381052, + "epoch": 0.6962873891477529, + "flos": 22210280864640.0, + "grad_norm": 2.1301365349624586, + "language_loss": 0.66443908, + "learning_rate": 8.919175122860787e-07, + "loss": 0.68895996, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20239258, + "step": 11581, + "time_per_iteration": 2.8702609539031982 + }, + { + "auxiliary_loss_clip": 0.01407929, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.24570751, + "balance_loss_mlp": 1.01103461, + "epoch": 0.6963475124004208, + "flos": 12495938290560.0, + "grad_norm": 2.1583851598034185, + "language_loss": 0.77506554, + "learning_rate": 8.915933106463056e-07, + "loss": 0.7994523, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19726562, + "step": 11582, + "time_per_iteration": 2.815110206604004 + }, + { + "auxiliary_loss_clip": 0.01399586, + "auxiliary_loss_mlp": 0.01030273, + "balance_loss_clip": 1.23732138, + "balance_loss_mlp": 1.01170015, + "epoch": 0.6964076356530888, + "flos": 17173894348800.0, + "grad_norm": 1.9723614810778611, + "language_loss": 0.70353228, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72783089, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18591309, + "step": 11583, + "time_per_iteration": 2.87984561920166 + }, + { + "auxiliary_loss_clip": 0.01416077, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.25348353, + "balance_loss_mlp": 1.01331949, + "epoch": 0.6964677589057569, + "flos": 19946667329280.0, + "grad_norm": 2.6918858748377397, + "language_loss": 0.8313309, + "learning_rate": 8.909450334717301e-07, + "loss": 0.85581815, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19335938, + "step": 11584, + "time_per_iteration": 2.886331558227539 + }, + { + "auxiliary_loss_clip": 0.01425192, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.2594167, + "balance_loss_mlp": 1.01383996, + "epoch": 0.6965278821584248, + "flos": 22794374977920.0, + "grad_norm": 2.4428956119151994, + "language_loss": 0.8055625, + "learning_rate": 8.906209579615107e-07, + "loss": 0.83015847, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20556641, + "step": 11585, + "time_per_iteration": 4.268278121948242 + }, + { + "auxiliary_loss_clip": 0.01401509, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.24200964, + "balance_loss_mlp": 1.01457882, + "epoch": 0.6965880054110928, + "flos": 20057146427520.0, + "grad_norm": 1.54150622458124, + "language_loss": 0.77927184, + "learning_rate": 8.90296924519055e-07, + "loss": 0.80363238, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19958496, + "step": 11586, + "time_per_iteration": 2.866168260574341 + }, + { + "auxiliary_loss_clip": 0.01388448, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.23185706, + "balance_loss_mlp": 1.01255548, + "epoch": 0.6966481286637607, + "flos": 21918279052800.0, + "grad_norm": 2.378691278223498, + "language_loss": 0.79677951, + "learning_rate": 8.899729331566519e-07, + "loss": 0.8209902, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.20031738, + "step": 11587, + "time_per_iteration": 2.86370587348938 + }, + { + "auxiliary_loss_clip": 0.01392564, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.23526812, + "balance_loss_mlp": 1.01361156, + "epoch": 0.6967082519164287, + "flos": 15641710554240.0, + "grad_norm": 1.7844293310055668, + "language_loss": 0.73884082, + "learning_rate": 8.896489838865857e-07, + "loss": 0.76309711, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19458008, + "step": 11588, + "time_per_iteration": 2.8338394165039062 + }, + { + "auxiliary_loss_clip": 0.01402915, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.24036002, + "balance_loss_mlp": 1.01280868, + "epoch": 0.6967683751690966, + "flos": 24035235632640.0, + "grad_norm": 1.6276920243715087, + "language_loss": 0.76201957, + "learning_rate": 8.893250767211413e-07, + "loss": 0.78636885, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19189453, + "step": 11589, + "time_per_iteration": 2.877988815307617 + }, + { + "auxiliary_loss_clip": 0.01403306, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.24108028, + "balance_loss_mlp": 1.01216412, + "epoch": 0.6968284984217646, + "flos": 31035813724800.0, + "grad_norm": 1.7184693856345636, + "language_loss": 0.64411879, + "learning_rate": 8.890012116726012e-07, + "loss": 0.66846985, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19628906, + "step": 11590, + "time_per_iteration": 2.961817502975464 + }, + { + "auxiliary_loss_clip": 0.01197105, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.10387516, + "balance_loss_mlp": 1.01407611, + "epoch": 0.6968886216744326, + "flos": 67653866943360.0, + "grad_norm": 0.7557423256790824, + "language_loss": 0.61292011, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63525122, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.21972656, + "step": 11591, + "time_per_iteration": 3.438796281814575 + }, + { + "auxiliary_loss_clip": 0.01417755, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.25490856, + "balance_loss_mlp": 1.01692247, + "epoch": 0.6969487449271006, + "flos": 24874520273280.0, + "grad_norm": 1.5822946820983526, + "language_loss": 0.70120966, + "learning_rate": 8.883536079753582e-07, + "loss": 0.72576618, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20983887, + "step": 11592, + "time_per_iteration": 2.895056962966919 + }, + { + "auxiliary_loss_clip": 0.01416798, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.25516486, + "balance_loss_mlp": 1.01461887, + "epoch": 0.6970088681797685, + "flos": 28779756071040.0, + "grad_norm": 1.5709667561399687, + "language_loss": 0.63153285, + "learning_rate": 8.880298693512109e-07, + "loss": 0.65603912, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.1920166, + "step": 11593, + "time_per_iteration": 2.9703571796417236 + }, + { + "auxiliary_loss_clip": 0.01393949, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.23544121, + "balance_loss_mlp": 1.01522446, + "epoch": 0.6970689914324365, + "flos": 27320108970240.0, + "grad_norm": 1.3746198203990576, + "language_loss": 0.54402649, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56831336, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19519043, + "step": 11594, + "time_per_iteration": 2.9341232776641846 + }, + { + "auxiliary_loss_clip": 0.01415082, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.25183177, + "balance_loss_mlp": 1.01388025, + "epoch": 0.6971291146851044, + "flos": 19145822785920.0, + "grad_norm": 2.2862345065269007, + "language_loss": 0.78269649, + "learning_rate": 8.87382518613248e-07, + "loss": 0.8071748, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18847656, + "step": 11595, + "time_per_iteration": 5.66045069694519 + }, + { + "auxiliary_loss_clip": 0.01415119, + "auxiliary_loss_mlp": 0.01036329, + "balance_loss_clip": 1.25012541, + "balance_loss_mlp": 1.01571798, + "epoch": 0.6971892379377724, + "flos": 14618505225600.0, + "grad_norm": 2.377005486997459, + "language_loss": 0.72564244, + "learning_rate": 8.870589065239793e-07, + "loss": 0.75015694, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20617676, + "step": 11596, + "time_per_iteration": 2.8214449882507324 + }, + { + "auxiliary_loss_clip": 0.01418165, + "auxiliary_loss_mlp": 0.01038049, + "balance_loss_clip": 1.25483084, + "balance_loss_mlp": 1.01816463, + "epoch": 0.6972493611904405, + "flos": 22317049889280.0, + "grad_norm": 1.9424288102484915, + "language_loss": 0.77003741, + "learning_rate": 8.867353366375492e-07, + "loss": 0.79459953, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19897461, + "step": 11597, + "time_per_iteration": 2.868818998336792 + }, + { + "auxiliary_loss_clip": 0.01410358, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.24716246, + "balance_loss_mlp": 1.01794577, + "epoch": 0.6973094844431084, + "flos": 17429220610560.0, + "grad_norm": 1.9498856828740008, + "language_loss": 0.75558686, + "learning_rate": 8.864118089662267e-07, + "loss": 0.78006876, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19885254, + "step": 11598, + "time_per_iteration": 4.342143297195435 + }, + { + "auxiliary_loss_clip": 0.01415996, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.24936819, + "balance_loss_mlp": 1.01701653, + "epoch": 0.6973696076957764, + "flos": 27246848359680.0, + "grad_norm": 1.856523923701185, + "language_loss": 0.90272582, + "learning_rate": 8.860883235222791e-07, + "loss": 0.92726606, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.21020508, + "step": 11599, + "time_per_iteration": 2.9140918254852295 + }, + { + "auxiliary_loss_clip": 0.01434615, + "auxiliary_loss_mlp": 0.01042479, + "balance_loss_clip": 1.26459241, + "balance_loss_mlp": 1.02098608, + "epoch": 0.6974297309484443, + "flos": 22028260458240.0, + "grad_norm": 2.1045856265300253, + "language_loss": 0.70103025, + "learning_rate": 8.85764880317974e-07, + "loss": 0.72580123, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.21508789, + "step": 11600, + "time_per_iteration": 2.8804945945739746 + }, + { + "auxiliary_loss_clip": 0.01418331, + "auxiliary_loss_mlp": 0.01038709, + "balance_loss_clip": 1.25303447, + "balance_loss_mlp": 1.01720393, + "epoch": 0.6974898542011123, + "flos": 28378225301760.0, + "grad_norm": 3.1482055349078406, + "language_loss": 0.77510166, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79967201, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.21496582, + "step": 11601, + "time_per_iteration": 2.9103167057037354 + }, + { + "auxiliary_loss_clip": 0.01393156, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.235201, + "balance_loss_mlp": 1.01629889, + "epoch": 0.6975499774537802, + "flos": 15240541743360.0, + "grad_norm": 2.075832261308193, + "language_loss": 0.72846448, + "learning_rate": 8.851181206773508e-07, + "loss": 0.75275922, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.20007324, + "step": 11602, + "time_per_iteration": 2.877328634262085 + }, + { + "auxiliary_loss_clip": 0.01416007, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.25377595, + "balance_loss_mlp": 1.01597881, + "epoch": 0.6976101007064482, + "flos": 22165913698560.0, + "grad_norm": 2.165426834815899, + "language_loss": 0.77131742, + "learning_rate": 8.847948042655567e-07, + "loss": 0.79583549, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19836426, + "step": 11603, + "time_per_iteration": 2.8663036823272705 + }, + { + "auxiliary_loss_clip": 0.01414069, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.25172579, + "balance_loss_mlp": 1.01151681, + "epoch": 0.6976702239591162, + "flos": 22283767699200.0, + "grad_norm": 1.6632891208023308, + "language_loss": 0.62852293, + "learning_rate": 8.844715301424557e-07, + "loss": 0.65297812, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19946289, + "step": 11604, + "time_per_iteration": 2.8941104412078857 + }, + { + "auxiliary_loss_clip": 0.01416865, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.25286019, + "balance_loss_mlp": 1.01452613, + "epoch": 0.6977303472117842, + "flos": 25859647463040.0, + "grad_norm": 2.084829839459537, + "language_loss": 0.82437348, + "learning_rate": 8.841482983203057e-07, + "loss": 0.84890133, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.21411133, + "step": 11605, + "time_per_iteration": 2.862170696258545 + }, + { + "auxiliary_loss_clip": 0.01408673, + "auxiliary_loss_mlp": 0.01036374, + "balance_loss_clip": 1.24698782, + "balance_loss_mlp": 1.01589382, + "epoch": 0.6977904704644521, + "flos": 20969374965120.0, + "grad_norm": 1.5835472155412509, + "language_loss": 0.70817077, + "learning_rate": 8.838251088113638e-07, + "loss": 0.73262119, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.20483398, + "step": 11606, + "time_per_iteration": 2.849299669265747 + }, + { + "auxiliary_loss_clip": 0.01425068, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.25779796, + "balance_loss_mlp": 1.01723766, + "epoch": 0.6978505937171201, + "flos": 22065343211520.0, + "grad_norm": 1.8984628354700894, + "language_loss": 0.83273292, + "learning_rate": 8.835019616278856e-07, + "loss": 0.85736394, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.20776367, + "step": 11607, + "time_per_iteration": 2.8632678985595703 + }, + { + "auxiliary_loss_clip": 0.01417999, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.24976802, + "balance_loss_mlp": 1.01371205, + "epoch": 0.697910716969788, + "flos": 20052305233920.0, + "grad_norm": 1.8612738692910307, + "language_loss": 0.80197072, + "learning_rate": 8.831788567821265e-07, + "loss": 0.82649618, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20849609, + "step": 11608, + "time_per_iteration": 2.8244423866271973 + }, + { + "auxiliary_loss_clip": 0.01407437, + "auxiliary_loss_mlp": 0.01033212, + "balance_loss_clip": 1.24325442, + "balance_loss_mlp": 1.01398349, + "epoch": 0.697970840222456, + "flos": 15896765347200.0, + "grad_norm": 1.811677087541155, + "language_loss": 0.91009283, + "learning_rate": 8.828557942863357e-07, + "loss": 0.93449938, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19226074, + "step": 11609, + "time_per_iteration": 2.82832932472229 + }, + { + "auxiliary_loss_clip": 0.01424716, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.25756359, + "balance_loss_mlp": 1.02209353, + "epoch": 0.698030963475124, + "flos": 21225651367680.0, + "grad_norm": 2.3761796428148982, + "language_loss": 0.65071034, + "learning_rate": 8.82532774152765e-07, + "loss": 0.67537582, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.1973877, + "step": 11610, + "time_per_iteration": 2.912391185760498 + }, + { + "auxiliary_loss_clip": 0.01400616, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.23968589, + "balance_loss_mlp": 1.01537097, + "epoch": 0.698091086727792, + "flos": 33771187238400.0, + "grad_norm": 1.6015528451985452, + "language_loss": 0.84929907, + "learning_rate": 8.822097963936643e-07, + "loss": 0.8736611, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20202637, + "step": 11611, + "time_per_iteration": 2.945190906524658 + }, + { + "auxiliary_loss_clip": 0.01416897, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.25239563, + "balance_loss_mlp": 1.01672149, + "epoch": 0.69815120998046, + "flos": 15896267654400.0, + "grad_norm": 7.381770290059103, + "language_loss": 0.71795309, + "learning_rate": 8.818868610212793e-07, + "loss": 0.74248505, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19567871, + "step": 11612, + "time_per_iteration": 2.8099870681762695 + }, + { + "auxiliary_loss_clip": 0.0141527, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.25457048, + "balance_loss_mlp": 1.01518106, + "epoch": 0.6982113332331279, + "flos": 18954798664320.0, + "grad_norm": 2.177734498504229, + "language_loss": 0.81784254, + "learning_rate": 8.815639680478573e-07, + "loss": 0.84234965, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20263672, + "step": 11613, + "time_per_iteration": 2.884998083114624 + }, + { + "auxiliary_loss_clip": 0.01406248, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.24493241, + "balance_loss_mlp": 1.0132513, + "epoch": 0.6982714564857959, + "flos": 24400362320640.0, + "grad_norm": 1.852322606324556, + "language_loss": 0.75786591, + "learning_rate": 8.812411174856411e-07, + "loss": 0.78224736, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18652344, + "step": 11614, + "time_per_iteration": 2.8704352378845215 + }, + { + "auxiliary_loss_clip": 0.01407096, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.24382794, + "balance_loss_mlp": 1.0145936, + "epoch": 0.6983315797384638, + "flos": 20093233795200.0, + "grad_norm": 2.003664963122446, + "language_loss": 0.78541744, + "learning_rate": 8.809183093468746e-07, + "loss": 0.80982596, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19165039, + "step": 11615, + "time_per_iteration": 2.8477370738983154 + }, + { + "auxiliary_loss_clip": 0.0139569, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.23656058, + "balance_loss_mlp": 1.01412928, + "epoch": 0.6983917029911318, + "flos": 13519234108800.0, + "grad_norm": 1.9392046652887585, + "language_loss": 0.73901016, + "learning_rate": 8.80595543643797e-07, + "loss": 0.76330656, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19836426, + "step": 11616, + "time_per_iteration": 2.851538896560669 + }, + { + "auxiliary_loss_clip": 0.01398559, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.24079967, + "balance_loss_mlp": 1.01595831, + "epoch": 0.6984518262437998, + "flos": 22028667661440.0, + "grad_norm": 1.544601557231382, + "language_loss": 0.84533191, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86967194, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19494629, + "step": 11617, + "time_per_iteration": 2.8751742839813232 + }, + { + "auxiliary_loss_clip": 0.01417522, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.25104856, + "balance_loss_mlp": 1.02120256, + "epoch": 0.6985119494964678, + "flos": 18779881691520.0, + "grad_norm": 2.233722183223116, + "language_loss": 0.5973202, + "learning_rate": 8.799501395936682e-07, + "loss": 0.62189782, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19042969, + "step": 11618, + "time_per_iteration": 2.847720146179199 + }, + { + "auxiliary_loss_clip": 0.01409183, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.24663866, + "balance_loss_mlp": 1.01533329, + "epoch": 0.6985720727491357, + "flos": 22393070432640.0, + "grad_norm": 1.8109490547067972, + "language_loss": 0.83839869, + "learning_rate": 8.796275012710903e-07, + "loss": 0.86284137, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19763184, + "step": 11619, + "time_per_iteration": 2.9151976108551025 + }, + { + "auxiliary_loss_clip": 0.01399248, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.24000633, + "balance_loss_mlp": 1.01313281, + "epoch": 0.6986321960018037, + "flos": 39582646744320.0, + "grad_norm": 1.610226917748209, + "language_loss": 0.68016613, + "learning_rate": 8.793049054331494e-07, + "loss": 0.70447969, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18994141, + "step": 11620, + "time_per_iteration": 4.376590728759766 + }, + { + "auxiliary_loss_clip": 0.01415467, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.25205469, + "balance_loss_mlp": 1.0163238, + "epoch": 0.6986923192544716, + "flos": 17976729663360.0, + "grad_norm": 1.9811520462645826, + "language_loss": 0.7402699, + "learning_rate": 8.789823520920794e-07, + "loss": 0.76479006, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20239258, + "step": 11621, + "time_per_iteration": 2.8261077404022217 + }, + { + "auxiliary_loss_clip": 0.01422199, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.25561857, + "balance_loss_mlp": 1.01621079, + "epoch": 0.6987524425071396, + "flos": 25605588055680.0, + "grad_norm": 1.6537132566848827, + "language_loss": 0.68947327, + "learning_rate": 8.7865984126011e-07, + "loss": 0.71405834, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.2010498, + "step": 11622, + "time_per_iteration": 2.929243803024292 + }, + { + "auxiliary_loss_clip": 0.01401153, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.24101758, + "balance_loss_mlp": 1.01361012, + "epoch": 0.6988125657598077, + "flos": 17539021036800.0, + "grad_norm": 1.7798032163391981, + "language_loss": 0.63381648, + "learning_rate": 8.783373729494721e-07, + "loss": 0.658162, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19787598, + "step": 11623, + "time_per_iteration": 2.857217788696289 + }, + { + "auxiliary_loss_clip": 0.01423101, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.25501466, + "balance_loss_mlp": 1.01508665, + "epoch": 0.6988726890124756, + "flos": 39180165834240.0, + "grad_norm": 2.3799195819647556, + "language_loss": 0.6158663, + "learning_rate": 8.780149471723932e-07, + "loss": 0.64044964, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20141602, + "step": 11624, + "time_per_iteration": 2.967169761657715 + }, + { + "auxiliary_loss_clip": 0.01421499, + "auxiliary_loss_mlp": 0.0103828, + "balance_loss_clip": 1.2564944, + "balance_loss_mlp": 1.01838362, + "epoch": 0.6989328122651436, + "flos": 20203079466240.0, + "grad_norm": 1.5990677176861823, + "language_loss": 0.79219019, + "learning_rate": 8.776925639411017e-07, + "loss": 0.81678796, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19885254, + "step": 11625, + "time_per_iteration": 2.8308730125427246 + }, + { + "auxiliary_loss_clip": 0.01406658, + "auxiliary_loss_mlp": 0.01036264, + "balance_loss_clip": 1.24697661, + "balance_loss_mlp": 1.01618862, + "epoch": 0.6989929355178115, + "flos": 21844792218240.0, + "grad_norm": 1.9622815122803738, + "language_loss": 0.66617846, + "learning_rate": 8.773702232678188e-07, + "loss": 0.69060767, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.20080566, + "step": 11626, + "time_per_iteration": 2.8384556770324707 + }, + { + "auxiliary_loss_clip": 0.01406999, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.24447083, + "balance_loss_mlp": 1.01758885, + "epoch": 0.6990530587704795, + "flos": 26334031639680.0, + "grad_norm": 1.8010160344620985, + "language_loss": 0.71419901, + "learning_rate": 8.770479251647697e-07, + "loss": 0.73863852, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19348145, + "step": 11627, + "time_per_iteration": 2.8734607696533203 + }, + { + "auxiliary_loss_clip": 0.01392414, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.23510885, + "balance_loss_mlp": 1.01827621, + "epoch": 0.6991131820231474, + "flos": 19838586205440.0, + "grad_norm": 3.6805060860766714, + "language_loss": 0.63503051, + "learning_rate": 8.767256696441768e-07, + "loss": 0.65933317, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19567871, + "step": 11628, + "time_per_iteration": 2.8216824531555176 + }, + { + "auxiliary_loss_clip": 0.01417994, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.25343192, + "balance_loss_mlp": 1.01683581, + "epoch": 0.6991733052758154, + "flos": 33997936769280.0, + "grad_norm": 4.283192159656449, + "language_loss": 0.68752337, + "learning_rate": 8.764034567182581e-07, + "loss": 0.71207237, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20092773, + "step": 11629, + "time_per_iteration": 2.9502267837524414 + }, + { + "auxiliary_loss_clip": 0.01407847, + "auxiliary_loss_mlp": 0.01039797, + "balance_loss_clip": 1.24638462, + "balance_loss_mlp": 1.01830363, + "epoch": 0.6992334285284834, + "flos": 15641665309440.0, + "grad_norm": 1.5360720419650575, + "language_loss": 0.72862047, + "learning_rate": 8.760812863992337e-07, + "loss": 0.75309694, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.21496582, + "step": 11630, + "time_per_iteration": 4.261090040206909 + }, + { + "auxiliary_loss_clip": 0.01407244, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.24739695, + "balance_loss_mlp": 1.01753187, + "epoch": 0.6992935517811514, + "flos": 21736394380800.0, + "grad_norm": 1.6494894225888876, + "language_loss": 0.7479291, + "learning_rate": 8.757591586993196e-07, + "loss": 0.77237248, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19555664, + "step": 11631, + "time_per_iteration": 4.259166955947876 + }, + { + "auxiliary_loss_clip": 0.01419538, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.25327992, + "balance_loss_mlp": 1.01345158, + "epoch": 0.6993536750338193, + "flos": 20123484583680.0, + "grad_norm": 2.2520313960465175, + "language_loss": 0.8997699, + "learning_rate": 8.7543707363073e-07, + "loss": 0.92430085, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.2010498, + "step": 11632, + "time_per_iteration": 2.829986095428467 + }, + { + "auxiliary_loss_clip": 0.01421703, + "auxiliary_loss_mlp": 0.01041923, + "balance_loss_clip": 1.2573278, + "balance_loss_mlp": 1.0217886, + "epoch": 0.6994137982864873, + "flos": 22018759050240.0, + "grad_norm": 1.5746183664921196, + "language_loss": 0.80207133, + "learning_rate": 8.751150312056792e-07, + "loss": 0.8267076, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20141602, + "step": 11633, + "time_per_iteration": 4.217574119567871 + }, + { + "auxiliary_loss_clip": 0.01429632, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.26162481, + "balance_loss_mlp": 1.01673508, + "epoch": 0.6994739215391552, + "flos": 25529658001920.0, + "grad_norm": 1.8917895637267577, + "language_loss": 0.6789993, + "learning_rate": 8.747930314363794e-07, + "loss": 0.70366347, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20043945, + "step": 11634, + "time_per_iteration": 2.876605749130249 + }, + { + "auxiliary_loss_clip": 0.012001, + "auxiliary_loss_mlp": 0.01043487, + "balance_loss_clip": 1.1072793, + "balance_loss_mlp": 1.01525867, + "epoch": 0.6995340447918232, + "flos": 59158730747520.0, + "grad_norm": 1.0270482839022725, + "language_loss": 0.53184134, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55427718, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.28320312, + "step": 11635, + "time_per_iteration": 3.476649522781372 + }, + { + "auxiliary_loss_clip": 0.01413378, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.25036299, + "balance_loss_mlp": 1.01549864, + "epoch": 0.6995941680444913, + "flos": 17976865397760.0, + "grad_norm": 1.601496346120854, + "language_loss": 0.82677889, + "learning_rate": 8.741491599138726e-07, + "loss": 0.85125685, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18908691, + "step": 11636, + "time_per_iteration": 2.8381102085113525 + }, + { + "auxiliary_loss_clip": 0.0141215, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.24719501, + "balance_loss_mlp": 1.01263833, + "epoch": 0.6996542912971592, + "flos": 21989865605760.0, + "grad_norm": 2.355358243778302, + "language_loss": 0.83908296, + "learning_rate": 8.738272881850801e-07, + "loss": 0.8635335, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20263672, + "step": 11637, + "time_per_iteration": 2.854966402053833 + }, + { + "auxiliary_loss_clip": 0.01410193, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.24778628, + "balance_loss_mlp": 1.01905942, + "epoch": 0.6997144145498272, + "flos": 11691971856000.0, + "grad_norm": 2.1893906400359775, + "language_loss": 0.68182689, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70632756, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20812988, + "step": 11638, + "time_per_iteration": 2.816181182861328 + }, + { + "auxiliary_loss_clip": 0.01424682, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.25607443, + "balance_loss_mlp": 1.01726377, + "epoch": 0.6997745378024951, + "flos": 29619312180480.0, + "grad_norm": 2.9199563216956492, + "language_loss": 0.78100514, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80564618, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.22143555, + "step": 11639, + "time_per_iteration": 2.9106521606445312 + }, + { + "auxiliary_loss_clip": 0.01419318, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.25484812, + "balance_loss_mlp": 1.0176239, + "epoch": 0.6998346610551631, + "flos": 20896385823360.0, + "grad_norm": 2.320189263007193, + "language_loss": 0.83546048, + "learning_rate": 8.728619292750093e-07, + "loss": 0.86003196, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20214844, + "step": 11640, + "time_per_iteration": 2.8706612586975098 + }, + { + "auxiliary_loss_clip": 0.01402383, + "auxiliary_loss_mlp": 0.01035139, + "balance_loss_clip": 1.24002838, + "balance_loss_mlp": 1.01514757, + "epoch": 0.699894784307831, + "flos": 27174673624320.0, + "grad_norm": 1.663077428157904, + "language_loss": 0.76240885, + "learning_rate": 8.725402284377619e-07, + "loss": 0.78678405, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20007324, + "step": 11641, + "time_per_iteration": 2.9198415279388428 + }, + { + "auxiliary_loss_clip": 0.01414219, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.25026226, + "balance_loss_mlp": 1.0092442, + "epoch": 0.699954907560499, + "flos": 20933559066240.0, + "grad_norm": 1.8581892058001863, + "language_loss": 0.78628612, + "learning_rate": 8.722185703539022e-07, + "loss": 0.8107264, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20581055, + "step": 11642, + "time_per_iteration": 2.8708479404449463 + }, + { + "auxiliary_loss_clip": 0.01423029, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.25544488, + "balance_loss_mlp": 1.01596034, + "epoch": 0.700015030813167, + "flos": 28669503196800.0, + "grad_norm": 3.0795421260976266, + "language_loss": 0.75672638, + "learning_rate": 8.718969550356266e-07, + "loss": 0.78132963, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.21337891, + "step": 11643, + "time_per_iteration": 2.9185147285461426 + }, + { + "auxiliary_loss_clip": 0.01417869, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.25165319, + "balance_loss_mlp": 1.0132848, + "epoch": 0.700075154065835, + "flos": 29217193228800.0, + "grad_norm": 1.433249195726307, + "language_loss": 0.60669029, + "learning_rate": 8.715753824951315e-07, + "loss": 0.63120818, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20617676, + "step": 11644, + "time_per_iteration": 2.9078919887542725 + }, + { + "auxiliary_loss_clip": 0.01409098, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.24673533, + "balance_loss_mlp": 1.01625943, + "epoch": 0.7001352773185029, + "flos": 23122826115840.0, + "grad_norm": 2.3539099497609626, + "language_loss": 0.81623888, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84070146, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20898438, + "step": 11645, + "time_per_iteration": 2.851039171218872 + }, + { + "auxiliary_loss_clip": 0.01397883, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.23689103, + "balance_loss_mlp": 1.01396394, + "epoch": 0.7001954005711709, + "flos": 21332329902720.0, + "grad_norm": 1.911168273091155, + "language_loss": 0.69521481, + "learning_rate": 8.709323657962584e-07, + "loss": 0.71953821, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20495605, + "step": 11646, + "time_per_iteration": 2.829630136489868 + }, + { + "auxiliary_loss_clip": 0.01396332, + "auxiliary_loss_mlp": 0.01034597, + "balance_loss_clip": 1.23584652, + "balance_loss_mlp": 1.01439154, + "epoch": 0.7002555238238388, + "flos": 24546702562560.0, + "grad_norm": 1.645878462705099, + "language_loss": 0.71881169, + "learning_rate": 8.706109216622635e-07, + "loss": 0.74312097, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.20214844, + "step": 11647, + "time_per_iteration": 2.9959559440612793 + }, + { + "auxiliary_loss_clip": 0.0142113, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.25510192, + "balance_loss_mlp": 1.01771832, + "epoch": 0.7003156470765068, + "flos": 39071044080000.0, + "grad_norm": 1.6034074218353362, + "language_loss": 0.72463167, + "learning_rate": 8.702895203548155e-07, + "loss": 0.7492274, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20727539, + "step": 11648, + "time_per_iteration": 3.015843391418457 + }, + { + "auxiliary_loss_clip": 0.01403745, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.24258184, + "balance_loss_mlp": 1.0177238, + "epoch": 0.7003757703291749, + "flos": 28815933928320.0, + "grad_norm": 1.631057113467804, + "language_loss": 0.77763641, + "learning_rate": 8.699681618861014e-07, + "loss": 0.80205727, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20605469, + "step": 11649, + "time_per_iteration": 2.949418306350708 + }, + { + "auxiliary_loss_clip": 0.01401753, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.23973846, + "balance_loss_mlp": 1.01596904, + "epoch": 0.7004358935818428, + "flos": 15960479304960.0, + "grad_norm": 1.6911973467186812, + "language_loss": 0.79332536, + "learning_rate": 8.69646846268308e-07, + "loss": 0.81770909, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20654297, + "step": 11650, + "time_per_iteration": 2.7975709438323975 + }, + { + "auxiliary_loss_clip": 0.01407922, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.24522853, + "balance_loss_mlp": 1.01468134, + "epoch": 0.7004960168345108, + "flos": 20421549198720.0, + "grad_norm": 2.1309457612072773, + "language_loss": 0.78996515, + "learning_rate": 8.693255735136194e-07, + "loss": 0.81439412, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.203125, + "step": 11651, + "time_per_iteration": 2.8309266567230225 + }, + { + "auxiliary_loss_clip": 0.0142303, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.25618362, + "balance_loss_mlp": 1.01304865, + "epoch": 0.7005561400871787, + "flos": 17356412448000.0, + "grad_norm": 2.0288439151830255, + "language_loss": 0.70358759, + "learning_rate": 8.690043436342198e-07, + "loss": 0.72815132, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20288086, + "step": 11652, + "time_per_iteration": 2.8250958919525146 + }, + { + "auxiliary_loss_clip": 0.01413711, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.25041938, + "balance_loss_mlp": 1.01645219, + "epoch": 0.7006162633398467, + "flos": 25313224285440.0, + "grad_norm": 1.4662833090541683, + "language_loss": 0.75230718, + "learning_rate": 8.686831566422874e-07, + "loss": 0.77682233, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.21337891, + "step": 11653, + "time_per_iteration": 3.0283145904541016 + }, + { + "auxiliary_loss_clip": 0.01407983, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.24201953, + "balance_loss_mlp": 1.01225853, + "epoch": 0.7006763865925146, + "flos": 20679047210880.0, + "grad_norm": 2.15765240031465, + "language_loss": 0.7136907, + "learning_rate": 8.68362012550003e-07, + "loss": 0.7381072, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.21386719, + "step": 11654, + "time_per_iteration": 2.89398193359375 + }, + { + "auxiliary_loss_clip": 0.01410185, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.24557126, + "balance_loss_mlp": 1.0116744, + "epoch": 0.7007365098451827, + "flos": 20055743838720.0, + "grad_norm": 2.6906783904275984, + "language_loss": 0.73850405, + "learning_rate": 8.680409113695453e-07, + "loss": 0.76294208, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.21948242, + "step": 11655, + "time_per_iteration": 4.318296909332275 + }, + { + "auxiliary_loss_clip": 0.01434892, + "auxiliary_loss_mlp": 0.01039741, + "balance_loss_clip": 1.26287019, + "balance_loss_mlp": 1.01773477, + "epoch": 0.7007966330978506, + "flos": 20787173579520.0, + "grad_norm": 1.8732173570998834, + "language_loss": 0.7081793, + "learning_rate": 8.677198531130889e-07, + "loss": 0.73292565, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 1.71875, + "router_z_loss_mlp": 0.2199707, + "step": 11656, + "time_per_iteration": 2.859370708465576 + }, + { + "auxiliary_loss_clip": 0.01403286, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.24236202, + "balance_loss_mlp": 1.01334763, + "epoch": 0.7008567563505186, + "flos": 29648929541760.0, + "grad_norm": 1.6541060634991553, + "language_loss": 0.78577822, + "learning_rate": 8.673988377928092e-07, + "loss": 0.81014174, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19702148, + "step": 11657, + "time_per_iteration": 2.9331865310668945 + }, + { + "auxiliary_loss_clip": 0.01422517, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.25240135, + "balance_loss_mlp": 1.01808286, + "epoch": 0.7009168796031865, + "flos": 17100769472640.0, + "grad_norm": 2.41577996669732, + "language_loss": 0.79259002, + "learning_rate": 8.670778654208797e-07, + "loss": 0.81721246, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.21655273, + "step": 11658, + "time_per_iteration": 2.8667688369750977 + }, + { + "auxiliary_loss_clip": 0.01400143, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.24096799, + "balance_loss_mlp": 1.01047647, + "epoch": 0.7009770028558545, + "flos": 20458677196800.0, + "grad_norm": 1.707573332022692, + "language_loss": 0.83248377, + "learning_rate": 8.667569360094713e-07, + "loss": 0.85680568, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.21582031, + "step": 11659, + "time_per_iteration": 2.8578951358795166 + }, + { + "auxiliary_loss_clip": 0.01397199, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.23641944, + "balance_loss_mlp": 1.01406968, + "epoch": 0.7010371261085224, + "flos": 19254582581760.0, + "grad_norm": 4.813987693673104, + "language_loss": 0.70703208, + "learning_rate": 8.664360495707526e-07, + "loss": 0.73135829, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.21362305, + "step": 11660, + "time_per_iteration": 2.83294939994812 + }, + { + "auxiliary_loss_clip": 0.01426492, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.25962543, + "balance_loss_mlp": 1.01654828, + "epoch": 0.7010972493611904, + "flos": 22137789415680.0, + "grad_norm": 1.6719302674615144, + "language_loss": 0.81556308, + "learning_rate": 8.661152061168924e-07, + "loss": 0.8402015, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20800781, + "step": 11661, + "time_per_iteration": 2.878554344177246 + }, + { + "auxiliary_loss_clip": 0.01406032, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.24309111, + "balance_loss_mlp": 1.01808619, + "epoch": 0.7011573726138585, + "flos": 31402750204800.0, + "grad_norm": 1.756705381159017, + "language_loss": 0.79865682, + "learning_rate": 8.657944056600579e-07, + "loss": 0.82309866, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20056152, + "step": 11662, + "time_per_iteration": 2.9275314807891846 + }, + { + "auxiliary_loss_clip": 0.01421803, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.2561208, + "balance_loss_mlp": 1.01191545, + "epoch": 0.7012174958665264, + "flos": 18159745455360.0, + "grad_norm": 1.7548390969245795, + "language_loss": 0.84271652, + "learning_rate": 8.654736482124134e-07, + "loss": 0.86726642, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.21264648, + "step": 11663, + "time_per_iteration": 2.844663381576538 + }, + { + "auxiliary_loss_clip": 0.01190019, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.10105753, + "balance_loss_mlp": 1.00561702, + "epoch": 0.7012776191191944, + "flos": 60679558097280.0, + "grad_norm": 0.8223345229736677, + "language_loss": 0.53843594, + "learning_rate": 8.651529337861209e-07, + "loss": 0.56063259, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.24023438, + "step": 11664, + "time_per_iteration": 4.7706732749938965 + }, + { + "auxiliary_loss_clip": 0.01415324, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.24937117, + "balance_loss_mlp": 1.01305401, + "epoch": 0.7013377423718623, + "flos": 27209991830400.0, + "grad_norm": 2.1675194380833545, + "language_loss": 0.80285597, + "learning_rate": 8.64832262393344e-07, + "loss": 0.82733387, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.1940918, + "step": 11665, + "time_per_iteration": 2.919515609741211 + }, + { + "auxiliary_loss_clip": 0.01399618, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.23837996, + "balance_loss_mlp": 1.01415181, + "epoch": 0.7013978656245303, + "flos": 16551857831040.0, + "grad_norm": 2.0369617925332704, + "language_loss": 0.77442813, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79876125, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.1953125, + "step": 11666, + "time_per_iteration": 4.236992597579956 + }, + { + "auxiliary_loss_clip": 0.01412076, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.24825573, + "balance_loss_mlp": 1.01369143, + "epoch": 0.7014579888771982, + "flos": 23152850680320.0, + "grad_norm": 1.7288921220098765, + "language_loss": 0.8197701, + "learning_rate": 8.641910487569695e-07, + "loss": 0.84422612, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19836426, + "step": 11667, + "time_per_iteration": 2.862044095993042 + }, + { + "auxiliary_loss_clip": 0.01408383, + "auxiliary_loss_mlp": 0.01038499, + "balance_loss_clip": 1.24573076, + "balance_loss_mlp": 1.01789927, + "epoch": 0.7015181121298663, + "flos": 25092854271360.0, + "grad_norm": 2.215272923488967, + "language_loss": 0.65766925, + "learning_rate": 8.638705065376879e-07, + "loss": 0.68213809, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20593262, + "step": 11668, + "time_per_iteration": 4.185842275619507 + }, + { + "auxiliary_loss_clip": 0.01413112, + "auxiliary_loss_mlp": 0.01030946, + "balance_loss_clip": 1.24686217, + "balance_loss_mlp": 1.01107371, + "epoch": 0.7015782353825342, + "flos": 23337450040320.0, + "grad_norm": 5.253937859876017, + "language_loss": 0.77483636, + "learning_rate": 8.635500074005519e-07, + "loss": 0.79927695, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19885254, + "step": 11669, + "time_per_iteration": 2.881883144378662 + }, + { + "auxiliary_loss_clip": 0.01193032, + "auxiliary_loss_mlp": 0.01042639, + "balance_loss_clip": 1.10291481, + "balance_loss_mlp": 1.02251625, + "epoch": 0.7016383586352022, + "flos": 70429083143040.0, + "grad_norm": 0.7155546492751289, + "language_loss": 0.54497766, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56733435, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20117188, + "step": 11670, + "time_per_iteration": 3.502450704574585 + }, + { + "auxiliary_loss_clip": 0.01403051, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.24179316, + "balance_loss_mlp": 1.01323247, + "epoch": 0.7016984818878701, + "flos": 19801820165760.0, + "grad_norm": 1.7319518615407454, + "language_loss": 0.82856905, + "learning_rate": 8.629091384213218e-07, + "loss": 0.85294694, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.21520996, + "step": 11671, + "time_per_iteration": 2.9164483547210693 + }, + { + "auxiliary_loss_clip": 0.01431932, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.26534271, + "balance_loss_mlp": 1.0161221, + "epoch": 0.7017586051405381, + "flos": 12904798717440.0, + "grad_norm": 2.6161132259047637, + "language_loss": 0.76140004, + "learning_rate": 8.625887686035313e-07, + "loss": 0.78609312, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.21252441, + "step": 11672, + "time_per_iteration": 2.866234540939331 + }, + { + "auxiliary_loss_clip": 0.01406019, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.2431767, + "balance_loss_mlp": 1.01531208, + "epoch": 0.701818728393206, + "flos": 18341992085760.0, + "grad_norm": 1.651082831586969, + "language_loss": 0.87152731, + "learning_rate": 8.622684419164883e-07, + "loss": 0.8959347, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1940918, + "step": 11673, + "time_per_iteration": 2.833664655685425 + }, + { + "auxiliary_loss_clip": 0.01400827, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.23972988, + "balance_loss_mlp": 1.017048, + "epoch": 0.701878851645874, + "flos": 17393133242880.0, + "grad_norm": 4.248773406523166, + "language_loss": 0.74119234, + "learning_rate": 8.619481583723399e-07, + "loss": 0.7655772, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20593262, + "step": 11674, + "time_per_iteration": 2.853548526763916 + }, + { + "auxiliary_loss_clip": 0.01410705, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.2502054, + "balance_loss_mlp": 1.01543605, + "epoch": 0.701938974898542, + "flos": 23926204368000.0, + "grad_norm": 3.4524403735306817, + "language_loss": 0.72892404, + "learning_rate": 8.616279179832329e-07, + "loss": 0.75338233, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19714355, + "step": 11675, + "time_per_iteration": 2.9173691272735596 + }, + { + "auxiliary_loss_clip": 0.01420978, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.25578523, + "balance_loss_mlp": 1.01376379, + "epoch": 0.70199909815121, + "flos": 21804768552960.0, + "grad_norm": 2.241672086597585, + "language_loss": 0.51960242, + "learning_rate": 8.613077207613078e-07, + "loss": 0.5441407, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.1907959, + "step": 11676, + "time_per_iteration": 2.8823370933532715 + }, + { + "auxiliary_loss_clip": 0.01189565, + "auxiliary_loss_mlp": 0.01013055, + "balance_loss_clip": 1.09878421, + "balance_loss_mlp": 0.99121612, + "epoch": 0.702059221403878, + "flos": 71748452805120.0, + "grad_norm": 0.7227030089320855, + "language_loss": 0.59223044, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61425662, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.21875, + "step": 11677, + "time_per_iteration": 3.4079089164733887 + }, + { + "auxiliary_loss_clip": 0.01412288, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.24859357, + "balance_loss_mlp": 1.0131259, + "epoch": 0.7021193446565459, + "flos": 28122763305600.0, + "grad_norm": 2.3892905589112168, + "language_loss": 0.63287854, + "learning_rate": 8.606674558675737e-07, + "loss": 0.65732694, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19433594, + "step": 11678, + "time_per_iteration": 2.915574312210083 + }, + { + "auxiliary_loss_clip": 0.01400816, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.24021089, + "balance_loss_mlp": 1.01472354, + "epoch": 0.7021794679092139, + "flos": 22933928499840.0, + "grad_norm": 1.6368688558288498, + "language_loss": 0.79681289, + "learning_rate": 8.603473882200444e-07, + "loss": 0.82115865, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19055176, + "step": 11679, + "time_per_iteration": 2.8836517333984375 + }, + { + "auxiliary_loss_clip": 0.01405147, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.24641526, + "balance_loss_mlp": 1.01328802, + "epoch": 0.7022395911618818, + "flos": 18087027782400.0, + "grad_norm": 2.153153118456413, + "language_loss": 0.72105956, + "learning_rate": 8.600273637882567e-07, + "loss": 0.74544168, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19775391, + "step": 11680, + "time_per_iteration": 2.822939157485962 + }, + { + "auxiliary_loss_clip": 0.01427027, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.26002657, + "balance_loss_mlp": 1.01532912, + "epoch": 0.7022997144145499, + "flos": 16042879365120.0, + "grad_norm": 1.5261468878983375, + "language_loss": 0.75447381, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77909863, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.2010498, + "step": 11681, + "time_per_iteration": 2.8128974437713623 + }, + { + "auxiliary_loss_clip": 0.01403503, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.24368072, + "balance_loss_mlp": 1.01532435, + "epoch": 0.7023598376672178, + "flos": 26479828944000.0, + "grad_norm": 1.648678717570838, + "language_loss": 0.77546906, + "learning_rate": 8.593874446204434e-07, + "loss": 0.79985136, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19421387, + "step": 11682, + "time_per_iteration": 2.869854211807251 + }, + { + "auxiliary_loss_clip": 0.01427581, + "auxiliary_loss_mlp": 0.01040762, + "balance_loss_clip": 1.26024902, + "balance_loss_mlp": 1.02142632, + "epoch": 0.7024199609198858, + "flos": 17064772594560.0, + "grad_norm": 3.966231573072461, + "language_loss": 0.74434125, + "learning_rate": 8.590675499086841e-07, + "loss": 0.76902473, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19335938, + "step": 11683, + "time_per_iteration": 2.8685128688812256 + }, + { + "auxiliary_loss_clip": 0.0140757, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.24597049, + "balance_loss_mlp": 1.01461482, + "epoch": 0.7024800841725537, + "flos": 25860190400640.0, + "grad_norm": 1.7573304380246906, + "language_loss": 0.72474766, + "learning_rate": 8.587476984611976e-07, + "loss": 0.74917746, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20788574, + "step": 11684, + "time_per_iteration": 2.8778140544891357 + }, + { + "auxiliary_loss_clip": 0.01411621, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.24965727, + "balance_loss_mlp": 1.021101, + "epoch": 0.7025402074252217, + "flos": 23523316254720.0, + "grad_norm": 2.500963492075203, + "language_loss": 0.73608088, + "learning_rate": 8.584278902901128e-07, + "loss": 0.76060081, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19274902, + "step": 11685, + "time_per_iteration": 2.8646981716156006 + }, + { + "auxiliary_loss_clip": 0.01404218, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.24011731, + "balance_loss_mlp": 1.01337957, + "epoch": 0.7026003306778896, + "flos": 20159029013760.0, + "grad_norm": 1.6875490952471681, + "language_loss": 0.85681832, + "learning_rate": 8.581081254075582e-07, + "loss": 0.88119161, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19726562, + "step": 11686, + "time_per_iteration": 2.8408091068267822 + }, + { + "auxiliary_loss_clip": 0.01187607, + "auxiliary_loss_mlp": 0.01022744, + "balance_loss_clip": 1.09771466, + "balance_loss_mlp": 1.00061917, + "epoch": 0.7026604539305576, + "flos": 64801318101120.0, + "grad_norm": 0.9793892573988293, + "language_loss": 0.70025963, + "learning_rate": 8.577884038256566e-07, + "loss": 0.72236311, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.22167969, + "step": 11687, + "time_per_iteration": 3.490342855453491 + }, + { + "auxiliary_loss_clip": 0.01417553, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.25483561, + "balance_loss_mlp": 1.01983595, + "epoch": 0.7027205771832256, + "flos": 21881422523520.0, + "grad_norm": 2.1625600301802823, + "language_loss": 0.77631295, + "learning_rate": 8.574687255565329e-07, + "loss": 0.8008796, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19287109, + "step": 11688, + "time_per_iteration": 2.8764820098876953 + }, + { + "auxiliary_loss_clip": 0.01399665, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.23807633, + "balance_loss_mlp": 1.01964164, + "epoch": 0.7027807004358936, + "flos": 23378242867200.0, + "grad_norm": 2.6774911839172546, + "language_loss": 0.69065332, + "learning_rate": 8.571490906123107e-07, + "loss": 0.71504283, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19641113, + "step": 11689, + "time_per_iteration": 2.8516480922698975 + }, + { + "auxiliary_loss_clip": 0.01419094, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.25319457, + "balance_loss_mlp": 1.0180707, + "epoch": 0.7028408236885616, + "flos": 15312580744320.0, + "grad_norm": 2.4239166753422317, + "language_loss": 0.80775821, + "learning_rate": 8.568294990051086e-07, + "loss": 0.83234006, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21044922, + "step": 11690, + "time_per_iteration": 4.319056749343872 + }, + { + "auxiliary_loss_clip": 0.01406557, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.24412131, + "balance_loss_mlp": 1.01483881, + "epoch": 0.7029009469412295, + "flos": 22028396192640.0, + "grad_norm": 1.6948643174418232, + "language_loss": 0.76519263, + "learning_rate": 8.56509950747047e-07, + "loss": 0.7896024, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19580078, + "step": 11691, + "time_per_iteration": 2.8429200649261475 + }, + { + "auxiliary_loss_clip": 0.01416196, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.25401664, + "balance_loss_mlp": 1.01626086, + "epoch": 0.7029610701938975, + "flos": 21845606624640.0, + "grad_norm": 1.6535280273734518, + "language_loss": 0.82285202, + "learning_rate": 8.561904458502429e-07, + "loss": 0.84736979, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.1932373, + "step": 11692, + "time_per_iteration": 2.836735725402832 + }, + { + "auxiliary_loss_clip": 0.01411024, + "auxiliary_loss_mlp": 0.01040428, + "balance_loss_clip": 1.24962151, + "balance_loss_mlp": 1.01939917, + "epoch": 0.7030211934465654, + "flos": 19145053624320.0, + "grad_norm": 1.6144173124374728, + "language_loss": 0.77259868, + "learning_rate": 8.558709843268111e-07, + "loss": 0.79711318, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.21044922, + "step": 11693, + "time_per_iteration": 2.9013922214508057 + }, + { + "auxiliary_loss_clip": 0.01414215, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.25190973, + "balance_loss_mlp": 1.01730514, + "epoch": 0.7030813166992335, + "flos": 38560436801280.0, + "grad_norm": 1.3709085605453615, + "language_loss": 0.68684542, + "learning_rate": 8.55551566188866e-07, + "loss": 0.71136117, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20056152, + "step": 11694, + "time_per_iteration": 2.9837522506713867 + }, + { + "auxiliary_loss_clip": 0.01414567, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.25157261, + "balance_loss_mlp": 1.01944017, + "epoch": 0.7031414399519014, + "flos": 14729165303040.0, + "grad_norm": 2.3111271743218755, + "language_loss": 0.76335579, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78789151, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19567871, + "step": 11695, + "time_per_iteration": 2.8471624851226807 + }, + { + "auxiliary_loss_clip": 0.01432059, + "auxiliary_loss_mlp": 0.01042139, + "balance_loss_clip": 1.26558352, + "balance_loss_mlp": 1.02229059, + "epoch": 0.7032015632045694, + "flos": 14035225518720.0, + "grad_norm": 17.38644250736391, + "language_loss": 0.74414468, + "learning_rate": 8.549128601178852e-07, + "loss": 0.76888669, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19836426, + "step": 11696, + "time_per_iteration": 2.8867242336273193 + }, + { + "auxiliary_loss_clip": 0.01413803, + "auxiliary_loss_mlp": 0.01042704, + "balance_loss_clip": 1.24972343, + "balance_loss_mlp": 1.02248621, + "epoch": 0.7032616864572373, + "flos": 27648876821760.0, + "grad_norm": 1.6190303156366626, + "language_loss": 0.76160038, + "learning_rate": 8.545935722090693e-07, + "loss": 0.78616548, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20227051, + "step": 11697, + "time_per_iteration": 2.9000773429870605 + }, + { + "auxiliary_loss_clip": 0.01417369, + "auxiliary_loss_mlp": 0.01043241, + "balance_loss_clip": 1.25195742, + "balance_loss_mlp": 1.02148509, + "epoch": 0.7033218097099053, + "flos": 17976186725760.0, + "grad_norm": 3.4961286415473505, + "language_loss": 0.80649567, + "learning_rate": 8.542743277341793e-07, + "loss": 0.83110183, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.2175293, + "step": 11698, + "time_per_iteration": 2.9439756870269775 + }, + { + "auxiliary_loss_clip": 0.01410451, + "auxiliary_loss_mlp": 0.01045086, + "balance_loss_clip": 1.24644291, + "balance_loss_mlp": 1.02501118, + "epoch": 0.7033819329625732, + "flos": 19511537656320.0, + "grad_norm": 4.074828101394712, + "language_loss": 0.85388255, + "learning_rate": 8.539551267053222e-07, + "loss": 0.87843788, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20068359, + "step": 11699, + "time_per_iteration": 4.319744825363159 + }, + { + "auxiliary_loss_clip": 0.01410417, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.25079608, + "balance_loss_mlp": 1.01508546, + "epoch": 0.7034420562152413, + "flos": 23998152879360.0, + "grad_norm": 2.4309047385389304, + "language_loss": 0.8030771, + "learning_rate": 8.53635969134601e-07, + "loss": 0.82753575, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20361328, + "step": 11700, + "time_per_iteration": 2.854562759399414 + }, + { + "auxiliary_loss_clip": 0.01405744, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.24136543, + "balance_loss_mlp": 1.01696944, + "epoch": 0.7035021794679092, + "flos": 35056957996800.0, + "grad_norm": 3.494171653738354, + "language_loss": 0.75031948, + "learning_rate": 8.533168550341186e-07, + "loss": 0.77474809, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20153809, + "step": 11701, + "time_per_iteration": 4.414593696594238 + }, + { + "auxiliary_loss_clip": 0.01420456, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.2551415, + "balance_loss_mlp": 1.01880336, + "epoch": 0.7035623027205772, + "flos": 11004230609280.0, + "grad_norm": 2.1619829796653094, + "language_loss": 0.85422468, + "learning_rate": 8.529977844159769e-07, + "loss": 0.8788172, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19970703, + "step": 11702, + "time_per_iteration": 2.791396379470825 + }, + { + "auxiliary_loss_clip": 0.01420438, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.25510776, + "balance_loss_mlp": 1.01823735, + "epoch": 0.7036224259732452, + "flos": 23634383535360.0, + "grad_norm": 2.0744044274512112, + "language_loss": 0.61591256, + "learning_rate": 8.526787572922738e-07, + "loss": 0.64049554, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19616699, + "step": 11703, + "time_per_iteration": 4.252660274505615 + }, + { + "auxiliary_loss_clip": 0.01412125, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.24862981, + "balance_loss_mlp": 1.02128959, + "epoch": 0.7036825492259131, + "flos": 31698688314240.0, + "grad_norm": 1.8687060685859946, + "language_loss": 0.62279016, + "learning_rate": 8.523597736751067e-07, + "loss": 0.64732182, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1973877, + "step": 11704, + "time_per_iteration": 2.8988168239593506 + }, + { + "auxiliary_loss_clip": 0.01403541, + "auxiliary_loss_mlp": 0.01040308, + "balance_loss_clip": 1.24465919, + "balance_loss_mlp": 1.02084112, + "epoch": 0.7037426724785811, + "flos": 30205668533760.0, + "grad_norm": 2.1428627602599772, + "language_loss": 0.71592206, + "learning_rate": 8.520408335765719e-07, + "loss": 0.74036056, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19458008, + "step": 11705, + "time_per_iteration": 2.928816080093384 + }, + { + "auxiliary_loss_clip": 0.01413515, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.25224042, + "balance_loss_mlp": 1.0196321, + "epoch": 0.703802795731249, + "flos": 24320948417280.0, + "grad_norm": 1.7883723472515363, + "language_loss": 0.62965024, + "learning_rate": 8.517219370087645e-07, + "loss": 0.65418613, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20446777, + "step": 11706, + "time_per_iteration": 2.8349080085754395 + }, + { + "auxiliary_loss_clip": 0.01418618, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.25459421, + "balance_loss_mlp": 1.01715183, + "epoch": 0.7038629189839171, + "flos": 22539274940160.0, + "grad_norm": 3.5377052393858017, + "language_loss": 0.68881404, + "learning_rate": 8.514030839837756e-07, + "loss": 0.71336645, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19470215, + "step": 11707, + "time_per_iteration": 2.848904609680176 + }, + { + "auxiliary_loss_clip": 0.01414101, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.25257897, + "balance_loss_mlp": 1.0144155, + "epoch": 0.703923042236585, + "flos": 26261766414720.0, + "grad_norm": 1.7081140210224062, + "language_loss": 0.7699312, + "learning_rate": 8.510842745136974e-07, + "loss": 0.79440665, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19006348, + "step": 11708, + "time_per_iteration": 2.87967848777771 + }, + { + "auxiliary_loss_clip": 0.01411475, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.2511673, + "balance_loss_mlp": 1.01910698, + "epoch": 0.703983165489253, + "flos": 19399520234880.0, + "grad_norm": 3.8356073239700734, + "language_loss": 0.72602439, + "learning_rate": 8.50765508610619e-07, + "loss": 0.75050998, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.17993164, + "step": 11709, + "time_per_iteration": 2.838552951812744 + }, + { + "auxiliary_loss_clip": 0.01413225, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.25123358, + "balance_loss_mlp": 1.01844764, + "epoch": 0.7040432887419209, + "flos": 16690461212160.0, + "grad_norm": 1.987681309692948, + "language_loss": 0.79986364, + "learning_rate": 8.504467862866267e-07, + "loss": 0.82437336, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19311523, + "step": 11710, + "time_per_iteration": 2.8572070598602295 + }, + { + "auxiliary_loss_clip": 0.01408297, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.24529731, + "balance_loss_mlp": 1.01768374, + "epoch": 0.7041034119945889, + "flos": 21151078657920.0, + "grad_norm": 1.736698838977467, + "language_loss": 0.78114671, + "learning_rate": 8.501281075538076e-07, + "loss": 0.80560046, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19384766, + "step": 11711, + "time_per_iteration": 2.8947036266326904 + }, + { + "auxiliary_loss_clip": 0.01404777, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.24361348, + "balance_loss_mlp": 1.01280642, + "epoch": 0.7041635352472568, + "flos": 16919201514240.0, + "grad_norm": 2.028345937355536, + "language_loss": 0.74927068, + "learning_rate": 8.498094724242457e-07, + "loss": 0.77363372, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18725586, + "step": 11712, + "time_per_iteration": 2.880039691925049 + }, + { + "auxiliary_loss_clip": 0.01193571, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.10209513, + "balance_loss_mlp": 1.0015887, + "epoch": 0.7042236584999249, + "flos": 71715306349440.0, + "grad_norm": 0.884630296077185, + "language_loss": 0.64648455, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66868985, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.25390625, + "step": 11713, + "time_per_iteration": 3.4457130432128906 + }, + { + "auxiliary_loss_clip": 0.01403795, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.2432003, + "balance_loss_mlp": 1.01383495, + "epoch": 0.7042837817525928, + "flos": 28670000889600.0, + "grad_norm": 2.8965384301266623, + "language_loss": 0.73136985, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75573611, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18994141, + "step": 11714, + "time_per_iteration": 2.923923969268799 + }, + { + "auxiliary_loss_clip": 0.01410852, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.24939799, + "balance_loss_mlp": 1.01875746, + "epoch": 0.7043439050052608, + "flos": 19762520417280.0, + "grad_norm": 1.6801676124325584, + "language_loss": 0.800547, + "learning_rate": 8.488538287759248e-07, + "loss": 0.82503968, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1965332, + "step": 11715, + "time_per_iteration": 2.8640058040618896 + }, + { + "auxiliary_loss_clip": 0.01409913, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.24860334, + "balance_loss_mlp": 1.01779664, + "epoch": 0.7044040282579288, + "flos": 11543867066880.0, + "grad_norm": 2.503575400345042, + "language_loss": 0.71389914, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73835874, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18237305, + "step": 11716, + "time_per_iteration": 2.8554110527038574 + }, + { + "auxiliary_loss_clip": 0.01428531, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.26072347, + "balance_loss_mlp": 1.01777494, + "epoch": 0.7044641515105967, + "flos": 33669078428160.0, + "grad_norm": 3.2845259292163127, + "language_loss": 0.67345643, + "learning_rate": 8.482169512481358e-07, + "loss": 0.69812053, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20092773, + "step": 11717, + "time_per_iteration": 2.9919707775115967 + }, + { + "auxiliary_loss_clip": 0.01406047, + "auxiliary_loss_mlp": 0.01032895, + "balance_loss_clip": 1.2429173, + "balance_loss_mlp": 1.01385736, + "epoch": 0.7045242747632647, + "flos": 26735019471360.0, + "grad_norm": 1.405422833554187, + "language_loss": 0.74788904, + "learning_rate": 8.478985779917967e-07, + "loss": 0.77227843, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19055176, + "step": 11718, + "time_per_iteration": 2.8866968154907227 + }, + { + "auxiliary_loss_clip": 0.0141199, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.25163507, + "balance_loss_mlp": 1.01466393, + "epoch": 0.7045843980159326, + "flos": 26809049243520.0, + "grad_norm": 1.572311629146511, + "language_loss": 0.80300188, + "learning_rate": 8.475802484232606e-07, + "loss": 0.82745272, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18444824, + "step": 11719, + "time_per_iteration": 2.8984062671661377 + }, + { + "auxiliary_loss_clip": 0.01406029, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.24532437, + "balance_loss_mlp": 1.01835537, + "epoch": 0.7046445212686007, + "flos": 41590164856320.0, + "grad_norm": 1.6332817000678286, + "language_loss": 0.6667093, + "learning_rate": 8.472619625545951e-07, + "loss": 0.691149, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19592285, + "step": 11720, + "time_per_iteration": 3.0373404026031494 + }, + { + "auxiliary_loss_clip": 0.01433605, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.26589108, + "balance_loss_mlp": 1.01601672, + "epoch": 0.7047046445212686, + "flos": 15568676167680.0, + "grad_norm": 2.3595170420570684, + "language_loss": 0.80970562, + "learning_rate": 8.46943720397872e-07, + "loss": 0.83440924, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.20739746, + "step": 11721, + "time_per_iteration": 2.822758674621582 + }, + { + "auxiliary_loss_clip": 0.01186225, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.09772635, + "balance_loss_mlp": 1.01478148, + "epoch": 0.7047647677739366, + "flos": 70445597495040.0, + "grad_norm": 0.7684669782251707, + "language_loss": 0.64795172, + "learning_rate": 8.466255219651582e-07, + "loss": 0.67015159, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.18945312, + "step": 11722, + "time_per_iteration": 3.481835126876831 + }, + { + "auxiliary_loss_clip": 0.01406788, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.24529648, + "balance_loss_mlp": 1.01156294, + "epoch": 0.7048248910266045, + "flos": 23670651882240.0, + "grad_norm": 1.7148933353035734, + "language_loss": 0.66130352, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68567657, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18969727, + "step": 11723, + "time_per_iteration": 2.9002490043640137 + }, + { + "auxiliary_loss_clip": 0.01415262, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.25239074, + "balance_loss_mlp": 1.01341367, + "epoch": 0.7048850142792725, + "flos": 21406993102080.0, + "grad_norm": 1.6594072748101552, + "language_loss": 0.8178637, + "learning_rate": 8.459892563200235e-07, + "loss": 0.84235048, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19995117, + "step": 11724, + "time_per_iteration": 2.8698086738586426 + }, + { + "auxiliary_loss_clip": 0.01428634, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.2638073, + "balance_loss_mlp": 1.01898551, + "epoch": 0.7049451375319404, + "flos": 21656844743040.0, + "grad_norm": 1.6869796783430473, + "language_loss": 0.73291326, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75758654, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19702148, + "step": 11725, + "time_per_iteration": 4.262852430343628 + }, + { + "auxiliary_loss_clip": 0.01421264, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.25537157, + "balance_loss_mlp": 1.0135386, + "epoch": 0.7050052607846085, + "flos": 14875098341760.0, + "grad_norm": 2.105783292139515, + "language_loss": 0.78501236, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80956519, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20471191, + "step": 11726, + "time_per_iteration": 2.8376142978668213 + }, + { + "auxiliary_loss_clip": 0.014071, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.24508309, + "balance_loss_mlp": 1.01474953, + "epoch": 0.7050653840372764, + "flos": 19250329570560.0, + "grad_norm": 1.8262484046073542, + "language_loss": 0.71200049, + "learning_rate": 8.450351860839931e-07, + "loss": 0.73640454, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18554688, + "step": 11727, + "time_per_iteration": 2.832303524017334 + }, + { + "auxiliary_loss_clip": 0.01383133, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.22804761, + "balance_loss_mlp": 1.01303434, + "epoch": 0.7051255072899444, + "flos": 27791461745280.0, + "grad_norm": 1.7950727569978053, + "language_loss": 0.69593376, + "learning_rate": 8.44717250248668e-07, + "loss": 0.72008342, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18786621, + "step": 11728, + "time_per_iteration": 2.9273786544799805 + }, + { + "auxiliary_loss_clip": 0.0140916, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.24708557, + "balance_loss_mlp": 1.01307404, + "epoch": 0.7051856305426124, + "flos": 27903660145920.0, + "grad_norm": 2.3055897850703673, + "language_loss": 0.73758757, + "learning_rate": 8.443993582217803e-07, + "loss": 0.76200169, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19189453, + "step": 11729, + "time_per_iteration": 2.898818016052246 + }, + { + "auxiliary_loss_clip": 0.01436788, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.26734591, + "balance_loss_mlp": 1.01603317, + "epoch": 0.7052457537952803, + "flos": 25053780746880.0, + "grad_norm": 1.7279837797542845, + "language_loss": 0.78894711, + "learning_rate": 8.440815100153862e-07, + "loss": 0.81366903, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19372559, + "step": 11730, + "time_per_iteration": 2.8742361068725586 + }, + { + "auxiliary_loss_clip": 0.01424425, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.25833583, + "balance_loss_mlp": 1.01685667, + "epoch": 0.7053058770479483, + "flos": 21881739237120.0, + "grad_norm": 3.4886426485115583, + "language_loss": 0.63622737, + "learning_rate": 8.437637056415359e-07, + "loss": 0.66083848, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19824219, + "step": 11731, + "time_per_iteration": 2.860541343688965 + }, + { + "auxiliary_loss_clip": 0.01426384, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.26062632, + "balance_loss_mlp": 1.01431084, + "epoch": 0.7053660003006162, + "flos": 16407282136320.0, + "grad_norm": 2.802976265484762, + "language_loss": 0.7533263, + "learning_rate": 8.434459451122815e-07, + "loss": 0.77793384, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20056152, + "step": 11732, + "time_per_iteration": 2.9166829586029053 + }, + { + "auxiliary_loss_clip": 0.01414149, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.25400519, + "balance_loss_mlp": 1.01121449, + "epoch": 0.7054261235532843, + "flos": 22721974018560.0, + "grad_norm": 1.3902654141309376, + "language_loss": 0.71744335, + "learning_rate": 8.431282284396735e-07, + "loss": 0.74188799, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19104004, + "step": 11733, + "time_per_iteration": 2.9336273670196533 + }, + { + "auxiliary_loss_clip": 0.01400993, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.2391181, + "balance_loss_mlp": 1.01614475, + "epoch": 0.7054862468059522, + "flos": 13597652626560.0, + "grad_norm": 1.935950732763059, + "language_loss": 0.7504701, + "learning_rate": 8.428105556357583e-07, + "loss": 0.77483708, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19567871, + "step": 11734, + "time_per_iteration": 4.294881820678711 + }, + { + "auxiliary_loss_clip": 0.0143066, + "auxiliary_loss_mlp": 0.01034966, + "balance_loss_clip": 1.26177359, + "balance_loss_mlp": 1.01517701, + "epoch": 0.7055463700586202, + "flos": 15887263939200.0, + "grad_norm": 2.7269571064788662, + "language_loss": 0.69534481, + "learning_rate": 8.424929267125829e-07, + "loss": 0.72000104, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.19787598, + "step": 11735, + "time_per_iteration": 2.832582473754883 + }, + { + "auxiliary_loss_clip": 0.01417112, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.25207067, + "balance_loss_mlp": 1.01660919, + "epoch": 0.7056064933112881, + "flos": 23086603013760.0, + "grad_norm": 1.9021344881974143, + "language_loss": 0.73380101, + "learning_rate": 8.421753416821933e-07, + "loss": 0.75834835, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.21020508, + "step": 11736, + "time_per_iteration": 4.346101999282837 + }, + { + "auxiliary_loss_clip": 0.01400002, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.24053478, + "balance_loss_mlp": 1.01070881, + "epoch": 0.7056666165639561, + "flos": 24066798520320.0, + "grad_norm": 2.138126957856501, + "language_loss": 0.69515967, + "learning_rate": 8.41857800556629e-07, + "loss": 0.7194587, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19177246, + "step": 11737, + "time_per_iteration": 2.898561477661133 + }, + { + "auxiliary_loss_clip": 0.01408293, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.24414933, + "balance_loss_mlp": 1.01370347, + "epoch": 0.705726739816624, + "flos": 17502074017920.0, + "grad_norm": 2.2072242051407343, + "language_loss": 0.68314618, + "learning_rate": 8.415403033479332e-07, + "loss": 0.70757258, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20654297, + "step": 11738, + "time_per_iteration": 4.328527450561523 + }, + { + "auxiliary_loss_clip": 0.01414858, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.25126946, + "balance_loss_mlp": 1.01197767, + "epoch": 0.7057868630692921, + "flos": 51367633205760.0, + "grad_norm": 1.5643705708665274, + "language_loss": 0.75804138, + "learning_rate": 8.41222850068145e-07, + "loss": 0.78250802, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19848633, + "step": 11739, + "time_per_iteration": 3.124157190322876 + }, + { + "auxiliary_loss_clip": 0.01404199, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.24482381, + "balance_loss_mlp": 1.01113307, + "epoch": 0.70584698632196, + "flos": 26113797360000.0, + "grad_norm": 2.5288766372304265, + "language_loss": 0.72054297, + "learning_rate": 8.409054407293032e-07, + "loss": 0.74489009, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19384766, + "step": 11740, + "time_per_iteration": 2.8951213359832764 + }, + { + "auxiliary_loss_clip": 0.01402454, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.24163568, + "balance_loss_mlp": 1.01612949, + "epoch": 0.705907109574628, + "flos": 21553016630400.0, + "grad_norm": 1.6552722094670702, + "language_loss": 0.82697344, + "learning_rate": 8.405880753434434e-07, + "loss": 0.85135174, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19250488, + "step": 11741, + "time_per_iteration": 2.870028018951416 + }, + { + "auxiliary_loss_clip": 0.01411901, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.24784136, + "balance_loss_mlp": 1.01489282, + "epoch": 0.705967232827296, + "flos": 22721069122560.0, + "grad_norm": 1.843197305765223, + "language_loss": 0.78849417, + "learning_rate": 8.402707539225993e-07, + "loss": 0.81296903, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20703125, + "step": 11742, + "time_per_iteration": 2.8915622234344482 + }, + { + "auxiliary_loss_clip": 0.0141321, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.24659872, + "balance_loss_mlp": 1.01016116, + "epoch": 0.7060273560799639, + "flos": 28702151959680.0, + "grad_norm": 1.647994686042008, + "language_loss": 0.65101457, + "learning_rate": 8.39953476478805e-07, + "loss": 0.67544401, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19580078, + "step": 11743, + "time_per_iteration": 2.898725748062134 + }, + { + "auxiliary_loss_clip": 0.01420202, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.25422978, + "balance_loss_mlp": 1.01512587, + "epoch": 0.7060874793326319, + "flos": 15714744940800.0, + "grad_norm": 2.03241198710867, + "language_loss": 0.66191149, + "learning_rate": 8.396362430240902e-07, + "loss": 0.68646502, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20007324, + "step": 11744, + "time_per_iteration": 2.8372607231140137 + }, + { + "auxiliary_loss_clip": 0.01401648, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.24249518, + "balance_loss_mlp": 1.01320493, + "epoch": 0.7061476025852998, + "flos": 21516657793920.0, + "grad_norm": 1.7808625434823844, + "language_loss": 0.64245808, + "learning_rate": 8.393190535704857e-07, + "loss": 0.66680986, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20336914, + "step": 11745, + "time_per_iteration": 2.8310179710388184 + }, + { + "auxiliary_loss_clip": 0.01418098, + "auxiliary_loss_mlp": 0.01029459, + "balance_loss_clip": 1.25367987, + "balance_loss_mlp": 1.01087427, + "epoch": 0.7062077258379679, + "flos": 28192992514560.0, + "grad_norm": 2.9863061585485418, + "language_loss": 0.72117251, + "learning_rate": 8.390019081300188e-07, + "loss": 0.74564803, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18591309, + "step": 11746, + "time_per_iteration": 2.9126250743865967 + }, + { + "auxiliary_loss_clip": 0.01408586, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.24501097, + "balance_loss_mlp": 1.01595926, + "epoch": 0.7062678490906358, + "flos": 27864812845440.0, + "grad_norm": 1.4385372297867605, + "language_loss": 0.79875076, + "learning_rate": 8.386848067147175e-07, + "loss": 0.82319528, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19897461, + "step": 11747, + "time_per_iteration": 2.929713487625122 + }, + { + "auxiliary_loss_clip": 0.01406238, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.24664235, + "balance_loss_mlp": 1.01612914, + "epoch": 0.7063279723433038, + "flos": 23195136585600.0, + "grad_norm": 2.4654901022563904, + "language_loss": 0.66212523, + "learning_rate": 8.383677493366031e-07, + "loss": 0.68653035, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18139648, + "step": 11748, + "time_per_iteration": 2.8790979385375977 + }, + { + "auxiliary_loss_clip": 0.01404546, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.24289703, + "balance_loss_mlp": 1.01745248, + "epoch": 0.7063880955959717, + "flos": 20197016663040.0, + "grad_norm": 1.922223982414972, + "language_loss": 0.80742395, + "learning_rate": 8.380507360077003e-07, + "loss": 0.83184361, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19958496, + "step": 11749, + "time_per_iteration": 2.890880823135376 + }, + { + "auxiliary_loss_clip": 0.01183998, + "auxiliary_loss_mlp": 0.01024149, + "balance_loss_clip": 1.09615552, + "balance_loss_mlp": 1.00335932, + "epoch": 0.7064482188486397, + "flos": 63694310123520.0, + "grad_norm": 0.788223546880592, + "language_loss": 0.53999597, + "learning_rate": 8.377337667400304e-07, + "loss": 0.5620774, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.20800781, + "step": 11750, + "time_per_iteration": 3.3300135135650635 + }, + { + "auxiliary_loss_clip": 0.01417563, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.25427067, + "balance_loss_mlp": 1.01914966, + "epoch": 0.7065083421013076, + "flos": 25201930780800.0, + "grad_norm": 1.8281972416865868, + "language_loss": 0.79724866, + "learning_rate": 8.37416841545612e-07, + "loss": 0.82182556, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.2097168, + "step": 11751, + "time_per_iteration": 2.923722743988037 + }, + { + "auxiliary_loss_clip": 0.01402445, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.24208701, + "balance_loss_mlp": 1.01277637, + "epoch": 0.7065684653539757, + "flos": 22904084914560.0, + "grad_norm": 2.0577866301889087, + "language_loss": 0.68710268, + "learning_rate": 8.370999604364634e-07, + "loss": 0.71146089, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20593262, + "step": 11752, + "time_per_iteration": 2.891237258911133 + }, + { + "auxiliary_loss_clip": 0.01399654, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.23932624, + "balance_loss_mlp": 1.01904845, + "epoch": 0.7066285886066436, + "flos": 23560760966400.0, + "grad_norm": 2.050684827106445, + "language_loss": 0.77145177, + "learning_rate": 8.367831234246025e-07, + "loss": 0.79584241, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.20361328, + "step": 11753, + "time_per_iteration": 2.8983354568481445 + }, + { + "auxiliary_loss_clip": 0.0140446, + "auxiliary_loss_mlp": 0.01033919, + "balance_loss_clip": 1.24517012, + "balance_loss_mlp": 1.01397562, + "epoch": 0.7066887118593116, + "flos": 21079175391360.0, + "grad_norm": 1.5815891673636249, + "language_loss": 0.71838897, + "learning_rate": 8.364663305220405e-07, + "loss": 0.74277276, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19934082, + "step": 11754, + "time_per_iteration": 2.8926970958709717 + }, + { + "auxiliary_loss_clip": 0.01411502, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.24801362, + "balance_loss_mlp": 1.01543307, + "epoch": 0.7067488351119796, + "flos": 21185582457600.0, + "grad_norm": 1.6786550188019733, + "language_loss": 0.89655709, + "learning_rate": 8.361495817407919e-07, + "loss": 0.92102873, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20227051, + "step": 11755, + "time_per_iteration": 2.844797372817993 + }, + { + "auxiliary_loss_clip": 0.01397534, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.23670208, + "balance_loss_mlp": 1.01893818, + "epoch": 0.7068089583646475, + "flos": 20458993910400.0, + "grad_norm": 2.155656507534046, + "language_loss": 0.80343235, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82779247, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19543457, + "step": 11756, + "time_per_iteration": 2.934812068939209 + }, + { + "auxiliary_loss_clip": 0.01192317, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.10141516, + "balance_loss_mlp": 1.01315165, + "epoch": 0.7068690816173155, + "flos": 59134570024320.0, + "grad_norm": 0.8268467830823323, + "language_loss": 0.60444289, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62670934, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.21191406, + "step": 11757, + "time_per_iteration": 3.1590664386749268 + }, + { + "auxiliary_loss_clip": 0.0140247, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.24282217, + "balance_loss_mlp": 1.01743817, + "epoch": 0.7069292048699835, + "flos": 16259765529600.0, + "grad_norm": 1.6434649756400264, + "language_loss": 0.81035513, + "learning_rate": 8.351996002450307e-07, + "loss": 0.83475971, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20556641, + "step": 11758, + "time_per_iteration": 2.821810007095337 + }, + { + "auxiliary_loss_clip": 0.01399951, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.23936677, + "balance_loss_mlp": 1.0164752, + "epoch": 0.7069893281226515, + "flos": 41187819680640.0, + "grad_norm": 1.653778407348568, + "language_loss": 0.78193629, + "learning_rate": 8.348830280691304e-07, + "loss": 0.80630386, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20324707, + "step": 11759, + "time_per_iteration": 3.057382822036743 + }, + { + "auxiliary_loss_clip": 0.01402874, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.24147832, + "balance_loss_mlp": 1.01465034, + "epoch": 0.7070494513753194, + "flos": 24218025200640.0, + "grad_norm": 1.6736212201766554, + "language_loss": 0.68621469, + "learning_rate": 8.34566500074583e-07, + "loss": 0.71059477, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.20495605, + "step": 11760, + "time_per_iteration": 4.348189830780029 + }, + { + "auxiliary_loss_clip": 0.01411618, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.24783456, + "balance_loss_mlp": 1.01290846, + "epoch": 0.7071095746279874, + "flos": 20193261344640.0, + "grad_norm": 1.8818082682733772, + "language_loss": 0.80562747, + "learning_rate": 8.342500162733899e-07, + "loss": 0.83006561, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19287109, + "step": 11761, + "time_per_iteration": 2.838639497756958 + }, + { + "auxiliary_loss_clip": 0.0140517, + "auxiliary_loss_mlp": 0.01037944, + "balance_loss_clip": 1.24359477, + "balance_loss_mlp": 1.01715398, + "epoch": 0.7071696978806553, + "flos": 18191172608640.0, + "grad_norm": 2.5281474533569, + "language_loss": 0.76317644, + "learning_rate": 8.33933576677553e-07, + "loss": 0.78760755, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20800781, + "step": 11762, + "time_per_iteration": 2.854710817337036 + }, + { + "auxiliary_loss_clip": 0.01408965, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.24877143, + "balance_loss_mlp": 1.01475883, + "epoch": 0.7072298211333233, + "flos": 24141778433280.0, + "grad_norm": 6.554440055568179, + "language_loss": 0.77714741, + "learning_rate": 8.336171812990724e-07, + "loss": 0.80158854, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.20410156, + "step": 11763, + "time_per_iteration": 2.874415397644043 + }, + { + "auxiliary_loss_clip": 0.01408824, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.24583542, + "balance_loss_mlp": 1.02077377, + "epoch": 0.7072899443859912, + "flos": 27209765606400.0, + "grad_norm": 2.275166719996912, + "language_loss": 0.79459578, + "learning_rate": 8.333008301499453e-07, + "loss": 0.81910157, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20983887, + "step": 11764, + "time_per_iteration": 2.8755879402160645 + }, + { + "auxiliary_loss_clip": 0.01425363, + "auxiliary_loss_mlp": 0.01040948, + "balance_loss_clip": 1.25942218, + "balance_loss_mlp": 1.01950228, + "epoch": 0.7073500676386593, + "flos": 16444274400000.0, + "grad_norm": 1.569428299007325, + "language_loss": 0.80529594, + "learning_rate": 8.32984523242167e-07, + "loss": 0.82995903, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21435547, + "step": 11765, + "time_per_iteration": 2.9497225284576416 + }, + { + "auxiliary_loss_clip": 0.01408723, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.24681878, + "balance_loss_mlp": 1.0173018, + "epoch": 0.7074101908913272, + "flos": 27685461882240.0, + "grad_norm": 1.6753842986842695, + "language_loss": 0.69270205, + "learning_rate": 8.326682605877324e-07, + "loss": 0.71715105, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18859863, + "step": 11766, + "time_per_iteration": 2.915818452835083 + }, + { + "auxiliary_loss_clip": 0.01414421, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.25077176, + "balance_loss_mlp": 1.01414788, + "epoch": 0.7074703141439952, + "flos": 22248585227520.0, + "grad_norm": 1.8922378048457638, + "language_loss": 0.64852607, + "learning_rate": 8.323520421986352e-07, + "loss": 0.67301273, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20080566, + "step": 11767, + "time_per_iteration": 2.8752670288085938 + }, + { + "auxiliary_loss_clip": 0.01413585, + "auxiliary_loss_mlp": 0.01036433, + "balance_loss_clip": 1.25078201, + "balance_loss_mlp": 1.01591706, + "epoch": 0.7075304373966632, + "flos": 29655263813760.0, + "grad_norm": 2.2242424684608952, + "language_loss": 0.53755611, + "learning_rate": 8.320358680868646e-07, + "loss": 0.5620563, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20532227, + "step": 11768, + "time_per_iteration": 2.9267711639404297 + }, + { + "auxiliary_loss_clip": 0.0139897, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.23894095, + "balance_loss_mlp": 1.01434898, + "epoch": 0.7075905606493311, + "flos": 19764601678080.0, + "grad_norm": 1.6240328521323, + "language_loss": 0.76344121, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78777063, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19592285, + "step": 11769, + "time_per_iteration": 4.284012079238892 + }, + { + "auxiliary_loss_clip": 0.01190797, + "auxiliary_loss_mlp": 0.01022673, + "balance_loss_clip": 1.09979534, + "balance_loss_mlp": 0.9978776, + "epoch": 0.7076506839019991, + "flos": 65744023651200.0, + "grad_norm": 0.8530176786815913, + "language_loss": 0.62014562, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64228034, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.24804688, + "step": 11770, + "time_per_iteration": 3.282191038131714 + }, + { + "auxiliary_loss_clip": 0.01426554, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.2617445, + "balance_loss_mlp": 1.01591206, + "epoch": 0.707710807154667, + "flos": 23774841953280.0, + "grad_norm": 2.0932235567286517, + "language_loss": 0.770118, + "learning_rate": 8.310876115354055e-07, + "loss": 0.79473555, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19274902, + "step": 11771, + "time_per_iteration": 4.325823545455933 + }, + { + "auxiliary_loss_clip": 0.0140007, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.24212635, + "balance_loss_mlp": 1.01286376, + "epoch": 0.7077709304073351, + "flos": 21261195797760.0, + "grad_norm": 1.4970080731864575, + "language_loss": 0.71892321, + "learning_rate": 8.307716146528221e-07, + "loss": 0.74325407, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.20166016, + "step": 11772, + "time_per_iteration": 4.419597864151001 + }, + { + "auxiliary_loss_clip": 0.01419149, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.25296295, + "balance_loss_mlp": 1.01700032, + "epoch": 0.707831053660003, + "flos": 20750498029440.0, + "grad_norm": 1.79749341623192, + "language_loss": 0.70174378, + "learning_rate": 8.30455662107496e-07, + "loss": 0.726309, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20373535, + "step": 11773, + "time_per_iteration": 2.9243147373199463 + }, + { + "auxiliary_loss_clip": 0.01422279, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.25956452, + "balance_loss_mlp": 1.01300168, + "epoch": 0.707891176912671, + "flos": 21990996725760.0, + "grad_norm": 1.3916900871239033, + "language_loss": 0.71309465, + "learning_rate": 8.301397539114095e-07, + "loss": 0.73763418, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18701172, + "step": 11774, + "time_per_iteration": 2.9795069694519043 + }, + { + "auxiliary_loss_clip": 0.01398138, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.24090397, + "balance_loss_mlp": 1.01134777, + "epoch": 0.7079513001653389, + "flos": 21078768188160.0, + "grad_norm": 1.4642857897857584, + "language_loss": 0.74824667, + "learning_rate": 8.298238900765407e-07, + "loss": 0.7725296, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18811035, + "step": 11775, + "time_per_iteration": 2.9243435859680176 + }, + { + "auxiliary_loss_clip": 0.01423568, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.25987411, + "balance_loss_mlp": 1.01336169, + "epoch": 0.7080114234180069, + "flos": 18049673560320.0, + "grad_norm": 1.7006219682496255, + "language_loss": 0.88204038, + "learning_rate": 8.295080706148665e-07, + "loss": 0.90660143, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19165039, + "step": 11776, + "time_per_iteration": 2.859626293182373 + }, + { + "auxiliary_loss_clip": 0.0139818, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.23714292, + "balance_loss_mlp": 1.01477623, + "epoch": 0.7080715466706748, + "flos": 15130650827520.0, + "grad_norm": 1.880657216374079, + "language_loss": 0.75473243, + "learning_rate": 8.291922955383641e-07, + "loss": 0.7790581, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19616699, + "step": 11777, + "time_per_iteration": 2.847273349761963 + }, + { + "auxiliary_loss_clip": 0.01438163, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.27117062, + "balance_loss_mlp": 1.01628566, + "epoch": 0.7081316699233429, + "flos": 14429381385600.0, + "grad_norm": 2.213838816948703, + "language_loss": 0.8301847, + "learning_rate": 8.288765648590066e-07, + "loss": 0.854936, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20678711, + "step": 11778, + "time_per_iteration": 2.84698486328125 + }, + { + "auxiliary_loss_clip": 0.01392073, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.2355355, + "balance_loss_mlp": 1.0166657, + "epoch": 0.7081917931760108, + "flos": 23232943255680.0, + "grad_norm": 1.6728728316160362, + "language_loss": 0.8549999, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87928057, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.19311523, + "step": 11779, + "time_per_iteration": 2.894827365875244 + }, + { + "auxiliary_loss_clip": 0.01426492, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.26249778, + "balance_loss_mlp": 1.01812291, + "epoch": 0.7082519164286788, + "flos": 39322072085760.0, + "grad_norm": 2.9668855089516324, + "language_loss": 0.72078484, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74542737, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19616699, + "step": 11780, + "time_per_iteration": 3.018651247024536 + }, + { + "auxiliary_loss_clip": 0.01409564, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.24958575, + "balance_loss_mlp": 1.01411963, + "epoch": 0.7083120396813467, + "flos": 21660690551040.0, + "grad_norm": 1.7498551831105367, + "language_loss": 0.73551118, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75993335, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18530273, + "step": 11781, + "time_per_iteration": 2.9287118911743164 + }, + { + "auxiliary_loss_clip": 0.01405946, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.24643672, + "balance_loss_mlp": 1.01448393, + "epoch": 0.7083721629340147, + "flos": 17576103790080.0, + "grad_norm": 1.521327501725006, + "language_loss": 0.7850275, + "learning_rate": 8.276140863524585e-07, + "loss": 0.8094275, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19567871, + "step": 11782, + "time_per_iteration": 2.911020517349243 + }, + { + "auxiliary_loss_clip": 0.01404715, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.24437165, + "balance_loss_mlp": 1.01253915, + "epoch": 0.7084322861866827, + "flos": 29362266616320.0, + "grad_norm": 1.4229243759035972, + "language_loss": 0.70330679, + "learning_rate": 8.272985778383828e-07, + "loss": 0.72766709, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18774414, + "step": 11783, + "time_per_iteration": 2.9125282764434814 + }, + { + "auxiliary_loss_clip": 0.01423866, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.25917959, + "balance_loss_mlp": 1.01502299, + "epoch": 0.7084924094393507, + "flos": 20204120096640.0, + "grad_norm": 1.6642713925502632, + "language_loss": 0.7974844, + "learning_rate": 8.269831137932632e-07, + "loss": 0.82207233, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19897461, + "step": 11784, + "time_per_iteration": 2.875748634338379 + }, + { + "auxiliary_loss_clip": 0.0141859, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.25678015, + "balance_loss_mlp": 1.01812005, + "epoch": 0.7085525326920187, + "flos": 23487681335040.0, + "grad_norm": 1.549524927161694, + "language_loss": 0.77660084, + "learning_rate": 8.266676942290609e-07, + "loss": 0.8011632, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.1953125, + "step": 11785, + "time_per_iteration": 2.8858065605163574 + }, + { + "auxiliary_loss_clip": 0.01398938, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.2385782, + "balance_loss_mlp": 1.01307619, + "epoch": 0.7086126559446866, + "flos": 25970081316480.0, + "grad_norm": 1.5871358556149409, + "language_loss": 0.78446662, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80878007, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1932373, + "step": 11786, + "time_per_iteration": 2.8944664001464844 + }, + { + "auxiliary_loss_clip": 0.01409853, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.24651456, + "balance_loss_mlp": 1.01506162, + "epoch": 0.7086727791973546, + "flos": 26736195836160.0, + "grad_norm": 2.1533953575037246, + "language_loss": 0.79202056, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81645727, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18762207, + "step": 11787, + "time_per_iteration": 2.8816089630126953 + }, + { + "auxiliary_loss_clip": 0.01408094, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.24683976, + "balance_loss_mlp": 1.01464224, + "epoch": 0.7087329024500225, + "flos": 21691800990720.0, + "grad_norm": 2.173076644330915, + "language_loss": 0.77427161, + "learning_rate": 8.257217025415615e-07, + "loss": 0.79869562, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1965332, + "step": 11788, + "time_per_iteration": 2.8515329360961914 + }, + { + "auxiliary_loss_clip": 0.01431719, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.26323068, + "balance_loss_mlp": 1.0136342, + "epoch": 0.7087930257026905, + "flos": 17940008868480.0, + "grad_norm": 2.065315075328281, + "language_loss": 0.68838525, + "learning_rate": 8.254064610206212e-07, + "loss": 0.71304274, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20373535, + "step": 11789, + "time_per_iteration": 2.851297378540039 + }, + { + "auxiliary_loss_clip": 0.01422854, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.25873613, + "balance_loss_mlp": 1.01312721, + "epoch": 0.7088531489553584, + "flos": 18919525703040.0, + "grad_norm": 1.5366379091233429, + "language_loss": 0.78047806, + "learning_rate": 8.250912640403858e-07, + "loss": 0.80503023, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19238281, + "step": 11790, + "time_per_iteration": 2.8657472133636475 + }, + { + "auxiliary_loss_clip": 0.01422419, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.25583386, + "balance_loss_mlp": 1.01202965, + "epoch": 0.7089132722080265, + "flos": 27392283705600.0, + "grad_norm": 1.6425898750439292, + "language_loss": 0.7162562, + "learning_rate": 8.247761116128085e-07, + "loss": 0.74079657, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19580078, + "step": 11791, + "time_per_iteration": 2.9373066425323486 + }, + { + "auxiliary_loss_clip": 0.01403646, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.24317336, + "balance_loss_mlp": 1.01569855, + "epoch": 0.7089733954606944, + "flos": 22172836152960.0, + "grad_norm": 1.4678239754939364, + "language_loss": 0.82489872, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84929252, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20031738, + "step": 11792, + "time_per_iteration": 2.9464361667633057 + }, + { + "auxiliary_loss_clip": 0.01423384, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.25747061, + "balance_loss_mlp": 1.01229072, + "epoch": 0.7090335187133624, + "flos": 24436540177920.0, + "grad_norm": 1.9648879748906931, + "language_loss": 0.65740097, + "learning_rate": 8.241459404634232e-07, + "loss": 0.68195164, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19384766, + "step": 11793, + "time_per_iteration": 2.903441905975342 + }, + { + "auxiliary_loss_clip": 0.01402949, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.24321103, + "balance_loss_mlp": 1.01250768, + "epoch": 0.7090936419660303, + "flos": 21845244666240.0, + "grad_norm": 2.2120338988846435, + "language_loss": 0.71449792, + "learning_rate": 8.238309217655133e-07, + "loss": 0.73883474, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18225098, + "step": 11794, + "time_per_iteration": 4.319464683532715 + }, + { + "auxiliary_loss_clip": 0.01401193, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.24149632, + "balance_loss_mlp": 1.0149169, + "epoch": 0.7091537652186983, + "flos": 20091514492800.0, + "grad_norm": 1.8462024111891113, + "language_loss": 0.76108068, + "learning_rate": 8.23515947668052e-07, + "loss": 0.7854259, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18408203, + "step": 11795, + "time_per_iteration": 2.8404104709625244 + }, + { + "auxiliary_loss_clip": 0.01403571, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.24279165, + "balance_loss_mlp": 1.01472008, + "epoch": 0.7092138884713663, + "flos": 13159310572800.0, + "grad_norm": 2.286084851191497, + "language_loss": 0.76313311, + "learning_rate": 8.232010181829838e-07, + "loss": 0.78751236, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19628906, + "step": 11796, + "time_per_iteration": 2.8594119548797607 + }, + { + "auxiliary_loss_clip": 0.01421877, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.25595331, + "balance_loss_mlp": 1.01643801, + "epoch": 0.7092740117240343, + "flos": 21654175299840.0, + "grad_norm": 1.8115005294003268, + "language_loss": 0.75103605, + "learning_rate": 8.228861333222523e-07, + "loss": 0.77562344, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20422363, + "step": 11797, + "time_per_iteration": 2.8479998111724854 + }, + { + "auxiliary_loss_clip": 0.01411739, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.24872947, + "balance_loss_mlp": 1.01201773, + "epoch": 0.7093341349767023, + "flos": 21042318862080.0, + "grad_norm": 1.3739851719368212, + "language_loss": 0.80235708, + "learning_rate": 8.225712930977953e-07, + "loss": 0.82678777, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19335938, + "step": 11798, + "time_per_iteration": 2.872982978820801 + }, + { + "auxiliary_loss_clip": 0.01401521, + "auxiliary_loss_mlp": 0.01038058, + "balance_loss_clip": 1.24083066, + "balance_loss_mlp": 1.01850808, + "epoch": 0.7093942582293702, + "flos": 22027717520640.0, + "grad_norm": 1.807580606646496, + "language_loss": 0.67374372, + "learning_rate": 8.222564975215529e-07, + "loss": 0.69813955, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19543457, + "step": 11799, + "time_per_iteration": 2.9210784435272217 + }, + { + "auxiliary_loss_clip": 0.01412672, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.25026357, + "balance_loss_mlp": 1.01113486, + "epoch": 0.7094543814820382, + "flos": 27247119828480.0, + "grad_norm": 1.6006384064006411, + "language_loss": 0.82592738, + "learning_rate": 8.219417466054622e-07, + "loss": 0.85036391, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19848633, + "step": 11800, + "time_per_iteration": 2.8932902812957764 + }, + { + "auxiliary_loss_clip": 0.01404107, + "auxiliary_loss_mlp": 0.01029074, + "balance_loss_clip": 1.24351132, + "balance_loss_mlp": 1.0107038, + "epoch": 0.7095145047347061, + "flos": 12095493396480.0, + "grad_norm": 1.7885518676166097, + "language_loss": 0.87658906, + "learning_rate": 8.21627040361459e-07, + "loss": 0.90092087, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18371582, + "step": 11801, + "time_per_iteration": 2.809185028076172 + }, + { + "auxiliary_loss_clip": 0.01405728, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.24366117, + "balance_loss_mlp": 1.01385045, + "epoch": 0.7095746279873741, + "flos": 19391376170880.0, + "grad_norm": 1.731042699542604, + "language_loss": 0.76814914, + "learning_rate": 8.213123788014758e-07, + "loss": 0.79254586, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.2010498, + "step": 11802, + "time_per_iteration": 2.8582847118377686 + }, + { + "auxiliary_loss_clip": 0.01399615, + "auxiliary_loss_mlp": 0.01037095, + "balance_loss_clip": 1.23841238, + "balance_loss_mlp": 1.01736534, + "epoch": 0.709634751240042, + "flos": 21370408041600.0, + "grad_norm": 2.176398794974906, + "language_loss": 0.8272692, + "learning_rate": 8.209977619374462e-07, + "loss": 0.85163629, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19726562, + "step": 11803, + "time_per_iteration": 4.325204372406006 + }, + { + "auxiliary_loss_clip": 0.01411566, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.24718463, + "balance_loss_mlp": 1.01444674, + "epoch": 0.7096948744927101, + "flos": 13924791665280.0, + "grad_norm": 2.674285770278976, + "language_loss": 0.69000149, + "learning_rate": 8.206831897812995e-07, + "loss": 0.71446866, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20703125, + "step": 11804, + "time_per_iteration": 2.8424248695373535 + }, + { + "auxiliary_loss_clip": 0.0139273, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.23674464, + "balance_loss_mlp": 1.01262593, + "epoch": 0.709754997745378, + "flos": 30310130073600.0, + "grad_norm": 1.8350930384779145, + "language_loss": 0.78987736, + "learning_rate": 8.203686623449637e-07, + "loss": 0.81412101, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19006348, + "step": 11805, + "time_per_iteration": 2.960129737854004 + }, + { + "auxiliary_loss_clip": 0.01409011, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.24708223, + "balance_loss_mlp": 1.01056552, + "epoch": 0.709815120998046, + "flos": 18524329205760.0, + "grad_norm": 1.852044868558423, + "language_loss": 0.7944535, + "learning_rate": 8.200541796403667e-07, + "loss": 0.81884021, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19104004, + "step": 11806, + "time_per_iteration": 4.2590553760528564 + }, + { + "auxiliary_loss_clip": 0.01403555, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.24223411, + "balance_loss_mlp": 1.01373315, + "epoch": 0.7098752442507139, + "flos": 22282591334400.0, + "grad_norm": 2.116401135140007, + "language_loss": 0.57561421, + "learning_rate": 8.197397416794332e-07, + "loss": 0.59998631, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19909668, + "step": 11807, + "time_per_iteration": 4.238116502761841 + }, + { + "auxiliary_loss_clip": 0.0141841, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.2507441, + "balance_loss_mlp": 1.01663876, + "epoch": 0.7099353675033819, + "flos": 19283295047040.0, + "grad_norm": 1.9009266081524119, + "language_loss": 0.6910795, + "learning_rate": 8.194253484740882e-07, + "loss": 0.71563411, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.20410156, + "step": 11808, + "time_per_iteration": 2.852538585662842 + }, + { + "auxiliary_loss_clip": 0.01419456, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.25398815, + "balance_loss_mlp": 1.01646924, + "epoch": 0.70999549075605, + "flos": 21918641011200.0, + "grad_norm": 2.3483660339635346, + "language_loss": 0.70925504, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73379451, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18029785, + "step": 11809, + "time_per_iteration": 2.8739731311798096 + }, + { + "auxiliary_loss_clip": 0.01187624, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.10149312, + "balance_loss_mlp": 1.0155791, + "epoch": 0.7100556140087179, + "flos": 70484336323200.0, + "grad_norm": 0.7562727968825211, + "language_loss": 0.59464979, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61685824, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.17675781, + "step": 11810, + "time_per_iteration": 3.4314229488372803 + }, + { + "auxiliary_loss_clip": 0.01409339, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.24863768, + "balance_loss_mlp": 1.01379585, + "epoch": 0.7101157372613859, + "flos": 23049972708480.0, + "grad_norm": 1.607523570812524, + "language_loss": 0.75042045, + "learning_rate": 8.18482437510784e-07, + "loss": 0.77484119, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18945312, + "step": 11811, + "time_per_iteration": 2.888417959213257 + }, + { + "auxiliary_loss_clip": 0.013955, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.23806787, + "balance_loss_mlp": 1.01217127, + "epoch": 0.7101758605140538, + "flos": 23195724768000.0, + "grad_norm": 1.9162205372167527, + "language_loss": 0.84048653, + "learning_rate": 8.181682234469882e-07, + "loss": 0.86475325, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19018555, + "step": 11812, + "time_per_iteration": 2.9091482162475586 + }, + { + "auxiliary_loss_clip": 0.01412247, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.24863172, + "balance_loss_mlp": 1.01183152, + "epoch": 0.7102359837667218, + "flos": 23706648760320.0, + "grad_norm": 1.4408822391913676, + "language_loss": 0.70896459, + "learning_rate": 8.178540541983716e-07, + "loss": 0.7334013, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19592285, + "step": 11813, + "time_per_iteration": 2.932417154312134 + }, + { + "auxiliary_loss_clip": 0.0139947, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.24050987, + "balance_loss_mlp": 1.00978017, + "epoch": 0.7102961070193897, + "flos": 19400606110080.0, + "grad_norm": 2.168805359282903, + "language_loss": 0.82723457, + "learning_rate": 8.175399297768495e-07, + "loss": 0.85151494, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18786621, + "step": 11814, + "time_per_iteration": 2.8755788803100586 + }, + { + "auxiliary_loss_clip": 0.01407613, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.24616921, + "balance_loss_mlp": 1.0123769, + "epoch": 0.7103562302720577, + "flos": 21517607934720.0, + "grad_norm": 1.7874839952207309, + "language_loss": 0.76916826, + "learning_rate": 8.172258501943301e-07, + "loss": 0.79356754, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19946289, + "step": 11815, + "time_per_iteration": 2.8501358032226562 + }, + { + "auxiliary_loss_clip": 0.01399472, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.24162591, + "balance_loss_mlp": 1.01478493, + "epoch": 0.7104163535247257, + "flos": 14542394192640.0, + "grad_norm": 1.6207973324635574, + "language_loss": 0.7935372, + "learning_rate": 8.16911815462725e-07, + "loss": 0.81786859, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1887207, + "step": 11816, + "time_per_iteration": 2.8523595333099365 + }, + { + "auxiliary_loss_clip": 0.01389799, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.23066807, + "balance_loss_mlp": 1.01319122, + "epoch": 0.7104764767773937, + "flos": 11407616415360.0, + "grad_norm": 1.8688220483959654, + "language_loss": 0.87151116, + "learning_rate": 8.165978255939426e-07, + "loss": 0.89572525, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1842041, + "step": 11817, + "time_per_iteration": 2.944854259490967 + }, + { + "auxiliary_loss_clip": 0.01391731, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.23256505, + "balance_loss_mlp": 1.01389039, + "epoch": 0.7105366000300616, + "flos": 11697355987200.0, + "grad_norm": 2.1614272031044024, + "language_loss": 0.85144889, + "learning_rate": 8.162838805998897e-07, + "loss": 0.87570298, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19775391, + "step": 11818, + "time_per_iteration": 2.79404354095459 + }, + { + "auxiliary_loss_clip": 0.01406855, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.24559844, + "balance_loss_mlp": 1.0119729, + "epoch": 0.7105967232827296, + "flos": 19363342377600.0, + "grad_norm": 2.394402306909922, + "language_loss": 0.77036285, + "learning_rate": 8.159699804924709e-07, + "loss": 0.79474616, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19494629, + "step": 11819, + "time_per_iteration": 2.8299973011016846 + }, + { + "auxiliary_loss_clip": 0.014152, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.25453067, + "balance_loss_mlp": 1.0146091, + "epoch": 0.7106568465353975, + "flos": 22940850954240.0, + "grad_norm": 2.7472545006669105, + "language_loss": 0.71525937, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7397598, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.20227051, + "step": 11820, + "time_per_iteration": 2.8683018684387207 + }, + { + "auxiliary_loss_clip": 0.01398024, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.23795247, + "balance_loss_mlp": 1.0130347, + "epoch": 0.7107169697880655, + "flos": 19109192480640.0, + "grad_norm": 1.7506375107820273, + "language_loss": 0.76327211, + "learning_rate": 8.153423149851449e-07, + "loss": 0.78757954, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19665527, + "step": 11821, + "time_per_iteration": 2.8369550704956055 + }, + { + "auxiliary_loss_clip": 0.01193156, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.1041801, + "balance_loss_mlp": 1.00880933, + "epoch": 0.7107770930407336, + "flos": 63665823882240.0, + "grad_norm": 0.7789398491004937, + "language_loss": 0.55221355, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57445157, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.21875, + "step": 11822, + "time_per_iteration": 3.3936574459075928 + }, + { + "auxiliary_loss_clip": 0.01390486, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.23539829, + "balance_loss_mlp": 1.00978374, + "epoch": 0.7108372162934015, + "flos": 22064619294720.0, + "grad_norm": 1.9876100442588547, + "language_loss": 0.61242706, + "learning_rate": 8.147148291671688e-07, + "loss": 0.63664711, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.21728516, + "step": 11823, + "time_per_iteration": 2.8972604274749756 + }, + { + "auxiliary_loss_clip": 0.01411937, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.2513746, + "balance_loss_mlp": 1.01034307, + "epoch": 0.7108973395460695, + "flos": 19144420197120.0, + "grad_norm": 1.913044897778204, + "language_loss": 0.7247231, + "learning_rate": 8.144011536714322e-07, + "loss": 0.74913323, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18737793, + "step": 11824, + "time_per_iteration": 2.9166276454925537 + }, + { + "auxiliary_loss_clip": 0.01391368, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.23563528, + "balance_loss_mlp": 1.01086199, + "epoch": 0.7109574627987374, + "flos": 17903333318400.0, + "grad_norm": 1.7598402544006317, + "language_loss": 0.7332952, + "learning_rate": 8.140875231337223e-07, + "loss": 0.75748974, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.17224121, + "step": 11825, + "time_per_iteration": 2.83345627784729 + }, + { + "auxiliary_loss_clip": 0.01413768, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.25174618, + "balance_loss_mlp": 1.01339591, + "epoch": 0.7110175860514054, + "flos": 28989719781120.0, + "grad_norm": 1.6479384859953843, + "language_loss": 0.7999922, + "learning_rate": 8.137739375659321e-07, + "loss": 0.82445747, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19360352, + "step": 11826, + "time_per_iteration": 2.9245052337646484 + }, + { + "auxiliary_loss_clip": 0.01397812, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.23958826, + "balance_loss_mlp": 1.01448596, + "epoch": 0.7110777093040733, + "flos": 26183483631360.0, + "grad_norm": 1.7744475549604568, + "language_loss": 0.83987868, + "learning_rate": 8.134603969799527e-07, + "loss": 0.86419678, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19506836, + "step": 11827, + "time_per_iteration": 2.9219415187835693 + }, + { + "auxiliary_loss_clip": 0.01415764, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.25394297, + "balance_loss_mlp": 1.01488113, + "epoch": 0.7111378325567413, + "flos": 26881178734080.0, + "grad_norm": 2.346649839393538, + "language_loss": 0.62836397, + "learning_rate": 8.131469013876748e-07, + "loss": 0.65286815, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19787598, + "step": 11828, + "time_per_iteration": 2.9100098609924316 + }, + { + "auxiliary_loss_clip": 0.01410016, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.24919558, + "balance_loss_mlp": 1.01600373, + "epoch": 0.7111979558094093, + "flos": 27283297685760.0, + "grad_norm": 1.4601626564670322, + "language_loss": 0.72529471, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74974275, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18762207, + "step": 11829, + "time_per_iteration": 4.338937520980835 + }, + { + "auxiliary_loss_clip": 0.01404786, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.24493194, + "balance_loss_mlp": 1.01658416, + "epoch": 0.7112580790620773, + "flos": 25058033758080.0, + "grad_norm": 1.8193718901468916, + "language_loss": 0.80793399, + "learning_rate": 8.125200452317697e-07, + "loss": 0.83233261, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18493652, + "step": 11830, + "time_per_iteration": 2.873239755630493 + }, + { + "auxiliary_loss_clip": 0.01411435, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.24951124, + "balance_loss_mlp": 1.0138042, + "epoch": 0.7113182023147452, + "flos": 21654989706240.0, + "grad_norm": 1.8866192635889636, + "language_loss": 0.84762609, + "learning_rate": 8.122066846919138e-07, + "loss": 0.87207377, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19519043, + "step": 11831, + "time_per_iteration": 2.8795948028564453 + }, + { + "auxiliary_loss_clip": 0.01405388, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.24365842, + "balance_loss_mlp": 1.01241004, + "epoch": 0.7113783255674132, + "flos": 21006141004800.0, + "grad_norm": 1.9264576829828444, + "language_loss": 0.78577507, + "learning_rate": 8.118933691932985e-07, + "loss": 0.81014454, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19140625, + "step": 11832, + "time_per_iteration": 2.924132823944092 + }, + { + "auxiliary_loss_clip": 0.01186846, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.09856963, + "balance_loss_mlp": 1.01373446, + "epoch": 0.7114384488200811, + "flos": 66798429909120.0, + "grad_norm": 0.7558065806242333, + "language_loss": 0.56696212, + "learning_rate": 8.115800987478059e-07, + "loss": 0.5891844, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21679688, + "step": 11833, + "time_per_iteration": 3.2933030128479004 + }, + { + "auxiliary_loss_clip": 0.01407303, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.24675202, + "balance_loss_mlp": 1.01757193, + "epoch": 0.7114985720727491, + "flos": 25021041494400.0, + "grad_norm": 2.222167741792351, + "language_loss": 0.71385181, + "learning_rate": 8.11266873367315e-07, + "loss": 0.7382859, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1854248, + "step": 11834, + "time_per_iteration": 2.9071874618530273 + }, + { + "auxiliary_loss_clip": 0.01407916, + "auxiliary_loss_mlp": 0.01033645, + "balance_loss_clip": 1.24551654, + "balance_loss_mlp": 1.01353455, + "epoch": 0.7115586953254172, + "flos": 21479982243840.0, + "grad_norm": 1.948160052068014, + "language_loss": 0.80479467, + "learning_rate": 8.10953693063704e-07, + "loss": 0.82921028, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20129395, + "step": 11835, + "time_per_iteration": 2.833233594894409 + }, + { + "auxiliary_loss_clip": 0.0140024, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.24151754, + "balance_loss_mlp": 1.01214528, + "epoch": 0.7116188185780851, + "flos": 28634637438720.0, + "grad_norm": 1.5341531583328798, + "language_loss": 0.76644242, + "learning_rate": 8.10640557848848e-07, + "loss": 0.79075259, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18640137, + "step": 11836, + "time_per_iteration": 2.946895122528076 + }, + { + "auxiliary_loss_clip": 0.01392042, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.23338938, + "balance_loss_mlp": 1.01535404, + "epoch": 0.7116789418307531, + "flos": 25302501267840.0, + "grad_norm": 1.8194935761325404, + "language_loss": 0.706725, + "learning_rate": 8.103274677346208e-07, + "loss": 0.73100448, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.20556641, + "step": 11837, + "time_per_iteration": 2.905081033706665 + }, + { + "auxiliary_loss_clip": 0.01425406, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.25980091, + "balance_loss_mlp": 1.01825428, + "epoch": 0.711739065083421, + "flos": 25568414812800.0, + "grad_norm": 3.979016252730088, + "language_loss": 0.62088335, + "learning_rate": 8.100144227328958e-07, + "loss": 0.64552212, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20214844, + "step": 11838, + "time_per_iteration": 4.289820909500122 + }, + { + "auxiliary_loss_clip": 0.01410323, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.24958062, + "balance_loss_mlp": 1.01543832, + "epoch": 0.711799188336089, + "flos": 26152237457280.0, + "grad_norm": 3.850926765531573, + "language_loss": 0.67637575, + "learning_rate": 8.097014228555426e-07, + "loss": 0.70083356, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20007324, + "step": 11839, + "time_per_iteration": 2.870102643966675 + }, + { + "auxiliary_loss_clip": 0.01404379, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.2435813, + "balance_loss_mlp": 1.01721334, + "epoch": 0.7118593115887569, + "flos": 21150128517120.0, + "grad_norm": 3.4084073019820083, + "language_loss": 0.85051638, + "learning_rate": 8.093884681144305e-07, + "loss": 0.87491095, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.17871094, + "step": 11840, + "time_per_iteration": 2.8398663997650146 + }, + { + "auxiliary_loss_clip": 0.01412978, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.24884391, + "balance_loss_mlp": 1.01572382, + "epoch": 0.711919434841425, + "flos": 14983858137600.0, + "grad_norm": 1.9961299719453303, + "language_loss": 0.77519131, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79967666, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19836426, + "step": 11841, + "time_per_iteration": 4.194320201873779 + }, + { + "auxiliary_loss_clip": 0.01408016, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.24658108, + "balance_loss_mlp": 1.01966047, + "epoch": 0.7119795580940929, + "flos": 16517806479360.0, + "grad_norm": 3.518422326737597, + "language_loss": 0.75939703, + "learning_rate": 8.087626940883994e-07, + "loss": 0.78387177, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19799805, + "step": 11842, + "time_per_iteration": 4.2595531940460205 + }, + { + "auxiliary_loss_clip": 0.01190632, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.10297632, + "balance_loss_mlp": 1.00978363, + "epoch": 0.7120396813467609, + "flos": 66602854045440.0, + "grad_norm": 0.789500535665854, + "language_loss": 0.61747146, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63966835, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.19238281, + "step": 11843, + "time_per_iteration": 3.3120384216308594 + }, + { + "auxiliary_loss_clip": 0.01404889, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.24419332, + "balance_loss_mlp": 1.01498497, + "epoch": 0.7120998045994288, + "flos": 26444013045120.0, + "grad_norm": 1.5499140814693286, + "language_loss": 0.80936527, + "learning_rate": 8.081371007497171e-07, + "loss": 0.83375114, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18725586, + "step": 11844, + "time_per_iteration": 2.918930768966675 + }, + { + "auxiliary_loss_clip": 0.01407723, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.24605083, + "balance_loss_mlp": 1.01474082, + "epoch": 0.7121599278520968, + "flos": 16434682502400.0, + "grad_norm": 2.3064723080222937, + "language_loss": 0.80285144, + "learning_rate": 8.078243718677873e-07, + "loss": 0.82727009, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19421387, + "step": 11845, + "time_per_iteration": 2.8584022521972656 + }, + { + "auxiliary_loss_clip": 0.01398073, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.24062586, + "balance_loss_mlp": 1.01527739, + "epoch": 0.7122200511047647, + "flos": 28961731232640.0, + "grad_norm": 2.672711427524689, + "language_loss": 0.78549361, + "learning_rate": 8.075116881932762e-07, + "loss": 0.80982971, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.20263672, + "step": 11846, + "time_per_iteration": 2.8994081020355225 + }, + { + "auxiliary_loss_clip": 0.0140795, + "auxiliary_loss_mlp": 0.0103728, + "balance_loss_clip": 1.24657774, + "balance_loss_mlp": 1.01809907, + "epoch": 0.7122801743574327, + "flos": 16480497502080.0, + "grad_norm": 2.269678257843906, + "language_loss": 0.58850449, + "learning_rate": 8.071990497380421e-07, + "loss": 0.61295676, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19177246, + "step": 11847, + "time_per_iteration": 2.851761817932129 + }, + { + "auxiliary_loss_clip": 0.01397099, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.24120998, + "balance_loss_mlp": 1.01426864, + "epoch": 0.7123402976101008, + "flos": 20640697603200.0, + "grad_norm": 1.7278890015027006, + "language_loss": 0.71740377, + "learning_rate": 8.068864565139395e-07, + "loss": 0.74171102, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.19360352, + "step": 11848, + "time_per_iteration": 2.9037811756134033 + }, + { + "auxiliary_loss_clip": 0.01190016, + "auxiliary_loss_mlp": 0.01023502, + "balance_loss_clip": 1.10170627, + "balance_loss_mlp": 1.00652623, + "epoch": 0.7124004208627687, + "flos": 62353467164160.0, + "grad_norm": 0.8213386957329786, + "language_loss": 0.63143504, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65357023, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16992188, + "step": 11849, + "time_per_iteration": 3.376105546951294 + }, + { + "auxiliary_loss_clip": 0.01408122, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.24641776, + "balance_loss_mlp": 1.01318514, + "epoch": 0.7124605441154367, + "flos": 39690411154560.0, + "grad_norm": 1.511069003726596, + "language_loss": 0.64597327, + "learning_rate": 8.0626140580654e-07, + "loss": 0.67038041, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1940918, + "step": 11850, + "time_per_iteration": 3.000702142715454 + }, + { + "auxiliary_loss_clip": 0.01411647, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.2489084, + "balance_loss_mlp": 1.01222944, + "epoch": 0.7125206673681046, + "flos": 28193037759360.0, + "grad_norm": 1.4712276327350127, + "language_loss": 0.70499325, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72942674, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19470215, + "step": 11851, + "time_per_iteration": 2.9218590259552 + }, + { + "auxiliary_loss_clip": 0.01398348, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.24028885, + "balance_loss_mlp": 1.01285124, + "epoch": 0.7125807906207726, + "flos": 26188053356160.0, + "grad_norm": 1.5070641829515175, + "language_loss": 0.83676845, + "learning_rate": 8.056365361658882e-07, + "loss": 0.86107218, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19177246, + "step": 11852, + "time_per_iteration": 3.062297821044922 + }, + { + "auxiliary_loss_clip": 0.01417025, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.25301218, + "balance_loss_mlp": 1.01396346, + "epoch": 0.7126409138734405, + "flos": 17164121472000.0, + "grad_norm": 2.732363642488351, + "language_loss": 0.73547226, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75998104, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19873047, + "step": 11853, + "time_per_iteration": 2.8431310653686523 + }, + { + "auxiliary_loss_clip": 0.01383354, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.22905171, + "balance_loss_mlp": 1.01414311, + "epoch": 0.7127010371261085, + "flos": 18779022040320.0, + "grad_norm": 2.253506757084599, + "language_loss": 0.93205488, + "learning_rate": 8.050118476867635e-07, + "loss": 0.95622039, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.1907959, + "step": 11854, + "time_per_iteration": 2.8059003353118896 + }, + { + "auxiliary_loss_clip": 0.0139778, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.24005544, + "balance_loss_mlp": 1.0132966, + "epoch": 0.7127611603787765, + "flos": 20386230992640.0, + "grad_norm": 1.786060219151393, + "language_loss": 0.80135965, + "learning_rate": 8.046995714123856e-07, + "loss": 0.82567132, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.20092773, + "step": 11855, + "time_per_iteration": 2.8763790130615234 + }, + { + "auxiliary_loss_clip": 0.01395153, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.23622298, + "balance_loss_mlp": 1.01374483, + "epoch": 0.7128212836314445, + "flos": 20458722441600.0, + "grad_norm": 2.34966046539848, + "language_loss": 0.73573393, + "learning_rate": 8.043873404639192e-07, + "loss": 0.7600264, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.20361328, + "step": 11856, + "time_per_iteration": 2.839785099029541 + }, + { + "auxiliary_loss_clip": 0.01409536, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.24723554, + "balance_loss_mlp": 1.01428807, + "epoch": 0.7128814068841124, + "flos": 23451322498560.0, + "grad_norm": 1.5141864167140535, + "language_loss": 0.70929384, + "learning_rate": 8.040751548532046e-07, + "loss": 0.73373145, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19934082, + "step": 11857, + "time_per_iteration": 2.8646161556243896 + }, + { + "auxiliary_loss_clip": 0.01398043, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.24019492, + "balance_loss_mlp": 1.01194096, + "epoch": 0.7129415301367804, + "flos": 18231965435520.0, + "grad_norm": 2.3923283681083847, + "language_loss": 0.85416102, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87846184, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.20092773, + "step": 11858, + "time_per_iteration": 2.8267552852630615 + }, + { + "auxiliary_loss_clip": 0.01418148, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.25435662, + "balance_loss_mlp": 1.01316392, + "epoch": 0.7130016533894483, + "flos": 15532453065600.0, + "grad_norm": 2.0907860286477384, + "language_loss": 0.80838251, + "learning_rate": 8.034509196923829e-07, + "loss": 0.83289874, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.203125, + "step": 11859, + "time_per_iteration": 2.831976890563965 + }, + { + "auxiliary_loss_clip": 0.01398739, + "auxiliary_loss_mlp": 0.01033157, + "balance_loss_clip": 1.23986495, + "balance_loss_mlp": 1.01363027, + "epoch": 0.7130617766421163, + "flos": 57134499321600.0, + "grad_norm": 1.1965964578326465, + "language_loss": 0.69304812, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71736705, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19519043, + "step": 11860, + "time_per_iteration": 3.274263381958008 + }, + { + "auxiliary_loss_clip": 0.01410848, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.2491014, + "balance_loss_mlp": 1.01170897, + "epoch": 0.7131218998947844, + "flos": 19796752748160.0, + "grad_norm": 2.024056274445619, + "language_loss": 0.65422463, + "learning_rate": 8.028268660246023e-07, + "loss": 0.67866933, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.21887207, + "step": 11861, + "time_per_iteration": 2.871811628341675 + }, + { + "auxiliary_loss_clip": 0.01433189, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.26795816, + "balance_loss_mlp": 1.01510251, + "epoch": 0.7131820231474523, + "flos": 26663297184000.0, + "grad_norm": 1.6391349259497663, + "language_loss": 0.67823303, + "learning_rate": 8.025149072801849e-07, + "loss": 0.70291144, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19543457, + "step": 11862, + "time_per_iteration": 2.9294931888580322 + }, + { + "auxiliary_loss_clip": 0.01400144, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.24153602, + "balance_loss_mlp": 1.01434231, + "epoch": 0.7132421464001203, + "flos": 29217600432000.0, + "grad_norm": 10.683300890013431, + "language_loss": 0.67187977, + "learning_rate": 8.022029939445214e-07, + "loss": 0.69620597, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18127441, + "step": 11863, + "time_per_iteration": 2.9357428550720215 + }, + { + "auxiliary_loss_clip": 0.01432506, + "auxiliary_loss_mlp": 0.01032364, + "balance_loss_clip": 1.2655915, + "balance_loss_mlp": 1.01256323, + "epoch": 0.7133022696527882, + "flos": 23083390632960.0, + "grad_norm": 1.9489122789512379, + "language_loss": 0.66719675, + "learning_rate": 8.018911260294414e-07, + "loss": 0.69184542, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19812012, + "step": 11864, + "time_per_iteration": 2.873563289642334 + }, + { + "auxiliary_loss_clip": 0.01419756, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.2553072, + "balance_loss_mlp": 1.01974058, + "epoch": 0.7133623929054562, + "flos": 17466031895040.0, + "grad_norm": 3.868180829100113, + "language_loss": 0.87023079, + "learning_rate": 8.015793035467697e-07, + "loss": 0.89482808, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20239258, + "step": 11865, + "time_per_iteration": 4.279973268508911 + }, + { + "auxiliary_loss_clip": 0.01404038, + "auxiliary_loss_mlp": 0.01035338, + "balance_loss_clip": 1.24174476, + "balance_loss_mlp": 1.01520324, + "epoch": 0.7134225161581241, + "flos": 19545950966400.0, + "grad_norm": 4.332891898787362, + "language_loss": 0.75812638, + "learning_rate": 8.012675265083304e-07, + "loss": 0.78252017, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20141602, + "step": 11866, + "time_per_iteration": 2.915315866470337 + }, + { + "auxiliary_loss_clip": 0.0141467, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.25101566, + "balance_loss_mlp": 1.01753592, + "epoch": 0.7134826394107922, + "flos": 26261268721920.0, + "grad_norm": 2.079792507830645, + "language_loss": 0.71375346, + "learning_rate": 8.009557949259464e-07, + "loss": 0.73828608, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.21057129, + "step": 11867, + "time_per_iteration": 2.90033221244812 + }, + { + "auxiliary_loss_clip": 0.01391539, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.23447824, + "balance_loss_mlp": 1.01202416, + "epoch": 0.7135427626634601, + "flos": 15823821450240.0, + "grad_norm": 2.76461652209277, + "language_loss": 0.72139978, + "learning_rate": 8.006441088114397e-07, + "loss": 0.7456336, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19812012, + "step": 11868, + "time_per_iteration": 2.836662530899048 + }, + { + "auxiliary_loss_clip": 0.01407043, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.24438369, + "balance_loss_mlp": 1.01214957, + "epoch": 0.7136028859161281, + "flos": 18232553617920.0, + "grad_norm": 3.326572851249641, + "language_loss": 0.66349816, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68788731, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19726562, + "step": 11869, + "time_per_iteration": 2.7960124015808105 + }, + { + "auxiliary_loss_clip": 0.01406026, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.24264121, + "balance_loss_mlp": 1.01420164, + "epoch": 0.713663009168796, + "flos": 24325110938880.0, + "grad_norm": 1.6296236606908086, + "language_loss": 0.78629661, + "learning_rate": 8.000208730333298e-07, + "loss": 0.81068814, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18920898, + "step": 11870, + "time_per_iteration": 2.883226156234741 + }, + { + "auxiliary_loss_clip": 0.01399258, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.23968148, + "balance_loss_mlp": 1.01502693, + "epoch": 0.713723132421464, + "flos": 26548836543360.0, + "grad_norm": 1.80721373865855, + "language_loss": 0.81634521, + "learning_rate": 7.997093233933597e-07, + "loss": 0.84069765, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.20959473, + "step": 11871, + "time_per_iteration": 2.981794834136963 + }, + { + "auxiliary_loss_clip": 0.01414344, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.25026703, + "balance_loss_mlp": 1.01624489, + "epoch": 0.7137832556741319, + "flos": 19875352245120.0, + "grad_norm": 1.7133764718588806, + "language_loss": 0.79838604, + "learning_rate": 7.993978192685331e-07, + "loss": 0.82289976, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20788574, + "step": 11872, + "time_per_iteration": 4.350321292877197 + }, + { + "auxiliary_loss_clip": 0.01424136, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.25863242, + "balance_loss_mlp": 1.01544738, + "epoch": 0.7138433789267999, + "flos": 21698813934720.0, + "grad_norm": 2.335229049983905, + "language_loss": 0.84478468, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86937743, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19689941, + "step": 11873, + "time_per_iteration": 2.8230140209198 + }, + { + "auxiliary_loss_clip": 0.01386963, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.2291398, + "balance_loss_mlp": 1.0120374, + "epoch": 0.713903502179468, + "flos": 17611602975360.0, + "grad_norm": 2.097731776972648, + "language_loss": 0.86895907, + "learning_rate": 7.987749476115539e-07, + "loss": 0.89314324, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19421387, + "step": 11874, + "time_per_iteration": 2.844759702682495 + }, + { + "auxiliary_loss_clip": 0.01410857, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.24772763, + "balance_loss_mlp": 1.01066399, + "epoch": 0.7139636254321359, + "flos": 18049266357120.0, + "grad_norm": 1.6959782848896736, + "language_loss": 0.83386809, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85828912, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20593262, + "step": 11875, + "time_per_iteration": 2.807542562484741 + }, + { + "auxiliary_loss_clip": 0.01427178, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.25876153, + "balance_loss_mlp": 1.01751614, + "epoch": 0.7140237486848039, + "flos": 23341522072320.0, + "grad_norm": 1.947763786666601, + "language_loss": 0.7066797, + "learning_rate": 7.981522581568721e-07, + "loss": 0.73133576, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20910645, + "step": 11876, + "time_per_iteration": 4.235002756118774 + }, + { + "auxiliary_loss_clip": 0.01407453, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.24519241, + "balance_loss_mlp": 1.01266956, + "epoch": 0.7140838719374718, + "flos": 16845895658880.0, + "grad_norm": 1.9910910894419696, + "language_loss": 0.79182315, + "learning_rate": 7.978409817849079e-07, + "loss": 0.81622148, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19714355, + "step": 11877, + "time_per_iteration": 4.257075071334839 + }, + { + "auxiliary_loss_clip": 0.01399459, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.24019647, + "balance_loss_mlp": 1.01448011, + "epoch": 0.7141439951901398, + "flos": 21151847819520.0, + "grad_norm": 1.8978381561413629, + "language_loss": 0.70500004, + "learning_rate": 7.97529750998934e-07, + "loss": 0.72933048, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19104004, + "step": 11878, + "time_per_iteration": 2.825953960418701 + }, + { + "auxiliary_loss_clip": 0.01392228, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.23268843, + "balance_loss_mlp": 1.01359463, + "epoch": 0.7142041184428077, + "flos": 24728496744960.0, + "grad_norm": 1.784277592129552, + "language_loss": 0.68398649, + "learning_rate": 7.972185658107535e-07, + "loss": 0.70823652, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19177246, + "step": 11879, + "time_per_iteration": 2.860764503479004 + }, + { + "auxiliary_loss_clip": 0.01409106, + "auxiliary_loss_mlp": 0.01032801, + "balance_loss_clip": 1.24622464, + "balance_loss_mlp": 1.01199913, + "epoch": 0.7142642416954758, + "flos": 21918233808000.0, + "grad_norm": 1.539876365836101, + "language_loss": 0.70437062, + "learning_rate": 7.969074262321646e-07, + "loss": 0.72878969, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20800781, + "step": 11880, + "time_per_iteration": 2.861961841583252 + }, + { + "auxiliary_loss_clip": 0.01415807, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.25077081, + "balance_loss_mlp": 1.01225281, + "epoch": 0.7143243649481437, + "flos": 20813307091200.0, + "grad_norm": 2.886495936349849, + "language_loss": 0.81842834, + "learning_rate": 7.965963322749674e-07, + "loss": 0.84291077, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.2019043, + "step": 11881, + "time_per_iteration": 2.8490889072418213 + }, + { + "auxiliary_loss_clip": 0.01407322, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.24687004, + "balance_loss_mlp": 1.01636076, + "epoch": 0.7143844882008117, + "flos": 27246395911680.0, + "grad_norm": 1.4940297895247827, + "language_loss": 0.6476984, + "learning_rate": 7.962852839509579e-07, + "loss": 0.67212903, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19384766, + "step": 11882, + "time_per_iteration": 2.920353889465332 + }, + { + "auxiliary_loss_clip": 0.01411356, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.24781466, + "balance_loss_mlp": 1.01426506, + "epoch": 0.7144446114534796, + "flos": 17938199076480.0, + "grad_norm": 1.79216006948654, + "language_loss": 0.69777381, + "learning_rate": 7.959742812719304e-07, + "loss": 0.7222259, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19592285, + "step": 11883, + "time_per_iteration": 2.87205171585083 + }, + { + "auxiliary_loss_clip": 0.01401513, + "auxiliary_loss_mlp": 0.01037488, + "balance_loss_clip": 1.24269962, + "balance_loss_mlp": 1.01576829, + "epoch": 0.7145047347061476, + "flos": 20750769498240.0, + "grad_norm": 2.537575158326298, + "language_loss": 0.78807414, + "learning_rate": 7.956633242496788e-07, + "loss": 0.81246418, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.21728516, + "step": 11884, + "time_per_iteration": 2.8900973796844482 + }, + { + "auxiliary_loss_clip": 0.01421604, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.25225091, + "balance_loss_mlp": 1.01353455, + "epoch": 0.7145648579588155, + "flos": 21188478124800.0, + "grad_norm": 2.252166128658085, + "language_loss": 0.74720728, + "learning_rate": 7.953524128959954e-07, + "loss": 0.77176309, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.20446777, + "step": 11885, + "time_per_iteration": 2.8393328189849854 + }, + { + "auxiliary_loss_clip": 0.01186402, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.09880733, + "balance_loss_mlp": 1.01429498, + "epoch": 0.7146249812114835, + "flos": 64816610843520.0, + "grad_norm": 0.903022298548721, + "language_loss": 0.66419291, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68642116, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.22167969, + "step": 11886, + "time_per_iteration": 3.3299286365509033 + }, + { + "auxiliary_loss_clip": 0.01392891, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.23418307, + "balance_loss_mlp": 1.01614285, + "epoch": 0.7146851044641516, + "flos": 18122979415680.0, + "grad_norm": 1.634049749147538, + "language_loss": 0.75023335, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77451992, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19616699, + "step": 11887, + "time_per_iteration": 2.8364665508270264 + }, + { + "auxiliary_loss_clip": 0.01403136, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.241817, + "balance_loss_mlp": 1.01367474, + "epoch": 0.7147452277168195, + "flos": 19252953768960.0, + "grad_norm": 1.6784574291172119, + "language_loss": 0.72914195, + "learning_rate": 7.944199529642372e-07, + "loss": 0.7534951, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18493652, + "step": 11888, + "time_per_iteration": 2.8615500926971436 + }, + { + "auxiliary_loss_clip": 0.01406495, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.24182856, + "balance_loss_mlp": 1.0141573, + "epoch": 0.7148053509694875, + "flos": 23774118036480.0, + "grad_norm": 4.224813764072091, + "language_loss": 0.84830153, + "learning_rate": 7.941092244027041e-07, + "loss": 0.87271416, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20629883, + "step": 11889, + "time_per_iteration": 2.8795015811920166 + }, + { + "auxiliary_loss_clip": 0.0141782, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.25433016, + "balance_loss_mlp": 1.01080203, + "epoch": 0.7148654742221554, + "flos": 22493776654080.0, + "grad_norm": 1.766416341720488, + "language_loss": 0.76729727, + "learning_rate": 7.937985415686695e-07, + "loss": 0.79178119, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19750977, + "step": 11890, + "time_per_iteration": 2.873389482498169 + }, + { + "auxiliary_loss_clip": 0.01405066, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.24605179, + "balance_loss_mlp": 1.01394236, + "epoch": 0.7149255974748234, + "flos": 24689468465280.0, + "grad_norm": 1.720158408868567, + "language_loss": 0.74482131, + "learning_rate": 7.934879044739147e-07, + "loss": 0.7691986, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18725586, + "step": 11891, + "time_per_iteration": 2.87640380859375 + }, + { + "auxiliary_loss_clip": 0.01410914, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.24821889, + "balance_loss_mlp": 1.01746893, + "epoch": 0.7149857207274913, + "flos": 18414709758720.0, + "grad_norm": 2.439270402080044, + "language_loss": 0.68986821, + "learning_rate": 7.931773131302211e-07, + "loss": 0.71434855, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1965332, + "step": 11892, + "time_per_iteration": 2.9135901927948 + }, + { + "auxiliary_loss_clip": 0.01419555, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.25399947, + "balance_loss_mlp": 1.01375151, + "epoch": 0.7150458439801594, + "flos": 24979434261120.0, + "grad_norm": 1.8175041671100156, + "language_loss": 0.73977792, + "learning_rate": 7.928667675493632e-07, + "loss": 0.76430833, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19714355, + "step": 11893, + "time_per_iteration": 2.8651673793792725 + }, + { + "auxiliary_loss_clip": 0.01415811, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.24989748, + "balance_loss_mlp": 1.01198494, + "epoch": 0.7151059672328273, + "flos": 16699464927360.0, + "grad_norm": 2.1910436129154345, + "language_loss": 0.67871201, + "learning_rate": 7.925562677431185e-07, + "loss": 0.70319045, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20043945, + "step": 11894, + "time_per_iteration": 2.8403236865997314 + }, + { + "auxiliary_loss_clip": 0.01414271, + "auxiliary_loss_mlp": 0.01034918, + "balance_loss_clip": 1.24994528, + "balance_loss_mlp": 1.01562977, + "epoch": 0.7151660904854953, + "flos": 27283297685760.0, + "grad_norm": 1.8265521324639244, + "language_loss": 0.78251493, + "learning_rate": 7.922458137232613e-07, + "loss": 0.80700684, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19287109, + "step": 11895, + "time_per_iteration": 2.8903191089630127 + }, + { + "auxiliary_loss_clip": 0.01415059, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.25162816, + "balance_loss_mlp": 1.01212716, + "epoch": 0.7152262137381632, + "flos": 18341268168960.0, + "grad_norm": 1.846639313658224, + "language_loss": 0.70365179, + "learning_rate": 7.919354055015643e-07, + "loss": 0.72812414, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20056152, + "step": 11896, + "time_per_iteration": 2.8300161361694336 + }, + { + "auxiliary_loss_clip": 0.01421131, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.25608957, + "balance_loss_mlp": 1.01451457, + "epoch": 0.7152863369908312, + "flos": 21809428767360.0, + "grad_norm": 1.6110674992239924, + "language_loss": 0.87358558, + "learning_rate": 7.91625043089798e-07, + "loss": 0.89814472, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20251465, + "step": 11897, + "time_per_iteration": 2.8574132919311523 + }, + { + "auxiliary_loss_clip": 0.01404436, + "auxiliary_loss_mlp": 0.01036843, + "balance_loss_clip": 1.24529409, + "balance_loss_mlp": 1.01718569, + "epoch": 0.7153464602434991, + "flos": 22167180552960.0, + "grad_norm": 1.8534004569680465, + "language_loss": 0.7898466, + "learning_rate": 7.913147264997304e-07, + "loss": 0.81425941, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19665527, + "step": 11898, + "time_per_iteration": 2.8330564498901367 + }, + { + "auxiliary_loss_clip": 0.01421493, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.25576329, + "balance_loss_mlp": 1.01209617, + "epoch": 0.7154065834961671, + "flos": 24726415484160.0, + "grad_norm": 1.8488722559881847, + "language_loss": 0.73680377, + "learning_rate": 7.910044557431302e-07, + "loss": 0.76134038, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20068359, + "step": 11899, + "time_per_iteration": 2.900855541229248 + }, + { + "auxiliary_loss_clip": 0.01408006, + "auxiliary_loss_mlp": 0.0103781, + "balance_loss_clip": 1.24612391, + "balance_loss_mlp": 1.01661444, + "epoch": 0.7154667067488351, + "flos": 22611721144320.0, + "grad_norm": 2.9786356583343414, + "language_loss": 0.76535571, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78981388, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.21203613, + "step": 11900, + "time_per_iteration": 4.311737775802612 + }, + { + "auxiliary_loss_clip": 0.01419782, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.2564503, + "balance_loss_mlp": 1.01630402, + "epoch": 0.7155268300015031, + "flos": 18780560363520.0, + "grad_norm": 2.427425861595831, + "language_loss": 0.8170265, + "learning_rate": 7.903840517773886e-07, + "loss": 0.84158373, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19628906, + "step": 11901, + "time_per_iteration": 2.837871789932251 + }, + { + "auxiliary_loss_clip": 0.01424194, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.25726807, + "balance_loss_mlp": 1.01886439, + "epoch": 0.7155869532541711, + "flos": 18305497514880.0, + "grad_norm": 1.9337695962026336, + "language_loss": 0.82645464, + "learning_rate": 7.900739185917744e-07, + "loss": 0.85109282, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.2076416, + "step": 11902, + "time_per_iteration": 2.9118950366973877 + }, + { + "auxiliary_loss_clip": 0.01410632, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.24833226, + "balance_loss_mlp": 1.01156545, + "epoch": 0.715647076506839, + "flos": 11987412272640.0, + "grad_norm": 1.6864729313550786, + "language_loss": 0.68447858, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70888722, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18664551, + "step": 11903, + "time_per_iteration": 2.915066957473755 + }, + { + "auxiliary_loss_clip": 0.01407538, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.24663043, + "balance_loss_mlp": 1.01703119, + "epoch": 0.715707199759507, + "flos": 18960589998720.0, + "grad_norm": 1.840805857497778, + "language_loss": 0.76499647, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78943288, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1907959, + "step": 11904, + "time_per_iteration": 2.861319065093994 + }, + { + "auxiliary_loss_clip": 0.01414866, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.2528559, + "balance_loss_mlp": 1.01463652, + "epoch": 0.7157673230121749, + "flos": 15312761723520.0, + "grad_norm": 1.8237726050077288, + "language_loss": 0.72345072, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74795151, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20556641, + "step": 11905, + "time_per_iteration": 2.827944755554199 + }, + { + "auxiliary_loss_clip": 0.01409157, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.24757349, + "balance_loss_mlp": 1.01470041, + "epoch": 0.715827446264843, + "flos": 23232264583680.0, + "grad_norm": 2.7150081064598237, + "language_loss": 0.78781867, + "learning_rate": 7.88833844772076e-07, + "loss": 0.81225491, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19775391, + "step": 11906, + "time_per_iteration": 2.877495527267456 + }, + { + "auxiliary_loss_clip": 0.01191011, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.09997535, + "balance_loss_mlp": 1.01493478, + "epoch": 0.7158875695175109, + "flos": 61002715593600.0, + "grad_norm": 0.7330394394348556, + "language_loss": 0.55317008, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57542032, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.19042969, + "step": 11907, + "time_per_iteration": 3.300475597381592 + }, + { + "auxiliary_loss_clip": 0.01411785, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.24897766, + "balance_loss_mlp": 1.01199019, + "epoch": 0.7159476927701789, + "flos": 17137626001920.0, + "grad_norm": 1.878257541844724, + "language_loss": 0.70335257, + "learning_rate": 7.882140833804593e-07, + "loss": 0.72778255, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19226074, + "step": 11908, + "time_per_iteration": 4.324196100234985 + }, + { + "auxiliary_loss_clip": 0.01417564, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.25380576, + "balance_loss_mlp": 1.01671135, + "epoch": 0.7160078160228468, + "flos": 22500427639680.0, + "grad_norm": 1.6449469300210346, + "language_loss": 0.72413468, + "learning_rate": 7.879042716053415e-07, + "loss": 0.74867463, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19714355, + "step": 11909, + "time_per_iteration": 2.967806816101074 + }, + { + "auxiliary_loss_clip": 0.0141392, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.24906731, + "balance_loss_mlp": 1.01428199, + "epoch": 0.7160679392755148, + "flos": 30602991536640.0, + "grad_norm": 1.529464293499408, + "language_loss": 0.75419581, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77867842, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20056152, + "step": 11910, + "time_per_iteration": 2.9416685104370117 + }, + { + "auxiliary_loss_clip": 0.01414893, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.25217104, + "balance_loss_mlp": 1.01439524, + "epoch": 0.7161280625281827, + "flos": 21333325288320.0, + "grad_norm": 1.4232623975088865, + "language_loss": 0.77048266, + "learning_rate": 7.872847859552251e-07, + "loss": 0.79496557, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18981934, + "step": 11911, + "time_per_iteration": 4.314856290817261 + }, + { + "auxiliary_loss_clip": 0.01409735, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.24570322, + "balance_loss_mlp": 1.01281047, + "epoch": 0.7161881857808508, + "flos": 61880467593600.0, + "grad_norm": 2.7533493904714916, + "language_loss": 0.59954733, + "learning_rate": 7.869751121037192e-07, + "loss": 0.62399298, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.22021484, + "step": 11912, + "time_per_iteration": 4.713810443878174 + }, + { + "auxiliary_loss_clip": 0.01416724, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.25602674, + "balance_loss_mlp": 1.01508045, + "epoch": 0.7162483090335187, + "flos": 20821451155200.0, + "grad_norm": 1.608731673717409, + "language_loss": 0.7919693, + "learning_rate": 7.866654842502376e-07, + "loss": 0.81648397, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19665527, + "step": 11913, + "time_per_iteration": 2.8514459133148193 + }, + { + "auxiliary_loss_clip": 0.01401724, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.24287868, + "balance_loss_mlp": 1.01373076, + "epoch": 0.7163084322861867, + "flos": 24108179529600.0, + "grad_norm": 1.7280935967464837, + "language_loss": 0.75623542, + "learning_rate": 7.863559024065234e-07, + "loss": 0.78057218, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18212891, + "step": 11914, + "time_per_iteration": 2.8875882625579834 + }, + { + "auxiliary_loss_clip": 0.01404715, + "auxiliary_loss_mlp": 0.01038108, + "balance_loss_clip": 1.2457844, + "balance_loss_mlp": 1.0180335, + "epoch": 0.7163685555388547, + "flos": 20089659456000.0, + "grad_norm": 1.5871907883057572, + "language_loss": 0.74246073, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76688892, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20056152, + "step": 11915, + "time_per_iteration": 2.820852756500244 + }, + { + "auxiliary_loss_clip": 0.01404723, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.24206364, + "balance_loss_mlp": 1.01458657, + "epoch": 0.7164286787915226, + "flos": 17465850915840.0, + "grad_norm": 1.8230808300916983, + "language_loss": 0.81562912, + "learning_rate": 7.85736876795349e-07, + "loss": 0.84001285, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19067383, + "step": 11916, + "time_per_iteration": 2.804250955581665 + }, + { + "auxiliary_loss_clip": 0.0141386, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.25003958, + "balance_loss_mlp": 1.01880872, + "epoch": 0.7164888020441906, + "flos": 19728333331200.0, + "grad_norm": 1.7335578239662366, + "language_loss": 0.69299877, + "learning_rate": 7.854274330513626e-07, + "loss": 0.71751493, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1895752, + "step": 11917, + "time_per_iteration": 2.829460859298706 + }, + { + "auxiliary_loss_clip": 0.01406175, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.24465692, + "balance_loss_mlp": 1.01382422, + "epoch": 0.7165489252968585, + "flos": 21480479936640.0, + "grad_norm": 1.5469212283260947, + "language_loss": 0.76277441, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78716624, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19177246, + "step": 11918, + "time_per_iteration": 2.8464229106903076 + }, + { + "auxiliary_loss_clip": 0.01183686, + "auxiliary_loss_mlp": 0.01057226, + "balance_loss_clip": 1.09661198, + "balance_loss_mlp": 1.037485, + "epoch": 0.7166090485495266, + "flos": 69961214217600.0, + "grad_norm": 0.6497160109527855, + "language_loss": 0.54031402, + "learning_rate": 7.848086837452639e-07, + "loss": 0.56272316, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.19726562, + "step": 11919, + "time_per_iteration": 3.40771484375 + }, + { + "auxiliary_loss_clip": 0.01423608, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.25840592, + "balance_loss_mlp": 1.0142312, + "epoch": 0.7166691718021945, + "flos": 27355155707520.0, + "grad_norm": 1.555952148634013, + "language_loss": 0.69523454, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71980196, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18908691, + "step": 11920, + "time_per_iteration": 2.9170873165130615 + }, + { + "auxiliary_loss_clip": 0.01405825, + "auxiliary_loss_mlp": 0.01038316, + "balance_loss_clip": 1.2434814, + "balance_loss_mlp": 1.01726341, + "epoch": 0.7167292950548625, + "flos": 30420971130240.0, + "grad_norm": 3.1501183048146926, + "language_loss": 0.75372148, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77816284, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.21044922, + "step": 11921, + "time_per_iteration": 2.9334871768951416 + }, + { + "auxiliary_loss_clip": 0.01424426, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.25668526, + "balance_loss_mlp": 1.01532757, + "epoch": 0.7167894183075304, + "flos": 14578526805120.0, + "grad_norm": 1.9252039936106935, + "language_loss": 0.77435267, + "learning_rate": 7.83880905416755e-07, + "loss": 0.79896587, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.2154541, + "step": 11922, + "time_per_iteration": 2.8455607891082764 + }, + { + "auxiliary_loss_clip": 0.01183262, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.09440303, + "balance_loss_mlp": 1.01049519, + "epoch": 0.7168495415601984, + "flos": 64138126798080.0, + "grad_norm": 0.7515881654348368, + "language_loss": 0.55144417, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57357049, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18847656, + "step": 11923, + "time_per_iteration": 3.110062837600708 + }, + { + "auxiliary_loss_clip": 0.0141849, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.2548188, + "balance_loss_mlp": 1.01520443, + "epoch": 0.7169096648128663, + "flos": 24692273642880.0, + "grad_norm": 1.520170233637924, + "language_loss": 0.77851999, + "learning_rate": 7.832626170883279e-07, + "loss": 0.80305231, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.1953125, + "step": 11924, + "time_per_iteration": 2.8854012489318848 + }, + { + "auxiliary_loss_clip": 0.01402354, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.24308622, + "balance_loss_mlp": 1.01532507, + "epoch": 0.7169697880655344, + "flos": 20677192174080.0, + "grad_norm": 1.7793255485975457, + "language_loss": 0.69067156, + "learning_rate": 7.829535421264588e-07, + "loss": 0.71504772, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19934082, + "step": 11925, + "time_per_iteration": 2.912184476852417 + }, + { + "auxiliary_loss_clip": 0.01391624, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.23378241, + "balance_loss_mlp": 1.01158273, + "epoch": 0.7170299113182023, + "flos": 21042545086080.0, + "grad_norm": 1.8286109576521405, + "language_loss": 0.78056443, + "learning_rate": 7.826445133151133e-07, + "loss": 0.80479521, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1986084, + "step": 11926, + "time_per_iteration": 2.9376730918884277 + }, + { + "auxiliary_loss_clip": 0.01420328, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.25257766, + "balance_loss_mlp": 1.01977122, + "epoch": 0.7170900345708703, + "flos": 22903587221760.0, + "grad_norm": 2.239676771118708, + "language_loss": 0.78257018, + "learning_rate": 7.823355306660093e-07, + "loss": 0.80716896, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19763184, + "step": 11927, + "time_per_iteration": 2.8626770973205566 + }, + { + "auxiliary_loss_clip": 0.0140415, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.2444514, + "balance_loss_mlp": 1.01387405, + "epoch": 0.7171501578235383, + "flos": 15525892569600.0, + "grad_norm": 1.770847482660701, + "language_loss": 0.70304894, + "learning_rate": 7.820265941908642e-07, + "loss": 0.72742593, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19689941, + "step": 11928, + "time_per_iteration": 2.859679698944092 + }, + { + "auxiliary_loss_clip": 0.01397982, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.24009359, + "balance_loss_mlp": 1.01478684, + "epoch": 0.7172102810762062, + "flos": 26115471417600.0, + "grad_norm": 1.7849279086300927, + "language_loss": 0.65657985, + "learning_rate": 7.817177039013931e-07, + "loss": 0.68090808, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.20068359, + "step": 11929, + "time_per_iteration": 2.8804585933685303 + }, + { + "auxiliary_loss_clip": 0.01411887, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.24760294, + "balance_loss_mlp": 1.01593304, + "epoch": 0.7172704043288742, + "flos": 21516522059520.0, + "grad_norm": 2.4474650673239697, + "language_loss": 0.70722395, + "learning_rate": 7.81408859809308e-07, + "loss": 0.73170626, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20410156, + "step": 11930, + "time_per_iteration": 2.8679916858673096 + }, + { + "auxiliary_loss_clip": 0.01416221, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.25300384, + "balance_loss_mlp": 1.01204658, + "epoch": 0.7173305275815421, + "flos": 18780288894720.0, + "grad_norm": 1.8573097687497073, + "language_loss": 0.81279081, + "learning_rate": 7.811000619263219e-07, + "loss": 0.8372705, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19702148, + "step": 11931, + "time_per_iteration": 2.825329303741455 + }, + { + "auxiliary_loss_clip": 0.01402138, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.24139953, + "balance_loss_mlp": 1.01397276, + "epoch": 0.7173906508342102, + "flos": 16188405200640.0, + "grad_norm": 2.2343783371955346, + "language_loss": 0.79368877, + "learning_rate": 7.80791310264143e-07, + "loss": 0.81805134, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.20153809, + "step": 11932, + "time_per_iteration": 2.8248519897460938 + }, + { + "auxiliary_loss_clip": 0.01396911, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.23842323, + "balance_loss_mlp": 1.01498055, + "epoch": 0.7174507740868781, + "flos": 26624857086720.0, + "grad_norm": 1.4297053803581805, + "language_loss": 0.75664586, + "learning_rate": 7.804826048344803e-07, + "loss": 0.78096151, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19665527, + "step": 11933, + "time_per_iteration": 2.9148833751678467 + }, + { + "auxiliary_loss_clip": 0.01422091, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.25367987, + "balance_loss_mlp": 1.01605487, + "epoch": 0.7175108973395461, + "flos": 18439666905600.0, + "grad_norm": 2.477630120927513, + "language_loss": 0.69965661, + "learning_rate": 7.801739456490388e-07, + "loss": 0.72424799, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20983887, + "step": 11934, + "time_per_iteration": 2.780611038208008 + }, + { + "auxiliary_loss_clip": 0.01401477, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.24060798, + "balance_loss_mlp": 1.01596236, + "epoch": 0.717571020592214, + "flos": 23925525696000.0, + "grad_norm": 2.012680900594361, + "language_loss": 0.86727953, + "learning_rate": 7.798653327195237e-07, + "loss": 0.89164329, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18920898, + "step": 11935, + "time_per_iteration": 4.320261240005493 + }, + { + "auxiliary_loss_clip": 0.01407355, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.24599719, + "balance_loss_mlp": 1.0154258, + "epoch": 0.717631143844882, + "flos": 38272009328640.0, + "grad_norm": 1.484989665030357, + "language_loss": 0.74825472, + "learning_rate": 7.795567660576388e-07, + "loss": 0.77267349, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19104004, + "step": 11936, + "time_per_iteration": 3.012108087539673 + }, + { + "auxiliary_loss_clip": 0.01190263, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.09942698, + "balance_loss_mlp": 1.01064801, + "epoch": 0.7176912670975499, + "flos": 65548583521920.0, + "grad_norm": 0.7665729671020776, + "language_loss": 0.56004906, + "learning_rate": 7.79248245675082e-07, + "loss": 0.58226418, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.20605469, + "step": 11937, + "time_per_iteration": 3.3353137969970703 + }, + { + "auxiliary_loss_clip": 0.01416213, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.2520237, + "balance_loss_mlp": 1.01878428, + "epoch": 0.717751390350218, + "flos": 31292497330560.0, + "grad_norm": 2.0319996180933044, + "language_loss": 0.55127305, + "learning_rate": 7.789397715835542e-07, + "loss": 0.57582939, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2064209, + "step": 11938, + "time_per_iteration": 2.9410409927368164 + }, + { + "auxiliary_loss_clip": 0.01387362, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.22966456, + "balance_loss_mlp": 1.01324713, + "epoch": 0.7178115136028859, + "flos": 19866981957120.0, + "grad_norm": 1.5977408577492318, + "language_loss": 0.7728405, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79704297, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19641113, + "step": 11939, + "time_per_iteration": 2.850473403930664 + }, + { + "auxiliary_loss_clip": 0.01188825, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.09815538, + "balance_loss_mlp": 1.01240385, + "epoch": 0.7178716368555539, + "flos": 64381119212160.0, + "grad_norm": 0.7648896488320684, + "language_loss": 0.61561894, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63783246, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.20117188, + "step": 11940, + "time_per_iteration": 3.2522809505462646 + }, + { + "auxiliary_loss_clip": 0.01402972, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.24359143, + "balance_loss_mlp": 1.01602077, + "epoch": 0.7179317601082219, + "flos": 26774364464640.0, + "grad_norm": 1.4178391115991138, + "language_loss": 0.59386587, + "learning_rate": 7.780146271721097e-07, + "loss": 0.61825383, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19812012, + "step": 11941, + "time_per_iteration": 2.916381359100342 + }, + { + "auxiliary_loss_clip": 0.01405698, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.24585986, + "balance_loss_mlp": 1.01814294, + "epoch": 0.7179918833608898, + "flos": 23524311640320.0, + "grad_norm": 3.619366730279992, + "language_loss": 0.80015951, + "learning_rate": 7.777063383616543e-07, + "loss": 0.82459295, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19494629, + "step": 11942, + "time_per_iteration": 4.421423673629761 + }, + { + "auxiliary_loss_clip": 0.01403905, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.24336493, + "balance_loss_mlp": 1.01791883, + "epoch": 0.7180520066135578, + "flos": 17174527776000.0, + "grad_norm": 2.1376282864449916, + "language_loss": 0.66494781, + "learning_rate": 7.773980959006968e-07, + "loss": 0.689367, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20117188, + "step": 11943, + "time_per_iteration": 2.8056344985961914 + }, + { + "auxiliary_loss_clip": 0.01413432, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.25107765, + "balance_loss_mlp": 1.01644897, + "epoch": 0.7181121298662257, + "flos": 17575606097280.0, + "grad_norm": 1.8009987264788514, + "language_loss": 0.79411644, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81860435, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18908691, + "step": 11944, + "time_per_iteration": 2.856820821762085 + }, + { + "auxiliary_loss_clip": 0.01414695, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.2493329, + "balance_loss_mlp": 1.01530194, + "epoch": 0.7181722531188938, + "flos": 11955125468160.0, + "grad_norm": 4.4741333562509915, + "language_loss": 0.64307666, + "learning_rate": 7.767817500740277e-07, + "loss": 0.66758859, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.21203613, + "step": 11945, + "time_per_iteration": 2.9800782203674316 + }, + { + "auxiliary_loss_clip": 0.01188921, + "auxiliary_loss_mlp": 0.01026288, + "balance_loss_clip": 1.09900093, + "balance_loss_mlp": 1.00950325, + "epoch": 0.7182323763715617, + "flos": 65533155045120.0, + "grad_norm": 0.6985574208126251, + "language_loss": 0.51131666, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53346878, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16796875, + "step": 11946, + "time_per_iteration": 4.611027956008911 + }, + { + "auxiliary_loss_clip": 0.01414503, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.25044847, + "balance_loss_mlp": 1.01606989, + "epoch": 0.7182924996242297, + "flos": 20640561868800.0, + "grad_norm": 1.8058395600311008, + "language_loss": 0.75577855, + "learning_rate": 7.761655897855925e-07, + "loss": 0.7802881, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20373535, + "step": 11947, + "time_per_iteration": 4.2395124435424805 + }, + { + "auxiliary_loss_clip": 0.01396334, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.23650312, + "balance_loss_mlp": 1.0101254, + "epoch": 0.7183526228768976, + "flos": 16224990261120.0, + "grad_norm": 1.70415399700799, + "language_loss": 0.73311377, + "learning_rate": 7.758575792474187e-07, + "loss": 0.75738019, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20178223, + "step": 11948, + "time_per_iteration": 2.8384287357330322 + }, + { + "auxiliary_loss_clip": 0.01414435, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.25273705, + "balance_loss_mlp": 1.01598322, + "epoch": 0.7184127461295656, + "flos": 22241753262720.0, + "grad_norm": 1.7335591290150743, + "language_loss": 0.71859181, + "learning_rate": 7.755496151288483e-07, + "loss": 0.74309301, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19714355, + "step": 11949, + "time_per_iteration": 2.8595261573791504 + }, + { + "auxiliary_loss_clip": 0.01403923, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.24301326, + "balance_loss_mlp": 1.01717448, + "epoch": 0.7184728693822335, + "flos": 27355924869120.0, + "grad_norm": 1.9562058886406282, + "language_loss": 0.77017343, + "learning_rate": 7.752416974415598e-07, + "loss": 0.79458141, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19714355, + "step": 11950, + "time_per_iteration": 2.910169839859009 + }, + { + "auxiliary_loss_clip": 0.01413935, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.25131297, + "balance_loss_mlp": 1.01525354, + "epoch": 0.7185329926349016, + "flos": 16517218296960.0, + "grad_norm": 4.185579789541104, + "language_loss": 0.68369591, + "learning_rate": 7.749338261972282e-07, + "loss": 0.70818907, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20117188, + "step": 11951, + "time_per_iteration": 2.971205234527588 + }, + { + "auxiliary_loss_clip": 0.01403403, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.24063289, + "balance_loss_mlp": 1.0164938, + "epoch": 0.7185931158875695, + "flos": 23961522574080.0, + "grad_norm": 1.7824014643524613, + "language_loss": 0.79126966, + "learning_rate": 7.746260014075286e-07, + "loss": 0.81567037, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20178223, + "step": 11952, + "time_per_iteration": 2.8984999656677246 + }, + { + "auxiliary_loss_clip": 0.01424631, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.25963545, + "balance_loss_mlp": 1.01651323, + "epoch": 0.7186532391402375, + "flos": 26553270533760.0, + "grad_norm": 1.7233292350133835, + "language_loss": 0.7569766, + "learning_rate": 7.743182230841352e-07, + "loss": 0.78158462, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1965332, + "step": 11953, + "time_per_iteration": 2.9015767574310303 + }, + { + "auxiliary_loss_clip": 0.01409007, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.24630761, + "balance_loss_mlp": 1.01683259, + "epoch": 0.7187133623929055, + "flos": 22393341901440.0, + "grad_norm": 1.6223293660386822, + "language_loss": 0.73666042, + "learning_rate": 7.740104912387164e-07, + "loss": 0.76111633, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1973877, + "step": 11954, + "time_per_iteration": 2.819594144821167 + }, + { + "auxiliary_loss_clip": 0.01418447, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.25658059, + "balance_loss_mlp": 1.01381707, + "epoch": 0.7187734856455734, + "flos": 15788277020160.0, + "grad_norm": 1.6843273353271118, + "language_loss": 0.75254118, + "learning_rate": 7.737028058829425e-07, + "loss": 0.77706254, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19885254, + "step": 11955, + "time_per_iteration": 2.833716869354248 + }, + { + "auxiliary_loss_clip": 0.01416304, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.25380397, + "balance_loss_mlp": 1.01397979, + "epoch": 0.7188336088982414, + "flos": 31772265638400.0, + "grad_norm": 1.7658769922709923, + "language_loss": 0.74556792, + "learning_rate": 7.733951670284817e-07, + "loss": 0.77006519, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19433594, + "step": 11956, + "time_per_iteration": 2.8969931602478027 + }, + { + "auxiliary_loss_clip": 0.01415626, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.2512387, + "balance_loss_mlp": 1.0160377, + "epoch": 0.7188937321509093, + "flos": 21473783706240.0, + "grad_norm": 1.6232005260749167, + "language_loss": 0.71731782, + "learning_rate": 7.730875746869987e-07, + "loss": 0.74183434, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19970703, + "step": 11957, + "time_per_iteration": 2.884120225906372 + }, + { + "auxiliary_loss_clip": 0.01421283, + "auxiliary_loss_mlp": 0.01039016, + "balance_loss_clip": 1.25543368, + "balance_loss_mlp": 1.01798761, + "epoch": 0.7189538554035774, + "flos": 27282890482560.0, + "grad_norm": 1.8326406325553835, + "language_loss": 0.73848474, + "learning_rate": 7.727800288701582e-07, + "loss": 0.76308769, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.21032715, + "step": 11958, + "time_per_iteration": 2.864164352416992 + }, + { + "auxiliary_loss_clip": 0.01398971, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.23928618, + "balance_loss_mlp": 1.01438034, + "epoch": 0.7190139786562453, + "flos": 21590778055680.0, + "grad_norm": 3.359923473503292, + "language_loss": 0.8439188, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86825413, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.2019043, + "step": 11959, + "time_per_iteration": 2.843977689743042 + }, + { + "auxiliary_loss_clip": 0.01411469, + "auxiliary_loss_mlp": 0.0103419, + "balance_loss_clip": 1.24691939, + "balance_loss_mlp": 1.01406765, + "epoch": 0.7190741019089133, + "flos": 26731716600960.0, + "grad_norm": 1.5488925325362566, + "language_loss": 0.82252312, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84697974, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20117188, + "step": 11960, + "time_per_iteration": 2.8928382396698 + }, + { + "auxiliary_loss_clip": 0.01399797, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.23925972, + "balance_loss_mlp": 1.01867175, + "epoch": 0.7191342251615812, + "flos": 26116059600000.0, + "grad_norm": 3.339234815421593, + "language_loss": 0.78627932, + "learning_rate": 7.718576706841013e-07, + "loss": 0.81066334, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19934082, + "step": 11961, + "time_per_iteration": 2.8886313438415527 + }, + { + "auxiliary_loss_clip": 0.01397397, + "auxiliary_loss_mlp": 0.0103562, + "balance_loss_clip": 1.24015141, + "balance_loss_mlp": 1.01674974, + "epoch": 0.7191943484142492, + "flos": 22977526504320.0, + "grad_norm": 1.4693010261022863, + "language_loss": 0.75775218, + "learning_rate": 7.715503110824326e-07, + "loss": 0.78208232, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.1887207, + "step": 11962, + "time_per_iteration": 2.862276315689087 + }, + { + "auxiliary_loss_clip": 0.01405204, + "auxiliary_loss_mlp": 0.01031777, + "balance_loss_clip": 1.24363804, + "balance_loss_mlp": 1.01178527, + "epoch": 0.7192544716669171, + "flos": 22575633776640.0, + "grad_norm": 1.7206152717062972, + "language_loss": 0.75658238, + "learning_rate": 7.712429980637001e-07, + "loss": 0.78095222, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.20007324, + "step": 11963, + "time_per_iteration": 2.8644957542419434 + }, + { + "auxiliary_loss_clip": 0.01428831, + "auxiliary_loss_mlp": 0.01036684, + "balance_loss_clip": 1.26173472, + "balance_loss_mlp": 1.01531005, + "epoch": 0.7193145949195852, + "flos": 18989619177600.0, + "grad_norm": 5.676084484264525, + "language_loss": 0.81498903, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83964419, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.21374512, + "step": 11964, + "time_per_iteration": 2.83955717086792 + }, + { + "auxiliary_loss_clip": 0.01396375, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.23724771, + "balance_loss_mlp": 1.01537979, + "epoch": 0.7193747181722531, + "flos": 18013269479040.0, + "grad_norm": 3.0495531227293853, + "language_loss": 0.7539072, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77823049, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20568848, + "step": 11965, + "time_per_iteration": 2.8270442485809326 + }, + { + "auxiliary_loss_clip": 0.01421231, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.25594354, + "balance_loss_mlp": 1.01271331, + "epoch": 0.7194348414249211, + "flos": 24400136096640.0, + "grad_norm": 1.548098992307941, + "language_loss": 0.77869529, + "learning_rate": 7.703213386216377e-07, + "loss": 0.80324638, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.21166992, + "step": 11966, + "time_per_iteration": 2.9676949977874756 + }, + { + "auxiliary_loss_clip": 0.01409455, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.24638867, + "balance_loss_mlp": 1.01076853, + "epoch": 0.7194949646775891, + "flos": 22173514824960.0, + "grad_norm": 1.7279475819709809, + "language_loss": 0.73880231, + "learning_rate": 7.700142120511619e-07, + "loss": 0.76320988, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20532227, + "step": 11967, + "time_per_iteration": 2.829530954360962 + }, + { + "auxiliary_loss_clip": 0.01391086, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.23617303, + "balance_loss_mlp": 1.01438808, + "epoch": 0.719555087930257, + "flos": 20275978118400.0, + "grad_norm": 1.9138378673063465, + "language_loss": 0.8255083, + "learning_rate": 7.6970713212187e-07, + "loss": 0.84975225, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18896484, + "step": 11968, + "time_per_iteration": 2.812560558319092 + }, + { + "auxiliary_loss_clip": 0.01401217, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.2404201, + "balance_loss_mlp": 1.01501048, + "epoch": 0.719615211182925, + "flos": 24726732197760.0, + "grad_norm": 1.9204034875874365, + "language_loss": 0.77093595, + "learning_rate": 7.69400098845407e-07, + "loss": 0.79529756, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19921875, + "step": 11969, + "time_per_iteration": 2.9130172729492188 + }, + { + "auxiliary_loss_clip": 0.01406549, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.2435987, + "balance_loss_mlp": 1.01084745, + "epoch": 0.719675334435593, + "flos": 20018842064640.0, + "grad_norm": 1.4923261589684718, + "language_loss": 0.71483958, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73921841, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20495605, + "step": 11970, + "time_per_iteration": 4.249422550201416 + }, + { + "auxiliary_loss_clip": 0.01187313, + "auxiliary_loss_mlp": 0.01025098, + "balance_loss_clip": 1.09744716, + "balance_loss_mlp": 1.00373578, + "epoch": 0.719735457688261, + "flos": 44224850131200.0, + "grad_norm": 1.0233104450737351, + "language_loss": 0.60879397, + "learning_rate": 7.68786172297538e-07, + "loss": 0.63091803, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.21386719, + "step": 11971, + "time_per_iteration": 3.2314682006835938 + }, + { + "auxiliary_loss_clip": 0.01418368, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.25016236, + "balance_loss_mlp": 1.01393151, + "epoch": 0.7197955809409289, + "flos": 16812296755200.0, + "grad_norm": 4.465347360436265, + "language_loss": 0.80638188, + "learning_rate": 7.684792790494105e-07, + "loss": 0.83091497, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.21008301, + "step": 11972, + "time_per_iteration": 2.883413553237915 + }, + { + "auxiliary_loss_clip": 0.01417634, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.25393271, + "balance_loss_mlp": 1.01802373, + "epoch": 0.7198557041935969, + "flos": 24546159624960.0, + "grad_norm": 1.5912028382832362, + "language_loss": 0.76938438, + "learning_rate": 7.681724325006733e-07, + "loss": 0.79394758, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20654297, + "step": 11973, + "time_per_iteration": 2.874927520751953 + }, + { + "auxiliary_loss_clip": 0.0119325, + "auxiliary_loss_mlp": 0.01028102, + "balance_loss_clip": 1.10216999, + "balance_loss_mlp": 1.00311589, + "epoch": 0.7199158274462648, + "flos": 70739029157760.0, + "grad_norm": 0.8654038751513043, + "language_loss": 0.57360011, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59581363, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.25, + "step": 11974, + "time_per_iteration": 3.129164457321167 + }, + { + "auxiliary_loss_clip": 0.01407582, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.24397469, + "balance_loss_mlp": 1.01486754, + "epoch": 0.7199759506989328, + "flos": 29359099480320.0, + "grad_norm": 3.7020224967308764, + "language_loss": 0.62099618, + "learning_rate": 7.675588795479062e-07, + "loss": 0.64542693, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.20617676, + "step": 11975, + "time_per_iteration": 2.9228219985961914 + }, + { + "auxiliary_loss_clip": 0.01394496, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.23315108, + "balance_loss_mlp": 1.01416409, + "epoch": 0.7200360739516007, + "flos": 24650123472000.0, + "grad_norm": 2.2323457217044087, + "language_loss": 0.68472576, + "learning_rate": 7.672521731671425e-07, + "loss": 0.70900762, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19519043, + "step": 11976, + "time_per_iteration": 2.9086384773254395 + }, + { + "auxiliary_loss_clip": 0.0142428, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.26061988, + "balance_loss_mlp": 1.02398932, + "epoch": 0.7200961972042688, + "flos": 20822401296000.0, + "grad_norm": 1.7199583212527683, + "language_loss": 0.67593133, + "learning_rate": 7.669455135323004e-07, + "loss": 0.70060992, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19592285, + "step": 11977, + "time_per_iteration": 2.898306369781494 + }, + { + "auxiliary_loss_clip": 0.01421175, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.2557354, + "balance_loss_mlp": 1.01818407, + "epoch": 0.7201563204569367, + "flos": 31257676817280.0, + "grad_norm": 1.5296240665929364, + "language_loss": 0.76113594, + "learning_rate": 7.666389006550074e-07, + "loss": 0.78574288, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.21337891, + "step": 11978, + "time_per_iteration": 4.454826831817627 + }, + { + "auxiliary_loss_clip": 0.01405635, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.24525166, + "balance_loss_mlp": 1.01462758, + "epoch": 0.7202164437096047, + "flos": 26662663756800.0, + "grad_norm": 1.9988329201155917, + "language_loss": 0.79179418, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81620055, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.20373535, + "step": 11979, + "time_per_iteration": 2.8819196224212646 + }, + { + "auxiliary_loss_clip": 0.01404029, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.2433418, + "balance_loss_mlp": 1.01633775, + "epoch": 0.7202765669622727, + "flos": 25971076702080.0, + "grad_norm": 2.1941896831564662, + "language_loss": 0.65738094, + "learning_rate": 7.660258152195767e-07, + "loss": 0.68178952, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20495605, + "step": 11980, + "time_per_iteration": 2.946620464324951 + }, + { + "auxiliary_loss_clip": 0.01411356, + "auxiliary_loss_mlp": 0.0104237, + "balance_loss_clip": 1.24884248, + "balance_loss_mlp": 1.02178264, + "epoch": 0.7203366902149406, + "flos": 28524158340480.0, + "grad_norm": 1.875498176911067, + "language_loss": 0.67897373, + "learning_rate": 7.657193426846871e-07, + "loss": 0.703511, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20581055, + "step": 11981, + "time_per_iteration": 4.352744817733765 + }, + { + "auxiliary_loss_clip": 0.01410584, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.24653459, + "balance_loss_mlp": 1.01855862, + "epoch": 0.7203968134676086, + "flos": 21115986675840.0, + "grad_norm": 1.85972514342076, + "language_loss": 0.74541646, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76991051, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20251465, + "step": 11982, + "time_per_iteration": 4.274494886398315 + }, + { + "auxiliary_loss_clip": 0.01407002, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_clip": 1.24355578, + "balance_loss_mlp": 1.02295637, + "epoch": 0.7204569367202766, + "flos": 18341132434560.0, + "grad_norm": 2.132663860023864, + "language_loss": 0.6653893, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68988496, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19616699, + "step": 11983, + "time_per_iteration": 2.811666250228882 + }, + { + "auxiliary_loss_clip": 0.01400626, + "auxiliary_loss_mlp": 0.01042962, + "balance_loss_clip": 1.23788953, + "balance_loss_mlp": 1.02120638, + "epoch": 0.7205170599729446, + "flos": 23264687122560.0, + "grad_norm": 1.5090609352616173, + "language_loss": 0.67136019, + "learning_rate": 7.648002059507715e-07, + "loss": 0.69579607, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.21765137, + "step": 11984, + "time_per_iteration": 2.849229574203491 + }, + { + "auxiliary_loss_clip": 0.01426418, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.26206231, + "balance_loss_mlp": 1.02197552, + "epoch": 0.7205771832256125, + "flos": 20130588017280.0, + "grad_norm": 1.5254404313229588, + "language_loss": 0.74783057, + "learning_rate": 7.644939207017771e-07, + "loss": 0.77252537, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.21105957, + "step": 11985, + "time_per_iteration": 2.904045581817627 + }, + { + "auxiliary_loss_clip": 0.01414378, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.253443, + "balance_loss_mlp": 1.02022958, + "epoch": 0.7206373064782805, + "flos": 27713178961920.0, + "grad_norm": 1.7009508826955413, + "language_loss": 0.6375941, + "learning_rate": 7.641876823032977e-07, + "loss": 0.66213393, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19372559, + "step": 11986, + "time_per_iteration": 2.8900091648101807 + }, + { + "auxiliary_loss_clip": 0.01411913, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.24756312, + "balance_loss_mlp": 1.01801324, + "epoch": 0.7206974297309484, + "flos": 17977272600960.0, + "grad_norm": 1.5502530157132937, + "language_loss": 0.72902298, + "learning_rate": 7.638814907669455e-07, + "loss": 0.75353074, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.20837402, + "step": 11987, + "time_per_iteration": 2.978158950805664 + }, + { + "auxiliary_loss_clip": 0.01411927, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.24633837, + "balance_loss_mlp": 1.01764274, + "epoch": 0.7207575529836164, + "flos": 16992054921600.0, + "grad_norm": 1.7315809203291026, + "language_loss": 0.79001915, + "learning_rate": 7.635753461043301e-07, + "loss": 0.81450725, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19238281, + "step": 11988, + "time_per_iteration": 2.830598831176758 + }, + { + "auxiliary_loss_clip": 0.01412157, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.24923849, + "balance_loss_mlp": 1.01557446, + "epoch": 0.7208176762362843, + "flos": 18735152567040.0, + "grad_norm": 1.629029539285493, + "language_loss": 0.78936625, + "learning_rate": 7.632692483270618e-07, + "loss": 0.81385994, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.21643066, + "step": 11989, + "time_per_iteration": 2.854494333267212 + }, + { + "auxiliary_loss_clip": 0.01391031, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.23122692, + "balance_loss_mlp": 1.01502216, + "epoch": 0.7208777994889524, + "flos": 18743839568640.0, + "grad_norm": 1.8314500504593056, + "language_loss": 0.83624172, + "learning_rate": 7.629631974467481e-07, + "loss": 0.86050606, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.20373535, + "step": 11990, + "time_per_iteration": 2.8515193462371826 + }, + { + "auxiliary_loss_clip": 0.0140189, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.23954356, + "balance_loss_mlp": 1.01718116, + "epoch": 0.7209379227416203, + "flos": 14801611507200.0, + "grad_norm": 2.9078216756949993, + "language_loss": 0.76712954, + "learning_rate": 7.626571934749931e-07, + "loss": 0.79151744, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19726562, + "step": 11991, + "time_per_iteration": 2.871149778366089 + }, + { + "auxiliary_loss_clip": 0.01387204, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.22974777, + "balance_loss_mlp": 1.01496959, + "epoch": 0.7209980459942883, + "flos": 29647255484160.0, + "grad_norm": 1.4490511274849567, + "language_loss": 0.73038077, + "learning_rate": 7.623512364234022e-07, + "loss": 0.75459909, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1965332, + "step": 11992, + "time_per_iteration": 2.9448790550231934 + }, + { + "auxiliary_loss_clip": 0.01409751, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.24419761, + "balance_loss_mlp": 1.01196074, + "epoch": 0.7210581692469563, + "flos": 23487636090240.0, + "grad_norm": 3.115065893758335, + "language_loss": 0.67493081, + "learning_rate": 7.620453263035755e-07, + "loss": 0.69934368, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19580078, + "step": 11993, + "time_per_iteration": 2.917584180831909 + }, + { + "auxiliary_loss_clip": 0.01413524, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.25084138, + "balance_loss_mlp": 1.01227176, + "epoch": 0.7211182924996242, + "flos": 26109861062400.0, + "grad_norm": 1.7395404404584849, + "language_loss": 0.67173314, + "learning_rate": 7.61739463127115e-07, + "loss": 0.69619608, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20495605, + "step": 11994, + "time_per_iteration": 2.93839955329895 + }, + { + "auxiliary_loss_clip": 0.01414187, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.25010037, + "balance_loss_mlp": 1.01226604, + "epoch": 0.7211784157522922, + "flos": 17720950953600.0, + "grad_norm": 1.7648009416578843, + "language_loss": 0.67556012, + "learning_rate": 7.614336469056172e-07, + "loss": 0.70003766, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.2130127, + "step": 11995, + "time_per_iteration": 2.803344488143921 + }, + { + "auxiliary_loss_clip": 0.01394544, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.23583674, + "balance_loss_mlp": 1.0098505, + "epoch": 0.7212385390049602, + "flos": 24433780245120.0, + "grad_norm": 1.6905210023185984, + "language_loss": 0.80081379, + "learning_rate": 7.6112787765068e-07, + "loss": 0.82506764, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.20983887, + "step": 11996, + "time_per_iteration": 2.8795478343963623 + }, + { + "auxiliary_loss_clip": 0.01414615, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.25080538, + "balance_loss_mlp": 1.01401424, + "epoch": 0.7212986622576282, + "flos": 28158895918080.0, + "grad_norm": 2.650346266504728, + "language_loss": 0.82489169, + "learning_rate": 7.60822155373899e-07, + "loss": 0.8493706, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19238281, + "step": 11997, + "time_per_iteration": 2.893066167831421 + }, + { + "auxiliary_loss_clip": 0.01413551, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.24763346, + "balance_loss_mlp": 1.0142746, + "epoch": 0.7213587855102961, + "flos": 21845923338240.0, + "grad_norm": 2.142757407226953, + "language_loss": 0.68171251, + "learning_rate": 7.605164800868646e-07, + "loss": 0.70620668, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21594238, + "step": 11998, + "time_per_iteration": 2.8460843563079834 + }, + { + "auxiliary_loss_clip": 0.01409046, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.24684238, + "balance_loss_mlp": 1.014503, + "epoch": 0.7214189087629641, + "flos": 14619817324800.0, + "grad_norm": 2.024940766347357, + "language_loss": 0.72797835, + "learning_rate": 7.602108518011696e-07, + "loss": 0.75240487, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19104004, + "step": 11999, + "time_per_iteration": 2.8100616931915283 + }, + { + "auxiliary_loss_clip": 0.01407658, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.24523973, + "balance_loss_mlp": 1.01279616, + "epoch": 0.721479032015632, + "flos": 19400289396480.0, + "grad_norm": 5.687019692259008, + "language_loss": 0.84398949, + "learning_rate": 7.599052705284039e-07, + "loss": 0.86839616, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20202637, + "step": 12000, + "time_per_iteration": 2.8108911514282227 + }, + { + "auxiliary_loss_clip": 0.01412044, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.249228, + "balance_loss_mlp": 1.01774728, + "epoch": 0.7215391552683, + "flos": 18521478783360.0, + "grad_norm": 2.054086719660786, + "language_loss": 0.78311515, + "learning_rate": 7.59599736280154e-07, + "loss": 0.80760592, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19287109, + "step": 12001, + "time_per_iteration": 2.8439884185791016 + }, + { + "auxiliary_loss_clip": 0.01395886, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.23792863, + "balance_loss_mlp": 1.01740551, + "epoch": 0.721599278520968, + "flos": 23269256847360.0, + "grad_norm": 1.6650387846520809, + "language_loss": 0.82773691, + "learning_rate": 7.592942490680066e-07, + "loss": 0.85207057, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.20068359, + "step": 12002, + "time_per_iteration": 2.9147582054138184 + }, + { + "auxiliary_loss_clip": 0.01417614, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.25387943, + "balance_loss_mlp": 1.01458466, + "epoch": 0.721659401773636, + "flos": 39212678862720.0, + "grad_norm": 1.9342406308144207, + "language_loss": 0.63340199, + "learning_rate": 7.589888089035462e-07, + "loss": 0.65792465, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20068359, + "step": 12003, + "time_per_iteration": 3.043604612350464 + }, + { + "auxiliary_loss_clip": 0.01423032, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.25745952, + "balance_loss_mlp": 1.01738334, + "epoch": 0.7217195250263039, + "flos": 14948268462720.0, + "grad_norm": 2.4014697920967123, + "language_loss": 0.69294143, + "learning_rate": 7.586834157983544e-07, + "loss": 0.71754789, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20239258, + "step": 12004, + "time_per_iteration": 2.837273359298706 + }, + { + "auxiliary_loss_clip": 0.01188606, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.10000408, + "balance_loss_mlp": 1.00103366, + "epoch": 0.7217796482789719, + "flos": 70900662142080.0, + "grad_norm": 0.8579453607702863, + "language_loss": 0.54102492, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56318831, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.26757812, + "step": 12005, + "time_per_iteration": 4.741447687149048 + }, + { + "auxiliary_loss_clip": 0.01407996, + "auxiliary_loss_mlp": 0.01039505, + "balance_loss_clip": 1.24528956, + "balance_loss_mlp": 1.01889348, + "epoch": 0.7218397715316398, + "flos": 37465690164480.0, + "grad_norm": 1.5219384071488482, + "language_loss": 0.63641822, + "learning_rate": 7.580727708120962e-07, + "loss": 0.6608932, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20617676, + "step": 12006, + "time_per_iteration": 2.9969661235809326 + }, + { + "auxiliary_loss_clip": 0.01425169, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.25974584, + "balance_loss_mlp": 1.01769638, + "epoch": 0.7218998947843078, + "flos": 22720842898560.0, + "grad_norm": 2.4274958122345627, + "language_loss": 0.91629338, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94091791, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19580078, + "step": 12007, + "time_per_iteration": 2.8227572441101074 + }, + { + "auxiliary_loss_clip": 0.01410285, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.24690282, + "balance_loss_mlp": 1.01280403, + "epoch": 0.7219600180369758, + "flos": 12174862055040.0, + "grad_norm": 1.6699485456748728, + "language_loss": 0.6443758, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66881835, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.21166992, + "step": 12008, + "time_per_iteration": 2.842782497406006 + }, + { + "auxiliary_loss_clip": 0.01416329, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.2509433, + "balance_loss_mlp": 1.01067567, + "epoch": 0.7220201412896438, + "flos": 22605522606720.0, + "grad_norm": 2.7769349905231806, + "language_loss": 0.78757954, + "learning_rate": 7.57157156566681e-07, + "loss": 0.81204474, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19519043, + "step": 12009, + "time_per_iteration": 2.863978624343872 + }, + { + "auxiliary_loss_clip": 0.01424908, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.25892437, + "balance_loss_mlp": 1.0129354, + "epoch": 0.7220802645423118, + "flos": 26728458975360.0, + "grad_norm": 4.611855756354056, + "language_loss": 0.64231282, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66689318, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.20178223, + "step": 12010, + "time_per_iteration": 2.8872697353363037 + }, + { + "auxiliary_loss_clip": 0.01412831, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.2507031, + "balance_loss_mlp": 1.01100266, + "epoch": 0.7221403877949797, + "flos": 24429889192320.0, + "grad_norm": 1.8721531601217651, + "language_loss": 0.78275502, + "learning_rate": 7.565469826940742e-07, + "loss": 0.80718935, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19592285, + "step": 12011, + "time_per_iteration": 2.8706040382385254 + }, + { + "auxiliary_loss_clip": 0.01408116, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.24746251, + "balance_loss_mlp": 1.01368868, + "epoch": 0.7222005110476477, + "flos": 23525035557120.0, + "grad_norm": 1.7440084794151987, + "language_loss": 0.80270994, + "learning_rate": 7.56241966479781e-07, + "loss": 0.82711303, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18505859, + "step": 12012, + "time_per_iteration": 2.9151716232299805 + }, + { + "auxiliary_loss_clip": 0.01405224, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.24251616, + "balance_loss_mlp": 1.01221561, + "epoch": 0.7222606343003156, + "flos": 23122961850240.0, + "grad_norm": 2.79803032023028, + "language_loss": 0.76562667, + "learning_rate": 7.559369974289171e-07, + "loss": 0.78999227, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19128418, + "step": 12013, + "time_per_iteration": 4.249547481536865 + }, + { + "auxiliary_loss_clip": 0.01409403, + "auxiliary_loss_mlp": 0.01031887, + "balance_loss_clip": 1.24942422, + "balance_loss_mlp": 1.01253963, + "epoch": 0.7223207575529836, + "flos": 24361424530560.0, + "grad_norm": 1.5831642681304265, + "language_loss": 0.76705694, + "learning_rate": 7.556320755530484e-07, + "loss": 0.79146981, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19372559, + "step": 12014, + "time_per_iteration": 2.8735532760620117 + }, + { + "auxiliary_loss_clip": 0.01418448, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.2543242, + "balance_loss_mlp": 1.01149714, + "epoch": 0.7223808808056515, + "flos": 28342002199680.0, + "grad_norm": 1.6127434477910607, + "language_loss": 0.87092865, + "learning_rate": 7.553272008637346e-07, + "loss": 0.89542025, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19213867, + "step": 12015, + "time_per_iteration": 2.911015272140503 + }, + { + "auxiliary_loss_clip": 0.01402768, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.24239564, + "balance_loss_mlp": 1.01155138, + "epoch": 0.7224410040583196, + "flos": 21079356370560.0, + "grad_norm": 1.9720433764767953, + "language_loss": 0.7864846, + "learning_rate": 7.55022373372538e-07, + "loss": 0.8108145, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18640137, + "step": 12016, + "time_per_iteration": 4.259311199188232 + }, + { + "auxiliary_loss_clip": 0.01403797, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.24371839, + "balance_loss_mlp": 1.01707625, + "epoch": 0.7225011273109875, + "flos": 26806244065920.0, + "grad_norm": 1.4568209385192235, + "language_loss": 0.78258085, + "learning_rate": 7.547175930910186e-07, + "loss": 0.80698371, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1940918, + "step": 12017, + "time_per_iteration": 4.2947492599487305 + }, + { + "auxiliary_loss_clip": 0.0140183, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.24254203, + "balance_loss_mlp": 1.01620293, + "epoch": 0.7225612505636555, + "flos": 23593273994880.0, + "grad_norm": 1.7472856614962702, + "language_loss": 0.74275994, + "learning_rate": 7.54412860030732e-07, + "loss": 0.7671293, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18908691, + "step": 12018, + "time_per_iteration": 2.8729517459869385 + }, + { + "auxiliary_loss_clip": 0.01395998, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.23990178, + "balance_loss_mlp": 1.01594067, + "epoch": 0.7226213738163234, + "flos": 20787490293120.0, + "grad_norm": 1.7054495311733995, + "language_loss": 0.78101659, + "learning_rate": 7.541081742032347e-07, + "loss": 0.80532032, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18408203, + "step": 12019, + "time_per_iteration": 2.869986057281494 + }, + { + "auxiliary_loss_clip": 0.01402797, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.24206614, + "balance_loss_mlp": 1.01062346, + "epoch": 0.7226814970689914, + "flos": 32648859256320.0, + "grad_norm": 1.710176812401023, + "language_loss": 0.74427223, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76860738, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.2010498, + "step": 12020, + "time_per_iteration": 2.932291269302368 + }, + { + "auxiliary_loss_clip": 0.01416302, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.25216436, + "balance_loss_mlp": 1.0183804, + "epoch": 0.7227416203216595, + "flos": 22464249782400.0, + "grad_norm": 1.6280409385602646, + "language_loss": 0.77928722, + "learning_rate": 7.534989442928219e-07, + "loss": 0.8038221, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18798828, + "step": 12021, + "time_per_iteration": 2.950657367706299 + }, + { + "auxiliary_loss_clip": 0.01407597, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.2469461, + "balance_loss_mlp": 1.01425672, + "epoch": 0.7228017435743274, + "flos": 21662138384640.0, + "grad_norm": 3.6048549861755186, + "language_loss": 0.69408977, + "learning_rate": 7.531944002330073e-07, + "loss": 0.71850419, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19567871, + "step": 12022, + "time_per_iteration": 2.874795913696289 + }, + { + "auxiliary_loss_clip": 0.01411404, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.24838376, + "balance_loss_mlp": 1.01323056, + "epoch": 0.7228618668269954, + "flos": 29545418142720.0, + "grad_norm": 1.6569712483134886, + "language_loss": 0.69744623, + "learning_rate": 7.528899034521858e-07, + "loss": 0.72189009, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1973877, + "step": 12023, + "time_per_iteration": 2.871166467666626 + }, + { + "auxiliary_loss_clip": 0.01397819, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.23706746, + "balance_loss_mlp": 1.01367211, + "epoch": 0.7229219900796633, + "flos": 27465227602560.0, + "grad_norm": 1.5445384399234414, + "language_loss": 0.71695876, + "learning_rate": 7.525854539619052e-07, + "loss": 0.7412622, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18847656, + "step": 12024, + "time_per_iteration": 2.9208128452301025 + }, + { + "auxiliary_loss_clip": 0.0140036, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.23931527, + "balance_loss_mlp": 1.01577961, + "epoch": 0.7229821133323313, + "flos": 16297934158080.0, + "grad_norm": 2.4517953666256416, + "language_loss": 0.76369905, + "learning_rate": 7.522810517737089e-07, + "loss": 0.78804958, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18908691, + "step": 12025, + "time_per_iteration": 2.7892825603485107 + }, + { + "auxiliary_loss_clip": 0.01386349, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.22927475, + "balance_loss_mlp": 1.01307952, + "epoch": 0.7230422365849992, + "flos": 20422001646720.0, + "grad_norm": 1.962605814506806, + "language_loss": 0.76594263, + "learning_rate": 7.519766968991395e-07, + "loss": 0.79013157, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19470215, + "step": 12026, + "time_per_iteration": 2.807366132736206 + }, + { + "auxiliary_loss_clip": 0.01413783, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.24979866, + "balance_loss_mlp": 1.01408517, + "epoch": 0.7231023598376672, + "flos": 25604547425280.0, + "grad_norm": 1.8726564749492303, + "language_loss": 0.68944657, + "learning_rate": 7.516723893497388e-07, + "loss": 0.71391523, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19006348, + "step": 12027, + "time_per_iteration": 2.8627994060516357 + }, + { + "auxiliary_loss_clip": 0.01427606, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.26254284, + "balance_loss_mlp": 1.01610267, + "epoch": 0.7231624830903352, + "flos": 25158966203520.0, + "grad_norm": 1.864985347356744, + "language_loss": 0.79984581, + "learning_rate": 7.513681291370469e-07, + "loss": 0.82447577, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19274902, + "step": 12028, + "time_per_iteration": 2.8669936656951904 + }, + { + "auxiliary_loss_clip": 0.01410981, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.24924839, + "balance_loss_mlp": 1.01125801, + "epoch": 0.7232226063430032, + "flos": 21735715708800.0, + "grad_norm": 1.7632658502273728, + "language_loss": 0.83167505, + "learning_rate": 7.510639162726e-07, + "loss": 0.8560847, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18701172, + "step": 12029, + "time_per_iteration": 2.8251454830169678 + }, + { + "auxiliary_loss_clip": 0.011813, + "auxiliary_loss_mlp": 0.01022149, + "balance_loss_clip": 1.09431612, + "balance_loss_mlp": 1.00278938, + "epoch": 0.7232827295956711, + "flos": 68470773390720.0, + "grad_norm": 0.8091300271856494, + "language_loss": 0.61822057, + "learning_rate": 7.507597507679347e-07, + "loss": 0.64025509, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19335938, + "step": 12030, + "time_per_iteration": 3.392035484313965 + }, + { + "auxiliary_loss_clip": 0.01395277, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.23654819, + "balance_loss_mlp": 1.0139401, + "epoch": 0.7233428528483391, + "flos": 20202038835840.0, + "grad_norm": 1.9154288511415865, + "language_loss": 0.78551698, + "learning_rate": 7.504556326345859e-07, + "loss": 0.8098076, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.1986084, + "step": 12031, + "time_per_iteration": 3.0072810649871826 + }, + { + "auxiliary_loss_clip": 0.01415907, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.25275087, + "balance_loss_mlp": 1.01585531, + "epoch": 0.723402976101007, + "flos": 23959531802880.0, + "grad_norm": 1.7331401658647874, + "language_loss": 0.81855488, + "learning_rate": 7.501515618840834e-07, + "loss": 0.8430655, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19311523, + "step": 12032, + "time_per_iteration": 2.9443275928497314 + }, + { + "auxiliary_loss_clip": 0.01428835, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.26106203, + "balance_loss_mlp": 1.01563597, + "epoch": 0.723463099353675, + "flos": 20823079968000.0, + "grad_norm": 2.3935625001880734, + "language_loss": 0.76231217, + "learning_rate": 7.498475385279592e-07, + "loss": 0.78695351, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19677734, + "step": 12033, + "time_per_iteration": 2.854008436203003 + }, + { + "auxiliary_loss_clip": 0.01402616, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.24281251, + "balance_loss_mlp": 1.01154339, + "epoch": 0.723523222606343, + "flos": 19107111219840.0, + "grad_norm": 1.8155031183393504, + "language_loss": 0.75352263, + "learning_rate": 7.495435625777423e-07, + "loss": 0.7778576, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1932373, + "step": 12034, + "time_per_iteration": 2.845275640487671 + }, + { + "auxiliary_loss_clip": 0.0139792, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.23757076, + "balance_loss_mlp": 1.01175892, + "epoch": 0.723583345859011, + "flos": 26518630999680.0, + "grad_norm": 1.9432312333607356, + "language_loss": 0.81827462, + "learning_rate": 7.492396340449578e-07, + "loss": 0.84255052, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.17895508, + "step": 12035, + "time_per_iteration": 2.865886688232422 + }, + { + "auxiliary_loss_clip": 0.01420269, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.25588369, + "balance_loss_mlp": 1.01346326, + "epoch": 0.723643469111679, + "flos": 16042336427520.0, + "grad_norm": 2.1505970593620884, + "language_loss": 0.61385995, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63838875, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19140625, + "step": 12036, + "time_per_iteration": 2.8170084953308105 + }, + { + "auxiliary_loss_clip": 0.01390756, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.23423684, + "balance_loss_mlp": 1.01573217, + "epoch": 0.7237035923643469, + "flos": 21955814254080.0, + "grad_norm": 1.5022210200663448, + "language_loss": 0.68810284, + "learning_rate": 7.486319192777883e-07, + "loss": 0.71235693, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18920898, + "step": 12037, + "time_per_iteration": 2.918562650680542 + }, + { + "auxiliary_loss_clip": 0.01398413, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.23878431, + "balance_loss_mlp": 1.01316905, + "epoch": 0.7237637156170149, + "flos": 23592685812480.0, + "grad_norm": 2.086594719906185, + "language_loss": 0.72854757, + "learning_rate": 7.483281330664479e-07, + "loss": 0.75285828, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19482422, + "step": 12038, + "time_per_iteration": 2.8516435623168945 + }, + { + "auxiliary_loss_clip": 0.0140955, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.24810529, + "balance_loss_mlp": 1.01446033, + "epoch": 0.7238238388696828, + "flos": 20604384011520.0, + "grad_norm": 1.622583060990797, + "language_loss": 0.72841787, + "learning_rate": 7.480243943186293e-07, + "loss": 0.75286269, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.20483398, + "step": 12039, + "time_per_iteration": 2.837143659591675 + }, + { + "auxiliary_loss_clip": 0.01413971, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.25184774, + "balance_loss_mlp": 1.01464868, + "epoch": 0.7238839621223508, + "flos": 24217708487040.0, + "grad_norm": 1.69866560192204, + "language_loss": 0.77008325, + "learning_rate": 7.477207030458513e-07, + "loss": 0.79456031, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.1907959, + "step": 12040, + "time_per_iteration": 4.357841491699219 + }, + { + "auxiliary_loss_clip": 0.01411384, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.24786246, + "balance_loss_mlp": 1.0163815, + "epoch": 0.7239440853750188, + "flos": 14217788862720.0, + "grad_norm": 1.693898473480938, + "language_loss": 0.78029168, + "learning_rate": 7.474170592596301e-07, + "loss": 0.8047685, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19921875, + "step": 12041, + "time_per_iteration": 2.820542573928833 + }, + { + "auxiliary_loss_clip": 0.0140452, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.24233985, + "balance_loss_mlp": 1.01365137, + "epoch": 0.7240042086276868, + "flos": 21624376959360.0, + "grad_norm": 2.2020509346659085, + "language_loss": 0.64625037, + "learning_rate": 7.471134629714797e-07, + "loss": 0.67062271, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.1907959, + "step": 12042, + "time_per_iteration": 2.8402535915374756 + }, + { + "auxiliary_loss_clip": 0.01418086, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.25254607, + "balance_loss_mlp": 1.01755786, + "epoch": 0.7240643318803547, + "flos": 23341929275520.0, + "grad_norm": 1.877949027570416, + "language_loss": 0.84038532, + "learning_rate": 7.468099141929116e-07, + "loss": 0.86495554, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.21386719, + "step": 12043, + "time_per_iteration": 2.8376688957214355 + }, + { + "auxiliary_loss_clip": 0.01408623, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.2462883, + "balance_loss_mlp": 1.01548982, + "epoch": 0.7241244551330227, + "flos": 24035552346240.0, + "grad_norm": 2.3987181235540227, + "language_loss": 0.64688647, + "learning_rate": 7.465064129354379e-07, + "loss": 0.67131972, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1920166, + "step": 12044, + "time_per_iteration": 2.856257200241089 + }, + { + "auxiliary_loss_clip": 0.01426527, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.26321828, + "balance_loss_mlp": 1.0163871, + "epoch": 0.7241845783856906, + "flos": 18738591171840.0, + "grad_norm": 1.5626507505605252, + "language_loss": 0.82471317, + "learning_rate": 7.462029592105658e-07, + "loss": 0.84933686, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19470215, + "step": 12045, + "time_per_iteration": 2.8290116786956787 + }, + { + "auxiliary_loss_clip": 0.01397085, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.23965049, + "balance_loss_mlp": 1.01774478, + "epoch": 0.7242447016383586, + "flos": 19507691848320.0, + "grad_norm": 1.8716094106937367, + "language_loss": 0.72667968, + "learning_rate": 7.458995530298034e-07, + "loss": 0.75102055, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19262695, + "step": 12046, + "time_per_iteration": 2.866175651550293 + }, + { + "auxiliary_loss_clip": 0.01415925, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.25313616, + "balance_loss_mlp": 1.01210999, + "epoch": 0.7243048248910267, + "flos": 22173560069760.0, + "grad_norm": 1.7745662094740648, + "language_loss": 0.71631861, + "learning_rate": 7.455961944046553e-07, + "loss": 0.74078798, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18920898, + "step": 12047, + "time_per_iteration": 4.250116586685181 + }, + { + "auxiliary_loss_clip": 0.014291, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.26338506, + "balance_loss_mlp": 1.01978469, + "epoch": 0.7243649481436946, + "flos": 27684240272640.0, + "grad_norm": 1.5183386016287255, + "language_loss": 0.70175636, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72643858, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1932373, + "step": 12048, + "time_per_iteration": 2.86251163482666 + }, + { + "auxiliary_loss_clip": 0.01179235, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.09257674, + "balance_loss_mlp": 1.00610721, + "epoch": 0.7244250713963626, + "flos": 63274246968960.0, + "grad_norm": 0.8422359325511274, + "language_loss": 0.53720915, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55936396, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.30078125, + "step": 12049, + "time_per_iteration": 3.374913215637207 + }, + { + "auxiliary_loss_clip": 0.01439719, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.26955998, + "balance_loss_mlp": 1.01808429, + "epoch": 0.7244851946490305, + "flos": 17975960501760.0, + "grad_norm": 2.280897325512806, + "language_loss": 0.60312122, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62789637, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19714355, + "step": 12050, + "time_per_iteration": 2.8094446659088135 + }, + { + "auxiliary_loss_clip": 0.01182486, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.09600496, + "balance_loss_mlp": 1.00829172, + "epoch": 0.7245453179016985, + "flos": 70975506320640.0, + "grad_norm": 0.7134482866064371, + "language_loss": 0.53333354, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55543876, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.19726562, + "step": 12051, + "time_per_iteration": 4.63675594329834 + }, + { + "auxiliary_loss_clip": 0.0140225, + "auxiliary_loss_mlp": 0.01040474, + "balance_loss_clip": 1.24338293, + "balance_loss_mlp": 1.02184165, + "epoch": 0.7246054411543664, + "flos": 24578355939840.0, + "grad_norm": 1.455416704483892, + "language_loss": 0.72455442, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74898171, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18640137, + "step": 12052, + "time_per_iteration": 4.333528518676758 + }, + { + "auxiliary_loss_clip": 0.01414557, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.25219882, + "balance_loss_mlp": 1.01409626, + "epoch": 0.7246655644070344, + "flos": 32349799255680.0, + "grad_norm": 2.1303566387877546, + "language_loss": 0.74858767, + "learning_rate": 7.437770419657415e-07, + "loss": 0.77306324, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18920898, + "step": 12053, + "time_per_iteration": 2.9229955673217773 + }, + { + "auxiliary_loss_clip": 0.01415915, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.25422001, + "balance_loss_mlp": 1.01502705, + "epoch": 0.7247256876597024, + "flos": 21882689377920.0, + "grad_norm": 2.0861571585412824, + "language_loss": 0.78634149, + "learning_rate": 7.434740165518898e-07, + "loss": 0.81083614, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18530273, + "step": 12054, + "time_per_iteration": 2.8486745357513428 + }, + { + "auxiliary_loss_clip": 0.01404524, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.24520266, + "balance_loss_mlp": 1.01923764, + "epoch": 0.7247858109123704, + "flos": 16220782494720.0, + "grad_norm": 3.8904672516423573, + "language_loss": 0.6929121, + "learning_rate": 7.431710387856301e-07, + "loss": 0.71733183, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18212891, + "step": 12055, + "time_per_iteration": 2.810206413269043 + }, + { + "auxiliary_loss_clip": 0.01411611, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.25005722, + "balance_loss_mlp": 1.02000415, + "epoch": 0.7248459341650383, + "flos": 20860615169280.0, + "grad_norm": 1.6443746038613802, + "language_loss": 0.74373245, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76823819, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18969727, + "step": 12056, + "time_per_iteration": 2.837925672531128 + }, + { + "auxiliary_loss_clip": 0.01406083, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.24828172, + "balance_loss_mlp": 1.01487434, + "epoch": 0.7249060574177063, + "flos": 25932908073600.0, + "grad_norm": 1.6577191573565986, + "language_loss": 0.71441722, + "learning_rate": 7.425652262418368e-07, + "loss": 0.73882103, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19433594, + "step": 12057, + "time_per_iteration": 2.893820285797119 + }, + { + "auxiliary_loss_clip": 0.0143304, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.26708436, + "balance_loss_mlp": 1.01724076, + "epoch": 0.7249661806703742, + "flos": 17353878739200.0, + "grad_norm": 1.711166086286174, + "language_loss": 0.62850428, + "learning_rate": 7.42262391487277e-07, + "loss": 0.65320051, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19372559, + "step": 12058, + "time_per_iteration": 2.8284049034118652 + }, + { + "auxiliary_loss_clip": 0.01412743, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.25152874, + "balance_loss_mlp": 1.01788259, + "epoch": 0.7250263039230422, + "flos": 19583848126080.0, + "grad_norm": 1.8336679143849142, + "language_loss": 0.75636399, + "learning_rate": 7.419596044262535e-07, + "loss": 0.78085804, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18786621, + "step": 12059, + "time_per_iteration": 2.844123125076294 + }, + { + "auxiliary_loss_clip": 0.01396785, + "auxiliary_loss_mlp": 0.01038338, + "balance_loss_clip": 1.23951936, + "balance_loss_mlp": 1.01910949, + "epoch": 0.7250864271757103, + "flos": 21985522104960.0, + "grad_norm": 1.832719681295629, + "language_loss": 0.79991531, + "learning_rate": 7.416568650702472e-07, + "loss": 0.82426649, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19226074, + "step": 12060, + "time_per_iteration": 2.8727757930755615 + }, + { + "auxiliary_loss_clip": 0.01415924, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.25310087, + "balance_loss_mlp": 1.01738453, + "epoch": 0.7251465504283782, + "flos": 25024253875200.0, + "grad_norm": 1.7539549735417825, + "language_loss": 0.7678085, + "learning_rate": 7.413541734307393e-07, + "loss": 0.79234242, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20080566, + "step": 12061, + "time_per_iteration": 2.8641445636749268 + }, + { + "auxiliary_loss_clip": 0.01407065, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.24979484, + "balance_loss_mlp": 1.01483166, + "epoch": 0.7252066736810462, + "flos": 16698650520960.0, + "grad_norm": 1.6139273163158219, + "language_loss": 0.81719947, + "learning_rate": 7.410515295192068e-07, + "loss": 0.84160352, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18518066, + "step": 12062, + "time_per_iteration": 2.9163150787353516 + }, + { + "auxiliary_loss_clip": 0.01431512, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.26574516, + "balance_loss_mlp": 1.01386058, + "epoch": 0.7252667969337141, + "flos": 25714121627520.0, + "grad_norm": 2.5367337008644695, + "language_loss": 0.70038974, + "learning_rate": 7.407489333471262e-07, + "loss": 0.72503585, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19238281, + "step": 12063, + "time_per_iteration": 2.85710072517395 + }, + { + "auxiliary_loss_clip": 0.01399619, + "auxiliary_loss_mlp": 0.01034349, + "balance_loss_clip": 1.24214125, + "balance_loss_mlp": 1.01589549, + "epoch": 0.7253269201863821, + "flos": 18269093433600.0, + "grad_norm": 1.415242393153993, + "language_loss": 0.70703, + "learning_rate": 7.40446384925973e-07, + "loss": 0.73136961, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18457031, + "step": 12064, + "time_per_iteration": 2.8255996704101562 + }, + { + "auxiliary_loss_clip": 0.01410075, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.24934292, + "balance_loss_mlp": 1.01405203, + "epoch": 0.72538704343905, + "flos": 20421322974720.0, + "grad_norm": 2.022143255212664, + "language_loss": 0.91515481, + "learning_rate": 7.401438842672192e-07, + "loss": 0.93958628, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19018555, + "step": 12065, + "time_per_iteration": 2.822315216064453 + }, + { + "auxiliary_loss_clip": 0.01186727, + "auxiliary_loss_mlp": 0.01023285, + "balance_loss_clip": 1.09768438, + "balance_loss_mlp": 1.00430667, + "epoch": 0.725447166691718, + "flos": 70185520529280.0, + "grad_norm": 0.6503065049822433, + "language_loss": 0.56102222, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58312231, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18945312, + "step": 12066, + "time_per_iteration": 3.4587628841400146 + }, + { + "auxiliary_loss_clip": 0.01404011, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.24470413, + "balance_loss_mlp": 1.01997483, + "epoch": 0.725507289944386, + "flos": 27063606343680.0, + "grad_norm": 1.643452986987224, + "language_loss": 0.76999986, + "learning_rate": 7.395390262827897e-07, + "loss": 0.79442173, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18200684, + "step": 12067, + "time_per_iteration": 2.8817858695983887 + }, + { + "auxiliary_loss_clip": 0.01180954, + "auxiliary_loss_mlp": 0.01018115, + "balance_loss_clip": 1.09326386, + "balance_loss_mlp": 0.99970931, + "epoch": 0.725567413197054, + "flos": 62953034999040.0, + "grad_norm": 0.7238393188290771, + "language_loss": 0.57083803, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59282869, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.18359375, + "step": 12068, + "time_per_iteration": 3.184457540512085 + }, + { + "auxiliary_loss_clip": 0.01184163, + "auxiliary_loss_mlp": 0.0102139, + "balance_loss_clip": 1.09544706, + "balance_loss_mlp": 0.99840659, + "epoch": 0.7256275364497219, + "flos": 60326647505280.0, + "grad_norm": 0.6583360732542883, + "language_loss": 0.55471879, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57677436, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.22949219, + "step": 12069, + "time_per_iteration": 3.302109956741333 + }, + { + "auxiliary_loss_clip": 0.01402023, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.24440563, + "balance_loss_mlp": 1.01403689, + "epoch": 0.7256876597023899, + "flos": 24509167361280.0, + "grad_norm": 1.7452482634964892, + "language_loss": 0.80218726, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82653052, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18261719, + "step": 12070, + "time_per_iteration": 2.914280414581299 + }, + { + "auxiliary_loss_clip": 0.01397697, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.2419008, + "balance_loss_mlp": 1.01335323, + "epoch": 0.7257477829550578, + "flos": 24363415301760.0, + "grad_norm": 1.9333670830833847, + "language_loss": 0.72951251, + "learning_rate": 7.383298839673197e-07, + "loss": 0.7538079, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18481445, + "step": 12071, + "time_per_iteration": 2.907585859298706 + }, + { + "auxiliary_loss_clip": 0.01405866, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.24755287, + "balance_loss_mlp": 1.02301383, + "epoch": 0.7258079062077258, + "flos": 17211203326080.0, + "grad_norm": 1.8102499453797956, + "language_loss": 0.70559937, + "learning_rate": 7.380277179664436e-07, + "loss": 0.73006916, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1809082, + "step": 12072, + "time_per_iteration": 2.882935047149658 + }, + { + "auxiliary_loss_clip": 0.01418105, + "auxiliary_loss_mlp": 0.01040688, + "balance_loss_clip": 1.25266051, + "balance_loss_mlp": 1.01977849, + "epoch": 0.7258680294603939, + "flos": 21590416097280.0, + "grad_norm": 1.9230776985328575, + "language_loss": 0.78880638, + "learning_rate": 7.377255998196821e-07, + "loss": 0.81339431, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20935059, + "step": 12073, + "time_per_iteration": 2.8940987586975098 + }, + { + "auxiliary_loss_clip": 0.01408608, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.24945807, + "balance_loss_mlp": 1.01315808, + "epoch": 0.7259281527130618, + "flos": 34867426953600.0, + "grad_norm": 1.4153469374095564, + "language_loss": 0.70550179, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72990453, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18518066, + "step": 12074, + "time_per_iteration": 3.077580213546753 + }, + { + "auxiliary_loss_clip": 0.01416064, + "auxiliary_loss_mlp": 0.01037089, + "balance_loss_clip": 1.2521708, + "balance_loss_mlp": 1.01811051, + "epoch": 0.7259882759657298, + "flos": 25413975751680.0, + "grad_norm": 2.5704217843644397, + "language_loss": 0.74729222, + "learning_rate": 7.371215071343302e-07, + "loss": 0.77182376, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18969727, + "step": 12075, + "time_per_iteration": 4.337449312210083 + }, + { + "auxiliary_loss_clip": 0.01406029, + "auxiliary_loss_mlp": 0.01038863, + "balance_loss_clip": 1.24400449, + "balance_loss_mlp": 1.01858556, + "epoch": 0.7260483992183977, + "flos": 62974037865600.0, + "grad_norm": 1.5067812408678491, + "language_loss": 0.6439321, + "learning_rate": 7.368195326186458e-07, + "loss": 0.66838104, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20275879, + "step": 12076, + "time_per_iteration": 3.226510524749756 + }, + { + "auxiliary_loss_clip": 0.01413699, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.25066912, + "balance_loss_mlp": 1.01325345, + "epoch": 0.7261085224710657, + "flos": 26478381110400.0, + "grad_norm": 1.7505039069783277, + "language_loss": 0.79500055, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81946558, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19555664, + "step": 12077, + "time_per_iteration": 2.9068408012390137 + }, + { + "auxiliary_loss_clip": 0.01181929, + "auxiliary_loss_mlp": 0.01024077, + "balance_loss_clip": 1.09180522, + "balance_loss_mlp": 1.00347793, + "epoch": 0.7261686457237336, + "flos": 66800511169920.0, + "grad_norm": 0.8941141302257031, + "language_loss": 0.64993656, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67199671, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20605469, + "step": 12078, + "time_per_iteration": 3.337397336959839 + }, + { + "auxiliary_loss_clip": 0.0118073, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.09211516, + "balance_loss_mlp": 1.00402725, + "epoch": 0.7262287689764017, + "flos": 70032393567360.0, + "grad_norm": 0.7065851911831613, + "language_loss": 0.59259927, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61470628, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.25976562, + "step": 12079, + "time_per_iteration": 3.478893995285034 + }, + { + "auxiliary_loss_clip": 0.01406836, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.24666286, + "balance_loss_mlp": 1.01566923, + "epoch": 0.7262888922290696, + "flos": 23815951493760.0, + "grad_norm": 1.8534978875254196, + "language_loss": 0.66255862, + "learning_rate": 7.356121136696895e-07, + "loss": 0.68697226, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18859863, + "step": 12080, + "time_per_iteration": 2.898808717727661 + }, + { + "auxiliary_loss_clip": 0.01414856, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.25012696, + "balance_loss_mlp": 1.01514077, + "epoch": 0.7263490154817376, + "flos": 19509773109120.0, + "grad_norm": 3.0433775755611703, + "language_loss": 0.7188797, + "learning_rate": 7.35310378768128e-07, + "loss": 0.74337065, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19116211, + "step": 12081, + "time_per_iteration": 2.849839210510254 + }, + { + "auxiliary_loss_clip": 0.01414945, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.25033009, + "balance_loss_mlp": 1.01714444, + "epoch": 0.7264091387344055, + "flos": 16293997860480.0, + "grad_norm": 1.7181843117330031, + "language_loss": 0.81794405, + "learning_rate": 7.350086918237237e-07, + "loss": 0.84246099, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19580078, + "step": 12082, + "time_per_iteration": 4.314535140991211 + }, + { + "auxiliary_loss_clip": 0.01428349, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.25840044, + "balance_loss_mlp": 1.01559043, + "epoch": 0.7264692619870735, + "flos": 24362510405760.0, + "grad_norm": 2.385260865079037, + "language_loss": 0.78007758, + "learning_rate": 7.347070528479158e-07, + "loss": 0.80471396, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.19689941, + "step": 12083, + "time_per_iteration": 2.874288320541382 + }, + { + "auxiliary_loss_clip": 0.0142581, + "auxiliary_loss_mlp": 0.01032539, + "balance_loss_clip": 1.26089549, + "balance_loss_mlp": 1.01323855, + "epoch": 0.7265293852397414, + "flos": 25130479962240.0, + "grad_norm": 1.754378378815766, + "language_loss": 0.73349011, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75807357, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19311523, + "step": 12084, + "time_per_iteration": 2.88417387008667 + }, + { + "auxiliary_loss_clip": 0.01416784, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.25307953, + "balance_loss_mlp": 1.01823401, + "epoch": 0.7265895084924094, + "flos": 22648532428800.0, + "grad_norm": 1.7297975480036487, + "language_loss": 0.78691012, + "learning_rate": 7.34103918847843e-07, + "loss": 0.81144559, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18518066, + "step": 12085, + "time_per_iteration": 2.864600419998169 + }, + { + "auxiliary_loss_clip": 0.01412951, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.2490859, + "balance_loss_mlp": 1.01666451, + "epoch": 0.7266496317450775, + "flos": 23378785804800.0, + "grad_norm": 2.2531652984469006, + "language_loss": 0.72975284, + "learning_rate": 7.338024238464493e-07, + "loss": 0.75423825, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18933105, + "step": 12086, + "time_per_iteration": 4.280728816986084 + }, + { + "auxiliary_loss_clip": 0.01408735, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.24794793, + "balance_loss_mlp": 1.01564682, + "epoch": 0.7267097549977454, + "flos": 28086313979520.0, + "grad_norm": 3.7160703579365384, + "language_loss": 0.70417094, + "learning_rate": 7.335009768593938e-07, + "loss": 0.72860742, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19287109, + "step": 12087, + "time_per_iteration": 4.308889389038086 + }, + { + "auxiliary_loss_clip": 0.01421186, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.25609303, + "balance_loss_mlp": 1.01684272, + "epoch": 0.7267698782504134, + "flos": 22204444285440.0, + "grad_norm": 2.2006842732751752, + "language_loss": 0.79678547, + "learning_rate": 7.331995778981088e-07, + "loss": 0.82137084, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20507812, + "step": 12088, + "time_per_iteration": 2.8395471572875977 + }, + { + "auxiliary_loss_clip": 0.01419101, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.2543112, + "balance_loss_mlp": 1.01645911, + "epoch": 0.7268300015030813, + "flos": 18523967247360.0, + "grad_norm": 1.709143396029944, + "language_loss": 0.74509984, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76965237, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19702148, + "step": 12089, + "time_per_iteration": 2.809056043624878 + }, + { + "auxiliary_loss_clip": 0.0141726, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.25462699, + "balance_loss_mlp": 1.01717412, + "epoch": 0.7268901247557493, + "flos": 23996116863360.0, + "grad_norm": 1.723145008026815, + "language_loss": 0.72060287, + "learning_rate": 7.325969240985616e-07, + "loss": 0.74514341, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19616699, + "step": 12090, + "time_per_iteration": 2.8941128253936768 + }, + { + "auxiliary_loss_clip": 0.01421769, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.25796962, + "balance_loss_mlp": 1.01484013, + "epoch": 0.7269502480084172, + "flos": 32100852510720.0, + "grad_norm": 15.1584887312411, + "language_loss": 0.77583146, + "learning_rate": 7.322956692831528e-07, + "loss": 0.80039299, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19519043, + "step": 12091, + "time_per_iteration": 3.0046520233154297 + }, + { + "auxiliary_loss_clip": 0.01407496, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.2459836, + "balance_loss_mlp": 1.01341784, + "epoch": 0.7270103712610853, + "flos": 19072200216960.0, + "grad_norm": 1.9493405540081374, + "language_loss": 0.72054982, + "learning_rate": 7.319944625392205e-07, + "loss": 0.7449559, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19689941, + "step": 12092, + "time_per_iteration": 2.832512617111206 + }, + { + "auxiliary_loss_clip": 0.01413373, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.25040007, + "balance_loss_mlp": 1.01383924, + "epoch": 0.7270704945137532, + "flos": 34546395962880.0, + "grad_norm": 1.89889620844141, + "language_loss": 0.61968446, + "learning_rate": 7.31693303878184e-07, + "loss": 0.64415389, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19726562, + "step": 12093, + "time_per_iteration": 2.9706997871398926 + }, + { + "auxiliary_loss_clip": 0.01411273, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.25014806, + "balance_loss_mlp": 1.01382422, + "epoch": 0.7271306177664212, + "flos": 21517698424320.0, + "grad_norm": 1.5250103690673458, + "language_loss": 0.7599386, + "learning_rate": 7.313921933114644e-07, + "loss": 0.78438747, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19799805, + "step": 12094, + "time_per_iteration": 2.8583364486694336 + }, + { + "auxiliary_loss_clip": 0.01402848, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.24422359, + "balance_loss_mlp": 1.01490164, + "epoch": 0.7271907410190891, + "flos": 22282410355200.0, + "grad_norm": 1.7692817442696116, + "language_loss": 0.85461414, + "learning_rate": 7.310911308504808e-07, + "loss": 0.8789795, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18786621, + "step": 12095, + "time_per_iteration": 2.8835935592651367 + }, + { + "auxiliary_loss_clip": 0.0141097, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.2480402, + "balance_loss_mlp": 1.01717186, + "epoch": 0.7272508642717571, + "flos": 22903360997760.0, + "grad_norm": 1.9639040722498762, + "language_loss": 0.78669798, + "learning_rate": 7.307901165066479e-07, + "loss": 0.81117487, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19555664, + "step": 12096, + "time_per_iteration": 2.8442108631134033 + }, + { + "auxiliary_loss_clip": 0.01419332, + "auxiliary_loss_mlp": 0.01035869, + "balance_loss_clip": 1.25614762, + "balance_loss_mlp": 1.01764154, + "epoch": 0.727310987524425, + "flos": 11663304635520.0, + "grad_norm": 1.9538289875191748, + "language_loss": 0.73364484, + "learning_rate": 7.30489150291381e-07, + "loss": 0.75819683, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18237305, + "step": 12097, + "time_per_iteration": 2.856088638305664 + }, + { + "auxiliary_loss_clip": 0.01417987, + "auxiliary_loss_mlp": 0.01035828, + "balance_loss_clip": 1.25347376, + "balance_loss_mlp": 1.01701677, + "epoch": 0.727371110777093, + "flos": 24546069135360.0, + "grad_norm": 1.8447795624926446, + "language_loss": 0.77224135, + "learning_rate": 7.301882322160935e-07, + "loss": 0.79677951, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18811035, + "step": 12098, + "time_per_iteration": 2.8744685649871826 + }, + { + "auxiliary_loss_clip": 0.01422314, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.25561213, + "balance_loss_mlp": 1.01208472, + "epoch": 0.7274312340297611, + "flos": 74763639296640.0, + "grad_norm": 1.7198908600579652, + "language_loss": 0.68032181, + "learning_rate": 7.298873622921952e-07, + "loss": 0.70485616, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19030762, + "step": 12099, + "time_per_iteration": 3.3003385066986084 + }, + { + "auxiliary_loss_clip": 0.01425701, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.25758123, + "balance_loss_mlp": 1.01507759, + "epoch": 0.727491357282429, + "flos": 22352187116160.0, + "grad_norm": 1.5200208234118708, + "language_loss": 0.73209786, + "learning_rate": 7.29586540531095e-07, + "loss": 0.75671327, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20776367, + "step": 12100, + "time_per_iteration": 2.8768422603607178 + }, + { + "auxiliary_loss_clip": 0.0142332, + "auxiliary_loss_mlp": 0.01039499, + "balance_loss_clip": 1.26048779, + "balance_loss_mlp": 1.02096188, + "epoch": 0.727551480535097, + "flos": 23308692330240.0, + "grad_norm": 1.591788990465889, + "language_loss": 0.7490207, + "learning_rate": 7.292857669442005e-07, + "loss": 0.77364886, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18554688, + "step": 12101, + "time_per_iteration": 2.913317918777466 + }, + { + "auxiliary_loss_clip": 0.01405009, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.24582517, + "balance_loss_mlp": 1.01561046, + "epoch": 0.7276116037877649, + "flos": 21480525181440.0, + "grad_norm": 1.8114734862873358, + "language_loss": 0.83265293, + "learning_rate": 7.289850415429177e-07, + "loss": 0.85703939, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18041992, + "step": 12102, + "time_per_iteration": 2.839599132537842 + }, + { + "auxiliary_loss_clip": 0.01410603, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.24871075, + "balance_loss_mlp": 1.01111174, + "epoch": 0.7276717270404329, + "flos": 21472335872640.0, + "grad_norm": 2.0843500721336867, + "language_loss": 0.82358956, + "learning_rate": 7.286843643386495e-07, + "loss": 0.84798896, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18225098, + "step": 12103, + "time_per_iteration": 2.8556008338928223 + }, + { + "auxiliary_loss_clip": 0.01417471, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.25344324, + "balance_loss_mlp": 1.01324105, + "epoch": 0.7277318502931008, + "flos": 16846076638080.0, + "grad_norm": 1.7784000298165807, + "language_loss": 0.67555165, + "learning_rate": 7.283837353427968e-07, + "loss": 0.70005274, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19396973, + "step": 12104, + "time_per_iteration": 2.8202319145202637 + }, + { + "auxiliary_loss_clip": 0.01408679, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.2482059, + "balance_loss_mlp": 1.01407421, + "epoch": 0.7277919735457689, + "flos": 33413390208000.0, + "grad_norm": 3.2835331604082496, + "language_loss": 0.6641196, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68854153, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19421387, + "step": 12105, + "time_per_iteration": 2.998450994491577 + }, + { + "auxiliary_loss_clip": 0.01409732, + "auxiliary_loss_mlp": 0.01035398, + "balance_loss_clip": 1.24873066, + "balance_loss_mlp": 1.01608634, + "epoch": 0.7278520967984368, + "flos": 19215373322880.0, + "grad_norm": 2.1219062688945454, + "language_loss": 0.7602756, + "learning_rate": 7.27782622021939e-07, + "loss": 0.78472698, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19311523, + "step": 12106, + "time_per_iteration": 2.823988914489746 + }, + { + "auxiliary_loss_clip": 0.01433028, + "auxiliary_loss_mlp": 0.0103996, + "balance_loss_clip": 1.26654911, + "balance_loss_mlp": 1.0203501, + "epoch": 0.7279122200511048, + "flos": 34107918174720.0, + "grad_norm": 1.9990180625704166, + "language_loss": 0.70888186, + "learning_rate": 7.274821377197273e-07, + "loss": 0.73361164, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19604492, + "step": 12107, + "time_per_iteration": 2.9482038021087646 + }, + { + "auxiliary_loss_clip": 0.01408053, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.24613404, + "balance_loss_mlp": 1.01271248, + "epoch": 0.7279723433037727, + "flos": 54617459806080.0, + "grad_norm": 1.4084801597393757, + "language_loss": 0.75769401, + "learning_rate": 7.271817016715205e-07, + "loss": 0.78209049, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18884277, + "step": 12108, + "time_per_iteration": 3.154111623764038 + }, + { + "auxiliary_loss_clip": 0.01426693, + "auxiliary_loss_mlp": 0.01034947, + "balance_loss_clip": 1.26113629, + "balance_loss_mlp": 1.01581442, + "epoch": 0.7280324665564407, + "flos": 36151161696000.0, + "grad_norm": 1.982993180602593, + "language_loss": 0.67644757, + "learning_rate": 7.268813138887124e-07, + "loss": 0.70106399, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19140625, + "step": 12109, + "time_per_iteration": 4.440367221832275 + }, + { + "auxiliary_loss_clip": 0.01407537, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.24718308, + "balance_loss_mlp": 1.01614666, + "epoch": 0.7280925898091086, + "flos": 11625724189440.0, + "grad_norm": 2.116810314290769, + "language_loss": 0.63821995, + "learning_rate": 7.265809743826912e-07, + "loss": 0.66265786, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.20068359, + "step": 12110, + "time_per_iteration": 2.851145029067993 + }, + { + "auxiliary_loss_clip": 0.01423477, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.25726283, + "balance_loss_mlp": 1.01411653, + "epoch": 0.7281527130617766, + "flos": 34290391029120.0, + "grad_norm": 1.9238147650186237, + "language_loss": 0.59912622, + "learning_rate": 7.26280683164847e-07, + "loss": 0.62369859, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19628906, + "step": 12111, + "time_per_iteration": 3.0161428451538086 + }, + { + "auxiliary_loss_clip": 0.01420623, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.25482738, + "balance_loss_mlp": 1.01477981, + "epoch": 0.7282128363144446, + "flos": 13926058519680.0, + "grad_norm": 2.0023844953482457, + "language_loss": 0.74952918, + "learning_rate": 7.259804402465677e-07, + "loss": 0.77406865, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.18530273, + "step": 12112, + "time_per_iteration": 2.8429298400878906 + }, + { + "auxiliary_loss_clip": 0.01404313, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.24272633, + "balance_loss_mlp": 1.01401722, + "epoch": 0.7282729595671126, + "flos": 20787490293120.0, + "grad_norm": 2.6319206592044053, + "language_loss": 0.67358816, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69795811, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18688965, + "step": 12113, + "time_per_iteration": 2.8669586181640625 + }, + { + "auxiliary_loss_clip": 0.01411925, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.24713337, + "balance_loss_mlp": 1.01283574, + "epoch": 0.7283330828197806, + "flos": 16333885791360.0, + "grad_norm": 1.8998578281642216, + "language_loss": 0.74103618, + "learning_rate": 7.253800993542399e-07, + "loss": 0.76548004, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19628906, + "step": 12114, + "time_per_iteration": 2.85542368888855 + }, + { + "auxiliary_loss_clip": 0.01407939, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.246508, + "balance_loss_mlp": 1.01167536, + "epoch": 0.7283932060724485, + "flos": 27501586439040.0, + "grad_norm": 2.1578201335609597, + "language_loss": 0.6893152, + "learning_rate": 7.250800014029564e-07, + "loss": 0.71371061, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19897461, + "step": 12115, + "time_per_iteration": 2.910413980484009 + }, + { + "auxiliary_loss_clip": 0.0143702, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.26935303, + "balance_loss_mlp": 1.01623154, + "epoch": 0.7284533293251165, + "flos": 18376676864640.0, + "grad_norm": 1.7033770539496362, + "language_loss": 0.60604846, + "learning_rate": 7.247799517967674e-07, + "loss": 0.63076866, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.18774414, + "step": 12116, + "time_per_iteration": 2.9810190200805664 + }, + { + "auxiliary_loss_clip": 0.01417136, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.25482607, + "balance_loss_mlp": 1.01643038, + "epoch": 0.7285134525777844, + "flos": 21735444240000.0, + "grad_norm": 2.0422862025745525, + "language_loss": 0.73812866, + "learning_rate": 7.2447995054705e-07, + "loss": 0.76264656, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18237305, + "step": 12117, + "time_per_iteration": 4.372398853302002 + }, + { + "auxiliary_loss_clip": 0.01419597, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.25538826, + "balance_loss_mlp": 1.01091027, + "epoch": 0.7285735758304525, + "flos": 20751267191040.0, + "grad_norm": 2.1055684838085362, + "language_loss": 0.71546394, + "learning_rate": 7.241799976651807e-07, + "loss": 0.73995841, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1895752, + "step": 12118, + "time_per_iteration": 2.8421473503112793 + }, + { + "auxiliary_loss_clip": 0.01408354, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.25039721, + "balance_loss_mlp": 1.01692724, + "epoch": 0.7286336990831204, + "flos": 17319782142720.0, + "grad_norm": 1.7135764188869396, + "language_loss": 0.84919155, + "learning_rate": 7.238800931625346e-07, + "loss": 0.87362444, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18017578, + "step": 12119, + "time_per_iteration": 2.816567897796631 + }, + { + "auxiliary_loss_clip": 0.01419516, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.25578833, + "balance_loss_mlp": 1.01144958, + "epoch": 0.7286938223357884, + "flos": 19795666872960.0, + "grad_norm": 2.0797298924209744, + "language_loss": 0.82270837, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84719753, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1796875, + "step": 12120, + "time_per_iteration": 2.8061304092407227 + }, + { + "auxiliary_loss_clip": 0.01425594, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.26171851, + "balance_loss_mlp": 1.01535201, + "epoch": 0.7287539455884563, + "flos": 15349211049600.0, + "grad_norm": 1.760982644813853, + "language_loss": 0.79007769, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81467521, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18811035, + "step": 12121, + "time_per_iteration": 4.422580718994141 + }, + { + "auxiliary_loss_clip": 0.01424479, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.25682759, + "balance_loss_mlp": 1.01423335, + "epoch": 0.7288140688411243, + "flos": 25203514348800.0, + "grad_norm": 3.239788959764501, + "language_loss": 0.696751, + "learning_rate": 7.229806700436441e-07, + "loss": 0.7213304, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19238281, + "step": 12122, + "time_per_iteration": 4.2991838455200195 + }, + { + "auxiliary_loss_clip": 0.01399012, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.2399919, + "balance_loss_mlp": 1.01414371, + "epoch": 0.7288741920937922, + "flos": 23993764133760.0, + "grad_norm": 1.8654465831727127, + "language_loss": 0.87941277, + "learning_rate": 7.226809591715923e-07, + "loss": 0.90372062, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17651367, + "step": 12123, + "time_per_iteration": 2.847494602203369 + }, + { + "auxiliary_loss_clip": 0.01412646, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.2519753, + "balance_loss_mlp": 1.01198101, + "epoch": 0.7289343153464602, + "flos": 22754532291840.0, + "grad_norm": 1.6651048585489334, + "language_loss": 0.83103585, + "learning_rate": 7.223812967356065e-07, + "loss": 0.85547161, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18945312, + "step": 12124, + "time_per_iteration": 2.8753581047058105 + }, + { + "auxiliary_loss_clip": 0.0141429, + "auxiliary_loss_mlp": 0.01032789, + "balance_loss_clip": 1.25169301, + "balance_loss_mlp": 1.01440692, + "epoch": 0.7289944385991282, + "flos": 24910833864960.0, + "grad_norm": 1.7157243554116082, + "language_loss": 0.6789794, + "learning_rate": 7.220816827470499e-07, + "loss": 0.7034502, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18395996, + "step": 12125, + "time_per_iteration": 2.9622833728790283 + }, + { + "auxiliary_loss_clip": 0.01428598, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.26022947, + "balance_loss_mlp": 1.0175848, + "epoch": 0.7290545618517962, + "flos": 22977571749120.0, + "grad_norm": 1.7420205968090947, + "language_loss": 0.76263785, + "learning_rate": 7.217821172172855e-07, + "loss": 0.78729773, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19812012, + "step": 12126, + "time_per_iteration": 2.869610548019409 + }, + { + "auxiliary_loss_clip": 0.01181649, + "auxiliary_loss_mlp": 0.01017469, + "balance_loss_clip": 1.09454322, + "balance_loss_mlp": 0.99887258, + "epoch": 0.7291146851044642, + "flos": 61932680092800.0, + "grad_norm": 0.8233413291365862, + "language_loss": 0.58688235, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60887361, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.18554688, + "step": 12127, + "time_per_iteration": 3.2875189781188965 + }, + { + "auxiliary_loss_clip": 0.01409088, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.24860859, + "balance_loss_mlp": 1.0116775, + "epoch": 0.7291748083571321, + "flos": 23341295848320.0, + "grad_norm": 19.484785675667176, + "language_loss": 0.69433421, + "learning_rate": 7.21183131579562e-07, + "loss": 0.7187264, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18469238, + "step": 12128, + "time_per_iteration": 2.893202066421509 + }, + { + "auxiliary_loss_clip": 0.01417356, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.25291896, + "balance_loss_mlp": 1.01415348, + "epoch": 0.7292349316098001, + "flos": 28341775975680.0, + "grad_norm": 1.797524535678285, + "language_loss": 0.65705597, + "learning_rate": 7.20883711494319e-07, + "loss": 0.68155986, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18884277, + "step": 12129, + "time_per_iteration": 2.953606128692627 + }, + { + "auxiliary_loss_clip": 0.01406248, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.24720204, + "balance_loss_mlp": 1.01458251, + "epoch": 0.729295054862468, + "flos": 24142411860480.0, + "grad_norm": 1.9549774498277668, + "language_loss": 0.75055939, + "learning_rate": 7.205843399132927e-07, + "loss": 0.77495605, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18823242, + "step": 12130, + "time_per_iteration": 2.9776265621185303 + }, + { + "auxiliary_loss_clip": 0.01416233, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.2529645, + "balance_loss_mlp": 1.01291203, + "epoch": 0.7293551781151361, + "flos": 22825440172800.0, + "grad_norm": 1.9229561389820833, + "language_loss": 0.70550907, + "learning_rate": 7.202850168478374e-07, + "loss": 0.7299881, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18774414, + "step": 12131, + "time_per_iteration": 2.926750898361206 + }, + { + "auxiliary_loss_clip": 0.01403484, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.24338436, + "balance_loss_mlp": 1.0172379, + "epoch": 0.729415301367804, + "flos": 22136658295680.0, + "grad_norm": 1.585555452488032, + "language_loss": 0.78036499, + "learning_rate": 7.199857423093025e-07, + "loss": 0.80475509, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18286133, + "step": 12132, + "time_per_iteration": 2.886493444442749 + }, + { + "auxiliary_loss_clip": 0.01413249, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.25308466, + "balance_loss_mlp": 1.01508296, + "epoch": 0.729475424620472, + "flos": 12357334909440.0, + "grad_norm": 2.070208347483394, + "language_loss": 0.80141282, + "learning_rate": 7.196865163090358e-07, + "loss": 0.82588243, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18640137, + "step": 12133, + "time_per_iteration": 2.8525891304016113 + }, + { + "auxiliary_loss_clip": 0.01404097, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.24352729, + "balance_loss_mlp": 1.01297355, + "epoch": 0.7295355478731399, + "flos": 22204172816640.0, + "grad_norm": 1.7565451632547386, + "language_loss": 0.72792494, + "learning_rate": 7.193873388583846e-07, + "loss": 0.75227964, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1842041, + "step": 12134, + "time_per_iteration": 2.9668548107147217 + }, + { + "auxiliary_loss_clip": 0.01433548, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.2691853, + "balance_loss_mlp": 1.01847076, + "epoch": 0.7295956711258079, + "flos": 23232490807680.0, + "grad_norm": 1.7177579327943748, + "language_loss": 0.72156924, + "learning_rate": 7.190882099686939e-07, + "loss": 0.74628174, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19238281, + "step": 12135, + "time_per_iteration": 3.0090444087982178 + }, + { + "auxiliary_loss_clip": 0.01423921, + "auxiliary_loss_mlp": 0.01039198, + "balance_loss_clip": 1.25890887, + "balance_loss_mlp": 1.02054191, + "epoch": 0.7296557943784758, + "flos": 31881794595840.0, + "grad_norm": 2.060822482533133, + "language_loss": 0.63160539, + "learning_rate": 7.187891296513075e-07, + "loss": 0.65623659, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18664551, + "step": 12136, + "time_per_iteration": 3.0272276401519775 + }, + { + "auxiliary_loss_clip": 0.0141774, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.25519276, + "balance_loss_mlp": 1.01626086, + "epoch": 0.7297159176311439, + "flos": 26663070960000.0, + "grad_norm": 1.7380209593505629, + "language_loss": 0.75714183, + "learning_rate": 7.184900979175654e-07, + "loss": 0.78167677, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19494629, + "step": 12137, + "time_per_iteration": 2.953033447265625 + }, + { + "auxiliary_loss_clip": 0.0142125, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.25770795, + "balance_loss_mlp": 1.01673853, + "epoch": 0.7297760408838118, + "flos": 24759154736640.0, + "grad_norm": 1.8072777112899314, + "language_loss": 0.74538445, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76996672, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20239258, + "step": 12138, + "time_per_iteration": 2.882807731628418 + }, + { + "auxiliary_loss_clip": 0.01407229, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.24546206, + "balance_loss_mlp": 1.01223886, + "epoch": 0.7298361641364798, + "flos": 18081915120000.0, + "grad_norm": 3.920479000594158, + "language_loss": 0.72374189, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74812281, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18615723, + "step": 12139, + "time_per_iteration": 2.836174249649048 + }, + { + "auxiliary_loss_clip": 0.01401452, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.2439878, + "balance_loss_mlp": 1.01521599, + "epoch": 0.7298962873891478, + "flos": 29906925246720.0, + "grad_norm": 1.5141390556737533, + "language_loss": 0.73797715, + "learning_rate": 7.175932943315898e-07, + "loss": 0.76232386, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18017578, + "step": 12140, + "time_per_iteration": 2.8881592750549316 + }, + { + "auxiliary_loss_clip": 0.0142193, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.25717688, + "balance_loss_mlp": 1.0193429, + "epoch": 0.7299564106418157, + "flos": 32277760254720.0, + "grad_norm": 1.4608758156402475, + "language_loss": 0.55840242, + "learning_rate": 7.172944570458003e-07, + "loss": 0.58300787, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19274902, + "step": 12141, + "time_per_iteration": 2.9648938179016113 + }, + { + "auxiliary_loss_clip": 0.01409371, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.25119138, + "balance_loss_mlp": 1.01087952, + "epoch": 0.7300165338944837, + "flos": 22940715219840.0, + "grad_norm": 1.602406960157579, + "language_loss": 0.73512506, + "learning_rate": 7.169956684003342e-07, + "loss": 0.75951058, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.1829834, + "step": 12142, + "time_per_iteration": 2.8505566120147705 + }, + { + "auxiliary_loss_clip": 0.01413953, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.25235128, + "balance_loss_mlp": 1.0150075, + "epoch": 0.7300766571471516, + "flos": 19838359981440.0, + "grad_norm": 2.9292595640583485, + "language_loss": 0.74345177, + "learning_rate": 7.16696928406521e-07, + "loss": 0.76791871, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.17736816, + "step": 12143, + "time_per_iteration": 2.7940311431884766 + }, + { + "auxiliary_loss_clip": 0.01420326, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.25857794, + "balance_loss_mlp": 1.01352918, + "epoch": 0.7301367803998197, + "flos": 24357216764160.0, + "grad_norm": 1.8394217001251587, + "language_loss": 0.6771335, + "learning_rate": 7.163982370756882e-07, + "loss": 0.70166475, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19274902, + "step": 12144, + "time_per_iteration": 2.8618180751800537 + }, + { + "auxiliary_loss_clip": 0.01435822, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.2707839, + "balance_loss_mlp": 1.01723516, + "epoch": 0.7301969036524876, + "flos": 15312897457920.0, + "grad_norm": 1.6799023123971732, + "language_loss": 0.79869401, + "learning_rate": 7.160995944191627e-07, + "loss": 0.82341516, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19067383, + "step": 12145, + "time_per_iteration": 4.263585329055786 + }, + { + "auxiliary_loss_clip": 0.01410192, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.24928892, + "balance_loss_mlp": 1.01686215, + "epoch": 0.7302570269051556, + "flos": 23516077086720.0, + "grad_norm": 1.7196227530170864, + "language_loss": 0.92188263, + "learning_rate": 7.158010004482702e-07, + "loss": 0.94634664, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19360352, + "step": 12146, + "time_per_iteration": 2.862212896347046 + }, + { + "auxiliary_loss_clip": 0.01409193, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.25016677, + "balance_loss_mlp": 1.01747906, + "epoch": 0.7303171501578235, + "flos": 20533068927360.0, + "grad_norm": 1.518822057580962, + "language_loss": 0.62655491, + "learning_rate": 7.155024551743316e-07, + "loss": 0.65100288, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18115234, + "step": 12147, + "time_per_iteration": 2.8818328380584717 + }, + { + "auxiliary_loss_clip": 0.01423427, + "auxiliary_loss_mlp": 0.01037096, + "balance_loss_clip": 1.25861001, + "balance_loss_mlp": 1.01747429, + "epoch": 0.7303772734104915, + "flos": 18341720616960.0, + "grad_norm": 2.0495099550884066, + "language_loss": 0.75881886, + "learning_rate": 7.152039586086693e-07, + "loss": 0.78342414, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19604492, + "step": 12148, + "time_per_iteration": 2.846372365951538 + }, + { + "auxiliary_loss_clip": 0.01181039, + "auxiliary_loss_mlp": 0.01022885, + "balance_loss_clip": 1.09290576, + "balance_loss_mlp": 1.00629067, + "epoch": 0.7304373966631594, + "flos": 60683992087680.0, + "grad_norm": 0.6928270853999937, + "language_loss": 0.56796145, + "learning_rate": 7.149055107626017e-07, + "loss": 0.59000063, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16601562, + "step": 12149, + "time_per_iteration": 3.3290328979492188 + }, + { + "auxiliary_loss_clip": 0.01434526, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.26920295, + "balance_loss_mlp": 1.0140717, + "epoch": 0.7304975199158275, + "flos": 19837455085440.0, + "grad_norm": 1.62426805894834, + "language_loss": 0.74807513, + "learning_rate": 7.146071116474451e-07, + "loss": 0.7727592, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19799805, + "step": 12150, + "time_per_iteration": 2.8794026374816895 + }, + { + "auxiliary_loss_clip": 0.01422095, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.2570622, + "balance_loss_mlp": 1.01509881, + "epoch": 0.7305576431684954, + "flos": 13231621042560.0, + "grad_norm": 1.9351863608951152, + "language_loss": 0.84214652, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86670554, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18688965, + "step": 12151, + "time_per_iteration": 4.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01417669, + "auxiliary_loss_mlp": 0.01037771, + "balance_loss_clip": 1.25407624, + "balance_loss_mlp": 1.01812577, + "epoch": 0.7306177664211634, + "flos": 24070961041920.0, + "grad_norm": 1.9967447859671437, + "language_loss": 0.78904712, + "learning_rate": 7.14010459655127e-07, + "loss": 0.81360149, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19641113, + "step": 12152, + "time_per_iteration": 2.9217722415924072 + }, + { + "auxiliary_loss_clip": 0.01428311, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.26574302, + "balance_loss_mlp": 1.01982534, + "epoch": 0.7306778896738314, + "flos": 27100462872960.0, + "grad_norm": 1.5392357848382765, + "language_loss": 0.80104268, + "learning_rate": 7.137122068005919e-07, + "loss": 0.82571125, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18713379, + "step": 12153, + "time_per_iteration": 2.9134867191314697 + }, + { + "auxiliary_loss_clip": 0.01430217, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.26344872, + "balance_loss_mlp": 1.01580751, + "epoch": 0.7307380129264993, + "flos": 16699329192960.0, + "grad_norm": 1.6421420223235073, + "language_loss": 0.68434024, + "learning_rate": 7.134140027222173e-07, + "loss": 0.70899516, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19482422, + "step": 12154, + "time_per_iteration": 2.845033884048462 + }, + { + "auxiliary_loss_clip": 0.01420636, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.25512922, + "balance_loss_mlp": 1.01213646, + "epoch": 0.7307981361791673, + "flos": 21735715708800.0, + "grad_norm": 4.492142219477114, + "language_loss": 0.66357714, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68809301, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18811035, + "step": 12155, + "time_per_iteration": 2.849754810333252 + }, + { + "auxiliary_loss_clip": 0.01396345, + "auxiliary_loss_mlp": 0.01033052, + "balance_loss_clip": 1.23734665, + "balance_loss_mlp": 1.01478887, + "epoch": 0.7308582594318352, + "flos": 18049673560320.0, + "grad_norm": 1.6665244227894938, + "language_loss": 0.82763767, + "learning_rate": 7.128177409391851e-07, + "loss": 0.85193169, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18273926, + "step": 12156, + "time_per_iteration": 4.268205642700195 + }, + { + "auxiliary_loss_clip": 0.01410951, + "auxiliary_loss_mlp": 0.01033328, + "balance_loss_clip": 1.24986553, + "balance_loss_mlp": 1.01516032, + "epoch": 0.7309183826845033, + "flos": 13852933643520.0, + "grad_norm": 3.8533759638416707, + "language_loss": 0.76273942, + "learning_rate": 7.125196832571367e-07, + "loss": 0.78718221, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18164062, + "step": 12157, + "time_per_iteration": 2.8005106449127197 + }, + { + "auxiliary_loss_clip": 0.01400901, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.24288297, + "balance_loss_mlp": 1.01449132, + "epoch": 0.7309785059371712, + "flos": 17028006554880.0, + "grad_norm": 1.9562726464332005, + "language_loss": 0.73538417, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75972795, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18994141, + "step": 12158, + "time_per_iteration": 4.2931084632873535 + }, + { + "auxiliary_loss_clip": 0.01422029, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.25957143, + "balance_loss_mlp": 1.01585102, + "epoch": 0.7310386291898392, + "flos": 26512839665280.0, + "grad_norm": 1.5417582188937977, + "language_loss": 0.86456656, + "learning_rate": 7.119237143684896e-07, + "loss": 0.88913852, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19299316, + "step": 12159, + "time_per_iteration": 2.883857011795044 + }, + { + "auxiliary_loss_clip": 0.01428436, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.26024854, + "balance_loss_mlp": 1.01381254, + "epoch": 0.7310987524425071, + "flos": 16954610209920.0, + "grad_norm": 3.2173478691015887, + "language_loss": 0.74050498, + "learning_rate": 7.116258031844895e-07, + "loss": 0.76512569, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.19836426, + "step": 12160, + "time_per_iteration": 2.8384621143341064 + }, + { + "auxiliary_loss_clip": 0.01437303, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.26873279, + "balance_loss_mlp": 1.01447415, + "epoch": 0.7311588756951751, + "flos": 13853340846720.0, + "grad_norm": 2.0370377509305384, + "language_loss": 0.73872417, + "learning_rate": 7.113279408557675e-07, + "loss": 0.76344442, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.20251465, + "step": 12161, + "time_per_iteration": 2.8398118019104004 + }, + { + "auxiliary_loss_clip": 0.01452272, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.28069627, + "balance_loss_mlp": 1.01271296, + "epoch": 0.731218998947843, + "flos": 28779801315840.0, + "grad_norm": 1.990172930420111, + "language_loss": 0.7049396, + "learning_rate": 7.110301273936192e-07, + "loss": 0.72979236, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 1.71679688, + "router_z_loss_mlp": 0.20288086, + "step": 12162, + "time_per_iteration": 2.9155490398406982 + }, + { + "auxiliary_loss_clip": 0.0142564, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.26104748, + "balance_loss_mlp": 1.01080287, + "epoch": 0.7312791222005111, + "flos": 27100055669760.0, + "grad_norm": 1.664013755551982, + "language_loss": 0.67635882, + "learning_rate": 7.107323628093382e-07, + "loss": 0.70091987, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19665527, + "step": 12163, + "time_per_iteration": 2.8756847381591797 + }, + { + "auxiliary_loss_clip": 0.01413078, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.250265, + "balance_loss_mlp": 1.01398504, + "epoch": 0.731339245453179, + "flos": 20934056759040.0, + "grad_norm": 1.7353501559267221, + "language_loss": 0.69496351, + "learning_rate": 7.104346471142153e-07, + "loss": 0.71942818, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19396973, + "step": 12164, + "time_per_iteration": 2.940541982650757 + }, + { + "auxiliary_loss_clip": 0.01396911, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.24070716, + "balance_loss_mlp": 1.01088452, + "epoch": 0.731399368705847, + "flos": 23086014831360.0, + "grad_norm": 1.5842497379038467, + "language_loss": 0.74066412, + "learning_rate": 7.101369803195391e-07, + "loss": 0.76492357, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18151855, + "step": 12165, + "time_per_iteration": 2.9039058685302734 + }, + { + "auxiliary_loss_clip": 0.01420772, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.25656939, + "balance_loss_mlp": 1.01607335, + "epoch": 0.731459491958515, + "flos": 23592142874880.0, + "grad_norm": 1.7521254493911536, + "language_loss": 0.77537119, + "learning_rate": 7.098393624365988e-07, + "loss": 0.79993248, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19262695, + "step": 12166, + "time_per_iteration": 2.8808186054229736 + }, + { + "auxiliary_loss_clip": 0.01409605, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.24981046, + "balance_loss_mlp": 1.01655996, + "epoch": 0.7315196152111829, + "flos": 22388591197440.0, + "grad_norm": 1.5528189795522882, + "language_loss": 0.80287784, + "learning_rate": 7.095417934766781e-07, + "loss": 0.82732296, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18359375, + "step": 12167, + "time_per_iteration": 2.8568427562713623 + }, + { + "auxiliary_loss_clip": 0.01408556, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.24723852, + "balance_loss_mlp": 1.01393044, + "epoch": 0.7315797384638509, + "flos": 26188189090560.0, + "grad_norm": 1.7444877027876589, + "language_loss": 0.77444202, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79886061, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19372559, + "step": 12168, + "time_per_iteration": 2.90659236907959 + }, + { + "auxiliary_loss_clip": 0.0143656, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.27019918, + "balance_loss_mlp": 1.01314545, + "epoch": 0.7316398617165188, + "flos": 21516114856320.0, + "grad_norm": 1.4564037975460493, + "language_loss": 0.82541478, + "learning_rate": 7.089468023710326e-07, + "loss": 0.85011536, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.20349121, + "step": 12169, + "time_per_iteration": 2.8382625579833984 + }, + { + "auxiliary_loss_clip": 0.01428547, + "auxiliary_loss_mlp": 0.01040736, + "balance_loss_clip": 1.26373148, + "balance_loss_mlp": 1.02134013, + "epoch": 0.7316999849691869, + "flos": 30494865168000.0, + "grad_norm": 1.8102388019274824, + "language_loss": 0.70870894, + "learning_rate": 7.08649380247871e-07, + "loss": 0.73340178, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19384766, + "step": 12170, + "time_per_iteration": 2.9168505668640137 + }, + { + "auxiliary_loss_clip": 0.0141598, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.2526803, + "balance_loss_mlp": 1.01076829, + "epoch": 0.7317601082218548, + "flos": 21553831036800.0, + "grad_norm": 1.9089536986116789, + "language_loss": 0.70784169, + "learning_rate": 7.083520070928533e-07, + "loss": 0.73231006, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20092773, + "step": 12171, + "time_per_iteration": 2.8841397762298584 + }, + { + "auxiliary_loss_clip": 0.01413559, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.25127327, + "balance_loss_mlp": 1.01866627, + "epoch": 0.7318202314745228, + "flos": 33263611361280.0, + "grad_norm": 1.755393847573121, + "language_loss": 0.6641283, + "learning_rate": 7.080546829172564e-07, + "loss": 0.68864071, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19018555, + "step": 12172, + "time_per_iteration": 2.9791955947875977 + }, + { + "auxiliary_loss_clip": 0.01419949, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.2567836, + "balance_loss_mlp": 1.01008463, + "epoch": 0.7318803547271907, + "flos": 20166675384960.0, + "grad_norm": 2.961478202580039, + "language_loss": 0.63451481, + "learning_rate": 7.077574077323564e-07, + "loss": 0.65900075, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18554688, + "step": 12173, + "time_per_iteration": 2.857858180999756 + }, + { + "auxiliary_loss_clip": 0.01425286, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.2632643, + "balance_loss_mlp": 1.01383281, + "epoch": 0.7319404779798587, + "flos": 20568296643840.0, + "grad_norm": 2.7758347313704363, + "language_loss": 0.74811161, + "learning_rate": 7.074601815494243e-07, + "loss": 0.77269053, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18774414, + "step": 12174, + "time_per_iteration": 2.8775856494903564 + }, + { + "auxiliary_loss_clip": 0.01420713, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.26109838, + "balance_loss_mlp": 1.0153048, + "epoch": 0.7320006012325266, + "flos": 28707400356480.0, + "grad_norm": 3.9170360773339654, + "language_loss": 0.81483513, + "learning_rate": 7.071630043797317e-07, + "loss": 0.83938032, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18505859, + "step": 12175, + "time_per_iteration": 2.907259702682495 + }, + { + "auxiliary_loss_clip": 0.01414866, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.25256133, + "balance_loss_mlp": 1.01616573, + "epoch": 0.7320607244851947, + "flos": 16371375747840.0, + "grad_norm": 2.106907101039173, + "language_loss": 0.77791011, + "learning_rate": 7.068658762345488e-07, + "loss": 0.8024087, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18823242, + "step": 12176, + "time_per_iteration": 2.8151955604553223 + }, + { + "auxiliary_loss_clip": 0.01420999, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.25871789, + "balance_loss_mlp": 1.01483858, + "epoch": 0.7321208477378626, + "flos": 20963719365120.0, + "grad_norm": 1.5133117362291926, + "language_loss": 0.77051294, + "learning_rate": 7.065687971251399e-07, + "loss": 0.79506207, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19067383, + "step": 12177, + "time_per_iteration": 2.873093366622925 + }, + { + "auxiliary_loss_clip": 0.01403997, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.24365258, + "balance_loss_mlp": 1.01927543, + "epoch": 0.7321809709905306, + "flos": 13853069377920.0, + "grad_norm": 2.045475000563159, + "language_loss": 0.75048345, + "learning_rate": 7.06271767062772e-07, + "loss": 0.77490115, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18505859, + "step": 12178, + "time_per_iteration": 2.8613827228546143 + }, + { + "auxiliary_loss_clip": 0.0142279, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.2565943, + "balance_loss_mlp": 1.01244116, + "epoch": 0.7322410942431986, + "flos": 26991341118720.0, + "grad_norm": 2.0570167547969436, + "language_loss": 0.83367783, + "learning_rate": 7.059747860587084e-07, + "loss": 0.85821503, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.18493652, + "step": 12179, + "time_per_iteration": 4.4028708934783936 + }, + { + "auxiliary_loss_clip": 0.01402261, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.24561453, + "balance_loss_mlp": 1.01705098, + "epoch": 0.7323012174958665, + "flos": 17648730973440.0, + "grad_norm": 2.153217302235769, + "language_loss": 0.75452822, + "learning_rate": 7.056778541242115e-07, + "loss": 0.77889371, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.17236328, + "step": 12180, + "time_per_iteration": 2.8169541358947754 + }, + { + "auxiliary_loss_clip": 0.014332, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.2652601, + "balance_loss_mlp": 1.01409554, + "epoch": 0.7323613407485345, + "flos": 32355409610880.0, + "grad_norm": 2.3264812745166603, + "language_loss": 0.79871941, + "learning_rate": 7.053809712705396e-07, + "loss": 0.82339025, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19787598, + "step": 12181, + "time_per_iteration": 2.934889078140259 + }, + { + "auxiliary_loss_clip": 0.01434202, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.26843023, + "balance_loss_mlp": 1.01768303, + "epoch": 0.7324214640012024, + "flos": 18370252103040.0, + "grad_norm": 2.040871381491611, + "language_loss": 0.72379398, + "learning_rate": 7.050841375089506e-07, + "loss": 0.74849242, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.17956543, + "step": 12182, + "time_per_iteration": 2.8124849796295166 + }, + { + "auxiliary_loss_clip": 0.01426889, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.26397872, + "balance_loss_mlp": 1.01694655, + "epoch": 0.7324815872538705, + "flos": 30825035608320.0, + "grad_norm": 1.6740843411565123, + "language_loss": 0.72198987, + "learning_rate": 7.047873528507015e-07, + "loss": 0.74661458, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.1862793, + "step": 12183, + "time_per_iteration": 2.9112558364868164 + }, + { + "auxiliary_loss_clip": 0.01431883, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.26551509, + "balance_loss_mlp": 1.01470518, + "epoch": 0.7325417105065384, + "flos": 21514531288320.0, + "grad_norm": 2.668768259784539, + "language_loss": 0.73316371, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75782597, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19628906, + "step": 12184, + "time_per_iteration": 2.8837087154388428 + }, + { + "auxiliary_loss_clip": 0.01183818, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.0960803, + "balance_loss_mlp": 1.01391339, + "epoch": 0.7326018337592064, + "flos": 67288423541760.0, + "grad_norm": 0.7638295535750713, + "language_loss": 0.65274191, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67489564, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.17675781, + "step": 12185, + "time_per_iteration": 3.3399131298065186 + }, + { + "auxiliary_loss_clip": 0.01421726, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.25642347, + "balance_loss_mlp": 1.01394737, + "epoch": 0.7326619570118743, + "flos": 22867409364480.0, + "grad_norm": 1.8778363764887303, + "language_loss": 0.81531566, + "learning_rate": 7.038972936085197e-07, + "loss": 0.83986723, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19470215, + "step": 12186, + "time_per_iteration": 2.874206066131592 + }, + { + "auxiliary_loss_clip": 0.01421805, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.25532889, + "balance_loss_mlp": 1.01296163, + "epoch": 0.7327220802645423, + "flos": 23336952347520.0, + "grad_norm": 2.219260085424728, + "language_loss": 0.74260879, + "learning_rate": 7.036007054761508e-07, + "loss": 0.76714885, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.19238281, + "step": 12187, + "time_per_iteration": 4.278978109359741 + }, + { + "auxiliary_loss_clip": 0.01419982, + "auxiliary_loss_mlp": 0.01037804, + "balance_loss_clip": 1.25594997, + "balance_loss_mlp": 1.0190165, + "epoch": 0.7327822035172102, + "flos": 23189842944000.0, + "grad_norm": 1.5967643417105928, + "language_loss": 0.89733565, + "learning_rate": 7.033041665033716e-07, + "loss": 0.9219135, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18786621, + "step": 12188, + "time_per_iteration": 2.8587846755981445 + }, + { + "auxiliary_loss_clip": 0.01431644, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.2651583, + "balance_loss_mlp": 1.01436424, + "epoch": 0.7328423267698783, + "flos": 21075782031360.0, + "grad_norm": 1.9830401615224187, + "language_loss": 0.75746024, + "learning_rate": 7.030076767014284e-07, + "loss": 0.78211343, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.1932373, + "step": 12189, + "time_per_iteration": 2.892582654953003 + }, + { + "auxiliary_loss_clip": 0.01424124, + "auxiliary_loss_mlp": 0.0103034, + "balance_loss_clip": 1.25799358, + "balance_loss_mlp": 1.0115881, + "epoch": 0.7329024500225462, + "flos": 21699628341120.0, + "grad_norm": 1.5157209163800363, + "language_loss": 0.82544619, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84999084, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.18737793, + "step": 12190, + "time_per_iteration": 2.8556103706359863 + }, + { + "auxiliary_loss_clip": 0.01428534, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.2644434, + "balance_loss_mlp": 1.01666617, + "epoch": 0.7329625732752142, + "flos": 24173024607360.0, + "grad_norm": 5.460922811155115, + "language_loss": 0.7250489, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74969739, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1965332, + "step": 12191, + "time_per_iteration": 4.3539440631866455 + }, + { + "auxiliary_loss_clip": 0.01412756, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.25056529, + "balance_loss_mlp": 1.0154103, + "epoch": 0.7330226965278822, + "flos": 30089262366720.0, + "grad_norm": 1.4700610688828695, + "language_loss": 0.69313133, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71759522, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18237305, + "step": 12192, + "time_per_iteration": 2.9408326148986816 + }, + { + "auxiliary_loss_clip": 0.01417447, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.25473011, + "balance_loss_mlp": 1.01618826, + "epoch": 0.7330828197805501, + "flos": 23378876294400.0, + "grad_norm": 1.5099641975298483, + "language_loss": 0.74161553, + "learning_rate": 7.01822209426848e-07, + "loss": 0.76614076, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18884277, + "step": 12193, + "time_per_iteration": 4.236151933670044 + }, + { + "auxiliary_loss_clip": 0.01431983, + "auxiliary_loss_mlp": 0.01039718, + "balance_loss_clip": 1.26601362, + "balance_loss_mlp": 1.01986909, + "epoch": 0.7331429430332181, + "flos": 21042680820480.0, + "grad_norm": 3.115842133322399, + "language_loss": 0.77708405, + "learning_rate": 7.015259656476911e-07, + "loss": 0.80180109, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19836426, + "step": 12194, + "time_per_iteration": 2.855551242828369 + }, + { + "auxiliary_loss_clip": 0.01422459, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.26168334, + "balance_loss_mlp": 1.01199722, + "epoch": 0.733203066285886, + "flos": 14656130916480.0, + "grad_norm": 1.7887486590778239, + "language_loss": 0.71235657, + "learning_rate": 7.012297711067998e-07, + "loss": 0.73688817, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18701172, + "step": 12195, + "time_per_iteration": 2.8642587661743164 + }, + { + "auxiliary_loss_clip": 0.01414597, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.25147915, + "balance_loss_mlp": 1.01485586, + "epoch": 0.7332631895385541, + "flos": 17174165817600.0, + "grad_norm": 1.813232833088303, + "language_loss": 0.72967637, + "learning_rate": 7.009336258154057e-07, + "loss": 0.7541585, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18774414, + "step": 12196, + "time_per_iteration": 2.8394932746887207 + }, + { + "auxiliary_loss_clip": 0.01410346, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.24886489, + "balance_loss_mlp": 1.01289451, + "epoch": 0.733323312791222, + "flos": 28669955644800.0, + "grad_norm": 4.167459497462062, + "language_loss": 0.72292292, + "learning_rate": 7.006375297847394e-07, + "loss": 0.74735004, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19470215, + "step": 12197, + "time_per_iteration": 2.897859573364258 + }, + { + "auxiliary_loss_clip": 0.01441295, + "auxiliary_loss_mlp": 0.01038893, + "balance_loss_clip": 1.27169442, + "balance_loss_mlp": 1.01945019, + "epoch": 0.73338343604389, + "flos": 16627244947200.0, + "grad_norm": 3.318957471792519, + "language_loss": 0.78810734, + "learning_rate": 7.003414830260282e-07, + "loss": 0.81290925, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19445801, + "step": 12198, + "time_per_iteration": 2.8272902965545654 + }, + { + "auxiliary_loss_clip": 0.0141534, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.25269508, + "balance_loss_mlp": 1.0133549, + "epoch": 0.7334435592965579, + "flos": 21151938309120.0, + "grad_norm": 1.8188677972355827, + "language_loss": 0.7538327, + "learning_rate": 7.000454855504974e-07, + "loss": 0.77830452, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18481445, + "step": 12199, + "time_per_iteration": 2.848068952560425 + }, + { + "auxiliary_loss_clip": 0.01425221, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.25864565, + "balance_loss_mlp": 1.01849341, + "epoch": 0.7335036825492259, + "flos": 17133282501120.0, + "grad_norm": 2.326550013928157, + "language_loss": 0.77063072, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79525936, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19140625, + "step": 12200, + "time_per_iteration": 2.785109758377075 + }, + { + "auxiliary_loss_clip": 0.01411451, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.24942482, + "balance_loss_mlp": 1.01507354, + "epoch": 0.7335638058018938, + "flos": 23742147945600.0, + "grad_norm": 1.5730928035021519, + "language_loss": 0.6242879, + "learning_rate": 6.994536384938754e-07, + "loss": 0.64874351, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19030762, + "step": 12201, + "time_per_iteration": 2.9968905448913574 + }, + { + "auxiliary_loss_clip": 0.01409568, + "auxiliary_loss_mlp": 0.01028331, + "balance_loss_clip": 1.24926198, + "balance_loss_mlp": 1.00991321, + "epoch": 0.7336239290545619, + "flos": 34946614632960.0, + "grad_norm": 1.5758784423194212, + "language_loss": 0.5288465, + "learning_rate": 6.991577889352264e-07, + "loss": 0.55322552, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1842041, + "step": 12202, + "time_per_iteration": 3.031066417694092 + }, + { + "auxiliary_loss_clip": 0.01410533, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.25046718, + "balance_loss_mlp": 1.01330662, + "epoch": 0.7336840523072298, + "flos": 21112231357440.0, + "grad_norm": 1.9040689816356613, + "language_loss": 0.69198108, + "learning_rate": 6.98861988704645e-07, + "loss": 0.71640384, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18444824, + "step": 12203, + "time_per_iteration": 2.9745562076568604 + }, + { + "auxiliary_loss_clip": 0.01444809, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.27709675, + "balance_loss_mlp": 1.01865745, + "epoch": 0.7337441755598978, + "flos": 24035009408640.0, + "grad_norm": 2.0738469384554135, + "language_loss": 0.66661823, + "learning_rate": 6.985662378133474e-07, + "loss": 0.69143671, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.18383789, + "step": 12204, + "time_per_iteration": 3.015991687774658 + }, + { + "auxiliary_loss_clip": 0.01403621, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.24504113, + "balance_loss_mlp": 1.01553845, + "epoch": 0.7338042988125658, + "flos": 22721657304960.0, + "grad_norm": 1.9622609295074442, + "language_loss": 0.77987564, + "learning_rate": 6.982705362725479e-07, + "loss": 0.8042419, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.17480469, + "step": 12205, + "time_per_iteration": 2.977876663208008 + }, + { + "auxiliary_loss_clip": 0.01408619, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.249789, + "balance_loss_mlp": 1.00947785, + "epoch": 0.7338644220652337, + "flos": 21370905734400.0, + "grad_norm": 1.5779325729850755, + "language_loss": 0.80506128, + "learning_rate": 6.979748840934601e-07, + "loss": 0.82941848, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1763916, + "step": 12206, + "time_per_iteration": 2.8631017208099365 + }, + { + "auxiliary_loss_clip": 0.01412823, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.24944293, + "balance_loss_mlp": 1.01110768, + "epoch": 0.7339245453179017, + "flos": 30932619039360.0, + "grad_norm": 1.9412489634940304, + "language_loss": 0.72037524, + "learning_rate": 6.976792812872958e-07, + "loss": 0.74479914, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18469238, + "step": 12207, + "time_per_iteration": 3.004380464553833 + }, + { + "auxiliary_loss_clip": 0.01182144, + "auxiliary_loss_mlp": 0.01023238, + "balance_loss_clip": 1.09413695, + "balance_loss_mlp": 1.00206661, + "epoch": 0.7339846685705697, + "flos": 67926368246400.0, + "grad_norm": 0.7977854237602594, + "language_loss": 0.54840302, + "learning_rate": 6.97383727865263e-07, + "loss": 0.57045686, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21191406, + "step": 12208, + "time_per_iteration": 3.44126033782959 + }, + { + "auxiliary_loss_clip": 0.0142219, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.25859594, + "balance_loss_mlp": 1.01344717, + "epoch": 0.7340447918232377, + "flos": 22246911169920.0, + "grad_norm": 1.5545001626144244, + "language_loss": 0.8091886, + "learning_rate": 6.970882238385703e-07, + "loss": 0.83372295, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.17822266, + "step": 12209, + "time_per_iteration": 2.8994128704071045 + }, + { + "auxiliary_loss_clip": 0.0140508, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.24523449, + "balance_loss_mlp": 1.01238394, + "epoch": 0.7341049150759056, + "flos": 23774208526080.0, + "grad_norm": 1.4988498983032987, + "language_loss": 0.79433399, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81869316, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18444824, + "step": 12210, + "time_per_iteration": 2.8393685817718506 + }, + { + "auxiliary_loss_clip": 0.01408763, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.24821687, + "balance_loss_mlp": 1.00983596, + "epoch": 0.7341650383285736, + "flos": 17244983208960.0, + "grad_norm": 1.6106678190251569, + "language_loss": 0.7728278, + "learning_rate": 6.964973640160236e-07, + "loss": 0.79720163, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18786621, + "step": 12211, + "time_per_iteration": 2.946786403656006 + }, + { + "auxiliary_loss_clip": 0.01415564, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.2542367, + "balance_loss_mlp": 1.01457667, + "epoch": 0.7342251615812415, + "flos": 23414330234880.0, + "grad_norm": 1.9314416231516227, + "language_loss": 0.72692382, + "learning_rate": 6.962020082425748e-07, + "loss": 0.75140464, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.17944336, + "step": 12212, + "time_per_iteration": 2.8833909034729004 + }, + { + "auxiliary_loss_clip": 0.01418858, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.25567532, + "balance_loss_mlp": 1.01568925, + "epoch": 0.7342852848339095, + "flos": 22757382714240.0, + "grad_norm": 1.4886691017838554, + "language_loss": 0.69618988, + "learning_rate": 6.959067019092766e-07, + "loss": 0.7207216, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18603516, + "step": 12213, + "time_per_iteration": 2.891525983810425 + }, + { + "auxiliary_loss_clip": 0.0118434, + "auxiliary_loss_mlp": 0.0101742, + "balance_loss_clip": 1.09507346, + "balance_loss_mlp": 1.00025403, + "epoch": 0.7343454080865774, + "flos": 53970076920960.0, + "grad_norm": 0.7189568660841992, + "language_loss": 0.54287273, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56489033, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.171875, + "step": 12214, + "time_per_iteration": 4.682013273239136 + }, + { + "auxiliary_loss_clip": 0.01426852, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.26112127, + "balance_loss_mlp": 1.00981867, + "epoch": 0.7344055313392455, + "flos": 12174454851840.0, + "grad_norm": 2.8370277099407204, + "language_loss": 0.71373463, + "learning_rate": 6.953162376079233e-07, + "loss": 0.73828113, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.17980957, + "step": 12215, + "time_per_iteration": 2.8478610515594482 + }, + { + "auxiliary_loss_clip": 0.01409256, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.24950206, + "balance_loss_mlp": 1.01264167, + "epoch": 0.7344656545919134, + "flos": 18558833005440.0, + "grad_norm": 1.6602652850301425, + "language_loss": 0.72997713, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75437117, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17492676, + "step": 12216, + "time_per_iteration": 2.888833522796631 + }, + { + "auxiliary_loss_clip": 0.01441175, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.2710309, + "balance_loss_mlp": 1.02099597, + "epoch": 0.7345257778445814, + "flos": 23672687898240.0, + "grad_norm": 1.7651383828225713, + "language_loss": 0.78770089, + "learning_rate": 6.947259712015236e-07, + "loss": 0.8125217, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.19909668, + "step": 12217, + "time_per_iteration": 2.9105124473571777 + }, + { + "auxiliary_loss_clip": 0.01404016, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.24358582, + "balance_loss_mlp": 1.01362419, + "epoch": 0.7345859010972494, + "flos": 13816665296640.0, + "grad_norm": 1.8904346109585348, + "language_loss": 0.79478109, + "learning_rate": 6.94430912236911e-07, + "loss": 0.81914145, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18383789, + "step": 12218, + "time_per_iteration": 2.8076541423797607 + }, + { + "auxiliary_loss_clip": 0.01408233, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.24873471, + "balance_loss_mlp": 1.01695013, + "epoch": 0.7346460243499173, + "flos": 22282772313600.0, + "grad_norm": 3.272117043162063, + "language_loss": 0.73298669, + "learning_rate": 6.941359027796092e-07, + "loss": 0.75742722, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18859863, + "step": 12219, + "time_per_iteration": 2.8672537803649902 + }, + { + "auxiliary_loss_clip": 0.0140541, + "auxiliary_loss_mlp": 0.01037337, + "balance_loss_clip": 1.24709904, + "balance_loss_mlp": 1.01934862, + "epoch": 0.7347061476025853, + "flos": 23264868101760.0, + "grad_norm": 2.1786606809324827, + "language_loss": 0.75042439, + "learning_rate": 6.938409428408061e-07, + "loss": 0.77485186, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.17980957, + "step": 12220, + "time_per_iteration": 2.9258995056152344 + }, + { + "auxiliary_loss_clip": 0.01422885, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.25726867, + "balance_loss_mlp": 1.01633334, + "epoch": 0.7347662708552533, + "flos": 15275814704640.0, + "grad_norm": 4.677673971840966, + "language_loss": 0.6670211, + "learning_rate": 6.93546032431684e-07, + "loss": 0.69160414, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19091797, + "step": 12221, + "time_per_iteration": 2.851973056793213 + }, + { + "auxiliary_loss_clip": 0.0142313, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.26042795, + "balance_loss_mlp": 1.01871204, + "epoch": 0.7348263941079213, + "flos": 24869860058880.0, + "grad_norm": 5.868286735877517, + "language_loss": 0.70613265, + "learning_rate": 6.932511715634273e-07, + "loss": 0.73072469, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.17370605, + "step": 12222, + "time_per_iteration": 4.33062744140625 + }, + { + "auxiliary_loss_clip": 0.01416289, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.25556409, + "balance_loss_mlp": 1.01704121, + "epoch": 0.7348865173605892, + "flos": 24362600895360.0, + "grad_norm": 1.5084475815124314, + "language_loss": 0.66920483, + "learning_rate": 6.92956360247217e-07, + "loss": 0.69371104, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1730957, + "step": 12223, + "time_per_iteration": 2.9391605854034424 + }, + { + "auxiliary_loss_clip": 0.0140984, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.24813056, + "balance_loss_mlp": 1.01402521, + "epoch": 0.7349466406132572, + "flos": 20012507792640.0, + "grad_norm": 2.0089764764572373, + "language_loss": 0.73394555, + "learning_rate": 6.926615984942332e-07, + "loss": 0.75837028, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18603516, + "step": 12224, + "time_per_iteration": 2.8297414779663086 + }, + { + "auxiliary_loss_clip": 0.01422018, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.25857091, + "balance_loss_mlp": 1.01541281, + "epoch": 0.7350067638659251, + "flos": 29837012751360.0, + "grad_norm": 1.6755698051363455, + "language_loss": 0.73207664, + "learning_rate": 6.92366886315652e-07, + "loss": 0.75664383, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19287109, + "step": 12225, + "time_per_iteration": 2.9107179641723633 + }, + { + "auxiliary_loss_clip": 0.01427414, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.25953746, + "balance_loss_mlp": 1.01934457, + "epoch": 0.7350668871185931, + "flos": 21874726293120.0, + "grad_norm": 1.6624366318660295, + "language_loss": 0.76676399, + "learning_rate": 6.920722237226501e-07, + "loss": 0.791426, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19433594, + "step": 12226, + "time_per_iteration": 4.346354246139526 + }, + { + "auxiliary_loss_clip": 0.0141891, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.25616336, + "balance_loss_mlp": 1.01309013, + "epoch": 0.735127010371261, + "flos": 22576629162240.0, + "grad_norm": 1.6653111628269737, + "language_loss": 0.67198575, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69648784, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18200684, + "step": 12227, + "time_per_iteration": 4.163660526275635 + }, + { + "auxiliary_loss_clip": 0.0143013, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.26499081, + "balance_loss_mlp": 1.01538217, + "epoch": 0.7351871336239291, + "flos": 25895191893120.0, + "grad_norm": 2.2856690738781578, + "language_loss": 0.64357382, + "learning_rate": 6.914830473380749e-07, + "loss": 0.66821527, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.1862793, + "step": 12228, + "time_per_iteration": 2.9297831058502197 + }, + { + "auxiliary_loss_clip": 0.01412152, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.24964869, + "balance_loss_mlp": 1.01594281, + "epoch": 0.735247256876597, + "flos": 17941411457280.0, + "grad_norm": 1.9135418719853745, + "language_loss": 0.64123833, + "learning_rate": 6.911885335688427e-07, + "loss": 0.66569978, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18029785, + "step": 12229, + "time_per_iteration": 2.90708065032959 + }, + { + "auxiliary_loss_clip": 0.01416365, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.25083709, + "balance_loss_mlp": 1.01502705, + "epoch": 0.735307380129265, + "flos": 28886525095680.0, + "grad_norm": 1.8734556462241183, + "language_loss": 0.74193907, + "learning_rate": 6.908940694298726e-07, + "loss": 0.7664482, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19519043, + "step": 12230, + "time_per_iteration": 2.928051471710205 + }, + { + "auxiliary_loss_clip": 0.01418229, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.2539376, + "balance_loss_mlp": 1.01496375, + "epoch": 0.7353675033819329, + "flos": 13633468525440.0, + "grad_norm": 2.1408634448657, + "language_loss": 0.73646963, + "learning_rate": 6.90599654932332e-07, + "loss": 0.7609905, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18884277, + "step": 12231, + "time_per_iteration": 2.8995578289031982 + }, + { + "auxiliary_loss_clip": 0.01426972, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.26177144, + "balance_loss_mlp": 1.01732802, + "epoch": 0.7354276266346009, + "flos": 19472328397440.0, + "grad_norm": 2.899890881830692, + "language_loss": 0.65607405, + "learning_rate": 6.903052900873823e-07, + "loss": 0.68071127, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19396973, + "step": 12232, + "time_per_iteration": 2.8283371925354004 + }, + { + "auxiliary_loss_clip": 0.01413643, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.25064731, + "balance_loss_mlp": 1.01862907, + "epoch": 0.735487749887269, + "flos": 15778549388160.0, + "grad_norm": 2.2239155700182964, + "language_loss": 0.76442516, + "learning_rate": 6.900109749061874e-07, + "loss": 0.78893816, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19006348, + "step": 12233, + "time_per_iteration": 2.8176674842834473 + }, + { + "auxiliary_loss_clip": 0.01413733, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.25017416, + "balance_loss_mlp": 1.01285481, + "epoch": 0.7355478731399369, + "flos": 18269998329600.0, + "grad_norm": 1.560039952349905, + "language_loss": 0.74821484, + "learning_rate": 6.897167093999079e-07, + "loss": 0.77267873, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19812012, + "step": 12234, + "time_per_iteration": 2.837233066558838 + }, + { + "auxiliary_loss_clip": 0.01429202, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.26345897, + "balance_loss_mlp": 1.01604033, + "epoch": 0.7356079963926049, + "flos": 26553315778560.0, + "grad_norm": 2.1742725849651414, + "language_loss": 0.60806161, + "learning_rate": 6.894224935797017e-07, + "loss": 0.63269895, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18469238, + "step": 12235, + "time_per_iteration": 2.8840866088867188 + }, + { + "auxiliary_loss_clip": 0.01402766, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.24308801, + "balance_loss_mlp": 1.0152235, + "epoch": 0.7356681196452728, + "flos": 10785398918400.0, + "grad_norm": 2.3129606938394076, + "language_loss": 0.87187696, + "learning_rate": 6.891283274567259e-07, + "loss": 0.89624709, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19030762, + "step": 12236, + "time_per_iteration": 2.818617343902588 + }, + { + "auxiliary_loss_clip": 0.01417099, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.25281239, + "balance_loss_mlp": 1.01312888, + "epoch": 0.7357282428979408, + "flos": 19728061862400.0, + "grad_norm": 1.6391001768455928, + "language_loss": 0.70133388, + "learning_rate": 6.888342110421364e-07, + "loss": 0.72582459, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18859863, + "step": 12237, + "time_per_iteration": 2.8453164100646973 + }, + { + "auxiliary_loss_clip": 0.01421794, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.25807977, + "balance_loss_mlp": 1.01568282, + "epoch": 0.7357883661506087, + "flos": 19473504762240.0, + "grad_norm": 1.9391936545276878, + "language_loss": 0.73563975, + "learning_rate": 6.885401443470839e-07, + "loss": 0.760198, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18347168, + "step": 12238, + "time_per_iteration": 2.845498561859131 + }, + { + "auxiliary_loss_clip": 0.014551, + "auxiliary_loss_mlp": 0.01032736, + "balance_loss_clip": 1.28323746, + "balance_loss_mlp": 1.0130899, + "epoch": 0.7358484894032767, + "flos": 27133699818240.0, + "grad_norm": 1.6910244009889452, + "language_loss": 0.73260468, + "learning_rate": 6.882461273827205e-07, + "loss": 0.75748307, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.1965332, + "step": 12239, + "time_per_iteration": 2.946596384048462 + }, + { + "auxiliary_loss_clip": 0.01401344, + "auxiliary_loss_mlp": 0.01035158, + "balance_loss_clip": 1.24282598, + "balance_loss_mlp": 1.01621532, + "epoch": 0.7359086126559446, + "flos": 24513918065280.0, + "grad_norm": 1.393882199598283, + "language_loss": 0.79548323, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81984824, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18933105, + "step": 12240, + "time_per_iteration": 2.910717725753784 + }, + { + "auxiliary_loss_clip": 0.01405814, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.24570513, + "balance_loss_mlp": 1.0174253, + "epoch": 0.7359687359086127, + "flos": 23341703051520.0, + "grad_norm": 1.8277993682149685, + "language_loss": 0.84409958, + "learning_rate": 6.876582426906565e-07, + "loss": 0.86853129, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19921875, + "step": 12241, + "time_per_iteration": 2.865825891494751 + }, + { + "auxiliary_loss_clip": 0.01401639, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.24251878, + "balance_loss_mlp": 1.01416445, + "epoch": 0.7360288591612806, + "flos": 20202988976640.0, + "grad_norm": 1.8617281255384062, + "language_loss": 0.79468137, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81902564, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1862793, + "step": 12242, + "time_per_iteration": 2.8347976207733154 + }, + { + "auxiliary_loss_clip": 0.01407578, + "auxiliary_loss_mlp": 0.01029228, + "balance_loss_clip": 1.24867463, + "balance_loss_mlp": 1.01055956, + "epoch": 0.7360889824139486, + "flos": 24983551537920.0, + "grad_norm": 1.7498059033327418, + "language_loss": 0.80127871, + "learning_rate": 6.870705570551145e-07, + "loss": 0.82564676, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18652344, + "step": 12243, + "time_per_iteration": 2.9074203968048096 + }, + { + "auxiliary_loss_clip": 0.01429183, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.26185572, + "balance_loss_mlp": 1.0117141, + "epoch": 0.7361491056666165, + "flos": 15020533687680.0, + "grad_norm": 5.777180210163534, + "language_loss": 0.75494832, + "learning_rate": 6.867767889113969e-07, + "loss": 0.77955103, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19360352, + "step": 12244, + "time_per_iteration": 2.98274564743042 + }, + { + "auxiliary_loss_clip": 0.0142676, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.26214671, + "balance_loss_mlp": 1.01696849, + "epoch": 0.7362092289192845, + "flos": 22940850954240.0, + "grad_norm": 1.7032348932661139, + "language_loss": 0.69747078, + "learning_rate": 6.864830705652347e-07, + "loss": 0.72209573, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18762207, + "step": 12245, + "time_per_iteration": 2.970857620239258 + }, + { + "auxiliary_loss_clip": 0.01402441, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.24581945, + "balance_loss_mlp": 1.01539636, + "epoch": 0.7362693521719526, + "flos": 20712012687360.0, + "grad_norm": 1.6140317245914846, + "language_loss": 0.74226308, + "learning_rate": 6.861894020277658e-07, + "loss": 0.76663721, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19567871, + "step": 12246, + "time_per_iteration": 2.882201910018921 + }, + { + "auxiliary_loss_clip": 0.01399891, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.24269986, + "balance_loss_mlp": 1.01169801, + "epoch": 0.7363294754246205, + "flos": 13118155787520.0, + "grad_norm": 1.9950178838034276, + "language_loss": 0.74152613, + "learning_rate": 6.858957833101266e-07, + "loss": 0.76582879, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18676758, + "step": 12247, + "time_per_iteration": 2.884002685546875 + }, + { + "auxiliary_loss_clip": 0.01403033, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.244524, + "balance_loss_mlp": 1.01494491, + "epoch": 0.7363895986772885, + "flos": 14035451742720.0, + "grad_norm": 1.5585204651088596, + "language_loss": 0.74803984, + "learning_rate": 6.856022144234526e-07, + "loss": 0.77240157, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18200684, + "step": 12248, + "time_per_iteration": 2.854557991027832 + }, + { + "auxiliary_loss_clip": 0.01425114, + "auxiliary_loss_mlp": 0.01040083, + "balance_loss_clip": 1.26047111, + "balance_loss_mlp": 1.02060461, + "epoch": 0.7364497219299564, + "flos": 19729690675200.0, + "grad_norm": 2.296185857367207, + "language_loss": 0.73177695, + "learning_rate": 6.853086953788727e-07, + "loss": 0.75642896, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19482422, + "step": 12249, + "time_per_iteration": 2.827298879623413 + }, + { + "auxiliary_loss_clip": 0.01411134, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.24877596, + "balance_loss_mlp": 1.01379228, + "epoch": 0.7365098451826244, + "flos": 21371403427200.0, + "grad_norm": 1.8170360875173526, + "language_loss": 0.77639294, + "learning_rate": 6.850152261875189e-07, + "loss": 0.80082852, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.1862793, + "step": 12250, + "time_per_iteration": 4.34359073638916 + }, + { + "auxiliary_loss_clip": 0.01421135, + "auxiliary_loss_mlp": 0.01038504, + "balance_loss_clip": 1.25637805, + "balance_loss_mlp": 1.01916838, + "epoch": 0.7365699684352923, + "flos": 23378785804800.0, + "grad_norm": 1.5896066097467292, + "language_loss": 0.7186631, + "learning_rate": 6.8472180686052e-07, + "loss": 0.74325949, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1932373, + "step": 12251, + "time_per_iteration": 2.865645408630371 + }, + { + "auxiliary_loss_clip": 0.01409703, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.24939156, + "balance_loss_mlp": 1.01346111, + "epoch": 0.7366300916879603, + "flos": 59544634078080.0, + "grad_norm": 1.492416236789137, + "language_loss": 0.65935707, + "learning_rate": 6.844284374090015e-07, + "loss": 0.68377566, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18688965, + "step": 12252, + "time_per_iteration": 3.162604331970215 + }, + { + "auxiliary_loss_clip": 0.01433691, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.27008629, + "balance_loss_mlp": 1.02037001, + "epoch": 0.7366902149406283, + "flos": 20932925639040.0, + "grad_norm": 1.5685067232738175, + "language_loss": 0.79521799, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81994313, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18444824, + "step": 12253, + "time_per_iteration": 2.841643810272217 + }, + { + "auxiliary_loss_clip": 0.0141019, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.25079918, + "balance_loss_mlp": 1.01394808, + "epoch": 0.7367503381932963, + "flos": 17357181609600.0, + "grad_norm": 3.259277797893367, + "language_loss": 0.76925021, + "learning_rate": 6.83841848176905e-07, + "loss": 0.79367375, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18212891, + "step": 12254, + "time_per_iteration": 2.831362247467041 + }, + { + "auxiliary_loss_clip": 0.01415919, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.25413477, + "balance_loss_mlp": 1.01745713, + "epoch": 0.7368104614459642, + "flos": 17830163197440.0, + "grad_norm": 2.6427606885279227, + "language_loss": 0.69866812, + "learning_rate": 6.835486284185692e-07, + "loss": 0.7231878, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18591309, + "step": 12255, + "time_per_iteration": 2.841979742050171 + }, + { + "auxiliary_loss_clip": 0.01420826, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.25621819, + "balance_loss_mlp": 1.01512516, + "epoch": 0.7368705846986322, + "flos": 24616298344320.0, + "grad_norm": 2.928222090632513, + "language_loss": 0.75753665, + "learning_rate": 6.832554585802012e-07, + "loss": 0.7820847, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.1887207, + "step": 12256, + "time_per_iteration": 2.8728294372558594 + }, + { + "auxiliary_loss_clip": 0.01419014, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.25689411, + "balance_loss_mlp": 1.01575375, + "epoch": 0.7369307079513001, + "flos": 34983606896640.0, + "grad_norm": 1.7184798826014815, + "language_loss": 0.73974454, + "learning_rate": 6.829623386729182e-07, + "loss": 0.76428676, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19445801, + "step": 12257, + "time_per_iteration": 4.456803560256958 + }, + { + "auxiliary_loss_clip": 0.01412985, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.25197005, + "balance_loss_mlp": 1.01715922, + "epoch": 0.7369908312039681, + "flos": 21224339268480.0, + "grad_norm": 1.490086311210768, + "language_loss": 0.78580344, + "learning_rate": 6.826692687078362e-07, + "loss": 0.81029075, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18591309, + "step": 12258, + "time_per_iteration": 2.8747777938842773 + }, + { + "auxiliary_loss_clip": 0.01423703, + "auxiliary_loss_mlp": 0.01035552, + "balance_loss_clip": 1.26033556, + "balance_loss_mlp": 1.01655006, + "epoch": 0.7370509544566362, + "flos": 23634293045760.0, + "grad_norm": 1.9948486547070594, + "language_loss": 0.67184222, + "learning_rate": 6.823762486960674e-07, + "loss": 0.6964348, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19006348, + "step": 12259, + "time_per_iteration": 2.9185097217559814 + }, + { + "auxiliary_loss_clip": 0.01410273, + "auxiliary_loss_mlp": 0.01037073, + "balance_loss_clip": 1.24910831, + "balance_loss_mlp": 1.0191915, + "epoch": 0.7371110777093041, + "flos": 24838659129600.0, + "grad_norm": 1.5986435996779478, + "language_loss": 0.73472512, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75919861, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.17883301, + "step": 12260, + "time_per_iteration": 2.880825996398926 + }, + { + "auxiliary_loss_clip": 0.01420284, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.25789273, + "balance_loss_mlp": 1.01263177, + "epoch": 0.7371712009619721, + "flos": 23160044603520.0, + "grad_norm": 1.6405907234592003, + "language_loss": 0.74108493, + "learning_rate": 6.817903585769125e-07, + "loss": 0.76559961, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1854248, + "step": 12261, + "time_per_iteration": 4.340795040130615 + }, + { + "auxiliary_loss_clip": 0.01427477, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.26266193, + "balance_loss_mlp": 1.01623905, + "epoch": 0.73723132421464, + "flos": 23123414298240.0, + "grad_norm": 2.703160835708504, + "language_loss": 0.6791203, + "learning_rate": 6.814974884917438e-07, + "loss": 0.70375323, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19580078, + "step": 12262, + "time_per_iteration": 4.216994285583496 + }, + { + "auxiliary_loss_clip": 0.01415233, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.2510761, + "balance_loss_mlp": 1.01678944, + "epoch": 0.737291447467308, + "flos": 19280625603840.0, + "grad_norm": 1.7414839880143522, + "language_loss": 0.89146221, + "learning_rate": 6.81204668404322e-07, + "loss": 0.91597283, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19055176, + "step": 12263, + "time_per_iteration": 2.815094470977783 + }, + { + "auxiliary_loss_clip": 0.01394559, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.239375, + "balance_loss_mlp": 1.01474833, + "epoch": 0.7373515707199759, + "flos": 25128941639040.0, + "grad_norm": 2.2730082145431134, + "language_loss": 0.67599541, + "learning_rate": 6.809118983257522e-07, + "loss": 0.70027387, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.1854248, + "step": 12264, + "time_per_iteration": 2.8930203914642334 + }, + { + "auxiliary_loss_clip": 0.01396977, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.23827887, + "balance_loss_mlp": 1.01529014, + "epoch": 0.737411693972644, + "flos": 32419259303040.0, + "grad_norm": 2.273165584209643, + "language_loss": 0.80861163, + "learning_rate": 6.806191782671356e-07, + "loss": 0.83292711, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19287109, + "step": 12265, + "time_per_iteration": 2.974125862121582 + }, + { + "auxiliary_loss_clip": 0.01427105, + "auxiliary_loss_mlp": 0.01040315, + "balance_loss_clip": 1.25790954, + "balance_loss_mlp": 1.02071667, + "epoch": 0.7374718172253119, + "flos": 24326332548480.0, + "grad_norm": 1.6793279427237544, + "language_loss": 0.75292754, + "learning_rate": 6.803265082395711e-07, + "loss": 0.77760178, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19604492, + "step": 12266, + "time_per_iteration": 2.9120073318481445 + }, + { + "auxiliary_loss_clip": 0.01402888, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.24123251, + "balance_loss_mlp": 1.0190779, + "epoch": 0.7375319404779799, + "flos": 27165941377920.0, + "grad_norm": 1.6811440159969364, + "language_loss": 0.73718643, + "learning_rate": 6.800338882541576e-07, + "loss": 0.76159465, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18859863, + "step": 12267, + "time_per_iteration": 2.9189398288726807 + }, + { + "auxiliary_loss_clip": 0.01408236, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.24686205, + "balance_loss_mlp": 1.01510525, + "epoch": 0.7375920637306478, + "flos": 18889274914560.0, + "grad_norm": 1.9449508796864363, + "language_loss": 0.83921498, + "learning_rate": 6.797413183219923e-07, + "loss": 0.86362553, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.17712402, + "step": 12268, + "time_per_iteration": 2.8338892459869385 + }, + { + "auxiliary_loss_clip": 0.01411975, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.2516439, + "balance_loss_mlp": 1.02007663, + "epoch": 0.7376521869833158, + "flos": 15678340859520.0, + "grad_norm": 1.8040314810680593, + "language_loss": 0.73939347, + "learning_rate": 6.794487984541677e-07, + "loss": 0.76391268, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19848633, + "step": 12269, + "time_per_iteration": 2.837789535522461 + }, + { + "auxiliary_loss_clip": 0.01429826, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.26405501, + "balance_loss_mlp": 1.01456714, + "epoch": 0.7377123102359837, + "flos": 36984655002240.0, + "grad_norm": 1.8224886734937134, + "language_loss": 0.71189225, + "learning_rate": 6.791563286617776e-07, + "loss": 0.7365284, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19226074, + "step": 12270, + "time_per_iteration": 2.95424222946167 + }, + { + "auxiliary_loss_clip": 0.01418725, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.25648284, + "balance_loss_mlp": 1.01584625, + "epoch": 0.7377724334886517, + "flos": 24506362183680.0, + "grad_norm": 1.7578735608230298, + "language_loss": 0.70206964, + "learning_rate": 6.788639089559119e-07, + "loss": 0.72659594, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18066406, + "step": 12271, + "time_per_iteration": 2.9211912155151367 + }, + { + "auxiliary_loss_clip": 0.01420443, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.25699341, + "balance_loss_mlp": 1.01369238, + "epoch": 0.7378325567413198, + "flos": 24400905258240.0, + "grad_norm": 1.966691295126329, + "language_loss": 0.68347317, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70800084, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18652344, + "step": 12272, + "time_per_iteration": 2.8777613639831543 + }, + { + "auxiliary_loss_clip": 0.01404961, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.24741316, + "balance_loss_mlp": 1.01452959, + "epoch": 0.7378926799939877, + "flos": 17424198437760.0, + "grad_norm": 1.7362303338535952, + "language_loss": 0.79002523, + "learning_rate": 6.782792198481049e-07, + "loss": 0.81440127, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18127441, + "step": 12273, + "time_per_iteration": 2.8311896324157715 + }, + { + "auxiliary_loss_clip": 0.01411538, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.25106096, + "balance_loss_mlp": 1.01159406, + "epoch": 0.7379528032466557, + "flos": 18482224279680.0, + "grad_norm": 2.428615854670239, + "language_loss": 0.83942449, + "learning_rate": 6.779869504683355e-07, + "loss": 0.86383915, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18334961, + "step": 12274, + "time_per_iteration": 2.8109068870544434 + }, + { + "auxiliary_loss_clip": 0.01441764, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.27394307, + "balance_loss_mlp": 1.0145638, + "epoch": 0.7380129264993236, + "flos": 17831022848640.0, + "grad_norm": 1.9828585061869195, + "language_loss": 0.74530923, + "learning_rate": 6.776947312194341e-07, + "loss": 0.77006924, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19702148, + "step": 12275, + "time_per_iteration": 2.82187557220459 + }, + { + "auxiliary_loss_clip": 0.01431633, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.26488137, + "balance_loss_mlp": 1.01563704, + "epoch": 0.7380730497519916, + "flos": 23006284214400.0, + "grad_norm": 2.132395001244171, + "language_loss": 0.743047, + "learning_rate": 6.774025621124813e-07, + "loss": 0.76771486, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19519043, + "step": 12276, + "time_per_iteration": 2.8650801181793213 + }, + { + "auxiliary_loss_clip": 0.01419768, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.25607252, + "balance_loss_mlp": 1.0120734, + "epoch": 0.7381331730046595, + "flos": 20275842384000.0, + "grad_norm": 18.377763326026084, + "language_loss": 0.78679311, + "learning_rate": 6.771104431585551e-07, + "loss": 0.81129408, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18249512, + "step": 12277, + "time_per_iteration": 2.847001314163208 + }, + { + "auxiliary_loss_clip": 0.01408918, + "auxiliary_loss_mlp": 0.01041135, + "balance_loss_clip": 1.24966884, + "balance_loss_mlp": 1.02108395, + "epoch": 0.7381932962573275, + "flos": 19763742026880.0, + "grad_norm": 1.9725636820814512, + "language_loss": 0.79761916, + "learning_rate": 6.768183743687338e-07, + "loss": 0.82211965, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20031738, + "step": 12278, + "time_per_iteration": 2.841799736022949 + }, + { + "auxiliary_loss_clip": 0.0142724, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.26233554, + "balance_loss_mlp": 1.0121969, + "epoch": 0.7382534195099955, + "flos": 17312678709120.0, + "grad_norm": 2.0142004637053885, + "language_loss": 0.72689509, + "learning_rate": 6.765263557540921e-07, + "loss": 0.75147367, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1842041, + "step": 12279, + "time_per_iteration": 2.8217525482177734 + }, + { + "auxiliary_loss_clip": 0.01409451, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.24528933, + "balance_loss_mlp": 1.01811874, + "epoch": 0.7383135427626635, + "flos": 18706530591360.0, + "grad_norm": 2.1790011736892634, + "language_loss": 0.86340767, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88787055, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18701172, + "step": 12280, + "time_per_iteration": 2.813356399536133 + }, + { + "auxiliary_loss_clip": 0.01416476, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.25304699, + "balance_loss_mlp": 1.01380467, + "epoch": 0.7383736660153314, + "flos": 20889689592960.0, + "grad_norm": 1.9826232726380792, + "language_loss": 0.72732401, + "learning_rate": 6.759424690946408e-07, + "loss": 0.75180703, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18029785, + "step": 12281, + "time_per_iteration": 2.8505444526672363 + }, + { + "auxiliary_loss_clip": 0.01420477, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.25624442, + "balance_loss_mlp": 1.01327443, + "epoch": 0.7384337892679994, + "flos": 20671762798080.0, + "grad_norm": 1.8463024864431228, + "language_loss": 0.61857188, + "learning_rate": 6.756506010719711e-07, + "loss": 0.64310426, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19458008, + "step": 12282, + "time_per_iteration": 2.861659049987793 + }, + { + "auxiliary_loss_clip": 0.0144043, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.27408004, + "balance_loss_mlp": 1.0161221, + "epoch": 0.7384939125206673, + "flos": 29181739288320.0, + "grad_norm": 1.6179466868841768, + "language_loss": 0.68873805, + "learning_rate": 6.753587832687632e-07, + "loss": 0.71349752, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19396973, + "step": 12283, + "time_per_iteration": 2.8977510929107666 + }, + { + "auxiliary_loss_clip": 0.01403993, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.24346673, + "balance_loss_mlp": 1.01330709, + "epoch": 0.7385540357733353, + "flos": 36324721324800.0, + "grad_norm": 1.619596029707286, + "language_loss": 0.76425427, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78861117, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18395996, + "step": 12284, + "time_per_iteration": 2.9703495502471924 + }, + { + "auxiliary_loss_clip": 0.01413967, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.25028634, + "balance_loss_mlp": 1.01147437, + "epoch": 0.7386141590260034, + "flos": 20312155975680.0, + "grad_norm": 1.9356438299836443, + "language_loss": 0.700683, + "learning_rate": 6.747752983649954e-07, + "loss": 0.72513115, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19384766, + "step": 12285, + "time_per_iteration": 4.349000692367554 + }, + { + "auxiliary_loss_clip": 0.01425344, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.25770271, + "balance_loss_mlp": 1.01345086, + "epoch": 0.7386742822786713, + "flos": 25494746999040.0, + "grad_norm": 1.8097841715885963, + "language_loss": 0.80465549, + "learning_rate": 6.744836312865602e-07, + "loss": 0.82923508, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19165039, + "step": 12286, + "time_per_iteration": 2.8831522464752197 + }, + { + "auxiliary_loss_clip": 0.0141212, + "auxiliary_loss_mlp": 0.01031329, + "balance_loss_clip": 1.25099111, + "balance_loss_mlp": 1.01233876, + "epoch": 0.7387344055313393, + "flos": 13779899256960.0, + "grad_norm": 2.016514368881021, + "language_loss": 0.66385293, + "learning_rate": 6.741920144718396e-07, + "loss": 0.68828738, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18994141, + "step": 12287, + "time_per_iteration": 2.9350907802581787 + }, + { + "auxiliary_loss_clip": 0.01412014, + "auxiliary_loss_mlp": 0.01028403, + "balance_loss_clip": 1.25275576, + "balance_loss_mlp": 1.01083159, + "epoch": 0.7387945287840072, + "flos": 27866713127040.0, + "grad_norm": 1.8706210737274642, + "language_loss": 0.77518797, + "learning_rate": 6.739004479318903e-07, + "loss": 0.7995922, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17578125, + "step": 12288, + "time_per_iteration": 2.9579830169677734 + }, + { + "auxiliary_loss_clip": 0.01436682, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.26872134, + "balance_loss_mlp": 1.01246095, + "epoch": 0.7388546520366752, + "flos": 44247708034560.0, + "grad_norm": 2.2231821292804836, + "language_loss": 0.58918333, + "learning_rate": 6.736089316777684e-07, + "loss": 0.61387384, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19909668, + "step": 12289, + "time_per_iteration": 3.059521436691284 + }, + { + "auxiliary_loss_clip": 0.011846, + "auxiliary_loss_mlp": 0.01026612, + "balance_loss_clip": 1.09366012, + "balance_loss_mlp": 1.00706124, + "epoch": 0.7389147752893431, + "flos": 70710769140480.0, + "grad_norm": 0.6527582765421391, + "language_loss": 0.49295568, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51506782, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.1953125, + "step": 12290, + "time_per_iteration": 3.4660584926605225 + }, + { + "auxiliary_loss_clip": 0.01418417, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.25576758, + "balance_loss_mlp": 1.01460397, + "epoch": 0.7389748985420111, + "flos": 26006168684160.0, + "grad_norm": 1.7022549012832766, + "language_loss": 0.68287933, + "learning_rate": 6.730260500712237e-07, + "loss": 0.70740736, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19787598, + "step": 12291, + "time_per_iteration": 2.9051895141601562 + }, + { + "auxiliary_loss_clip": 0.01187271, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.09477425, + "balance_loss_mlp": 1.00592947, + "epoch": 0.7390350217946791, + "flos": 54428127724800.0, + "grad_norm": 0.9818288655487112, + "language_loss": 0.60856783, + "learning_rate": 6.727346847409052e-07, + "loss": 0.63072872, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.22851562, + "step": 12292, + "time_per_iteration": 4.397026062011719 + }, + { + "auxiliary_loss_clip": 0.01410937, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.24969029, + "balance_loss_mlp": 1.01619315, + "epoch": 0.7390951450473471, + "flos": 32209612306560.0, + "grad_norm": 4.572650551942778, + "language_loss": 0.6798774, + "learning_rate": 6.724433697406191e-07, + "loss": 0.70433497, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1862793, + "step": 12293, + "time_per_iteration": 2.922081470489502 + }, + { + "auxiliary_loss_clip": 0.0141659, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.25561142, + "balance_loss_mlp": 1.01506734, + "epoch": 0.739155268300015, + "flos": 16691230373760.0, + "grad_norm": 2.015856095135241, + "language_loss": 0.84392464, + "learning_rate": 6.721521050814134e-07, + "loss": 0.86842668, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18566895, + "step": 12294, + "time_per_iteration": 2.831770658493042 + }, + { + "auxiliary_loss_clip": 0.01396171, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.23779655, + "balance_loss_mlp": 1.01616299, + "epoch": 0.739215391552683, + "flos": 31662284232960.0, + "grad_norm": 1.521367400222795, + "language_loss": 0.7352947, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75959826, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18017578, + "step": 12295, + "time_per_iteration": 2.9653072357177734 + }, + { + "auxiliary_loss_clip": 0.0139818, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.24053049, + "balance_loss_mlp": 1.01381636, + "epoch": 0.7392755148053509, + "flos": 29731193867520.0, + "grad_norm": 1.7043833177612684, + "language_loss": 0.78650451, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81080163, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17724609, + "step": 12296, + "time_per_iteration": 4.393824577331543 + }, + { + "auxiliary_loss_clip": 0.01405057, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.24434376, + "balance_loss_mlp": 1.01240373, + "epoch": 0.7393356380580189, + "flos": 37064611843200.0, + "grad_norm": 2.19197187306483, + "language_loss": 0.66925871, + "learning_rate": 6.712786132607182e-07, + "loss": 0.69363666, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20336914, + "step": 12297, + "time_per_iteration": 4.379637956619263 + }, + { + "auxiliary_loss_clip": 0.01413138, + "auxiliary_loss_mlp": 0.01040769, + "balance_loss_clip": 1.25110602, + "balance_loss_mlp": 1.02061105, + "epoch": 0.739395761310687, + "flos": 19729238227200.0, + "grad_norm": 1.6462966092568079, + "language_loss": 0.6920808, + "learning_rate": 6.709875500762645e-07, + "loss": 0.71661985, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20153809, + "step": 12298, + "time_per_iteration": 2.8607101440429688 + }, + { + "auxiliary_loss_clip": 0.01400032, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.23823452, + "balance_loss_mlp": 1.01289308, + "epoch": 0.7394558845633549, + "flos": 11808875715840.0, + "grad_norm": 2.054188490422839, + "language_loss": 0.75022185, + "learning_rate": 6.706965372880946e-07, + "loss": 0.77454448, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19311523, + "step": 12299, + "time_per_iteration": 2.8427700996398926 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01020248, + "balance_loss_clip": 1.09152699, + "balance_loss_mlp": 1.00107944, + "epoch": 0.7395160078160229, + "flos": 66225239792640.0, + "grad_norm": 0.7254375466448695, + "language_loss": 0.60897899, + "learning_rate": 6.704055749072455e-07, + "loss": 0.63100564, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.19140625, + "step": 12300, + "time_per_iteration": 3.4057888984680176 + }, + { + "auxiliary_loss_clip": 0.0142756, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.26423287, + "balance_loss_mlp": 1.01791763, + "epoch": 0.7395761310686908, + "flos": 21259114536960.0, + "grad_norm": 2.2543270365074157, + "language_loss": 0.80989438, + "learning_rate": 6.7011466294475e-07, + "loss": 0.83453596, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18688965, + "step": 12301, + "time_per_iteration": 2.919325590133667 + }, + { + "auxiliary_loss_clip": 0.01409491, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.24898767, + "balance_loss_mlp": 1.01765943, + "epoch": 0.7396362543213588, + "flos": 25965647326080.0, + "grad_norm": 1.8059820533148325, + "language_loss": 0.73761201, + "learning_rate": 6.698238014116406e-07, + "loss": 0.76206374, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18029785, + "step": 12302, + "time_per_iteration": 2.996809959411621 + }, + { + "auxiliary_loss_clip": 0.01431176, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.26675153, + "balance_loss_mlp": 1.01429033, + "epoch": 0.7396963775740267, + "flos": 27388573632000.0, + "grad_norm": 1.772804015936782, + "language_loss": 0.74811864, + "learning_rate": 6.695329903189451e-07, + "loss": 0.77275908, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18579102, + "step": 12303, + "time_per_iteration": 2.997849941253662 + }, + { + "auxiliary_loss_clip": 0.01398254, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.23931646, + "balance_loss_mlp": 1.01760674, + "epoch": 0.7397565008266948, + "flos": 25531060590720.0, + "grad_norm": 1.7584017220154173, + "language_loss": 0.5465228, + "learning_rate": 6.692422296776927e-07, + "loss": 0.57086915, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18786621, + "step": 12304, + "time_per_iteration": 2.9331576824188232 + }, + { + "auxiliary_loss_clip": 0.01413915, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.25161481, + "balance_loss_mlp": 1.01515031, + "epoch": 0.7398166240793627, + "flos": 23737035283200.0, + "grad_norm": 2.7564766129847404, + "language_loss": 0.8501901, + "learning_rate": 6.689515194989084e-07, + "loss": 0.87466955, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.1887207, + "step": 12305, + "time_per_iteration": 2.8845834732055664 + }, + { + "auxiliary_loss_clip": 0.01183945, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.09274328, + "balance_loss_mlp": 1.01426268, + "epoch": 0.7398767473320307, + "flos": 67300820616960.0, + "grad_norm": 0.8730437643560194, + "language_loss": 0.5767504, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59894514, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.21289062, + "step": 12306, + "time_per_iteration": 3.3787713050842285 + }, + { + "auxiliary_loss_clip": 0.01425999, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.26222086, + "balance_loss_mlp": 1.01631987, + "epoch": 0.7399368705846986, + "flos": 22029029619840.0, + "grad_norm": 4.111389201480146, + "language_loss": 0.82069016, + "learning_rate": 6.683702505728355e-07, + "loss": 0.8453058, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19238281, + "step": 12307, + "time_per_iteration": 2.8456714153289795 + }, + { + "auxiliary_loss_clip": 0.01389714, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.23355806, + "balance_loss_mlp": 1.01437044, + "epoch": 0.7399969938373666, + "flos": 14182063453440.0, + "grad_norm": 2.0290937653841263, + "language_loss": 0.70641738, + "learning_rate": 6.680796918475893e-07, + "loss": 0.73064804, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18981934, + "step": 12308, + "time_per_iteration": 2.7859859466552734 + }, + { + "auxiliary_loss_clip": 0.01397576, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.23913252, + "balance_loss_mlp": 1.01511168, + "epoch": 0.7400571170900345, + "flos": 25312409879040.0, + "grad_norm": 1.746643621004893, + "language_loss": 0.82020968, + "learning_rate": 6.67789183628896e-07, + "loss": 0.84452385, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18725586, + "step": 12309, + "time_per_iteration": 2.887418746948242 + }, + { + "auxiliary_loss_clip": 0.01413956, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.24812531, + "balance_loss_mlp": 1.01558208, + "epoch": 0.7401172403427025, + "flos": 22721702549760.0, + "grad_norm": 1.7347838967002225, + "language_loss": 0.73469567, + "learning_rate": 6.674987259277692e-07, + "loss": 0.75918406, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19287109, + "step": 12310, + "time_per_iteration": 2.8542439937591553 + }, + { + "auxiliary_loss_clip": 0.01410552, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.24765444, + "balance_loss_mlp": 1.01692569, + "epoch": 0.7401773635953706, + "flos": 18073952035200.0, + "grad_norm": 2.8220062772374583, + "language_loss": 0.89167738, + "learning_rate": 6.672083187552239e-07, + "loss": 0.91615009, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19787598, + "step": 12311, + "time_per_iteration": 2.847985029220581 + }, + { + "auxiliary_loss_clip": 0.01418992, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.25623202, + "balance_loss_mlp": 1.01458895, + "epoch": 0.7402374868480385, + "flos": 22722697935360.0, + "grad_norm": 5.703740698690624, + "language_loss": 0.80499482, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82951772, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18701172, + "step": 12312, + "time_per_iteration": 2.88779354095459 + }, + { + "auxiliary_loss_clip": 0.01415479, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.25583017, + "balance_loss_mlp": 1.01890278, + "epoch": 0.7402976101007065, + "flos": 22866911671680.0, + "grad_norm": 3.5478286811357926, + "language_loss": 0.79215813, + "learning_rate": 6.666276560399273e-07, + "loss": 0.8166821, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18029785, + "step": 12313, + "time_per_iteration": 2.8872907161712646 + }, + { + "auxiliary_loss_clip": 0.01423825, + "auxiliary_loss_mlp": 0.010368, + "balance_loss_clip": 1.25863004, + "balance_loss_mlp": 1.01754713, + "epoch": 0.7403577333533744, + "flos": 12352538960640.0, + "grad_norm": 4.705946349383261, + "language_loss": 0.7955879, + "learning_rate": 6.663374005191937e-07, + "loss": 0.82019424, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19250488, + "step": 12314, + "time_per_iteration": 2.796251058578491 + }, + { + "auxiliary_loss_clip": 0.01188084, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.09516883, + "balance_loss_mlp": 1.01461434, + "epoch": 0.7404178566060424, + "flos": 60356626824960.0, + "grad_norm": 0.8442621537771297, + "language_loss": 0.55187768, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57413357, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.22851562, + "step": 12315, + "time_per_iteration": 3.3425121307373047 + }, + { + "auxiliary_loss_clip": 0.01401938, + "auxiliary_loss_mlp": 0.0103449, + "balance_loss_clip": 1.24462521, + "balance_loss_mlp": 1.01594114, + "epoch": 0.7404779798587103, + "flos": 32027275186560.0, + "grad_norm": 6.959521302000672, + "language_loss": 0.80163866, + "learning_rate": 6.65757041206591e-07, + "loss": 0.82600296, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18554688, + "step": 12316, + "time_per_iteration": 2.994468927383423 + }, + { + "auxiliary_loss_clip": 0.01408668, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.24676311, + "balance_loss_mlp": 1.01656723, + "epoch": 0.7405381031113784, + "flos": 12895704512640.0, + "grad_norm": 1.6659485955295927, + "language_loss": 0.75577164, + "learning_rate": 6.654669374367275e-07, + "loss": 0.78021562, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19165039, + "step": 12317, + "time_per_iteration": 2.8533871173858643 + }, + { + "auxiliary_loss_clip": 0.01393314, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.23784077, + "balance_loss_mlp": 1.01584339, + "epoch": 0.7405982263640463, + "flos": 20238533406720.0, + "grad_norm": 1.6414440496231775, + "language_loss": 0.81714094, + "learning_rate": 6.651768842724917e-07, + "loss": 0.84141928, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18676758, + "step": 12318, + "time_per_iteration": 2.849893569946289 + }, + { + "auxiliary_loss_clip": 0.01429775, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.263906, + "balance_loss_mlp": 1.01532841, + "epoch": 0.7406583496167143, + "flos": 17576330014080.0, + "grad_norm": 1.9090661659578239, + "language_loss": 0.77303183, + "learning_rate": 6.648868817248827e-07, + "loss": 0.79767168, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1887207, + "step": 12319, + "time_per_iteration": 2.8331501483917236 + }, + { + "auxiliary_loss_clip": 0.01417886, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.25721192, + "balance_loss_mlp": 1.01985741, + "epoch": 0.7407184728693822, + "flos": 18304728353280.0, + "grad_norm": 1.9450824980585146, + "language_loss": 0.64778107, + "learning_rate": 6.64596929804897e-07, + "loss": 0.67233694, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.17858887, + "step": 12320, + "time_per_iteration": 4.27356481552124 + }, + { + "auxiliary_loss_clip": 0.0143047, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.26300597, + "balance_loss_mlp": 1.02090955, + "epoch": 0.7407785961220502, + "flos": 16699193458560.0, + "grad_norm": 2.5668337041249893, + "language_loss": 0.83266634, + "learning_rate": 6.643070285235288e-07, + "loss": 0.85737133, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.19116211, + "step": 12321, + "time_per_iteration": 2.8514344692230225 + }, + { + "auxiliary_loss_clip": 0.01438891, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.27027154, + "balance_loss_mlp": 1.02370441, + "epoch": 0.7408387193747181, + "flos": 22097810995200.0, + "grad_norm": 2.863712750778219, + "language_loss": 0.72625357, + "learning_rate": 6.640171778917727e-07, + "loss": 0.75108582, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.20617676, + "step": 12322, + "time_per_iteration": 2.853593349456787 + }, + { + "auxiliary_loss_clip": 0.01430742, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.26744723, + "balance_loss_mlp": 1.01755238, + "epoch": 0.7408988426273861, + "flos": 24245651790720.0, + "grad_norm": 1.6678358556559554, + "language_loss": 0.64957619, + "learning_rate": 6.637273779206183e-07, + "loss": 0.67425108, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19213867, + "step": 12323, + "time_per_iteration": 2.8764781951904297 + }, + { + "auxiliary_loss_clip": 0.01423444, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.25933659, + "balance_loss_mlp": 1.01817703, + "epoch": 0.7409589658800542, + "flos": 29034901353600.0, + "grad_norm": 2.999977588561279, + "language_loss": 0.76648688, + "learning_rate": 6.634376286210559e-07, + "loss": 0.79109633, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.1932373, + "step": 12324, + "time_per_iteration": 2.9832944869995117 + }, + { + "auxiliary_loss_clip": 0.01404371, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.24275231, + "balance_loss_mlp": 1.01165533, + "epoch": 0.7410190891327221, + "flos": 19359994262400.0, + "grad_norm": 1.7831133196316835, + "language_loss": 0.74866199, + "learning_rate": 6.63147930004073e-07, + "loss": 0.77300262, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18029785, + "step": 12325, + "time_per_iteration": 2.868947982788086 + }, + { + "auxiliary_loss_clip": 0.01439465, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.27175105, + "balance_loss_mlp": 1.0144279, + "epoch": 0.7410792123853901, + "flos": 22757970896640.0, + "grad_norm": 2.308260222666618, + "language_loss": 0.68497086, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70970356, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19384766, + "step": 12326, + "time_per_iteration": 2.8803138732910156 + }, + { + "auxiliary_loss_clip": 0.01411352, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.24878025, + "balance_loss_mlp": 1.01510382, + "epoch": 0.741139335638058, + "flos": 25382865312000.0, + "grad_norm": 1.6693700574285433, + "language_loss": 0.89859152, + "learning_rate": 6.625686848617835e-07, + "loss": 0.92304897, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19287109, + "step": 12327, + "time_per_iteration": 4.4053733348846436 + }, + { + "auxiliary_loss_clip": 0.01418345, + "auxiliary_loss_mlp": 0.01034615, + "balance_loss_clip": 1.25640631, + "balance_loss_mlp": 1.01563716, + "epoch": 0.741199458890726, + "flos": 18594196456320.0, + "grad_norm": 2.0697500109566396, + "language_loss": 0.86342466, + "learning_rate": 6.62279138358442e-07, + "loss": 0.88795429, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18969727, + "step": 12328, + "time_per_iteration": 2.794187307357788 + }, + { + "auxiliary_loss_clip": 0.01410329, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.25216913, + "balance_loss_mlp": 1.01212287, + "epoch": 0.7412595821433939, + "flos": 22137155988480.0, + "grad_norm": 1.8631287896488757, + "language_loss": 0.67365092, + "learning_rate": 6.619896425816103e-07, + "loss": 0.69806349, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18811035, + "step": 12329, + "time_per_iteration": 2.898951768875122 + }, + { + "auxiliary_loss_clip": 0.01440159, + "auxiliary_loss_mlp": 0.01037498, + "balance_loss_clip": 1.27233911, + "balance_loss_mlp": 1.01902092, + "epoch": 0.741319705396062, + "flos": 29181829777920.0, + "grad_norm": 1.848621068870848, + "language_loss": 0.67618108, + "learning_rate": 6.617001975422647e-07, + "loss": 0.70095766, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.18469238, + "step": 12330, + "time_per_iteration": 2.920916795730591 + }, + { + "auxiliary_loss_clip": 0.01439149, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.27095985, + "balance_loss_mlp": 1.01275659, + "epoch": 0.7413798286487299, + "flos": 20677146929280.0, + "grad_norm": 1.9197923745612766, + "language_loss": 0.86196762, + "learning_rate": 6.614108032513823e-07, + "loss": 0.88668716, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20043945, + "step": 12331, + "time_per_iteration": 2.84601092338562 + }, + { + "auxiliary_loss_clip": 0.01422403, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.25925469, + "balance_loss_mlp": 1.01436591, + "epoch": 0.7414399519013979, + "flos": 16407508360320.0, + "grad_norm": 1.8523985164435879, + "language_loss": 0.70802605, + "learning_rate": 6.611214597199364e-07, + "loss": 0.73258317, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18933105, + "step": 12332, + "time_per_iteration": 4.2664573192596436 + }, + { + "auxiliary_loss_clip": 0.01417633, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.25454223, + "balance_loss_mlp": 1.01492012, + "epoch": 0.7415000751540658, + "flos": 25641358709760.0, + "grad_norm": 1.9480122837622453, + "language_loss": 0.64148915, + "learning_rate": 6.608321669588984e-07, + "loss": 0.66600752, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19274902, + "step": 12333, + "time_per_iteration": 2.8744170665740967 + }, + { + "auxiliary_loss_clip": 0.01400912, + "auxiliary_loss_mlp": 0.01034932, + "balance_loss_clip": 1.24421024, + "balance_loss_mlp": 1.01679993, + "epoch": 0.7415601984067338, + "flos": 24510705684480.0, + "grad_norm": 1.8845113706596734, + "language_loss": 0.72070193, + "learning_rate": 6.605429249792387e-07, + "loss": 0.74506032, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18139648, + "step": 12334, + "time_per_iteration": 2.9569578170776367 + }, + { + "auxiliary_loss_clip": 0.01414539, + "auxiliary_loss_mlp": 0.01032151, + "balance_loss_clip": 1.25235093, + "balance_loss_mlp": 1.01295829, + "epoch": 0.7416203216594017, + "flos": 20897064495360.0, + "grad_norm": 1.698431540795265, + "language_loss": 0.83077323, + "learning_rate": 6.602537337919257e-07, + "loss": 0.85524017, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19189453, + "step": 12335, + "time_per_iteration": 2.856623411178589 + }, + { + "auxiliary_loss_clip": 0.01412101, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.24959731, + "balance_loss_mlp": 1.0162555, + "epoch": 0.7416804449120697, + "flos": 15630489843840.0, + "grad_norm": 2.325837658113184, + "language_loss": 0.7553941, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77987123, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19360352, + "step": 12336, + "time_per_iteration": 2.8298897743225098 + }, + { + "auxiliary_loss_clip": 0.01429851, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.26488042, + "balance_loss_mlp": 1.01405609, + "epoch": 0.7417405681647377, + "flos": 17127174453120.0, + "grad_norm": 2.4183565263123565, + "language_loss": 0.74393857, + "learning_rate": 6.596755038382029e-07, + "loss": 0.76857042, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19299316, + "step": 12337, + "time_per_iteration": 2.914642333984375 + }, + { + "auxiliary_loss_clip": 0.01421162, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.26070869, + "balance_loss_mlp": 1.01688731, + "epoch": 0.7418006914174057, + "flos": 18889682117760.0, + "grad_norm": 1.8389820607296536, + "language_loss": 0.77357626, + "learning_rate": 6.593864650937186e-07, + "loss": 0.79815364, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19702148, + "step": 12338, + "time_per_iteration": 2.843045949935913 + }, + { + "auxiliary_loss_clip": 0.0140247, + "auxiliary_loss_mlp": 0.01034181, + "balance_loss_clip": 1.24413705, + "balance_loss_mlp": 1.01504827, + "epoch": 0.7418608146700737, + "flos": 21590913790080.0, + "grad_norm": 1.7601472700520027, + "language_loss": 0.73132598, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75569248, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19140625, + "step": 12339, + "time_per_iteration": 2.8659756183624268 + }, + { + "auxiliary_loss_clip": 0.01410714, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.25006342, + "balance_loss_mlp": 1.01169133, + "epoch": 0.7419209379227416, + "flos": 22349155714560.0, + "grad_norm": 2.2148277440735042, + "language_loss": 0.8066709, + "learning_rate": 6.588085401243077e-07, + "loss": 0.83108521, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19030762, + "step": 12340, + "time_per_iteration": 2.8562278747558594 + }, + { + "auxiliary_loss_clip": 0.01417169, + "auxiliary_loss_mlp": 0.01036126, + "balance_loss_clip": 1.25523257, + "balance_loss_mlp": 1.01724362, + "epoch": 0.7419810611754096, + "flos": 16770599032320.0, + "grad_norm": 1.6191961816927123, + "language_loss": 0.76110655, + "learning_rate": 6.585196539212958e-07, + "loss": 0.78563946, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18884277, + "step": 12341, + "time_per_iteration": 2.9123871326446533 + }, + { + "auxiliary_loss_clip": 0.01388035, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.2353611, + "balance_loss_mlp": 1.01415956, + "epoch": 0.7420411844280775, + "flos": 26223959744640.0, + "grad_norm": 1.3842235463918187, + "language_loss": 0.80918121, + "learning_rate": 6.582308185873535e-07, + "loss": 0.83339161, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.18859863, + "step": 12342, + "time_per_iteration": 2.8887994289398193 + }, + { + "auxiliary_loss_clip": 0.01420557, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.25908589, + "balance_loss_mlp": 1.01440084, + "epoch": 0.7421013076807456, + "flos": 68549517901440.0, + "grad_norm": 1.6692967429468768, + "language_loss": 0.77735126, + "learning_rate": 6.57942034133433e-07, + "loss": 0.80188966, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18896484, + "step": 12343, + "time_per_iteration": 3.2489981651306152 + }, + { + "auxiliary_loss_clip": 0.01423122, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.26092124, + "balance_loss_mlp": 1.01871252, + "epoch": 0.7421614309334135, + "flos": 24435906750720.0, + "grad_norm": 1.6838364016589684, + "language_loss": 0.68467498, + "learning_rate": 6.576533005704843e-07, + "loss": 0.70928347, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19018555, + "step": 12344, + "time_per_iteration": 2.90736722946167 + }, + { + "auxiliary_loss_clip": 0.01417635, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.2537508, + "balance_loss_mlp": 1.01531553, + "epoch": 0.7422215541860815, + "flos": 12318261384960.0, + "grad_norm": 2.1467725525525005, + "language_loss": 0.822716, + "learning_rate": 6.573646179094572e-07, + "loss": 0.84724861, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20324707, + "step": 12345, + "time_per_iteration": 2.835594654083252 + }, + { + "auxiliary_loss_clip": 0.01415312, + "auxiliary_loss_mlp": 0.01035019, + "balance_loss_clip": 1.25179958, + "balance_loss_mlp": 1.01579094, + "epoch": 0.7422816774387494, + "flos": 19654891741440.0, + "grad_norm": 1.872232406259468, + "language_loss": 0.7158764, + "learning_rate": 6.570759861612988e-07, + "loss": 0.74037969, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19226074, + "step": 12346, + "time_per_iteration": 2.869725465774536 + }, + { + "auxiliary_loss_clip": 0.01414257, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.25189555, + "balance_loss_mlp": 1.01854718, + "epoch": 0.7423418006914174, + "flos": 32028949244160.0, + "grad_norm": 1.650952445424816, + "language_loss": 0.74303687, + "learning_rate": 6.56787405336953e-07, + "loss": 0.76754916, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1842041, + "step": 12347, + "time_per_iteration": 2.919619560241699 + }, + { + "auxiliary_loss_clip": 0.01436178, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.26925254, + "balance_loss_mlp": 1.01569128, + "epoch": 0.7424019239440853, + "flos": 18926357667840.0, + "grad_norm": 2.509920738737155, + "language_loss": 0.81810284, + "learning_rate": 6.564988754473642e-07, + "loss": 0.84280884, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18737793, + "step": 12348, + "time_per_iteration": 2.863982915878296 + }, + { + "auxiliary_loss_clip": 0.01412118, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.25234926, + "balance_loss_mlp": 1.01745379, + "epoch": 0.7424620471967533, + "flos": 35888686755840.0, + "grad_norm": 2.180804768214526, + "language_loss": 0.72664416, + "learning_rate": 6.562103965034724e-07, + "loss": 0.75114214, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.20227051, + "step": 12349, + "time_per_iteration": 2.9970381259918213 + }, + { + "auxiliary_loss_clip": 0.0143521, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.2669363, + "balance_loss_mlp": 1.01823783, + "epoch": 0.7425221704494213, + "flos": 27028061913600.0, + "grad_norm": 3.727844880351325, + "language_loss": 0.80231309, + "learning_rate": 6.559219685162165e-07, + "loss": 0.82704198, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19445801, + "step": 12350, + "time_per_iteration": 2.8791427612304688 + }, + { + "auxiliary_loss_clip": 0.01414114, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.25197041, + "balance_loss_mlp": 1.02130866, + "epoch": 0.7425822937020893, + "flos": 34180273889280.0, + "grad_norm": 2.2393207188878925, + "language_loss": 0.76136231, + "learning_rate": 6.556335914965343e-07, + "loss": 0.78590685, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19042969, + "step": 12351, + "time_per_iteration": 2.972874641418457 + }, + { + "auxiliary_loss_clip": 0.0141083, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.24891758, + "balance_loss_mlp": 1.01402879, + "epoch": 0.7426424169547573, + "flos": 21292306237440.0, + "grad_norm": 2.0974388389177725, + "language_loss": 0.82288557, + "learning_rate": 6.553452654553611e-07, + "loss": 0.84731776, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18347168, + "step": 12352, + "time_per_iteration": 2.878365993499756 + }, + { + "auxiliary_loss_clip": 0.01424678, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.26173162, + "balance_loss_mlp": 1.02362061, + "epoch": 0.7427025402074252, + "flos": 22456784390400.0, + "grad_norm": 1.8453228719656483, + "language_loss": 0.72051215, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74518824, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1932373, + "step": 12353, + "time_per_iteration": 2.8827974796295166 + }, + { + "auxiliary_loss_clip": 0.01430082, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.26807785, + "balance_loss_mlp": 1.02425396, + "epoch": 0.7427626634600932, + "flos": 22533800319360.0, + "grad_norm": 1.7755614579392558, + "language_loss": 0.72725743, + "learning_rate": 6.547687663522739e-07, + "loss": 0.7519871, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18652344, + "step": 12354, + "time_per_iteration": 2.8784947395324707 + }, + { + "auxiliary_loss_clip": 0.01192199, + "auxiliary_loss_mlp": 0.01046777, + "balance_loss_clip": 1.09828961, + "balance_loss_mlp": 1.02179086, + "epoch": 0.7428227867127611, + "flos": 67237088676480.0, + "grad_norm": 0.7022558752732885, + "language_loss": 0.59568858, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61807835, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.25, + "step": 12355, + "time_per_iteration": 4.897478818893433 + }, + { + "auxiliary_loss_clip": 0.01412087, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.24912906, + "balance_loss_mlp": 1.01598203, + "epoch": 0.7428829099654292, + "flos": 14729572506240.0, + "grad_norm": 1.7611123231224195, + "language_loss": 0.68806618, + "learning_rate": 6.541924712943971e-07, + "loss": 0.71252847, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18151855, + "step": 12356, + "time_per_iteration": 2.9124960899353027 + }, + { + "auxiliary_loss_clip": 0.01427537, + "auxiliary_loss_mlp": 0.01039373, + "balance_loss_clip": 1.262465, + "balance_loss_mlp": 1.02084804, + "epoch": 0.7429430332180971, + "flos": 48661967543040.0, + "grad_norm": 1.6117739783379834, + "language_loss": 0.7217496, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74641871, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18530273, + "step": 12357, + "time_per_iteration": 3.084669589996338 + }, + { + "auxiliary_loss_clip": 0.01407128, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.24932694, + "balance_loss_mlp": 1.01476574, + "epoch": 0.7430031564707651, + "flos": 16772861272320.0, + "grad_norm": 1.9563614403971479, + "language_loss": 0.66162258, + "learning_rate": 6.53616380369143e-07, + "loss": 0.6860382, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19665527, + "step": 12358, + "time_per_iteration": 2.9129788875579834 + }, + { + "auxiliary_loss_clip": 0.01421337, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.25559676, + "balance_loss_mlp": 1.01598263, + "epoch": 0.743063279723433, + "flos": 23879077269120.0, + "grad_norm": 1.6697154887376184, + "language_loss": 0.81314027, + "learning_rate": 6.533284114835591e-07, + "loss": 0.83770525, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19189453, + "step": 12359, + "time_per_iteration": 2.890619993209839 + }, + { + "auxiliary_loss_clip": 0.01413339, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.25071132, + "balance_loss_mlp": 1.01566327, + "epoch": 0.743123402976101, + "flos": 14399673534720.0, + "grad_norm": 2.0201834929421167, + "language_loss": 0.69287097, + "learning_rate": 6.530404936638956e-07, + "loss": 0.71735269, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19165039, + "step": 12360, + "time_per_iteration": 2.825568199157715 + }, + { + "auxiliary_loss_clip": 0.01399944, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.23969972, + "balance_loss_mlp": 1.01615334, + "epoch": 0.7431835262287689, + "flos": 27465861029760.0, + "grad_norm": 1.5775027276536586, + "language_loss": 0.73396301, + "learning_rate": 6.527526269210715e-07, + "loss": 0.75832325, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19934082, + "step": 12361, + "time_per_iteration": 2.921623945236206 + }, + { + "auxiliary_loss_clip": 0.01428053, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.26282525, + "balance_loss_mlp": 1.02023363, + "epoch": 0.743243649481437, + "flos": 20969284475520.0, + "grad_norm": 2.472894507033422, + "language_loss": 0.56504977, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58972502, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19250488, + "step": 12362, + "time_per_iteration": 4.260235071182251 + }, + { + "auxiliary_loss_clip": 0.01408017, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.24571538, + "balance_loss_mlp": 1.01843631, + "epoch": 0.7433037727341049, + "flos": 22793243857920.0, + "grad_norm": 2.426340331258866, + "language_loss": 0.78387481, + "learning_rate": 6.521770467096039e-07, + "loss": 0.80833066, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19128418, + "step": 12363, + "time_per_iteration": 2.840024948120117 + }, + { + "auxiliary_loss_clip": 0.01415923, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.25406098, + "balance_loss_mlp": 1.01784241, + "epoch": 0.7433638959867729, + "flos": 22205937363840.0, + "grad_norm": 3.6329263873769566, + "language_loss": 0.7876094, + "learning_rate": 6.518893332627862e-07, + "loss": 0.81214482, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19775391, + "step": 12364, + "time_per_iteration": 2.8636791706085205 + }, + { + "auxiliary_loss_clip": 0.01414459, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.25142562, + "balance_loss_mlp": 1.01468241, + "epoch": 0.7434240192394409, + "flos": 23307696944640.0, + "grad_norm": 1.6610116981770697, + "language_loss": 0.78937995, + "learning_rate": 6.516016709364604e-07, + "loss": 0.81385189, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18054199, + "step": 12365, + "time_per_iteration": 2.9113433361053467 + }, + { + "auxiliary_loss_clip": 0.01424816, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.25938153, + "balance_loss_mlp": 1.01692653, + "epoch": 0.7434841424921088, + "flos": 54026352748800.0, + "grad_norm": 1.6727700641878358, + "language_loss": 0.7762441, + "learning_rate": 6.513140597415346e-07, + "loss": 0.80084807, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18652344, + "step": 12366, + "time_per_iteration": 3.1428966522216797 + }, + { + "auxiliary_loss_clip": 0.01407859, + "auxiliary_loss_mlp": 0.01036999, + "balance_loss_clip": 1.24995518, + "balance_loss_mlp": 1.0184381, + "epoch": 0.7435442657447768, + "flos": 21443894876160.0, + "grad_norm": 1.3571170789658358, + "language_loss": 0.71712971, + "learning_rate": 6.510264996889141e-07, + "loss": 0.74157834, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18566895, + "step": 12367, + "time_per_iteration": 4.29633903503418 + }, + { + "auxiliary_loss_clip": 0.01428456, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.26103115, + "balance_loss_mlp": 1.01669884, + "epoch": 0.7436043889974447, + "flos": 24510298481280.0, + "grad_norm": 2.685428334170787, + "language_loss": 0.74950349, + "learning_rate": 6.507389907895038e-07, + "loss": 0.77414811, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19311523, + "step": 12368, + "time_per_iteration": 2.8901779651641846 + }, + { + "auxiliary_loss_clip": 0.01416159, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.25665188, + "balance_loss_mlp": 1.01574659, + "epoch": 0.7436645122501128, + "flos": 40713299769600.0, + "grad_norm": 1.6780653907453773, + "language_loss": 0.70147812, + "learning_rate": 6.50451533054207e-07, + "loss": 0.72599, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19287109, + "step": 12369, + "time_per_iteration": 3.003420352935791 + }, + { + "auxiliary_loss_clip": 0.01423082, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.25953209, + "balance_loss_mlp": 1.01654387, + "epoch": 0.7437246355027807, + "flos": 18916132343040.0, + "grad_norm": 1.7543962371186423, + "language_loss": 0.7617029, + "learning_rate": 6.501641264939233e-07, + "loss": 0.786277, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.17785645, + "step": 12370, + "time_per_iteration": 2.8291265964508057 + }, + { + "auxiliary_loss_clip": 0.01405053, + "auxiliary_loss_mlp": 0.01042296, + "balance_loss_clip": 1.24549818, + "balance_loss_mlp": 1.02297199, + "epoch": 0.7437847587554487, + "flos": 21553876281600.0, + "grad_norm": 1.4398465071214595, + "language_loss": 0.79356098, + "learning_rate": 6.498767711195503e-07, + "loss": 0.81803453, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1932373, + "step": 12371, + "time_per_iteration": 2.87307071685791 + }, + { + "auxiliary_loss_clip": 0.014063, + "auxiliary_loss_mlp": 0.01032445, + "balance_loss_clip": 1.24434066, + "balance_loss_mlp": 1.01313329, + "epoch": 0.7438448820081166, + "flos": 27794040698880.0, + "grad_norm": 1.6463991832597595, + "language_loss": 0.7018463, + "learning_rate": 6.495894669419857e-07, + "loss": 0.72623378, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.1932373, + "step": 12372, + "time_per_iteration": 2.986769437789917 + }, + { + "auxiliary_loss_clip": 0.01415131, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.25368142, + "balance_loss_mlp": 1.01504683, + "epoch": 0.7439050052607846, + "flos": 17976865397760.0, + "grad_norm": 16.3557695350612, + "language_loss": 0.7599203, + "learning_rate": 6.493022139721245e-07, + "loss": 0.78441775, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19567871, + "step": 12373, + "time_per_iteration": 2.858896255493164 + }, + { + "auxiliary_loss_clip": 0.01418629, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.25416684, + "balance_loss_mlp": 1.01561952, + "epoch": 0.7439651285134525, + "flos": 22967029710720.0, + "grad_norm": 2.7296032138113984, + "language_loss": 0.77884924, + "learning_rate": 6.49015012220858e-07, + "loss": 0.80339003, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19824219, + "step": 12374, + "time_per_iteration": 2.848813056945801 + }, + { + "auxiliary_loss_clip": 0.0141966, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.25618505, + "balance_loss_mlp": 1.01116991, + "epoch": 0.7440252517661206, + "flos": 18815969059200.0, + "grad_norm": 2.027007177767614, + "language_loss": 0.77115446, + "learning_rate": 6.487278616990774e-07, + "loss": 0.79565042, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.1875, + "step": 12375, + "time_per_iteration": 2.864499568939209 + }, + { + "auxiliary_loss_clip": 0.01406757, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.24755228, + "balance_loss_mlp": 1.01187599, + "epoch": 0.7440853750187885, + "flos": 20275887628800.0, + "grad_norm": 1.9214664695588923, + "language_loss": 0.77738023, + "learning_rate": 6.484407624176733e-07, + "loss": 0.80174202, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17541504, + "step": 12376, + "time_per_iteration": 2.852778434753418 + }, + { + "auxiliary_loss_clip": 0.01414107, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.25209737, + "balance_loss_mlp": 1.01187539, + "epoch": 0.7441454982714565, + "flos": 25348044798720.0, + "grad_norm": 1.702528103193998, + "language_loss": 0.80324715, + "learning_rate": 6.481537143875296e-07, + "loss": 0.82769299, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18603516, + "step": 12377, + "time_per_iteration": 2.88028883934021 + }, + { + "auxiliary_loss_clip": 0.01409442, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.24575043, + "balance_loss_mlp": 1.01202238, + "epoch": 0.7442056215241245, + "flos": 64500385080960.0, + "grad_norm": 1.9033425628688072, + "language_loss": 0.67023903, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69464338, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18969727, + "step": 12378, + "time_per_iteration": 3.258007287979126 + }, + { + "auxiliary_loss_clip": 0.01413974, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.25086832, + "balance_loss_mlp": 1.01255453, + "epoch": 0.7442657447767924, + "flos": 31297926706560.0, + "grad_norm": 1.706474175499525, + "language_loss": 0.72443175, + "learning_rate": 6.475797721245648e-07, + "loss": 0.74889135, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19433594, + "step": 12379, + "time_per_iteration": 3.0003767013549805 + }, + { + "auxiliary_loss_clip": 0.01411358, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.24922895, + "balance_loss_mlp": 1.02229261, + "epoch": 0.7443258680294604, + "flos": 20816157513600.0, + "grad_norm": 2.105034497859544, + "language_loss": 0.65577471, + "learning_rate": 6.472928779135085e-07, + "loss": 0.68030757, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19641113, + "step": 12380, + "time_per_iteration": 2.84525465965271 + }, + { + "auxiliary_loss_clip": 0.01425183, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.26089191, + "balance_loss_mlp": 1.01407337, + "epoch": 0.7443859912821283, + "flos": 22209873661440.0, + "grad_norm": 2.119726839706107, + "language_loss": 0.79809523, + "learning_rate": 6.470060349972411e-07, + "loss": 0.82267672, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18884277, + "step": 12381, + "time_per_iteration": 2.849461317062378 + }, + { + "auxiliary_loss_clip": 0.01419912, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.25469899, + "balance_loss_mlp": 1.01584673, + "epoch": 0.7444461145347964, + "flos": 22027943744640.0, + "grad_norm": 1.8886249265056847, + "language_loss": 0.73398781, + "learning_rate": 6.467192433866411e-07, + "loss": 0.75853837, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1932373, + "step": 12382, + "time_per_iteration": 2.877836227416992 + }, + { + "auxiliary_loss_clip": 0.01187381, + "auxiliary_loss_mlp": 0.01023044, + "balance_loss_clip": 1.09712005, + "balance_loss_mlp": 1.00034618, + "epoch": 0.7445062377874643, + "flos": 70595150117760.0, + "grad_norm": 0.6525247743979238, + "language_loss": 0.5463044, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56840861, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.2265625, + "step": 12383, + "time_per_iteration": 3.506392240524292 + }, + { + "auxiliary_loss_clip": 0.01405987, + "auxiliary_loss_mlp": 0.01031124, + "balance_loss_clip": 1.24368739, + "balance_loss_mlp": 1.01178885, + "epoch": 0.7445663610401323, + "flos": 22174736434560.0, + "grad_norm": 1.9919169845610232, + "language_loss": 0.77191985, + "learning_rate": 6.461458141259395e-07, + "loss": 0.79629099, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19311523, + "step": 12384, + "time_per_iteration": 2.891589403152466 + }, + { + "auxiliary_loss_clip": 0.01409938, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.24840021, + "balance_loss_mlp": 1.01280951, + "epoch": 0.7446264842928002, + "flos": 24180851957760.0, + "grad_norm": 2.561943740346893, + "language_loss": 0.7986424, + "learning_rate": 6.458591764975823e-07, + "loss": 0.82306015, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19030762, + "step": 12385, + "time_per_iteration": 2.8676953315734863 + }, + { + "auxiliary_loss_clip": 0.01434053, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.26795053, + "balance_loss_mlp": 1.01677048, + "epoch": 0.7446866075454682, + "flos": 24145352772480.0, + "grad_norm": 1.5433496581272748, + "language_loss": 0.82270491, + "learning_rate": 6.455725902183813e-07, + "loss": 0.84740919, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19604492, + "step": 12386, + "time_per_iteration": 2.8980581760406494 + }, + { + "auxiliary_loss_clip": 0.01402242, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.2436223, + "balance_loss_mlp": 1.01651204, + "epoch": 0.7447467307981361, + "flos": 23558001033600.0, + "grad_norm": 1.640762722585074, + "language_loss": 0.71532476, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73970032, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18798828, + "step": 12387, + "time_per_iteration": 2.8678295612335205 + }, + { + "auxiliary_loss_clip": 0.01409945, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.24831963, + "balance_loss_mlp": 1.01507282, + "epoch": 0.7448068540508042, + "flos": 19575885041280.0, + "grad_norm": 1.9846426790703335, + "language_loss": 0.71165502, + "learning_rate": 6.449995717509138e-07, + "loss": 0.73609519, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18994141, + "step": 12388, + "time_per_iteration": 2.825624942779541 + }, + { + "auxiliary_loss_clip": 0.01422212, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.25961232, + "balance_loss_mlp": 1.01629019, + "epoch": 0.7448669773034721, + "flos": 21850900266240.0, + "grad_norm": 1.5568825969977613, + "language_loss": 0.85709751, + "learning_rate": 6.447131395843761e-07, + "loss": 0.88166559, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.1829834, + "step": 12389, + "time_per_iteration": 2.900312662124634 + }, + { + "auxiliary_loss_clip": 0.01422447, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.25814331, + "balance_loss_mlp": 1.01801538, + "epoch": 0.7449271005561401, + "flos": 25166250616320.0, + "grad_norm": 1.848644425089653, + "language_loss": 0.79472899, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81931585, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18200684, + "step": 12390, + "time_per_iteration": 4.25848126411438 + }, + { + "auxiliary_loss_clip": 0.01423208, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.25896335, + "balance_loss_mlp": 1.01188397, + "epoch": 0.7449872238088081, + "flos": 22283541475200.0, + "grad_norm": 1.8842490975296704, + "language_loss": 0.85842049, + "learning_rate": 6.441404294400014e-07, + "loss": 0.88296866, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19702148, + "step": 12391, + "time_per_iteration": 2.843517303466797 + }, + { + "auxiliary_loss_clip": 0.01413806, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.25145555, + "balance_loss_mlp": 1.01540041, + "epoch": 0.745047347061476, + "flos": 20604519745920.0, + "grad_norm": 1.739986805176527, + "language_loss": 0.74665713, + "learning_rate": 6.438541514838811e-07, + "loss": 0.77113104, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18188477, + "step": 12392, + "time_per_iteration": 2.8697316646575928 + }, + { + "auxiliary_loss_clip": 0.01393598, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.23508334, + "balance_loss_mlp": 1.01719952, + "epoch": 0.745107470314144, + "flos": 22137744170880.0, + "grad_norm": 1.7308083067822082, + "language_loss": 0.77594614, + "learning_rate": 6.435679249529487e-07, + "loss": 0.80024904, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19506836, + "step": 12393, + "time_per_iteration": 2.8552327156066895 + }, + { + "auxiliary_loss_clip": 0.01408701, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.24812794, + "balance_loss_mlp": 1.01476526, + "epoch": 0.745167593566812, + "flos": 22246594456320.0, + "grad_norm": 1.7856018795121198, + "language_loss": 0.7294488, + "learning_rate": 6.432817498580552e-07, + "loss": 0.75388265, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19921875, + "step": 12394, + "time_per_iteration": 2.878673791885376 + }, + { + "auxiliary_loss_clip": 0.01415026, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.25303447, + "balance_loss_mlp": 1.01653039, + "epoch": 0.74522771681948, + "flos": 20675563361280.0, + "grad_norm": 1.7321572282548086, + "language_loss": 0.82443249, + "learning_rate": 6.429956262100535e-07, + "loss": 0.84895062, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.20251465, + "step": 12395, + "time_per_iteration": 2.862405776977539 + }, + { + "auxiliary_loss_clip": 0.01424372, + "auxiliary_loss_mlp": 0.0104087, + "balance_loss_clip": 1.25900412, + "balance_loss_mlp": 1.02134299, + "epoch": 0.7452878400721479, + "flos": 21117208285440.0, + "grad_norm": 2.0151656426585096, + "language_loss": 0.71648419, + "learning_rate": 6.427095540197937e-07, + "loss": 0.74113655, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19519043, + "step": 12396, + "time_per_iteration": 2.862217426300049 + }, + { + "auxiliary_loss_clip": 0.0142175, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.25728858, + "balance_loss_mlp": 1.01811051, + "epoch": 0.7453479633248159, + "flos": 26699384551680.0, + "grad_norm": 21.648990472832654, + "language_loss": 0.6931594, + "learning_rate": 6.424235332981245e-07, + "loss": 0.71773803, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18005371, + "step": 12397, + "time_per_iteration": 4.316742420196533 + }, + { + "auxiliary_loss_clip": 0.01400273, + "auxiliary_loss_mlp": 0.01038909, + "balance_loss_clip": 1.2407918, + "balance_loss_mlp": 1.02043152, + "epoch": 0.7454080865774838, + "flos": 17024025012480.0, + "grad_norm": 1.8599539699797933, + "language_loss": 0.77836478, + "learning_rate": 6.421375640558908e-07, + "loss": 0.80275655, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18469238, + "step": 12398, + "time_per_iteration": 2.8079679012298584 + }, + { + "auxiliary_loss_clip": 0.01401558, + "auxiliary_loss_mlp": 0.01031673, + "balance_loss_clip": 1.24294901, + "balance_loss_mlp": 1.0133028, + "epoch": 0.7454682098301518, + "flos": 21333642001920.0, + "grad_norm": 2.137313638042295, + "language_loss": 0.7874065, + "learning_rate": 6.418516463039363e-07, + "loss": 0.81173879, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18359375, + "step": 12399, + "time_per_iteration": 2.827789306640625 + }, + { + "auxiliary_loss_clip": 0.01400515, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.24381447, + "balance_loss_mlp": 1.01849365, + "epoch": 0.7455283330828197, + "flos": 17867336440320.0, + "grad_norm": 1.8894970603406782, + "language_loss": 0.75089407, + "learning_rate": 6.415657800531038e-07, + "loss": 0.77526915, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18505859, + "step": 12400, + "time_per_iteration": 2.833164930343628 + }, + { + "auxiliary_loss_clip": 0.01413932, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.25294542, + "balance_loss_mlp": 1.01567924, + "epoch": 0.7455884563354878, + "flos": 30786595511040.0, + "grad_norm": 2.1793405617033264, + "language_loss": 0.83034569, + "learning_rate": 6.412799653142327e-07, + "loss": 0.85482335, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1817627, + "step": 12401, + "time_per_iteration": 2.9080002307891846 + }, + { + "auxiliary_loss_clip": 0.01417335, + "auxiliary_loss_mlp": 0.01037564, + "balance_loss_clip": 1.25496268, + "balance_loss_mlp": 1.01888359, + "epoch": 0.7456485795881557, + "flos": 23195996236800.0, + "grad_norm": 2.1770639125542415, + "language_loss": 0.65878344, + "learning_rate": 6.409942020981611e-07, + "loss": 0.68333244, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18688965, + "step": 12402, + "time_per_iteration": 5.671504974365234 + }, + { + "auxiliary_loss_clip": 0.01397322, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.23875809, + "balance_loss_mlp": 1.01832032, + "epoch": 0.7457087028408237, + "flos": 38741280842880.0, + "grad_norm": 1.828119628898687, + "language_loss": 0.73615944, + "learning_rate": 6.407084904157265e-07, + "loss": 0.76049304, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17712402, + "step": 12403, + "time_per_iteration": 2.9931159019470215 + }, + { + "auxiliary_loss_clip": 0.0118575, + "auxiliary_loss_mlp": 0.0101928, + "balance_loss_clip": 1.09561729, + "balance_loss_mlp": 0.99848998, + "epoch": 0.7457688260934917, + "flos": 56068166419200.0, + "grad_norm": 0.8300283445998068, + "language_loss": 0.58841026, + "learning_rate": 6.404228302777621e-07, + "loss": 0.61046058, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20800781, + "step": 12404, + "time_per_iteration": 3.1960744857788086 + }, + { + "auxiliary_loss_clip": 0.01414699, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.2522999, + "balance_loss_mlp": 1.01834524, + "epoch": 0.7458289493461596, + "flos": 20124615703680.0, + "grad_norm": 2.1684712575019, + "language_loss": 0.78246772, + "learning_rate": 6.401372216950995e-07, + "loss": 0.80697906, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18103027, + "step": 12405, + "time_per_iteration": 2.8594865798950195 + }, + { + "auxiliary_loss_clip": 0.01403994, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.24613237, + "balance_loss_mlp": 1.01819932, + "epoch": 0.7458890725988276, + "flos": 20202581773440.0, + "grad_norm": 1.632367130969013, + "language_loss": 0.69074482, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71514332, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.1763916, + "step": 12406, + "time_per_iteration": 2.850078582763672 + }, + { + "auxiliary_loss_clip": 0.0143367, + "auxiliary_loss_mlp": 0.01037783, + "balance_loss_clip": 1.26566553, + "balance_loss_mlp": 1.01848328, + "epoch": 0.7459491958514956, + "flos": 17027418372480.0, + "grad_norm": 1.736225851077095, + "language_loss": 0.65576166, + "learning_rate": 6.39566159239002e-07, + "loss": 0.68047619, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19299316, + "step": 12407, + "time_per_iteration": 2.823737144470215 + }, + { + "auxiliary_loss_clip": 0.01421171, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.25609052, + "balance_loss_mlp": 1.01739657, + "epoch": 0.7460093191041636, + "flos": 25088601260160.0, + "grad_norm": 1.6771873185307231, + "language_loss": 0.72925317, + "learning_rate": 6.392807053872212e-07, + "loss": 0.75381577, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.17687988, + "step": 12408, + "time_per_iteration": 2.857102870941162 + }, + { + "auxiliary_loss_clip": 0.01428728, + "auxiliary_loss_mlp": 0.01031845, + "balance_loss_clip": 1.26149833, + "balance_loss_mlp": 1.01316535, + "epoch": 0.7460694423568315, + "flos": 21918641011200.0, + "grad_norm": 2.8389834353835957, + "language_loss": 0.73848861, + "learning_rate": 6.38995303134053e-07, + "loss": 0.76309437, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.18688965, + "step": 12409, + "time_per_iteration": 2.812295436859131 + }, + { + "auxiliary_loss_clip": 0.01388776, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.23271835, + "balance_loss_mlp": 1.01707554, + "epoch": 0.7461295656094995, + "flos": 21225787102080.0, + "grad_norm": 1.7243309574980383, + "language_loss": 0.66266942, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68690729, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.17944336, + "step": 12410, + "time_per_iteration": 2.854914665222168 + }, + { + "auxiliary_loss_clip": 0.01400741, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.24225402, + "balance_loss_mlp": 1.01488519, + "epoch": 0.7461896888621674, + "flos": 22357209288960.0, + "grad_norm": 1.9873608715473716, + "language_loss": 0.85184252, + "learning_rate": 6.384246534668396e-07, + "loss": 0.8761785, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17980957, + "step": 12411, + "time_per_iteration": 2.8333091735839844 + }, + { + "auxiliary_loss_clip": 0.01422786, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.26006007, + "balance_loss_mlp": 1.01565766, + "epoch": 0.7462498121148354, + "flos": 25493661123840.0, + "grad_norm": 1.5098641832571382, + "language_loss": 0.78844297, + "learning_rate": 6.381394060744339e-07, + "loss": 0.81301188, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18457031, + "step": 12412, + "time_per_iteration": 2.885653495788574 + }, + { + "auxiliary_loss_clip": 0.01416886, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.25365114, + "balance_loss_mlp": 1.01895916, + "epoch": 0.7463099353675033, + "flos": 33960944505600.0, + "grad_norm": 1.9682233779148723, + "language_loss": 0.63739884, + "learning_rate": 6.378542103239188e-07, + "loss": 0.66194546, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18823242, + "step": 12413, + "time_per_iteration": 2.981565237045288 + }, + { + "auxiliary_loss_clip": 0.0118709, + "auxiliary_loss_mlp": 0.01050313, + "balance_loss_clip": 1.09891713, + "balance_loss_mlp": 1.03095293, + "epoch": 0.7463700586201714, + "flos": 62796650411520.0, + "grad_norm": 0.7319914071522214, + "language_loss": 0.54885775, + "learning_rate": 6.375690662261082e-07, + "loss": 0.57123178, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19335938, + "step": 12414, + "time_per_iteration": 3.376025438308716 + }, + { + "auxiliary_loss_clip": 0.01414079, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.25108087, + "balance_loss_mlp": 1.01848948, + "epoch": 0.7464301818728393, + "flos": 33444364913280.0, + "grad_norm": 2.2259934739346146, + "language_loss": 0.55266225, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57718194, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19396973, + "step": 12415, + "time_per_iteration": 3.049652099609375 + }, + { + "auxiliary_loss_clip": 0.01411719, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.25036025, + "balance_loss_mlp": 1.01234663, + "epoch": 0.7464903051255073, + "flos": 26881088244480.0, + "grad_norm": 1.7984070905232146, + "language_loss": 0.75314361, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77757037, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18615723, + "step": 12416, + "time_per_iteration": 2.920586585998535 + }, + { + "auxiliary_loss_clip": 0.01413942, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.25119388, + "balance_loss_mlp": 1.01787412, + "epoch": 0.7465504283781753, + "flos": 44101548771840.0, + "grad_norm": 1.5758525380809827, + "language_loss": 0.6967513, + "learning_rate": 6.367139439570233e-07, + "loss": 0.72126186, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19238281, + "step": 12417, + "time_per_iteration": 3.02443528175354 + }, + { + "auxiliary_loss_clip": 0.01437034, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.27149677, + "balance_loss_mlp": 1.0179528, + "epoch": 0.7466105516308432, + "flos": 19683875675520.0, + "grad_norm": 1.9309556353316306, + "language_loss": 0.74573654, + "learning_rate": 6.364290065781392e-07, + "loss": 0.77048051, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19396973, + "step": 12418, + "time_per_iteration": 2.857792854309082 + }, + { + "auxiliary_loss_clip": 0.01410325, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.24816966, + "balance_loss_mlp": 1.01196885, + "epoch": 0.7466706748835112, + "flos": 20530128015360.0, + "grad_norm": 1.764446461653002, + "language_loss": 0.70307153, + "learning_rate": 6.361441209060039e-07, + "loss": 0.72748005, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18554688, + "step": 12419, + "time_per_iteration": 2.9096386432647705 + }, + { + "auxiliary_loss_clip": 0.01402567, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.24591088, + "balance_loss_mlp": 1.01327562, + "epoch": 0.7467307981361792, + "flos": 21700487992320.0, + "grad_norm": 2.2235562490390572, + "language_loss": 0.75670457, + "learning_rate": 6.358592869514216e-07, + "loss": 0.7810508, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18786621, + "step": 12420, + "time_per_iteration": 2.8544178009033203 + }, + { + "auxiliary_loss_clip": 0.01420522, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.25684428, + "balance_loss_mlp": 1.0144403, + "epoch": 0.7467909213888472, + "flos": 19583259943680.0, + "grad_norm": 1.853853336872252, + "language_loss": 0.68170172, + "learning_rate": 6.355745047251904e-07, + "loss": 0.70623082, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.17944336, + "step": 12421, + "time_per_iteration": 2.9223642349243164 + }, + { + "auxiliary_loss_clip": 0.0141924, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.25375295, + "balance_loss_mlp": 1.01397181, + "epoch": 0.7468510446415151, + "flos": 23705517640320.0, + "grad_norm": 7.435867422069504, + "language_loss": 0.73099977, + "learning_rate": 6.352897742381107e-07, + "loss": 0.75552499, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19311523, + "step": 12422, + "time_per_iteration": 2.9388985633850098 + }, + { + "auxiliary_loss_clip": 0.01404551, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.24514616, + "balance_loss_mlp": 1.01404977, + "epoch": 0.7469111678941831, + "flos": 29327989040640.0, + "grad_norm": 1.855367404192207, + "language_loss": 0.7547822, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77915668, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18847656, + "step": 12423, + "time_per_iteration": 2.9306602478027344 + }, + { + "auxiliary_loss_clip": 0.01399715, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.24092412, + "balance_loss_mlp": 1.012012, + "epoch": 0.746971291146851, + "flos": 21808614360960.0, + "grad_norm": 1.2706046628813827, + "language_loss": 0.68110698, + "learning_rate": 6.347204685245929e-07, + "loss": 0.70541203, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18774414, + "step": 12424, + "time_per_iteration": 2.9095659255981445 + }, + { + "auxiliary_loss_clip": 0.01413785, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.24972582, + "balance_loss_mlp": 1.0190624, + "epoch": 0.747031414399519, + "flos": 36258292679040.0, + "grad_norm": 1.9628583166154159, + "language_loss": 0.7546314, + "learning_rate": 6.344358933197418e-07, + "loss": 0.77915776, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19799805, + "step": 12425, + "time_per_iteration": 4.485883712768555 + }, + { + "auxiliary_loss_clip": 0.01420271, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.25596809, + "balance_loss_mlp": 1.01546431, + "epoch": 0.7470915376521869, + "flos": 19984564488960.0, + "grad_norm": 3.1666902296725707, + "language_loss": 0.70832789, + "learning_rate": 6.341513698972194e-07, + "loss": 0.73287356, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18847656, + "step": 12426, + "time_per_iteration": 2.849729537963867 + }, + { + "auxiliary_loss_clip": 0.01406485, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.24765337, + "balance_loss_mlp": 1.01655173, + "epoch": 0.747151660904855, + "flos": 20093957712000.0, + "grad_norm": 1.5438295780642828, + "language_loss": 0.65624946, + "learning_rate": 6.338668982678139e-07, + "loss": 0.68066251, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18249512, + "step": 12427, + "time_per_iteration": 2.8377249240875244 + }, + { + "auxiliary_loss_clip": 0.01417989, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.25456917, + "balance_loss_mlp": 1.01161098, + "epoch": 0.7472117841575229, + "flos": 16299201012480.0, + "grad_norm": 1.6622124188583554, + "language_loss": 0.75165904, + "learning_rate": 6.335824784423118e-07, + "loss": 0.77614599, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19091797, + "step": 12428, + "time_per_iteration": 2.8006911277770996 + }, + { + "auxiliary_loss_clip": 0.01425474, + "auxiliary_loss_mlp": 0.0103416, + "balance_loss_clip": 1.25749207, + "balance_loss_mlp": 1.01459742, + "epoch": 0.7472719074101909, + "flos": 21397898897280.0, + "grad_norm": 1.9599299798523406, + "language_loss": 0.58765745, + "learning_rate": 6.33298110431499e-07, + "loss": 0.61225373, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 1.6796875, + "router_z_loss_mlp": 0.19543457, + "step": 12429, + "time_per_iteration": 2.8095595836639404 + }, + { + "auxiliary_loss_clip": 0.0142623, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.26104736, + "balance_loss_mlp": 1.01763082, + "epoch": 0.7473320306628589, + "flos": 29655354303360.0, + "grad_norm": 1.8103043213659085, + "language_loss": 0.62075961, + "learning_rate": 6.330137942461595e-07, + "loss": 0.64538538, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18737793, + "step": 12430, + "time_per_iteration": 2.9137394428253174 + }, + { + "auxiliary_loss_clip": 0.01401762, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.24218225, + "balance_loss_mlp": 1.0141629, + "epoch": 0.7473921539155268, + "flos": 24147298298880.0, + "grad_norm": 1.4739379564644057, + "language_loss": 0.76552403, + "learning_rate": 6.327295298970734e-07, + "loss": 0.78987825, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19494629, + "step": 12431, + "time_per_iteration": 2.881517171859741 + }, + { + "auxiliary_loss_clip": 0.01412408, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.24947333, + "balance_loss_mlp": 1.01704478, + "epoch": 0.7474522771681948, + "flos": 17495423032320.0, + "grad_norm": 1.6993995715478654, + "language_loss": 0.76391143, + "learning_rate": 6.32445317395021e-07, + "loss": 0.7883842, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.17834473, + "step": 12432, + "time_per_iteration": 4.340936899185181 + }, + { + "auxiliary_loss_clip": 0.01423771, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.25619495, + "balance_loss_mlp": 1.01269436, + "epoch": 0.7475124004208628, + "flos": 16736004743040.0, + "grad_norm": 2.136116144373634, + "language_loss": 0.70875347, + "learning_rate": 6.321611567507787e-07, + "loss": 0.73331785, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.1998291, + "step": 12433, + "time_per_iteration": 2.7950756549835205 + }, + { + "auxiliary_loss_clip": 0.01419201, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.25547147, + "balance_loss_mlp": 1.01097405, + "epoch": 0.7475725236735308, + "flos": 19730007388800.0, + "grad_norm": 1.9679652025593597, + "language_loss": 0.68080568, + "learning_rate": 6.318770479751232e-07, + "loss": 0.70530236, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19506836, + "step": 12434, + "time_per_iteration": 2.879580020904541 + }, + { + "auxiliary_loss_clip": 0.01390043, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.23540735, + "balance_loss_mlp": 1.01400208, + "epoch": 0.7476326469261987, + "flos": 26297039376000.0, + "grad_norm": 1.4300783671726351, + "language_loss": 0.80337906, + "learning_rate": 6.315929910788263e-07, + "loss": 0.82760239, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.18286133, + "step": 12435, + "time_per_iteration": 2.8806560039520264 + }, + { + "auxiliary_loss_clip": 0.0141638, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.25047469, + "balance_loss_mlp": 1.01007354, + "epoch": 0.7476927701788667, + "flos": 31844304639360.0, + "grad_norm": 2.0165663518604773, + "language_loss": 0.68123138, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70568597, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19006348, + "step": 12436, + "time_per_iteration": 2.9192166328430176 + }, + { + "auxiliary_loss_clip": 0.01423814, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.25691223, + "balance_loss_mlp": 1.01623452, + "epoch": 0.7477528934315346, + "flos": 31807991047680.0, + "grad_norm": 1.7405568607138195, + "language_loss": 0.71402609, + "learning_rate": 6.31025032967396e-07, + "loss": 0.73861462, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18823242, + "step": 12437, + "time_per_iteration": 5.654440879821777 + }, + { + "auxiliary_loss_clip": 0.01394719, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.23681843, + "balance_loss_mlp": 1.01537991, + "epoch": 0.7478130166842026, + "flos": 20380620637440.0, + "grad_norm": 9.056087163221031, + "language_loss": 0.68013227, + "learning_rate": 6.307411317737986e-07, + "loss": 0.70441258, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.17932129, + "step": 12438, + "time_per_iteration": 2.8613014221191406 + }, + { + "auxiliary_loss_clip": 0.01407596, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.24610472, + "balance_loss_mlp": 1.01769221, + "epoch": 0.7478731399368705, + "flos": 18157664194560.0, + "grad_norm": 1.8752179293142412, + "language_loss": 0.81450802, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83895499, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19396973, + "step": 12439, + "time_per_iteration": 2.8371241092681885 + }, + { + "auxiliary_loss_clip": 0.01405286, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.24535048, + "balance_loss_mlp": 1.01540196, + "epoch": 0.7479332631895386, + "flos": 15276221907840.0, + "grad_norm": 1.9973296407947154, + "language_loss": 0.72161031, + "learning_rate": 6.301734851646674e-07, + "loss": 0.74600017, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18286133, + "step": 12440, + "time_per_iteration": 2.8294785022735596 + }, + { + "auxiliary_loss_clip": 0.01407069, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.2479012, + "balance_loss_mlp": 1.01268518, + "epoch": 0.7479933864422065, + "flos": 21152481246720.0, + "grad_norm": 1.6475253314475296, + "language_loss": 0.74829066, + "learning_rate": 6.298897397706597e-07, + "loss": 0.77267051, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18225098, + "step": 12441, + "time_per_iteration": 2.898759603500366 + }, + { + "auxiliary_loss_clip": 0.01414562, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.25026703, + "balance_loss_mlp": 1.01614189, + "epoch": 0.7480535096948745, + "flos": 14400080737920.0, + "grad_norm": 2.370559785513862, + "language_loss": 0.83114088, + "learning_rate": 6.296060463313698e-07, + "loss": 0.85565364, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20568848, + "step": 12442, + "time_per_iteration": 2.8207619190216064 + }, + { + "auxiliary_loss_clip": 0.01434538, + "auxiliary_loss_mlp": 0.01038321, + "balance_loss_clip": 1.26800227, + "balance_loss_mlp": 1.01866364, + "epoch": 0.7481136329475425, + "flos": 27356151093120.0, + "grad_norm": 2.046024760819203, + "language_loss": 0.64662063, + "learning_rate": 6.293224048575565e-07, + "loss": 0.67134911, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.1965332, + "step": 12443, + "time_per_iteration": 2.9087905883789062 + }, + { + "auxiliary_loss_clip": 0.01401251, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.24161208, + "balance_loss_mlp": 1.00947905, + "epoch": 0.7481737562002104, + "flos": 19539526204800.0, + "grad_norm": 2.080946538257685, + "language_loss": 0.72449958, + "learning_rate": 6.29038815359975e-07, + "loss": 0.74878699, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18017578, + "step": 12444, + "time_per_iteration": 2.8500118255615234 + }, + { + "auxiliary_loss_clip": 0.01413554, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.25098252, + "balance_loss_mlp": 1.01483321, + "epoch": 0.7482338794528784, + "flos": 21769450346880.0, + "grad_norm": 1.3767583578565141, + "language_loss": 0.69907916, + "learning_rate": 6.287552778493786e-07, + "loss": 0.72355282, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18981934, + "step": 12445, + "time_per_iteration": 2.8494691848754883 + }, + { + "auxiliary_loss_clip": 0.0140363, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.24384379, + "balance_loss_mlp": 1.01092184, + "epoch": 0.7482940027055464, + "flos": 18706530591360.0, + "grad_norm": 1.6055861593137224, + "language_loss": 0.74861014, + "learning_rate": 6.28471792336519e-07, + "loss": 0.77293921, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18383789, + "step": 12446, + "time_per_iteration": 2.820281982421875 + }, + { + "auxiliary_loss_clip": 0.01419575, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.25488079, + "balance_loss_mlp": 1.01136041, + "epoch": 0.7483541259582144, + "flos": 16006656263040.0, + "grad_norm": 1.9523759639281335, + "language_loss": 0.73769248, + "learning_rate": 6.281883588321475e-07, + "loss": 0.76219422, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19250488, + "step": 12447, + "time_per_iteration": 2.8095130920410156 + }, + { + "auxiliary_loss_clip": 0.01400243, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.23899543, + "balance_loss_mlp": 1.01393259, + "epoch": 0.7484142492108823, + "flos": 25567147958400.0, + "grad_norm": 2.6412419409881323, + "language_loss": 0.72665811, + "learning_rate": 6.279049773470109e-07, + "loss": 0.75097716, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.17736816, + "step": 12448, + "time_per_iteration": 2.8531925678253174 + }, + { + "auxiliary_loss_clip": 0.01412069, + "auxiliary_loss_mlp": 0.01034951, + "balance_loss_clip": 1.24785984, + "balance_loss_mlp": 1.01685452, + "epoch": 0.7484743724635503, + "flos": 22896574277760.0, + "grad_norm": 2.0624012992412735, + "language_loss": 0.73983645, + "learning_rate": 6.276216478918543e-07, + "loss": 0.76430666, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18103027, + "step": 12449, + "time_per_iteration": 2.845534086227417 + }, + { + "auxiliary_loss_clip": 0.014295, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.26185703, + "balance_loss_mlp": 1.01561117, + "epoch": 0.7485344957162182, + "flos": 25310916800640.0, + "grad_norm": 2.17326506757167, + "language_loss": 0.6204015, + "learning_rate": 6.273383704774225e-07, + "loss": 0.64504933, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19677734, + "step": 12450, + "time_per_iteration": 2.8838274478912354 + }, + { + "auxiliary_loss_clip": 0.01404035, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.24711871, + "balance_loss_mlp": 1.01331925, + "epoch": 0.7485946189688862, + "flos": 27064465994880.0, + "grad_norm": 2.073065104731006, + "language_loss": 0.71523398, + "learning_rate": 6.270551451144577e-07, + "loss": 0.73958683, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17944336, + "step": 12451, + "time_per_iteration": 2.8766801357269287 + }, + { + "auxiliary_loss_clip": 0.01430645, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.26162481, + "balance_loss_mlp": 1.02014112, + "epoch": 0.7486547422215541, + "flos": 26918080508160.0, + "grad_norm": 2.177203382172859, + "language_loss": 0.81084663, + "learning_rate": 6.267719718136988e-07, + "loss": 0.83553547, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.1809082, + "step": 12452, + "time_per_iteration": 2.908031463623047 + }, + { + "auxiliary_loss_clip": 0.01443904, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.27540493, + "balance_loss_mlp": 1.01404786, + "epoch": 0.7487148654742222, + "flos": 22356485372160.0, + "grad_norm": 2.2577179174956994, + "language_loss": 0.72265023, + "learning_rate": 6.264888505858843e-07, + "loss": 0.74741894, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.18933105, + "step": 12453, + "time_per_iteration": 2.8831496238708496 + }, + { + "auxiliary_loss_clip": 0.01421519, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.2584517, + "balance_loss_mlp": 1.01409125, + "epoch": 0.7487749887268901, + "flos": 23049022567680.0, + "grad_norm": 1.6193462415975595, + "language_loss": 0.74787021, + "learning_rate": 6.262057814417517e-07, + "loss": 0.7724117, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1854248, + "step": 12454, + "time_per_iteration": 2.906676769256592 + }, + { + "auxiliary_loss_clip": 0.01181161, + "auxiliary_loss_mlp": 0.01022783, + "balance_loss_clip": 1.09303331, + "balance_loss_mlp": 1.00218344, + "epoch": 0.7488351119795581, + "flos": 71556750011520.0, + "grad_norm": 0.7365398134440438, + "language_loss": 0.59396625, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61600566, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20605469, + "step": 12455, + "time_per_iteration": 3.5278584957122803 + }, + { + "auxiliary_loss_clip": 0.01412237, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.25330496, + "balance_loss_mlp": 1.01463771, + "epoch": 0.748895235232226, + "flos": 17203692689280.0, + "grad_norm": 1.5847272867481101, + "language_loss": 0.80740643, + "learning_rate": 6.256397994474592e-07, + "loss": 0.83186573, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19067383, + "step": 12456, + "time_per_iteration": 2.8248488903045654 + }, + { + "auxiliary_loss_clip": 0.01183989, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.09545088, + "balance_loss_mlp": 1.01961231, + "epoch": 0.748955358484894, + "flos": 59008770921600.0, + "grad_norm": 0.8359277753263036, + "language_loss": 0.61452705, + "learning_rate": 6.25356886618763e-07, + "loss": 0.636729, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.16601562, + "step": 12457, + "time_per_iteration": 3.2244930267333984 + }, + { + "auxiliary_loss_clip": 0.01417941, + "auxiliary_loss_mlp": 0.01040594, + "balance_loss_clip": 1.25357568, + "balance_loss_mlp": 1.02110362, + "epoch": 0.749015481737562, + "flos": 11366823588480.0, + "grad_norm": 4.7665354542432254, + "language_loss": 0.67612457, + "learning_rate": 6.250740259166711e-07, + "loss": 0.70070994, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19494629, + "step": 12458, + "time_per_iteration": 2.8123645782470703 + }, + { + "auxiliary_loss_clip": 0.01404671, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.2448318, + "balance_loss_mlp": 1.01613414, + "epoch": 0.74907560499023, + "flos": 21116574858240.0, + "grad_norm": 1.7365549635788728, + "language_loss": 0.80780137, + "learning_rate": 6.247912173519106e-07, + "loss": 0.83220065, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19128418, + "step": 12459, + "time_per_iteration": 3.03592848777771 + }, + { + "auxiliary_loss_clip": 0.01417162, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.25575912, + "balance_loss_mlp": 1.01629424, + "epoch": 0.749135728242898, + "flos": 22276845244800.0, + "grad_norm": 1.7394107446926732, + "language_loss": 0.81163335, + "learning_rate": 6.245084609352043e-07, + "loss": 0.83615601, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18798828, + "step": 12460, + "time_per_iteration": 4.249496698379517 + }, + { + "auxiliary_loss_clip": 0.01408095, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.2466855, + "balance_loss_mlp": 1.0129087, + "epoch": 0.7491958514955659, + "flos": 24067250968320.0, + "grad_norm": 2.0175209933907445, + "language_loss": 0.86343521, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88783896, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19384766, + "step": 12461, + "time_per_iteration": 2.889167308807373 + }, + { + "auxiliary_loss_clip": 0.01402459, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.24443913, + "balance_loss_mlp": 1.01667607, + "epoch": 0.7492559747482339, + "flos": 24501113786880.0, + "grad_norm": 1.749516115506894, + "language_loss": 0.70605892, + "learning_rate": 6.239431045888435e-07, + "loss": 0.73043239, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18212891, + "step": 12462, + "time_per_iteration": 2.9687438011169434 + }, + { + "auxiliary_loss_clip": 0.01410557, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.24848032, + "balance_loss_mlp": 1.01477075, + "epoch": 0.7493160980009018, + "flos": 27756731721600.0, + "grad_norm": 1.8041319594063903, + "language_loss": 0.71324146, + "learning_rate": 6.236605046806267e-07, + "loss": 0.737688, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19311523, + "step": 12463, + "time_per_iteration": 2.9328815937042236 + }, + { + "auxiliary_loss_clip": 0.01420127, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.25823998, + "balance_loss_mlp": 1.01845407, + "epoch": 0.7493762212535698, + "flos": 30238181562240.0, + "grad_norm": 1.9868968205806885, + "language_loss": 0.78278756, + "learning_rate": 6.233779569633419e-07, + "loss": 0.80736077, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18737793, + "step": 12464, + "time_per_iteration": 2.942558765411377 + }, + { + "auxiliary_loss_clip": 0.01404969, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.24323022, + "balance_loss_mlp": 1.01821542, + "epoch": 0.7494363445062378, + "flos": 21954456910080.0, + "grad_norm": 1.7153416336742429, + "language_loss": 0.7946583, + "learning_rate": 6.230954614477034e-07, + "loss": 0.8190822, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19189453, + "step": 12465, + "time_per_iteration": 2.8788678646087646 + }, + { + "auxiliary_loss_clip": 0.01433178, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.26351845, + "balance_loss_mlp": 1.01865244, + "epoch": 0.7494964677589058, + "flos": 12496752696960.0, + "grad_norm": 2.276132614050069, + "language_loss": 0.75717998, + "learning_rate": 6.22813018144422e-07, + "loss": 0.78190053, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.20239258, + "step": 12466, + "time_per_iteration": 2.806399345397949 + }, + { + "auxiliary_loss_clip": 0.01430047, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.26632547, + "balance_loss_mlp": 1.01791489, + "epoch": 0.7495565910115737, + "flos": 21663088525440.0, + "grad_norm": 5.872717285421449, + "language_loss": 0.67423278, + "learning_rate": 6.22530627064209e-07, + "loss": 0.69889319, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18078613, + "step": 12467, + "time_per_iteration": 4.319119691848755 + }, + { + "auxiliary_loss_clip": 0.01424094, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.25921059, + "balance_loss_mlp": 1.01356375, + "epoch": 0.7496167142642417, + "flos": 15277036314240.0, + "grad_norm": 4.558684364239522, + "language_loss": 0.77088392, + "learning_rate": 6.222482882177735e-07, + "loss": 0.79544222, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1817627, + "step": 12468, + "time_per_iteration": 2.802034616470337 + }, + { + "auxiliary_loss_clip": 0.01408395, + "auxiliary_loss_mlp": 0.01035677, + "balance_loss_clip": 1.24729466, + "balance_loss_mlp": 1.01560271, + "epoch": 0.7496768375169096, + "flos": 22065207477120.0, + "grad_norm": 2.1606248057545914, + "language_loss": 0.70485413, + "learning_rate": 6.219660016158201e-07, + "loss": 0.72929484, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20080566, + "step": 12469, + "time_per_iteration": 2.8313517570495605 + }, + { + "auxiliary_loss_clip": 0.01421744, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.25757396, + "balance_loss_mlp": 1.01672351, + "epoch": 0.7497369607695776, + "flos": 19064961048960.0, + "grad_norm": 1.8523547203377615, + "language_loss": 0.6943329, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71890926, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19165039, + "step": 12470, + "time_per_iteration": 2.826503038406372 + }, + { + "auxiliary_loss_clip": 0.01433175, + "auxiliary_loss_mlp": 0.01036708, + "balance_loss_clip": 1.26328349, + "balance_loss_mlp": 1.01647806, + "epoch": 0.7497970840222457, + "flos": 21627679829760.0, + "grad_norm": 1.9883853060365262, + "language_loss": 0.75543773, + "learning_rate": 6.214015851881793e-07, + "loss": 0.78013653, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.20214844, + "step": 12471, + "time_per_iteration": 2.8243792057037354 + }, + { + "auxiliary_loss_clip": 0.01406215, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.24384594, + "balance_loss_mlp": 1.01262891, + "epoch": 0.7498572072749136, + "flos": 13743540420480.0, + "grad_norm": 2.2023509364873792, + "language_loss": 0.7826823, + "learning_rate": 6.211194553838929e-07, + "loss": 0.8070569, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18615723, + "step": 12472, + "time_per_iteration": 5.638099908828735 + }, + { + "auxiliary_loss_clip": 0.01409006, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.24921799, + "balance_loss_mlp": 1.01364541, + "epoch": 0.7499173305275816, + "flos": 22976893077120.0, + "grad_norm": 1.5428894868137881, + "language_loss": 0.84730339, + "learning_rate": 6.208373778668951e-07, + "loss": 0.87172079, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1907959, + "step": 12473, + "time_per_iteration": 2.8864071369171143 + }, + { + "auxiliary_loss_clip": 0.01427815, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.26152599, + "balance_loss_mlp": 1.01767087, + "epoch": 0.7499774537802495, + "flos": 22748921936640.0, + "grad_norm": 2.3523608007598207, + "language_loss": 0.75137985, + "learning_rate": 6.205553526478829e-07, + "loss": 0.77602553, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.19091797, + "step": 12474, + "time_per_iteration": 2.8588924407958984 + }, + { + "auxiliary_loss_clip": 0.01443316, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.27391481, + "balance_loss_mlp": 1.01697969, + "epoch": 0.7500375770329175, + "flos": 18305768983680.0, + "grad_norm": 2.98924882898071, + "language_loss": 0.75214255, + "learning_rate": 6.202733797375492e-07, + "loss": 0.77693373, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.18823242, + "step": 12475, + "time_per_iteration": 2.8488049507141113 + }, + { + "auxiliary_loss_clip": 0.01436151, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.26526606, + "balance_loss_mlp": 1.01688433, + "epoch": 0.7500977002855854, + "flos": 19178833507200.0, + "grad_norm": 2.6536037971998057, + "language_loss": 0.8059541, + "learning_rate": 6.199914591465878e-07, + "loss": 0.83068335, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.19885254, + "step": 12476, + "time_per_iteration": 2.868831157684326 + }, + { + "auxiliary_loss_clip": 0.01413067, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.24987328, + "balance_loss_mlp": 1.01682997, + "epoch": 0.7501578235382534, + "flos": 22174057762560.0, + "grad_norm": 1.8075033734966004, + "language_loss": 0.78118849, + "learning_rate": 6.19709590885688e-07, + "loss": 0.80567294, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.1854248, + "step": 12477, + "time_per_iteration": 2.8602864742279053 + }, + { + "auxiliary_loss_clip": 0.01181937, + "auxiliary_loss_mlp": 0.01018882, + "balance_loss_clip": 1.09420013, + "balance_loss_mlp": 0.99542159, + "epoch": 0.7502179467909214, + "flos": 64489861025280.0, + "grad_norm": 0.8036009265908917, + "language_loss": 0.54449284, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56650102, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.234375, + "step": 12478, + "time_per_iteration": 3.3315060138702393 + }, + { + "auxiliary_loss_clip": 0.01404429, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.24420953, + "balance_loss_mlp": 1.01474166, + "epoch": 0.7502780700435894, + "flos": 20486122807680.0, + "grad_norm": 1.8850877143869944, + "language_loss": 0.80480963, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82918817, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18676758, + "step": 12479, + "time_per_iteration": 2.901224374771118 + }, + { + "auxiliary_loss_clip": 0.01433399, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.26472259, + "balance_loss_mlp": 1.01580131, + "epoch": 0.7503381932962573, + "flos": 20454016982400.0, + "grad_norm": 2.285845219170473, + "language_loss": 0.64078653, + "learning_rate": 6.188643001902369e-07, + "loss": 0.66546786, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.18945312, + "step": 12480, + "time_per_iteration": 2.858152389526367 + }, + { + "auxiliary_loss_clip": 0.01404371, + "auxiliary_loss_mlp": 0.01039639, + "balance_loss_clip": 1.24722457, + "balance_loss_mlp": 1.02149498, + "epoch": 0.7503983165489253, + "flos": 22391939312640.0, + "grad_norm": 1.6800236434957445, + "language_loss": 0.78439772, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80883777, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18139648, + "step": 12481, + "time_per_iteration": 2.939526081085205 + }, + { + "auxiliary_loss_clip": 0.01407523, + "auxiliary_loss_mlp": 0.0103748, + "balance_loss_clip": 1.24410844, + "balance_loss_mlp": 1.01794147, + "epoch": 0.7504584398015932, + "flos": 24910109948160.0, + "grad_norm": 1.663056736199344, + "language_loss": 0.72127694, + "learning_rate": 6.183010349061501e-07, + "loss": 0.74572694, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19555664, + "step": 12482, + "time_per_iteration": 2.924591064453125 + }, + { + "auxiliary_loss_clip": 0.01410539, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.24814248, + "balance_loss_mlp": 1.01841068, + "epoch": 0.7505185630542612, + "flos": 25896323013120.0, + "grad_norm": 1.673825100827398, + "language_loss": 0.70297801, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72745895, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19128418, + "step": 12483, + "time_per_iteration": 2.8641340732574463 + }, + { + "auxiliary_loss_clip": 0.01418913, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.25615156, + "balance_loss_mlp": 1.01337838, + "epoch": 0.7505786863069293, + "flos": 23152941169920.0, + "grad_norm": 1.869204626047582, + "language_loss": 0.75522268, + "learning_rate": 6.177379791987131e-07, + "loss": 0.77972847, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18286133, + "step": 12484, + "time_per_iteration": 2.8730931282043457 + }, + { + "auxiliary_loss_clip": 0.01406213, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.24513841, + "balance_loss_mlp": 1.01572776, + "epoch": 0.7506388095595972, + "flos": 16992326390400.0, + "grad_norm": 1.8574209210712411, + "language_loss": 0.85759228, + "learning_rate": 6.174565299629295e-07, + "loss": 0.88200879, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19726562, + "step": 12485, + "time_per_iteration": 2.82473087310791 + }, + { + "auxiliary_loss_clip": 0.01421938, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.26175427, + "balance_loss_mlp": 1.01632428, + "epoch": 0.7506989328122652, + "flos": 22354856559360.0, + "grad_norm": 1.5313888610017428, + "language_loss": 0.78860283, + "learning_rate": 6.171751331533323e-07, + "loss": 0.81317472, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18933105, + "step": 12486, + "time_per_iteration": 2.888272523880005 + }, + { + "auxiliary_loss_clip": 0.01419554, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.25517821, + "balance_loss_mlp": 1.01455426, + "epoch": 0.7507590560649331, + "flos": 25787201258880.0, + "grad_norm": 2.2463821732140676, + "language_loss": 0.73488593, + "learning_rate": 6.168937887805932e-07, + "loss": 0.75941443, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18737793, + "step": 12487, + "time_per_iteration": 2.889967679977417 + }, + { + "auxiliary_loss_clip": 0.01424755, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.26058578, + "balance_loss_mlp": 1.01527143, + "epoch": 0.7508191793176011, + "flos": 24289973712000.0, + "grad_norm": 2.7295280587991484, + "language_loss": 0.68765944, + "learning_rate": 6.166124968553801e-07, + "loss": 0.71224517, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18566895, + "step": 12488, + "time_per_iteration": 2.9213786125183105 + }, + { + "auxiliary_loss_clip": 0.01422514, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.25930071, + "balance_loss_mlp": 1.01780844, + "epoch": 0.750879302570269, + "flos": 19908589190400.0, + "grad_norm": 19.09172723996267, + "language_loss": 0.7771998, + "learning_rate": 6.163312573883592e-07, + "loss": 0.80178541, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18249512, + "step": 12489, + "time_per_iteration": 2.810661554336548 + }, + { + "auxiliary_loss_clip": 0.01407254, + "auxiliary_loss_mlp": 0.01040023, + "balance_loss_clip": 1.24829388, + "balance_loss_mlp": 1.02180779, + "epoch": 0.750939425822937, + "flos": 29217690921600.0, + "grad_norm": 2.446715817527074, + "language_loss": 0.75747764, + "learning_rate": 6.160500703901956e-07, + "loss": 0.78195041, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18212891, + "step": 12490, + "time_per_iteration": 2.8904542922973633 + }, + { + "auxiliary_loss_clip": 0.01408268, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.24685693, + "balance_loss_mlp": 1.01874006, + "epoch": 0.750999549075605, + "flos": 21152209777920.0, + "grad_norm": 1.4919482846402805, + "language_loss": 0.79025286, + "learning_rate": 6.157689358715527e-07, + "loss": 0.81470025, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.17724609, + "step": 12491, + "time_per_iteration": 2.8387835025787354 + }, + { + "auxiliary_loss_clip": 0.01400794, + "auxiliary_loss_mlp": 0.01036834, + "balance_loss_clip": 1.24235666, + "balance_loss_mlp": 1.01869023, + "epoch": 0.751059672328273, + "flos": 23557593830400.0, + "grad_norm": 1.9104272516959286, + "language_loss": 0.76931787, + "learning_rate": 6.154878538430899e-07, + "loss": 0.79369414, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18151855, + "step": 12492, + "time_per_iteration": 2.934488296508789 + }, + { + "auxiliary_loss_clip": 0.01405663, + "auxiliary_loss_mlp": 0.01032985, + "balance_loss_clip": 1.24428499, + "balance_loss_mlp": 1.01496065, + "epoch": 0.7511197955809409, + "flos": 18999120585600.0, + "grad_norm": 1.8108941772907208, + "language_loss": 0.71372783, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73811424, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18029785, + "step": 12493, + "time_per_iteration": 2.864041805267334 + }, + { + "auxiliary_loss_clip": 0.01410937, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.24829936, + "balance_loss_mlp": 1.01520801, + "epoch": 0.7511799188336089, + "flos": 22055163131520.0, + "grad_norm": 1.9020213367678784, + "language_loss": 0.80652827, + "learning_rate": 6.149258472993395e-07, + "loss": 0.83098346, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19372559, + "step": 12494, + "time_per_iteration": 2.869568347930908 + }, + { + "auxiliary_loss_clip": 0.0141618, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.25413525, + "balance_loss_mlp": 1.01560616, + "epoch": 0.7512400420862768, + "flos": 16474525188480.0, + "grad_norm": 2.0697022343619693, + "language_loss": 0.79491669, + "learning_rate": 6.146449228053634e-07, + "loss": 0.81942242, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18798828, + "step": 12495, + "time_per_iteration": 4.344980955123901 + }, + { + "auxiliary_loss_clip": 0.01402283, + "auxiliary_loss_mlp": 0.01035513, + "balance_loss_clip": 1.24266148, + "balance_loss_mlp": 1.01668978, + "epoch": 0.7513001653389448, + "flos": 20457862790400.0, + "grad_norm": 1.8181371972359652, + "language_loss": 0.71882701, + "learning_rate": 6.143640508441898e-07, + "loss": 0.74320501, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18835449, + "step": 12496, + "time_per_iteration": 2.8645927906036377 + }, + { + "auxiliary_loss_clip": 0.01415671, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.25391245, + "balance_loss_mlp": 1.01634836, + "epoch": 0.7513602885916129, + "flos": 23487183642240.0, + "grad_norm": 1.6829236524801938, + "language_loss": 0.78587139, + "learning_rate": 6.140832314264705e-07, + "loss": 0.8103807, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18896484, + "step": 12497, + "time_per_iteration": 2.861813545227051 + }, + { + "auxiliary_loss_clip": 0.01406156, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.24427915, + "balance_loss_mlp": 1.01591873, + "epoch": 0.7514204118442808, + "flos": 26808506305920.0, + "grad_norm": 1.5058327432914824, + "language_loss": 0.77313495, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79754639, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.1907959, + "step": 12498, + "time_per_iteration": 2.9465792179107666 + }, + { + "auxiliary_loss_clip": 0.0141105, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.25241709, + "balance_loss_mlp": 1.01862574, + "epoch": 0.7514805350969488, + "flos": 19875307000320.0, + "grad_norm": 1.8126640823116762, + "language_loss": 0.74799573, + "learning_rate": 6.135217502639878e-07, + "loss": 0.77247143, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.17907715, + "step": 12499, + "time_per_iteration": 2.9235429763793945 + }, + { + "auxiliary_loss_clip": 0.01407623, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.24798286, + "balance_loss_mlp": 1.01563096, + "epoch": 0.7515406583496167, + "flos": 24582246992640.0, + "grad_norm": 1.8435611398797505, + "language_loss": 0.79755712, + "learning_rate": 6.132410885405148e-07, + "loss": 0.82196772, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.17810059, + "step": 12500, + "time_per_iteration": 2.9319345951080322 + }, + { + "auxiliary_loss_clip": 0.01442549, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.27135444, + "balance_loss_mlp": 1.01518822, + "epoch": 0.7516007816022847, + "flos": 20129773610880.0, + "grad_norm": 2.2486830955849, + "language_loss": 0.74562824, + "learning_rate": 6.129604794030794e-07, + "loss": 0.77040005, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.19433594, + "step": 12501, + "time_per_iteration": 2.8921523094177246 + }, + { + "auxiliary_loss_clip": 0.01402263, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.24165356, + "balance_loss_mlp": 1.01514173, + "epoch": 0.7516609048549526, + "flos": 22795098894720.0, + "grad_norm": 16.350561167693222, + "language_loss": 0.7968235, + "learning_rate": 6.126799228623207e-07, + "loss": 0.82117939, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18188477, + "step": 12502, + "time_per_iteration": 4.372798442840576 + }, + { + "auxiliary_loss_clip": 0.01422679, + "auxiliary_loss_mlp": 0.01034136, + "balance_loss_clip": 1.25887966, + "balance_loss_mlp": 1.01589704, + "epoch": 0.7517210281076206, + "flos": 10641094692480.0, + "grad_norm": 2.062031127974115, + "language_loss": 0.71065092, + "learning_rate": 6.123994189288786e-07, + "loss": 0.73521906, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18237305, + "step": 12503, + "time_per_iteration": 2.795586109161377 + }, + { + "auxiliary_loss_clip": 0.01182892, + "auxiliary_loss_mlp": 0.01016496, + "balance_loss_clip": 1.09642649, + "balance_loss_mlp": 0.99541944, + "epoch": 0.7517811513602886, + "flos": 66082202421120.0, + "grad_norm": 0.9770093400966515, + "language_loss": 0.64069164, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66268551, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.2109375, + "step": 12504, + "time_per_iteration": 3.258028745651245 + }, + { + "auxiliary_loss_clip": 0.01399552, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.24218059, + "balance_loss_mlp": 1.01492715, + "epoch": 0.7518412746129566, + "flos": 37282267169280.0, + "grad_norm": 2.646396450129587, + "language_loss": 0.69074917, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71507251, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17858887, + "step": 12505, + "time_per_iteration": 3.008297920227051 + }, + { + "auxiliary_loss_clip": 0.0118732, + "auxiliary_loss_mlp": 0.01022075, + "balance_loss_clip": 1.09891701, + "balance_loss_mlp": 1.00204754, + "epoch": 0.7519013978656245, + "flos": 60550275144960.0, + "grad_norm": 1.05462818756991, + "language_loss": 0.55179429, + "learning_rate": 6.11558222878809e-07, + "loss": 0.5738883, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20019531, + "step": 12506, + "time_per_iteration": 4.853604316711426 + }, + { + "auxiliary_loss_clip": 0.01425654, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.26159692, + "balance_loss_mlp": 1.01532364, + "epoch": 0.7519615211182925, + "flos": 18816104793600.0, + "grad_norm": 2.1468944798636262, + "language_loss": 0.79460728, + "learning_rate": 6.112779294809796e-07, + "loss": 0.81920302, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18603516, + "step": 12507, + "time_per_iteration": 4.247362852096558 + }, + { + "auxiliary_loss_clip": 0.01407648, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.2488935, + "balance_loss_mlp": 1.01471031, + "epoch": 0.7520216443709604, + "flos": 14583548977920.0, + "grad_norm": 3.8521031066597944, + "language_loss": 0.71968645, + "learning_rate": 6.10997688743631e-07, + "loss": 0.74409372, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18383789, + "step": 12508, + "time_per_iteration": 2.826523542404175 + }, + { + "auxiliary_loss_clip": 0.01413725, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.25178623, + "balance_loss_mlp": 1.01295018, + "epoch": 0.7520817676236284, + "flos": 17065632245760.0, + "grad_norm": 1.5966007704382137, + "language_loss": 0.72159648, + "learning_rate": 6.107175006773885e-07, + "loss": 0.7460537, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19030762, + "step": 12509, + "time_per_iteration": 2.8108487129211426 + }, + { + "auxiliary_loss_clip": 0.01424354, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.25738788, + "balance_loss_mlp": 1.01981044, + "epoch": 0.7521418908762965, + "flos": 25677446077440.0, + "grad_norm": 2.0924255195168864, + "language_loss": 0.6277138, + "learning_rate": 6.104373652928785e-07, + "loss": 0.65234613, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19067383, + "step": 12510, + "time_per_iteration": 2.8786427974700928 + }, + { + "auxiliary_loss_clip": 0.01412679, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.25413978, + "balance_loss_mlp": 1.01911092, + "epoch": 0.7522020141289644, + "flos": 20896566802560.0, + "grad_norm": 1.7097508539425792, + "language_loss": 0.82820797, + "learning_rate": 6.10157282600722e-07, + "loss": 0.85271698, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19104004, + "step": 12511, + "time_per_iteration": 2.8683831691741943 + }, + { + "auxiliary_loss_clip": 0.01420667, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.2552228, + "balance_loss_mlp": 1.01535869, + "epoch": 0.7522621373816324, + "flos": 12647798398080.0, + "grad_norm": 1.9365320376541595, + "language_loss": 0.76910347, + "learning_rate": 6.098772526115412e-07, + "loss": 0.79365873, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19506836, + "step": 12512, + "time_per_iteration": 2.8434855937957764 + }, + { + "auxiliary_loss_clip": 0.01389766, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.2345593, + "balance_loss_mlp": 1.01573634, + "epoch": 0.7523222606343003, + "flos": 25636200802560.0, + "grad_norm": 2.107700431577854, + "language_loss": 0.82717276, + "learning_rate": 6.095972753359537e-07, + "loss": 0.85140753, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.17993164, + "step": 12513, + "time_per_iteration": 2.893735408782959 + }, + { + "auxiliary_loss_clip": 0.01425641, + "auxiliary_loss_mlp": 0.01036922, + "balance_loss_clip": 1.26160192, + "balance_loss_mlp": 1.01762211, + "epoch": 0.7523823838869683, + "flos": 20458586707200.0, + "grad_norm": 1.687500583729265, + "language_loss": 0.75861132, + "learning_rate": 6.093173507845771e-07, + "loss": 0.78323698, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19311523, + "step": 12514, + "time_per_iteration": 2.836536407470703 + }, + { + "auxiliary_loss_clip": 0.01395902, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.24012041, + "balance_loss_mlp": 1.01275277, + "epoch": 0.7524425071396362, + "flos": 14728803344640.0, + "grad_norm": 1.9857301811744628, + "language_loss": 0.68684578, + "learning_rate": 6.090374789680271e-07, + "loss": 0.7111159, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18371582, + "step": 12515, + "time_per_iteration": 2.817331075668335 + }, + { + "auxiliary_loss_clip": 0.01415851, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.25352597, + "balance_loss_mlp": 1.01383877, + "epoch": 0.7525026303923043, + "flos": 30604394125440.0, + "grad_norm": 2.10976560304191, + "language_loss": 0.70994449, + "learning_rate": 6.087576598969137e-07, + "loss": 0.73441851, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.17712402, + "step": 12516, + "time_per_iteration": 2.98014235496521 + }, + { + "auxiliary_loss_clip": 0.01411969, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.25277019, + "balance_loss_mlp": 1.01608574, + "epoch": 0.7525627536449722, + "flos": 24802843230720.0, + "grad_norm": 1.6851932802490854, + "language_loss": 0.89709735, + "learning_rate": 6.084778935818495e-07, + "loss": 0.92156506, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18713379, + "step": 12517, + "time_per_iteration": 2.865992546081543 + }, + { + "auxiliary_loss_clip": 0.01436706, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.27056086, + "balance_loss_mlp": 1.02026176, + "epoch": 0.7526228768976402, + "flos": 20790204981120.0, + "grad_norm": 1.701226716471453, + "language_loss": 0.75208861, + "learning_rate": 6.081981800334437e-07, + "loss": 0.77683926, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18103027, + "step": 12518, + "time_per_iteration": 2.8460352420806885 + }, + { + "auxiliary_loss_clip": 0.01181817, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.09595895, + "balance_loss_mlp": 1.01510429, + "epoch": 0.7526830001503081, + "flos": 66588375709440.0, + "grad_norm": 0.7125369424839948, + "language_loss": 0.55726403, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57940966, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.17675781, + "step": 12519, + "time_per_iteration": 3.415013074874878 + }, + { + "auxiliary_loss_clip": 0.01413972, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.25207484, + "balance_loss_mlp": 1.01464975, + "epoch": 0.7527431234029761, + "flos": 23488540986240.0, + "grad_norm": 1.4694622336932446, + "language_loss": 0.77733541, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80179954, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.17797852, + "step": 12520, + "time_per_iteration": 2.9047603607177734 + }, + { + "auxiliary_loss_clip": 0.01404201, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.24467158, + "balance_loss_mlp": 1.02138722, + "epoch": 0.752803246655644, + "flos": 22058420757120.0, + "grad_norm": 2.082117532418783, + "language_loss": 0.74630105, + "learning_rate": 6.07359356094229e-07, + "loss": 0.77073717, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18029785, + "step": 12521, + "time_per_iteration": 2.8940083980560303 + }, + { + "auxiliary_loss_clip": 0.01430344, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.26285565, + "balance_loss_mlp": 1.01268613, + "epoch": 0.752863369908312, + "flos": 30165916337280.0, + "grad_norm": 8.102361819746235, + "language_loss": 0.67711568, + "learning_rate": 6.070798537185016e-07, + "loss": 0.70173836, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19238281, + "step": 12522, + "time_per_iteration": 2.9213716983795166 + }, + { + "auxiliary_loss_clip": 0.01432435, + "auxiliary_loss_mlp": 0.01041932, + "balance_loss_clip": 1.26638496, + "balance_loss_mlp": 1.02320385, + "epoch": 0.7529234931609801, + "flos": 24577315309440.0, + "grad_norm": 1.558275238503345, + "language_loss": 0.79491937, + "learning_rate": 6.068004041624453e-07, + "loss": 0.81966305, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1875, + "step": 12523, + "time_per_iteration": 2.8782238960266113 + }, + { + "auxiliary_loss_clip": 0.01404199, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.2454524, + "balance_loss_mlp": 1.01288319, + "epoch": 0.752983616413648, + "flos": 23122780871040.0, + "grad_norm": 1.8380797841665848, + "language_loss": 0.81043601, + "learning_rate": 6.065210074366571e-07, + "loss": 0.83479112, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18432617, + "step": 12524, + "time_per_iteration": 2.8407185077667236 + }, + { + "auxiliary_loss_clip": 0.01398818, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.24028397, + "balance_loss_mlp": 1.01304257, + "epoch": 0.753043739666316, + "flos": 24327780382080.0, + "grad_norm": 1.8163588976383853, + "language_loss": 0.74491155, + "learning_rate": 6.062416635517326e-07, + "loss": 0.76921797, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18774414, + "step": 12525, + "time_per_iteration": 2.874983310699463 + }, + { + "auxiliary_loss_clip": 0.01400925, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.24067724, + "balance_loss_mlp": 1.01641726, + "epoch": 0.7531038629189839, + "flos": 24253931589120.0, + "grad_norm": 1.7696786625543217, + "language_loss": 0.7282328, + "learning_rate": 6.059623725182641e-07, + "loss": 0.75260365, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19750977, + "step": 12526, + "time_per_iteration": 2.8781533241271973 + }, + { + "auxiliary_loss_clip": 0.01405112, + "auxiliary_loss_mlp": 0.01031845, + "balance_loss_clip": 1.24510753, + "balance_loss_mlp": 1.01422548, + "epoch": 0.7531639861716519, + "flos": 30200555871360.0, + "grad_norm": 1.9582785262952493, + "language_loss": 0.72628415, + "learning_rate": 6.056831343468414e-07, + "loss": 0.75065374, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17614746, + "step": 12527, + "time_per_iteration": 2.907971143722534 + }, + { + "auxiliary_loss_clip": 0.01413627, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.25331557, + "balance_loss_mlp": 1.01088166, + "epoch": 0.7532241094243198, + "flos": 18232282149120.0, + "grad_norm": 1.8334075499491955, + "language_loss": 0.8164748, + "learning_rate": 6.054039490480539e-07, + "loss": 0.84089994, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.17993164, + "step": 12528, + "time_per_iteration": 2.805462121963501 + }, + { + "auxiliary_loss_clip": 0.01411241, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.24841654, + "balance_loss_mlp": 1.01525807, + "epoch": 0.7532842326769879, + "flos": 20889372879360.0, + "grad_norm": 1.8088842202638404, + "language_loss": 0.85692602, + "learning_rate": 6.051248166324892e-07, + "loss": 0.88137507, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18408203, + "step": 12529, + "time_per_iteration": 2.8273956775665283 + }, + { + "auxiliary_loss_clip": 0.01439548, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.27128124, + "balance_loss_mlp": 1.01587391, + "epoch": 0.7533443559296558, + "flos": 18088113657600.0, + "grad_norm": 1.7717014266723807, + "language_loss": 0.75264949, + "learning_rate": 6.048457371107303e-07, + "loss": 0.77739787, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.1940918, + "step": 12530, + "time_per_iteration": 4.264735460281372 + }, + { + "auxiliary_loss_clip": 0.01182346, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.0975858, + "balance_loss_mlp": 1.01678336, + "epoch": 0.7534044791823238, + "flos": 50280958126080.0, + "grad_norm": 0.8366184450679832, + "language_loss": 0.63655579, + "learning_rate": 6.045667104933612e-07, + "loss": 0.6587283, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.18164062, + "step": 12531, + "time_per_iteration": 3.2316484451293945 + }, + { + "auxiliary_loss_clip": 0.0142052, + "auxiliary_loss_mlp": 0.01030067, + "balance_loss_clip": 1.25538111, + "balance_loss_mlp": 1.01137519, + "epoch": 0.7534646024349917, + "flos": 20860072231680.0, + "grad_norm": 1.8568019660964725, + "language_loss": 0.70818043, + "learning_rate": 6.042877367909633e-07, + "loss": 0.73268628, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18688965, + "step": 12532, + "time_per_iteration": 2.8829541206359863 + }, + { + "auxiliary_loss_clip": 0.01393005, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.23567224, + "balance_loss_mlp": 1.01627898, + "epoch": 0.7535247256876597, + "flos": 23081083148160.0, + "grad_norm": 1.689841383636595, + "language_loss": 0.78231835, + "learning_rate": 6.040088160141132e-07, + "loss": 0.80658138, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17041016, + "step": 12533, + "time_per_iteration": 2.863807439804077 + }, + { + "auxiliary_loss_clip": 0.01183801, + "auxiliary_loss_mlp": 0.01017532, + "balance_loss_clip": 1.09644008, + "balance_loss_mlp": 1.00007951, + "epoch": 0.7535848489403276, + "flos": 58655181657600.0, + "grad_norm": 0.787424033101597, + "language_loss": 0.57396382, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59597719, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.17480469, + "step": 12534, + "time_per_iteration": 3.331066131591797 + }, + { + "auxiliary_loss_clip": 0.0140619, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.24534941, + "balance_loss_mlp": 1.0131768, + "epoch": 0.7536449721929956, + "flos": 26588995943040.0, + "grad_norm": 1.6432545054121002, + "language_loss": 0.71813965, + "learning_rate": 6.03451133279365e-07, + "loss": 0.74252093, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18762207, + "step": 12535, + "time_per_iteration": 2.9111478328704834 + }, + { + "auxiliary_loss_clip": 0.01418365, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.25318992, + "balance_loss_mlp": 1.01516604, + "epoch": 0.7537050954456637, + "flos": 25746182208000.0, + "grad_norm": 1.6705233688828167, + "language_loss": 0.81315279, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83768314, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19482422, + "step": 12536, + "time_per_iteration": 2.9483072757720947 + }, + { + "auxiliary_loss_clip": 0.01409988, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.25033593, + "balance_loss_mlp": 1.01395607, + "epoch": 0.7537652186983316, + "flos": 30235964567040.0, + "grad_norm": 1.8477353485007515, + "language_loss": 0.7576167, + "learning_rate": 6.028936623737067e-07, + "loss": 0.7820313, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17492676, + "step": 12537, + "time_per_iteration": 4.42828631401062 + }, + { + "auxiliary_loss_clip": 0.01422663, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.25891995, + "balance_loss_mlp": 1.0171001, + "epoch": 0.7538253419509996, + "flos": 12648974762880.0, + "grad_norm": 1.7579458617504538, + "language_loss": 0.75034875, + "learning_rate": 6.026150063832111e-07, + "loss": 0.77492857, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18225098, + "step": 12538, + "time_per_iteration": 2.8212201595306396 + }, + { + "auxiliary_loss_clip": 0.01419065, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.25491667, + "balance_loss_mlp": 1.01501703, + "epoch": 0.7538854652036675, + "flos": 23196539174400.0, + "grad_norm": 1.4749150972183194, + "language_loss": 0.68112135, + "learning_rate": 6.023364033816956e-07, + "loss": 0.70565176, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18945312, + "step": 12539, + "time_per_iteration": 2.8677146434783936 + }, + { + "auxiliary_loss_clip": 0.01405761, + "auxiliary_loss_mlp": 0.01036558, + "balance_loss_clip": 1.24667215, + "balance_loss_mlp": 1.01723433, + "epoch": 0.7539455884563355, + "flos": 23196855888000.0, + "grad_norm": 1.605872604071508, + "language_loss": 0.75441939, + "learning_rate": 6.020578533797229e-07, + "loss": 0.77884257, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.1932373, + "step": 12540, + "time_per_iteration": 2.855470657348633 + }, + { + "auxiliary_loss_clip": 0.01412358, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.2488296, + "balance_loss_mlp": 1.01666439, + "epoch": 0.7540057117090034, + "flos": 13187118142080.0, + "grad_norm": 2.2496803400617136, + "language_loss": 0.73728019, + "learning_rate": 6.017793563878566e-07, + "loss": 0.76175529, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18493652, + "step": 12541, + "time_per_iteration": 4.260338068008423 + }, + { + "auxiliary_loss_clip": 0.01407454, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.24615538, + "balance_loss_mlp": 1.01254725, + "epoch": 0.7540658349616715, + "flos": 45494314778880.0, + "grad_norm": 1.779009623857626, + "language_loss": 0.7288987, + "learning_rate": 6.015009124166576e-07, + "loss": 0.75328767, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18884277, + "step": 12542, + "time_per_iteration": 4.425582647323608 + }, + { + "auxiliary_loss_clip": 0.01403163, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.24333334, + "balance_loss_mlp": 1.00958872, + "epoch": 0.7541259582143394, + "flos": 19938478020480.0, + "grad_norm": 1.7828380713955987, + "language_loss": 0.85685062, + "learning_rate": 6.012225214766844e-07, + "loss": 0.88116038, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18225098, + "step": 12543, + "time_per_iteration": 2.9303622245788574 + }, + { + "auxiliary_loss_clip": 0.01405055, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.24627209, + "balance_loss_mlp": 1.01405692, + "epoch": 0.7541860814670074, + "flos": 27209539382400.0, + "grad_norm": 2.0552790162767716, + "language_loss": 0.74510562, + "learning_rate": 6.009441835784927e-07, + "loss": 0.76947534, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.17858887, + "step": 12544, + "time_per_iteration": 2.912008047103882 + }, + { + "auxiliary_loss_clip": 0.0139537, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.23677099, + "balance_loss_mlp": 1.01291299, + "epoch": 0.7542462047196753, + "flos": 21333958715520.0, + "grad_norm": 1.8254743662878836, + "language_loss": 0.69305813, + "learning_rate": 6.006658987326383e-07, + "loss": 0.71731067, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1697998, + "step": 12545, + "time_per_iteration": 2.832503318786621 + }, + { + "auxiliary_loss_clip": 0.01413448, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.25073099, + "balance_loss_mlp": 1.01188982, + "epoch": 0.7543063279723433, + "flos": 11946664690560.0, + "grad_norm": 1.7722481917770914, + "language_loss": 0.70085895, + "learning_rate": 6.003876669496728e-07, + "loss": 0.72529399, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1817627, + "step": 12546, + "time_per_iteration": 2.836237668991089 + }, + { + "auxiliary_loss_clip": 0.01419487, + "auxiliary_loss_mlp": 0.01032603, + "balance_loss_clip": 1.25458789, + "balance_loss_mlp": 1.01422048, + "epoch": 0.7543664512250112, + "flos": 22830145632000.0, + "grad_norm": 6.860032061353637, + "language_loss": 0.74859416, + "learning_rate": 6.00109488240147e-07, + "loss": 0.77311504, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18395996, + "step": 12547, + "time_per_iteration": 2.84859561920166 + }, + { + "auxiliary_loss_clip": 0.01398604, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.23734343, + "balance_loss_mlp": 1.0168153, + "epoch": 0.7544265744776792, + "flos": 20933830535040.0, + "grad_norm": 1.756312767150609, + "language_loss": 0.68202412, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70635951, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18115234, + "step": 12548, + "time_per_iteration": 2.939211368560791 + }, + { + "auxiliary_loss_clip": 0.01418706, + "auxiliary_loss_mlp": 0.0103652, + "balance_loss_clip": 1.25360286, + "balance_loss_mlp": 1.01844811, + "epoch": 0.7544866977303473, + "flos": 15203956682880.0, + "grad_norm": 1.7149477740098071, + "language_loss": 0.87481385, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89936614, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.18066406, + "step": 12549, + "time_per_iteration": 2.8127150535583496 + }, + { + "auxiliary_loss_clip": 0.01386178, + "auxiliary_loss_mlp": 0.010338, + "balance_loss_clip": 1.23091137, + "balance_loss_mlp": 1.01566768, + "epoch": 0.7545468209830152, + "flos": 27093630908160.0, + "grad_norm": 1.9616979264033383, + "language_loss": 0.78106558, + "learning_rate": 5.992752706576865e-07, + "loss": 0.80526531, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18139648, + "step": 12550, + "time_per_iteration": 2.8859975337982178 + }, + { + "auxiliary_loss_clip": 0.01423197, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.26005888, + "balance_loss_mlp": 1.01938546, + "epoch": 0.7546069442356832, + "flos": 26883395729280.0, + "grad_norm": 1.499304719982298, + "language_loss": 0.70070052, + "learning_rate": 5.98997304347386e-07, + "loss": 0.72530437, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.17797852, + "step": 12551, + "time_per_iteration": 2.933253765106201 + }, + { + "auxiliary_loss_clip": 0.01412982, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.25181699, + "balance_loss_mlp": 1.01301301, + "epoch": 0.7546670674883511, + "flos": 15751827694080.0, + "grad_norm": 1.9570946640751115, + "language_loss": 0.87056875, + "learning_rate": 5.987193911632487e-07, + "loss": 0.8950119, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18310547, + "step": 12552, + "time_per_iteration": 2.8008408546447754 + }, + { + "auxiliary_loss_clip": 0.01415135, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.25214171, + "balance_loss_mlp": 1.01639557, + "epoch": 0.7547271907410191, + "flos": 23488314762240.0, + "grad_norm": 1.769343980059863, + "language_loss": 0.79857737, + "learning_rate": 5.98441531115812e-07, + "loss": 0.82307315, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18041992, + "step": 12553, + "time_per_iteration": 2.8795533180236816 + }, + { + "auxiliary_loss_clip": 0.01410233, + "auxiliary_loss_mlp": 0.01030432, + "balance_loss_clip": 1.24926329, + "balance_loss_mlp": 1.01166844, + "epoch": 0.754787313993687, + "flos": 31735363864320.0, + "grad_norm": 2.4478203707279675, + "language_loss": 0.63670516, + "learning_rate": 5.981637242156135e-07, + "loss": 0.66111183, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18762207, + "step": 12554, + "time_per_iteration": 2.920714855194092 + }, + { + "auxiliary_loss_clip": 0.01406221, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.2438519, + "balance_loss_mlp": 1.01854157, + "epoch": 0.7548474372463551, + "flos": 27574439846400.0, + "grad_norm": 1.6098107064623632, + "language_loss": 0.73874444, + "learning_rate": 5.978859704731864e-07, + "loss": 0.76318491, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19299316, + "step": 12555, + "time_per_iteration": 2.921966314315796 + }, + { + "auxiliary_loss_clip": 0.01420432, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.25776148, + "balance_loss_mlp": 1.01377678, + "epoch": 0.754907560499023, + "flos": 19328159905920.0, + "grad_norm": 1.8633113786365774, + "language_loss": 0.79764259, + "learning_rate": 5.976082698990645e-07, + "loss": 0.82216859, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18383789, + "step": 12556, + "time_per_iteration": 2.908280372619629 + }, + { + "auxiliary_loss_clip": 0.01183451, + "auxiliary_loss_mlp": 0.01019499, + "balance_loss_clip": 1.09563875, + "balance_loss_mlp": 0.99947149, + "epoch": 0.754967683751691, + "flos": 69777474508800.0, + "grad_norm": 0.7044004611801478, + "language_loss": 0.50420934, + "learning_rate": 5.973306225037769e-07, + "loss": 0.5262388, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.20019531, + "step": 12557, + "time_per_iteration": 3.336458921432495 + }, + { + "auxiliary_loss_clip": 0.01423004, + "auxiliary_loss_mlp": 0.01038098, + "balance_loss_clip": 1.25887918, + "balance_loss_mlp": 1.01867855, + "epoch": 0.7550278070043589, + "flos": 24431608494720.0, + "grad_norm": 3.3870597843389367, + "language_loss": 0.72497433, + "learning_rate": 5.970530282978525e-07, + "loss": 0.74958539, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19421387, + "step": 12558, + "time_per_iteration": 2.926224946975708 + }, + { + "auxiliary_loss_clip": 0.01404222, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.24366164, + "balance_loss_mlp": 1.01625299, + "epoch": 0.7550879302570269, + "flos": 32647320933120.0, + "grad_norm": 1.8054958584933598, + "language_loss": 0.81105626, + "learning_rate": 5.967754872918187e-07, + "loss": 0.83544183, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18078613, + "step": 12559, + "time_per_iteration": 2.952582597732544 + }, + { + "auxiliary_loss_clip": 0.01425587, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.25949979, + "balance_loss_mlp": 1.01204348, + "epoch": 0.7551480535096948, + "flos": 21804723308160.0, + "grad_norm": 1.6990951724146872, + "language_loss": 0.7919628, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81652939, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19018555, + "step": 12560, + "time_per_iteration": 2.8748464584350586 + }, + { + "auxiliary_loss_clip": 0.01406611, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.24733305, + "balance_loss_mlp": 1.0142312, + "epoch": 0.7552081767623628, + "flos": 18524283960960.0, + "grad_norm": 1.5450851279325317, + "language_loss": 0.71421862, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73860383, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.17675781, + "step": 12561, + "time_per_iteration": 2.8078396320343018 + }, + { + "auxiliary_loss_clip": 0.01419741, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.25612235, + "balance_loss_mlp": 1.01393569, + "epoch": 0.7552683000150308, + "flos": 27645890664960.0, + "grad_norm": 4.560237584765932, + "language_loss": 0.76271147, + "learning_rate": 5.959431835782889e-07, + "loss": 0.78723359, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18530273, + "step": 12562, + "time_per_iteration": 2.9052159786224365 + }, + { + "auxiliary_loss_clip": 0.01408703, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.2476213, + "balance_loss_mlp": 1.0153302, + "epoch": 0.7553284232676988, + "flos": 20312563178880.0, + "grad_norm": 1.988043047010246, + "language_loss": 0.76611686, + "learning_rate": 5.956658554770371e-07, + "loss": 0.79055786, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20068359, + "step": 12563, + "time_per_iteration": 2.8394618034362793 + }, + { + "auxiliary_loss_clip": 0.01456183, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.28453994, + "balance_loss_mlp": 1.01904631, + "epoch": 0.7553885465203668, + "flos": 33268497799680.0, + "grad_norm": 2.7248944395883465, + "language_loss": 0.67545807, + "learning_rate": 5.953885806282768e-07, + "loss": 0.70040816, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 1.71972656, + "router_z_loss_mlp": 0.19775391, + "step": 12564, + "time_per_iteration": 2.950690746307373 + }, + { + "auxiliary_loss_clip": 0.01422344, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.25567698, + "balance_loss_mlp": 1.0180949, + "epoch": 0.7554486697730347, + "flos": 21626186751360.0, + "grad_norm": 2.9496874089931846, + "language_loss": 0.68906271, + "learning_rate": 5.951113590425228e-07, + "loss": 0.71365666, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.18933105, + "step": 12565, + "time_per_iteration": 4.282850742340088 + }, + { + "auxiliary_loss_clip": 0.01423468, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.25501037, + "balance_loss_mlp": 1.01646423, + "epoch": 0.7555087930257027, + "flos": 27643583180160.0, + "grad_norm": 1.4972850710576684, + "language_loss": 0.75458127, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77917093, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19030762, + "step": 12566, + "time_per_iteration": 2.8908095359802246 + }, + { + "auxiliary_loss_clip": 0.01431671, + "auxiliary_loss_mlp": 0.01039042, + "balance_loss_clip": 1.2639221, + "balance_loss_mlp": 1.02030241, + "epoch": 0.7555689162783706, + "flos": 23631759336960.0, + "grad_norm": 3.1021557168173026, + "language_loss": 0.74362117, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76832831, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 1.67480469, + "router_z_loss_mlp": 0.18737793, + "step": 12567, + "time_per_iteration": 2.8888962268829346 + }, + { + "auxiliary_loss_clip": 0.0141983, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.25693011, + "balance_loss_mlp": 1.01363468, + "epoch": 0.7556290395310387, + "flos": 24873751111680.0, + "grad_norm": 1.8628773628798716, + "language_loss": 0.63698506, + "learning_rate": 5.942800139684073e-07, + "loss": 0.66149306, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.17346191, + "step": 12568, + "time_per_iteration": 2.882469892501831 + }, + { + "auxiliary_loss_clip": 0.01415473, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.25450218, + "balance_loss_mlp": 1.01704764, + "epoch": 0.7556891627837066, + "flos": 43559288115840.0, + "grad_norm": 2.0132734911539387, + "language_loss": 0.67123801, + "learning_rate": 5.940030055397789e-07, + "loss": 0.69574583, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18273926, + "step": 12569, + "time_per_iteration": 3.054701566696167 + }, + { + "auxiliary_loss_clip": 0.01421676, + "auxiliary_loss_mlp": 0.01035487, + "balance_loss_clip": 1.25353074, + "balance_loss_mlp": 1.01631784, + "epoch": 0.7557492860363746, + "flos": 26662075574400.0, + "grad_norm": 1.628883311413023, + "language_loss": 0.67600596, + "learning_rate": 5.93726050426697e-07, + "loss": 0.70057762, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.19189453, + "step": 12570, + "time_per_iteration": 2.8827171325683594 + }, + { + "auxiliary_loss_clip": 0.01422827, + "auxiliary_loss_mlp": 0.01034087, + "balance_loss_clip": 1.25829399, + "balance_loss_mlp": 1.01531136, + "epoch": 0.7558094092890425, + "flos": 55201463429760.0, + "grad_norm": 1.9508263863980602, + "language_loss": 0.72497034, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74953943, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18786621, + "step": 12571, + "time_per_iteration": 3.119129180908203 + }, + { + "auxiliary_loss_clip": 0.01421011, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.25363505, + "balance_loss_mlp": 1.01798844, + "epoch": 0.7558695325417105, + "flos": 23998967285760.0, + "grad_norm": 2.247437391273483, + "language_loss": 0.7429074, + "learning_rate": 5.931723001891811e-07, + "loss": 0.76748025, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.18286133, + "step": 12572, + "time_per_iteration": 2.883322238922119 + }, + { + "auxiliary_loss_clip": 0.01435482, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.26962304, + "balance_loss_mlp": 1.01961029, + "epoch": 0.7559296557943784, + "flos": 14619455366400.0, + "grad_norm": 1.970319854285585, + "language_loss": 0.76883429, + "learning_rate": 5.928955050857456e-07, + "loss": 0.79357189, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18664551, + "step": 12573, + "time_per_iteration": 4.197461128234863 + }, + { + "auxiliary_loss_clip": 0.0141905, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.25229669, + "balance_loss_mlp": 1.01655114, + "epoch": 0.7559897790470465, + "flos": 18559375943040.0, + "grad_norm": 1.4974926185495152, + "language_loss": 0.69936991, + "learning_rate": 5.926187633398527e-07, + "loss": 0.72391075, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.18481445, + "step": 12574, + "time_per_iteration": 2.835620403289795 + }, + { + "auxiliary_loss_clip": 0.01413554, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.25167799, + "balance_loss_mlp": 1.01124048, + "epoch": 0.7560499022997144, + "flos": 17976910642560.0, + "grad_norm": 2.0954321921720487, + "language_loss": 0.72516572, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74960023, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18652344, + "step": 12575, + "time_per_iteration": 2.8251450061798096 + }, + { + "auxiliary_loss_clip": 0.01402177, + "auxiliary_loss_mlp": 0.01033884, + "balance_loss_clip": 1.24033666, + "balance_loss_mlp": 1.01596737, + "epoch": 0.7561100255523824, + "flos": 15745131463680.0, + "grad_norm": 2.583739146535471, + "language_loss": 0.72361529, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74797583, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.17919922, + "step": 12576, + "time_per_iteration": 2.8344364166259766 + }, + { + "auxiliary_loss_clip": 0.01412906, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.2518152, + "balance_loss_mlp": 1.01309586, + "epoch": 0.7561701488050504, + "flos": 15896584368000.0, + "grad_norm": 1.90847509990466, + "language_loss": 0.67739773, + "learning_rate": 5.917888583523669e-07, + "loss": 0.70184278, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18518066, + "step": 12577, + "time_per_iteration": 5.604647874832153 + }, + { + "auxiliary_loss_clip": 0.01401605, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.24193382, + "balance_loss_mlp": 1.01657236, + "epoch": 0.7562302720577183, + "flos": 20348741036160.0, + "grad_norm": 1.8557066963813003, + "language_loss": 0.78660214, + "learning_rate": 5.915123301415685e-07, + "loss": 0.81097108, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18737793, + "step": 12578, + "time_per_iteration": 2.834484577178955 + }, + { + "auxiliary_loss_clip": 0.01409822, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.24605799, + "balance_loss_mlp": 1.014552, + "epoch": 0.7562903953103863, + "flos": 20821586889600.0, + "grad_norm": 1.6037588359154342, + "language_loss": 0.76298243, + "learning_rate": 5.912358553407641e-07, + "loss": 0.78741241, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.1862793, + "step": 12579, + "time_per_iteration": 2.860870838165283 + }, + { + "auxiliary_loss_clip": 0.01428766, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.25942397, + "balance_loss_mlp": 1.01569629, + "epoch": 0.7563505185630542, + "flos": 37611080265600.0, + "grad_norm": 1.9590962283197713, + "language_loss": 0.62993252, + "learning_rate": 5.90959433960437e-07, + "loss": 0.65457296, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.19592285, + "step": 12580, + "time_per_iteration": 2.974163055419922 + }, + { + "auxiliary_loss_clip": 0.01411968, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.25016284, + "balance_loss_mlp": 1.01233268, + "epoch": 0.7564106418157223, + "flos": 20240886136320.0, + "grad_norm": 1.595368022370854, + "language_loss": 0.75741661, + "learning_rate": 5.906830660110691e-07, + "loss": 0.7818498, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19018555, + "step": 12581, + "time_per_iteration": 2.874105930328369 + }, + { + "auxiliary_loss_clip": 0.01423397, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.25801659, + "balance_loss_mlp": 1.01528573, + "epoch": 0.7564707650683902, + "flos": 24765850967040.0, + "grad_norm": 1.782824686918283, + "language_loss": 0.6349957, + "learning_rate": 5.904067515031412e-07, + "loss": 0.65957558, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19299316, + "step": 12582, + "time_per_iteration": 2.8719866275787354 + }, + { + "auxiliary_loss_clip": 0.01188436, + "auxiliary_loss_mlp": 0.0101452, + "balance_loss_clip": 1.10093331, + "balance_loss_mlp": 0.9977358, + "epoch": 0.7565308883210582, + "flos": 48553180485120.0, + "grad_norm": 0.9470573987589211, + "language_loss": 0.60659432, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62862384, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.16796875, + "step": 12583, + "time_per_iteration": 3.091330051422119 + }, + { + "auxiliary_loss_clip": 0.01418013, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.25577903, + "balance_loss_mlp": 1.01594198, + "epoch": 0.7565910115737261, + "flos": 12502725010560.0, + "grad_norm": 2.158671006752946, + "language_loss": 0.80159295, + "learning_rate": 5.898542828535125e-07, + "loss": 0.8261199, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18737793, + "step": 12584, + "time_per_iteration": 2.8028736114501953 + }, + { + "auxiliary_loss_clip": 0.01411763, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.2528497, + "balance_loss_mlp": 1.01347625, + "epoch": 0.7566511348263941, + "flos": 21181419936000.0, + "grad_norm": 1.8055809347775813, + "language_loss": 0.78339148, + "learning_rate": 5.895781287327612e-07, + "loss": 0.8078258, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18188477, + "step": 12585, + "time_per_iteration": 2.8593153953552246 + }, + { + "auxiliary_loss_clip": 0.01433128, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.26749468, + "balance_loss_mlp": 1.01939285, + "epoch": 0.756711258079062, + "flos": 21762889850880.0, + "grad_norm": 1.835518427174297, + "language_loss": 0.84419775, + "learning_rate": 5.893020280953493e-07, + "loss": 0.86892056, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19750977, + "step": 12586, + "time_per_iteration": 2.8272526264190674 + }, + { + "auxiliary_loss_clip": 0.01420079, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.25383985, + "balance_loss_mlp": 1.0150975, + "epoch": 0.75677138133173, + "flos": 22393160922240.0, + "grad_norm": 2.062512645665064, + "language_loss": 0.84396672, + "learning_rate": 5.890259809517459e-07, + "loss": 0.86849916, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.18066406, + "step": 12587, + "time_per_iteration": 2.8536458015441895 + }, + { + "auxiliary_loss_clip": 0.01407588, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.2455579, + "balance_loss_mlp": 1.0115304, + "epoch": 0.756831504584398, + "flos": 22718716392960.0, + "grad_norm": 1.603572541410736, + "language_loss": 0.71718216, + "learning_rate": 5.88749987312418e-07, + "loss": 0.74156082, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18737793, + "step": 12588, + "time_per_iteration": 2.88269305229187 + }, + { + "auxiliary_loss_clip": 0.01431615, + "auxiliary_loss_mlp": 0.0103603, + "balance_loss_clip": 1.26513934, + "balance_loss_mlp": 1.01605082, + "epoch": 0.756891627837066, + "flos": 24109220160000.0, + "grad_norm": 1.8144591871861817, + "language_loss": 0.69924521, + "learning_rate": 5.884740471878327e-07, + "loss": 0.72392166, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1998291, + "step": 12589, + "time_per_iteration": 2.854109764099121 + }, + { + "auxiliary_loss_clip": 0.01405298, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.24426627, + "balance_loss_mlp": 1.01182008, + "epoch": 0.756951751089734, + "flos": 19756774327680.0, + "grad_norm": 1.7334953193335612, + "language_loss": 0.92790139, + "learning_rate": 5.881981605884522e-07, + "loss": 0.95226872, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19616699, + "step": 12590, + "time_per_iteration": 2.8259172439575195 + }, + { + "auxiliary_loss_clip": 0.01408252, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.24747944, + "balance_loss_mlp": 1.0117631, + "epoch": 0.7570118743424019, + "flos": 35092638161280.0, + "grad_norm": 1.832671500446489, + "language_loss": 0.66461241, + "learning_rate": 5.879223275247391e-07, + "loss": 0.6889981, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18554688, + "step": 12591, + "time_per_iteration": 3.029036045074463 + }, + { + "auxiliary_loss_clip": 0.01415874, + "auxiliary_loss_mlp": 0.0102865, + "balance_loss_clip": 1.25588071, + "balance_loss_mlp": 1.01155531, + "epoch": 0.7570719975950699, + "flos": 25605859524480.0, + "grad_norm": 1.4576266971863459, + "language_loss": 0.74319935, + "learning_rate": 5.876465480071528e-07, + "loss": 0.76764458, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17077637, + "step": 12592, + "time_per_iteration": 2.953808307647705 + }, + { + "auxiliary_loss_clip": 0.01418678, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.25456643, + "balance_loss_mlp": 1.01891637, + "epoch": 0.7571321208477378, + "flos": 10823296078080.0, + "grad_norm": 2.8871062127715357, + "language_loss": 0.72134143, + "learning_rate": 5.873708220461522e-07, + "loss": 0.74589688, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.17956543, + "step": 12593, + "time_per_iteration": 2.8308708667755127 + }, + { + "auxiliary_loss_clip": 0.01431902, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.26540351, + "balance_loss_mlp": 1.01315904, + "epoch": 0.7571922441004059, + "flos": 18269319657600.0, + "grad_norm": 15.552558455580714, + "language_loss": 0.67777169, + "learning_rate": 5.870951496521903e-07, + "loss": 0.70240819, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18603516, + "step": 12594, + "time_per_iteration": 2.847644567489624 + }, + { + "auxiliary_loss_clip": 0.01433819, + "auxiliary_loss_mlp": 0.01039057, + "balance_loss_clip": 1.26753092, + "balance_loss_mlp": 1.02057922, + "epoch": 0.7572523673530738, + "flos": 22900058127360.0, + "grad_norm": 1.6079319046225837, + "language_loss": 0.81031311, + "learning_rate": 5.86819530835722e-07, + "loss": 0.83504188, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.18481445, + "step": 12595, + "time_per_iteration": 2.905236005783081 + }, + { + "auxiliary_loss_clip": 0.01405531, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.24433386, + "balance_loss_mlp": 1.01569593, + "epoch": 0.7573124906057418, + "flos": 21006186249600.0, + "grad_norm": 6.383171596471471, + "language_loss": 0.72473657, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74913621, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18737793, + "step": 12596, + "time_per_iteration": 2.8789401054382324 + }, + { + "auxiliary_loss_clip": 0.01396539, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.23849642, + "balance_loss_mlp": 1.0126183, + "epoch": 0.7573726138584097, + "flos": 20895978620160.0, + "grad_norm": 1.5238765871205333, + "language_loss": 0.81258017, + "learning_rate": 5.862684539770706e-07, + "loss": 0.83684862, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17700195, + "step": 12597, + "time_per_iteration": 2.8578648567199707 + }, + { + "auxiliary_loss_clip": 0.01434192, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.2681551, + "balance_loss_mlp": 1.01712215, + "epoch": 0.7574327371110777, + "flos": 24540006332160.0, + "grad_norm": 9.088442408206694, + "language_loss": 0.83413863, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85884082, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18908691, + "step": 12598, + "time_per_iteration": 2.8461194038391113 + }, + { + "auxiliary_loss_clip": 0.01408118, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.24752402, + "balance_loss_mlp": 1.01372206, + "epoch": 0.7574928603637456, + "flos": 23374125590400.0, + "grad_norm": 1.6021123016321979, + "language_loss": 0.63609302, + "learning_rate": 5.857175915537845e-07, + "loss": 0.66049254, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18115234, + "step": 12599, + "time_per_iteration": 2.8466861248016357 + }, + { + "auxiliary_loss_clip": 0.01423093, + "auxiliary_loss_mlp": 0.01034588, + "balance_loss_clip": 1.25511897, + "balance_loss_mlp": 1.01471579, + "epoch": 0.7575529836164137, + "flos": 13524075302400.0, + "grad_norm": 3.5232209373469736, + "language_loss": 0.64941937, + "learning_rate": 5.854422407815161e-07, + "loss": 0.67399615, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.19885254, + "step": 12600, + "time_per_iteration": 4.217883586883545 + }, + { + "auxiliary_loss_clip": 0.01404703, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.2460382, + "balance_loss_mlp": 1.01684093, + "epoch": 0.7576131068690816, + "flos": 19656158595840.0, + "grad_norm": 1.6662831224253276, + "language_loss": 0.66432536, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68872577, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18505859, + "step": 12601, + "time_per_iteration": 2.839223861694336 + }, + { + "auxiliary_loss_clip": 0.01407604, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.2476294, + "balance_loss_mlp": 1.01407146, + "epoch": 0.7576732301217496, + "flos": 20058006078720.0, + "grad_norm": 1.884894320662196, + "language_loss": 0.6865803, + "learning_rate": 5.848917001679335e-07, + "loss": 0.7109772, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18017578, + "step": 12602, + "time_per_iteration": 2.8498661518096924 + }, + { + "auxiliary_loss_clip": 0.01406306, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.244802, + "balance_loss_mlp": 1.0111537, + "epoch": 0.7577333533744176, + "flos": 15385388906880.0, + "grad_norm": 2.1628673986665565, + "language_loss": 0.67812526, + "learning_rate": 5.846165103474967e-07, + "loss": 0.70248842, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18847656, + "step": 12603, + "time_per_iteration": 2.7887380123138428 + }, + { + "auxiliary_loss_clip": 0.01393825, + "auxiliary_loss_mlp": 0.01028475, + "balance_loss_clip": 1.23481131, + "balance_loss_mlp": 1.01177382, + "epoch": 0.7577934766270855, + "flos": 17903876256000.0, + "grad_norm": 2.1908115006354505, + "language_loss": 0.62859905, + "learning_rate": 5.843413741985439e-07, + "loss": 0.65282202, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.16687012, + "step": 12604, + "time_per_iteration": 2.830157995223999 + }, + { + "auxiliary_loss_clip": 0.01411384, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.25124002, + "balance_loss_mlp": 1.01738763, + "epoch": 0.7578535998797535, + "flos": 21623064860160.0, + "grad_norm": 2.243013084775463, + "language_loss": 0.80394733, + "learning_rate": 5.840662917315076e-07, + "loss": 0.8284266, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19165039, + "step": 12605, + "time_per_iteration": 2.872981309890747 + }, + { + "auxiliary_loss_clip": 0.01418847, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.25395513, + "balance_loss_mlp": 1.01290107, + "epoch": 0.7579137231324214, + "flos": 18487472676480.0, + "grad_norm": 3.620458141928992, + "language_loss": 0.8128804, + "learning_rate": 5.837912629568198e-07, + "loss": 0.83739173, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19384766, + "step": 12606, + "time_per_iteration": 2.7820231914520264 + }, + { + "auxiliary_loss_clip": 0.01389476, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.23383343, + "balance_loss_mlp": 1.01312637, + "epoch": 0.7579738463850895, + "flos": 23264596632960.0, + "grad_norm": 1.33561557151199, + "language_loss": 0.73275709, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75695181, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.16870117, + "step": 12607, + "time_per_iteration": 4.290786266326904 + }, + { + "auxiliary_loss_clip": 0.01423061, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.25587595, + "balance_loss_mlp": 1.01484084, + "epoch": 0.7580339696377574, + "flos": 14034954049920.0, + "grad_norm": 2.183159638557979, + "language_loss": 0.75489652, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7794596, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.18395996, + "step": 12608, + "time_per_iteration": 2.8395910263061523 + }, + { + "auxiliary_loss_clip": 0.01401465, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.24032211, + "balance_loss_mlp": 1.01495647, + "epoch": 0.7580940928904254, + "flos": 25093713922560.0, + "grad_norm": 1.5391926009607988, + "language_loss": 0.72265172, + "learning_rate": 5.829664988911245e-07, + "loss": 0.7470001, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18432617, + "step": 12609, + "time_per_iteration": 2.865010976791382 + }, + { + "auxiliary_loss_clip": 0.01410967, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.2490077, + "balance_loss_mlp": 1.01206255, + "epoch": 0.7581542161430933, + "flos": 23845523610240.0, + "grad_norm": 1.6925968741414323, + "language_loss": 0.81943715, + "learning_rate": 5.826916849901007e-07, + "loss": 0.84386426, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19665527, + "step": 12610, + "time_per_iteration": 2.8378775119781494 + }, + { + "auxiliary_loss_clip": 0.01435611, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.26855922, + "balance_loss_mlp": 1.01517797, + "epoch": 0.7582143393957613, + "flos": 22247227883520.0, + "grad_norm": 2.031526260121046, + "language_loss": 0.71286619, + "learning_rate": 5.824169248335488e-07, + "loss": 0.73755789, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.18371582, + "step": 12611, + "time_per_iteration": 2.819511890411377 + }, + { + "auxiliary_loss_clip": 0.0140783, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.24705386, + "balance_loss_mlp": 1.01015806, + "epoch": 0.7582744626484292, + "flos": 21116439123840.0, + "grad_norm": 1.5687099746473676, + "language_loss": 0.71315581, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73752439, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1887207, + "step": 12612, + "time_per_iteration": 5.584877014160156 + }, + { + "auxiliary_loss_clip": 0.01413673, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.24919796, + "balance_loss_mlp": 1.01549625, + "epoch": 0.7583345859010973, + "flos": 24614624286720.0, + "grad_norm": 1.5038917366981355, + "language_loss": 0.6041292, + "learning_rate": 5.818675657955397e-07, + "loss": 0.62860781, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18688965, + "step": 12613, + "time_per_iteration": 2.8911592960357666 + }, + { + "auxiliary_loss_clip": 0.01417145, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.25414324, + "balance_loss_mlp": 1.0178535, + "epoch": 0.7583947091537652, + "flos": 33559775694720.0, + "grad_norm": 1.4761006788361861, + "language_loss": 0.60985392, + "learning_rate": 5.815929669349135e-07, + "loss": 0.63438344, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.17956543, + "step": 12614, + "time_per_iteration": 2.9609375 + }, + { + "auxiliary_loss_clip": 0.01423706, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.25789571, + "balance_loss_mlp": 1.01470721, + "epoch": 0.7584548324064332, + "flos": 20130723751680.0, + "grad_norm": 4.767740383882963, + "language_loss": 0.73572576, + "learning_rate": 5.813184218604246e-07, + "loss": 0.7602911, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18127441, + "step": 12615, + "time_per_iteration": 2.8483150005340576 + }, + { + "auxiliary_loss_clip": 0.01194399, + "auxiliary_loss_mlp": 0.01024854, + "balance_loss_clip": 1.10613525, + "balance_loss_mlp": 1.00272834, + "epoch": 0.7585149556591012, + "flos": 70435915107840.0, + "grad_norm": 0.8077764123816247, + "language_loss": 0.67803764, + "learning_rate": 5.810439305824828e-07, + "loss": 0.70023012, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22167969, + "step": 12616, + "time_per_iteration": 3.4209842681884766 + }, + { + "auxiliary_loss_clip": 0.01425909, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.26133502, + "balance_loss_mlp": 1.01252961, + "epoch": 0.7585750789117691, + "flos": 16152408322560.0, + "grad_norm": 2.0661337867857292, + "language_loss": 0.84972751, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87429976, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18786621, + "step": 12617, + "time_per_iteration": 2.803428888320923 + }, + { + "auxiliary_loss_clip": 0.01411987, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.24891305, + "balance_loss_mlp": 1.01297665, + "epoch": 0.7586352021644371, + "flos": 17501983528320.0, + "grad_norm": 2.275970480764086, + "language_loss": 0.75665677, + "learning_rate": 5.804951094578757e-07, + "loss": 0.78108627, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.17993164, + "step": 12618, + "time_per_iteration": 2.829324245452881 + }, + { + "auxiliary_loss_clip": 0.01435735, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.26617646, + "balance_loss_mlp": 1.01200879, + "epoch": 0.758695325417105, + "flos": 17284192467840.0, + "grad_norm": 2.1944487957927, + "language_loss": 0.77965081, + "learning_rate": 5.802207796320209e-07, + "loss": 0.80432022, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.19189453, + "step": 12619, + "time_per_iteration": 2.899750232696533 + }, + { + "auxiliary_loss_clip": 0.01398546, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.23962569, + "balance_loss_mlp": 1.01734316, + "epoch": 0.7587554486697731, + "flos": 29507249514240.0, + "grad_norm": 3.2161316892126184, + "language_loss": 0.83359545, + "learning_rate": 5.79946503644337e-07, + "loss": 0.85793531, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18103027, + "step": 12620, + "time_per_iteration": 2.9023165702819824 + }, + { + "auxiliary_loss_clip": 0.01417457, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.25093079, + "balance_loss_mlp": 1.01468611, + "epoch": 0.758815571922441, + "flos": 16107724442880.0, + "grad_norm": 2.0221723286571134, + "language_loss": 0.83216965, + "learning_rate": 5.796722815052242e-07, + "loss": 0.85668874, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19775391, + "step": 12621, + "time_per_iteration": 2.8102781772613525 + }, + { + "auxiliary_loss_clip": 0.01403691, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.24350095, + "balance_loss_mlp": 1.01625848, + "epoch": 0.758875695175109, + "flos": 16152317832960.0, + "grad_norm": 2.292377578030205, + "language_loss": 0.74763656, + "learning_rate": 5.7939811322508e-07, + "loss": 0.77202421, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18823242, + "step": 12622, + "time_per_iteration": 2.883671998977661 + }, + { + "auxiliary_loss_clip": 0.01191752, + "auxiliary_loss_mlp": 0.01023299, + "balance_loss_clip": 1.1036787, + "balance_loss_mlp": 0.99831259, + "epoch": 0.7589358184277769, + "flos": 68493061094400.0, + "grad_norm": 0.8138841223924056, + "language_loss": 0.60898542, + "learning_rate": 5.791239988143024e-07, + "loss": 0.63113594, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.25, + "step": 12623, + "time_per_iteration": 3.3693575859069824 + }, + { + "auxiliary_loss_clip": 0.01408603, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.24982953, + "balance_loss_mlp": 1.01641536, + "epoch": 0.7589959416804449, + "flos": 20056829713920.0, + "grad_norm": 2.952302849139921, + "language_loss": 0.68223, + "learning_rate": 5.788499382832847e-07, + "loss": 0.70666796, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18774414, + "step": 12624, + "time_per_iteration": 2.8182222843170166 + }, + { + "auxiliary_loss_clip": 0.01399432, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.23936534, + "balance_loss_mlp": 1.00872445, + "epoch": 0.7590560649331128, + "flos": 18781374769920.0, + "grad_norm": 1.847920696469423, + "language_loss": 0.76990879, + "learning_rate": 5.785759316424196e-07, + "loss": 0.79418558, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1953125, + "step": 12625, + "time_per_iteration": 2.8462681770324707 + }, + { + "auxiliary_loss_clip": 0.01402313, + "auxiliary_loss_mlp": 0.01036267, + "balance_loss_clip": 1.24349689, + "balance_loss_mlp": 1.01770568, + "epoch": 0.7591161881857809, + "flos": 29837284220160.0, + "grad_norm": 2.5943295346450657, + "language_loss": 0.64041746, + "learning_rate": 5.783019789020977e-07, + "loss": 0.66480321, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18554688, + "step": 12626, + "time_per_iteration": 2.9676458835601807 + }, + { + "auxiliary_loss_clip": 0.01417785, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.25352311, + "balance_loss_mlp": 1.02118576, + "epoch": 0.7591763114384488, + "flos": 20312291710080.0, + "grad_norm": 1.8999620641309276, + "language_loss": 0.74419069, + "learning_rate": 5.780280800727084e-07, + "loss": 0.76877755, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19726562, + "step": 12627, + "time_per_iteration": 2.849762439727783 + }, + { + "auxiliary_loss_clip": 0.01416165, + "auxiliary_loss_mlp": 0.0103943, + "balance_loss_clip": 1.25345445, + "balance_loss_mlp": 1.0203799, + "epoch": 0.7592364346911168, + "flos": 20823351436800.0, + "grad_norm": 6.467921756175341, + "language_loss": 0.69990492, + "learning_rate": 5.777542351646356e-07, + "loss": 0.7244609, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19030762, + "step": 12628, + "time_per_iteration": 2.8572897911071777 + }, + { + "auxiliary_loss_clip": 0.01439298, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.26936245, + "balance_loss_mlp": 1.01488304, + "epoch": 0.7592965579437848, + "flos": 21261376776960.0, + "grad_norm": 1.9266052835957244, + "language_loss": 0.64209324, + "learning_rate": 5.774804441882648e-07, + "loss": 0.66683614, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 1.69921875, + "router_z_loss_mlp": 0.2010498, + "step": 12629, + "time_per_iteration": 2.8586268424987793 + }, + { + "auxiliary_loss_clip": 0.01398083, + "auxiliary_loss_mlp": 0.01029402, + "balance_loss_clip": 1.23957062, + "balance_loss_mlp": 1.01138914, + "epoch": 0.7593566811964527, + "flos": 26224774151040.0, + "grad_norm": 1.5249017991487936, + "language_loss": 0.78680682, + "learning_rate": 5.772067071539786e-07, + "loss": 0.81108171, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18017578, + "step": 12630, + "time_per_iteration": 2.893017053604126 + }, + { + "auxiliary_loss_clip": 0.0119059, + "auxiliary_loss_mlp": 0.01039483, + "balance_loss_clip": 1.10210133, + "balance_loss_mlp": 1.01592731, + "epoch": 0.7594168044491207, + "flos": 71269634638080.0, + "grad_norm": 0.8240228899201695, + "language_loss": 0.61506993, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63737065, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.23535156, + "step": 12631, + "time_per_iteration": 3.3978114128112793 + }, + { + "auxiliary_loss_clip": 0.01434373, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.26620519, + "balance_loss_mlp": 1.01549518, + "epoch": 0.7594769277017887, + "flos": 26624178414720.0, + "grad_norm": 1.71696188408971, + "language_loss": 0.74681705, + "learning_rate": 5.766593949531767e-07, + "loss": 0.77151507, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.19946289, + "step": 12632, + "time_per_iteration": 2.931518077850342 + }, + { + "auxiliary_loss_clip": 0.01408864, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.24658537, + "balance_loss_mlp": 1.01443255, + "epoch": 0.7595370509544567, + "flos": 17603051708160.0, + "grad_norm": 1.8888811388528501, + "language_loss": 0.75602293, + "learning_rate": 5.763858198074154e-07, + "loss": 0.78043664, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18078613, + "step": 12633, + "time_per_iteration": 2.8170325756073 + }, + { + "auxiliary_loss_clip": 0.01416247, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.25455427, + "balance_loss_mlp": 1.01459813, + "epoch": 0.7595971742071246, + "flos": 18011640666240.0, + "grad_norm": 1.9993429851470306, + "language_loss": 0.74354339, + "learning_rate": 5.76112298645246e-07, + "loss": 0.76803029, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.17858887, + "step": 12634, + "time_per_iteration": 2.8799753189086914 + }, + { + "auxiliary_loss_clip": 0.01417038, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.25525141, + "balance_loss_mlp": 1.01743126, + "epoch": 0.7596572974597926, + "flos": 28852111785600.0, + "grad_norm": 1.6470954075719269, + "language_loss": 0.65076387, + "learning_rate": 5.758388314770408e-07, + "loss": 0.67529356, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18505859, + "step": 12635, + "time_per_iteration": 4.389769792556763 + }, + { + "auxiliary_loss_clip": 0.01412397, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.24756336, + "balance_loss_mlp": 1.01198387, + "epoch": 0.7597174207124605, + "flos": 14290913738880.0, + "grad_norm": 1.9645075640377239, + "language_loss": 0.69538093, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71981537, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19067383, + "step": 12636, + "time_per_iteration": 2.8139450550079346 + }, + { + "auxiliary_loss_clip": 0.01427563, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.26139307, + "balance_loss_mlp": 1.01466405, + "epoch": 0.7597775439651285, + "flos": 21698813934720.0, + "grad_norm": 1.937469722712686, + "language_loss": 0.8213262, + "learning_rate": 5.752920591640018e-07, + "loss": 0.84593928, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.1907959, + "step": 12637, + "time_per_iteration": 2.837063789367676 + }, + { + "auxiliary_loss_clip": 0.01407154, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.24376845, + "balance_loss_mlp": 1.01322818, + "epoch": 0.7598376672177964, + "flos": 36114983838720.0, + "grad_norm": 1.6908693189912851, + "language_loss": 0.67408586, + "learning_rate": 5.750187540399017e-07, + "loss": 0.69847846, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18884277, + "step": 12638, + "time_per_iteration": 2.9468774795532227 + }, + { + "auxiliary_loss_clip": 0.01409595, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.24662983, + "balance_loss_mlp": 1.0143671, + "epoch": 0.7598977904704645, + "flos": 18341358658560.0, + "grad_norm": 2.030081164266426, + "language_loss": 0.66438717, + "learning_rate": 5.747455029512323e-07, + "loss": 0.688824, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19714355, + "step": 12639, + "time_per_iteration": 2.8283374309539795 + }, + { + "auxiliary_loss_clip": 0.0140942, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.24897373, + "balance_loss_mlp": 1.01313806, + "epoch": 0.7599579137231324, + "flos": 20202038835840.0, + "grad_norm": 2.0819240337110556, + "language_loss": 0.71192986, + "learning_rate": 5.744723059083572e-07, + "loss": 0.73633683, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18139648, + "step": 12640, + "time_per_iteration": 2.839646816253662 + }, + { + "auxiliary_loss_clip": 0.01426385, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.26039171, + "balance_loss_mlp": 1.01282978, + "epoch": 0.7600180369758004, + "flos": 24035552346240.0, + "grad_norm": 1.7040892854093315, + "language_loss": 0.67131305, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69589764, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.19262695, + "step": 12641, + "time_per_iteration": 2.8556411266326904 + }, + { + "auxiliary_loss_clip": 0.01429756, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.26390696, + "balance_loss_mlp": 1.01500738, + "epoch": 0.7600781602284684, + "flos": 18998849116800.0, + "grad_norm": 2.788778984661137, + "language_loss": 0.68056262, + "learning_rate": 5.73926074001422e-07, + "loss": 0.70519859, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18847656, + "step": 12642, + "time_per_iteration": 4.244689226150513 + }, + { + "auxiliary_loss_clip": 0.01404659, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.24614942, + "balance_loss_mlp": 1.01297462, + "epoch": 0.7601382834811363, + "flos": 26078614888320.0, + "grad_norm": 1.9190858013371213, + "language_loss": 0.76856124, + "learning_rate": 5.736530391580765e-07, + "loss": 0.79291958, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18200684, + "step": 12643, + "time_per_iteration": 2.8637728691101074 + }, + { + "auxiliary_loss_clip": 0.01415975, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.25047255, + "balance_loss_mlp": 1.01551664, + "epoch": 0.7601984067338043, + "flos": 18853685239680.0, + "grad_norm": 1.7150142868682605, + "language_loss": 0.79246938, + "learning_rate": 5.733800584019508e-07, + "loss": 0.81697899, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19445801, + "step": 12644, + "time_per_iteration": 2.8438901901245117 + }, + { + "auxiliary_loss_clip": 0.01415067, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.25175428, + "balance_loss_mlp": 1.01397085, + "epoch": 0.7602585299864723, + "flos": 24657588864000.0, + "grad_norm": 1.4730743984121433, + "language_loss": 0.8119247, + "learning_rate": 5.731071317433957e-07, + "loss": 0.83638757, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.17260742, + "step": 12645, + "time_per_iteration": 2.8678648471832275 + }, + { + "auxiliary_loss_clip": 0.01419995, + "auxiliary_loss_mlp": 0.01035535, + "balance_loss_clip": 1.25514841, + "balance_loss_mlp": 1.01594853, + "epoch": 0.7603186532391403, + "flos": 23852672288640.0, + "grad_norm": 1.8397127178949644, + "language_loss": 0.73452801, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75908339, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19580078, + "step": 12646, + "time_per_iteration": 2.872244119644165 + }, + { + "auxiliary_loss_clip": 0.01404413, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.24367237, + "balance_loss_mlp": 1.01247001, + "epoch": 0.7603787764918082, + "flos": 22209964151040.0, + "grad_norm": 2.580906585253498, + "language_loss": 0.67885613, + "learning_rate": 5.725614407603949e-07, + "loss": 0.70321119, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18615723, + "step": 12647, + "time_per_iteration": 4.375565767288208 + }, + { + "auxiliary_loss_clip": 0.01189928, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.1019448, + "balance_loss_mlp": 1.0090332, + "epoch": 0.7604388997444762, + "flos": 54114290657280.0, + "grad_norm": 0.6747503241843853, + "language_loss": 0.48971501, + "learning_rate": 5.722886764566415e-07, + "loss": 0.51194304, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.23828125, + "step": 12648, + "time_per_iteration": 3.329648971557617 + }, + { + "auxiliary_loss_clip": 0.01403949, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.24527597, + "balance_loss_mlp": 1.01373243, + "epoch": 0.7604990229971441, + "flos": 19691205333120.0, + "grad_norm": 1.6894041442041148, + "language_loss": 0.77208334, + "learning_rate": 5.720159662918451e-07, + "loss": 0.79644364, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18359375, + "step": 12649, + "time_per_iteration": 2.988821506500244 + }, + { + "auxiliary_loss_clip": 0.01414204, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.2527324, + "balance_loss_mlp": 1.01374364, + "epoch": 0.7605591462498121, + "flos": 25238832554880.0, + "grad_norm": 1.513830844266189, + "language_loss": 0.69108212, + "learning_rate": 5.717433102763462e-07, + "loss": 0.71555042, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1887207, + "step": 12650, + "time_per_iteration": 2.936736583709717 + }, + { + "auxiliary_loss_clip": 0.01190687, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.10297883, + "balance_loss_mlp": 1.00636184, + "epoch": 0.76061926950248, + "flos": 66814401323520.0, + "grad_norm": 0.752988453502928, + "language_loss": 0.62708414, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64930254, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.24804688, + "step": 12651, + "time_per_iteration": 3.3325462341308594 + }, + { + "auxiliary_loss_clip": 0.01401934, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.24241209, + "balance_loss_mlp": 1.01201367, + "epoch": 0.7606793927551481, + "flos": 25349402142720.0, + "grad_norm": 4.328515868522785, + "language_loss": 0.71997809, + "learning_rate": 5.711981607345951e-07, + "loss": 0.74429864, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1809082, + "step": 12652, + "time_per_iteration": 2.9037299156188965 + }, + { + "auxiliary_loss_clip": 0.01410814, + "auxiliary_loss_mlp": 0.01038879, + "balance_loss_clip": 1.24767518, + "balance_loss_mlp": 1.0191617, + "epoch": 0.760739516007816, + "flos": 18232825086720.0, + "grad_norm": 2.2887124960910747, + "language_loss": 0.80273175, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82722867, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19714355, + "step": 12653, + "time_per_iteration": 2.821322441101074 + }, + { + "auxiliary_loss_clip": 0.01422185, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.25606608, + "balance_loss_mlp": 1.01500106, + "epoch": 0.760799639260484, + "flos": 22567806426240.0, + "grad_norm": 1.5416680078023959, + "language_loss": 0.80905485, + "learning_rate": 5.706532279140785e-07, + "loss": 0.83361292, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.1862793, + "step": 12654, + "time_per_iteration": 2.9069652557373047 + }, + { + "auxiliary_loss_clip": 0.01414048, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.25030494, + "balance_loss_mlp": 1.01436675, + "epoch": 0.760859762513152, + "flos": 22319402618880.0, + "grad_norm": 2.1456540154096038, + "language_loss": 0.7971471, + "learning_rate": 5.703808428001136e-07, + "loss": 0.82161945, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18823242, + "step": 12655, + "time_per_iteration": 2.8112499713897705 + }, + { + "auxiliary_loss_clip": 0.01403209, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.2440002, + "balance_loss_mlp": 1.01120138, + "epoch": 0.7609198857658199, + "flos": 24874791742080.0, + "grad_norm": 1.908968951095025, + "language_loss": 0.6924051, + "learning_rate": 5.701085118974505e-07, + "loss": 0.71672487, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17565918, + "step": 12656, + "time_per_iteration": 2.91989803314209 + }, + { + "auxiliary_loss_clip": 0.01426977, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.25993109, + "balance_loss_mlp": 1.01111519, + "epoch": 0.760980009018488, + "flos": 16845533700480.0, + "grad_norm": 2.3775659643914793, + "language_loss": 0.743855, + "learning_rate": 5.698362352164164e-07, + "loss": 0.76842511, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.18933105, + "step": 12657, + "time_per_iteration": 2.840183973312378 + }, + { + "auxiliary_loss_clip": 0.01190766, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.10218787, + "balance_loss_mlp": 1.0114634, + "epoch": 0.7610401322711559, + "flos": 61257906120960.0, + "grad_norm": 0.8609407686553608, + "language_loss": 0.64843702, + "learning_rate": 5.695640127673347e-07, + "loss": 0.67071676, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.2578125, + "step": 12658, + "time_per_iteration": 3.283670663833618 + }, + { + "auxiliary_loss_clip": 0.01396216, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.23841739, + "balance_loss_mlp": 1.01337254, + "epoch": 0.7611002555238239, + "flos": 19648783693440.0, + "grad_norm": 1.8732084668414295, + "language_loss": 0.80090892, + "learning_rate": 5.692918445605293e-07, + "loss": 0.82519829, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19335938, + "step": 12659, + "time_per_iteration": 2.852208137512207 + }, + { + "auxiliary_loss_clip": 0.01407401, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.24584961, + "balance_loss_mlp": 1.01307094, + "epoch": 0.7611603787764918, + "flos": 26884029156480.0, + "grad_norm": 1.5014736384359049, + "language_loss": 0.69320959, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71760035, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18603516, + "step": 12660, + "time_per_iteration": 2.8886780738830566 + }, + { + "auxiliary_loss_clip": 0.014118, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.24881268, + "balance_loss_mlp": 1.01480997, + "epoch": 0.7612205020291598, + "flos": 27355608155520.0, + "grad_norm": 1.599430104310088, + "language_loss": 0.7083993, + "learning_rate": 5.687476709150281e-07, + "loss": 0.73285049, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18518066, + "step": 12661, + "time_per_iteration": 2.9272115230560303 + }, + { + "auxiliary_loss_clip": 0.01408814, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.24676466, + "balance_loss_mlp": 1.01443624, + "epoch": 0.7612806252818277, + "flos": 29326495962240.0, + "grad_norm": 1.6645355091139196, + "language_loss": 0.84363729, + "learning_rate": 5.68475665496966e-07, + "loss": 0.86805236, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18237305, + "step": 12662, + "time_per_iteration": 2.9490137100219727 + }, + { + "auxiliary_loss_clip": 0.01412647, + "auxiliary_loss_mlp": 0.01039091, + "balance_loss_clip": 1.24981081, + "balance_loss_mlp": 1.02087557, + "epoch": 0.7613407485344957, + "flos": 19035388932480.0, + "grad_norm": 1.612728730211622, + "language_loss": 0.69444978, + "learning_rate": 5.682037143624505e-07, + "loss": 0.7189672, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18212891, + "step": 12663, + "time_per_iteration": 2.8309402465820312 + }, + { + "auxiliary_loss_clip": 0.0140537, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.24534678, + "balance_loss_mlp": 1.01158428, + "epoch": 0.7614008717871636, + "flos": 23265863487360.0, + "grad_norm": 1.5169621458826605, + "language_loss": 0.70432025, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72867072, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1809082, + "step": 12664, + "time_per_iteration": 2.839362144470215 + }, + { + "auxiliary_loss_clip": 0.01428921, + "auxiliary_loss_mlp": 0.01041855, + "balance_loss_clip": 1.26227736, + "balance_loss_mlp": 1.02259052, + "epoch": 0.7614609950398317, + "flos": 21589873159680.0, + "grad_norm": 2.3163050921061616, + "language_loss": 0.79863906, + "learning_rate": 5.676599749853066e-07, + "loss": 0.82334685, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.19274902, + "step": 12665, + "time_per_iteration": 2.8316965103149414 + }, + { + "auxiliary_loss_clip": 0.01408739, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.24912846, + "balance_loss_mlp": 1.01719642, + "epoch": 0.7615211182924996, + "flos": 29289729922560.0, + "grad_norm": 1.5987782550868137, + "language_loss": 0.88682783, + "learning_rate": 5.673881867632959e-07, + "loss": 0.91127425, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18688965, + "step": 12666, + "time_per_iteration": 2.9033985137939453 + }, + { + "auxiliary_loss_clip": 0.01417777, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.25522661, + "balance_loss_mlp": 1.01351476, + "epoch": 0.7615812415451676, + "flos": 13268930019840.0, + "grad_norm": 2.442720645794966, + "language_loss": 0.84415805, + "learning_rate": 5.671164528660693e-07, + "loss": 0.8686595, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1887207, + "step": 12667, + "time_per_iteration": 2.80924391746521 + }, + { + "auxiliary_loss_clip": 0.01397138, + "auxiliary_loss_mlp": 0.01030486, + "balance_loss_clip": 1.2399857, + "balance_loss_mlp": 1.01345146, + "epoch": 0.7616413647978356, + "flos": 18593065336320.0, + "grad_norm": 1.7642901655144163, + "language_loss": 0.7934407, + "learning_rate": 5.668447733039296e-07, + "loss": 0.81771696, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17041016, + "step": 12668, + "time_per_iteration": 2.8206419944763184 + }, + { + "auxiliary_loss_clip": 0.01403202, + "auxiliary_loss_mlp": 0.01032952, + "balance_loss_clip": 1.24262404, + "balance_loss_mlp": 1.01422405, + "epoch": 0.7617014880505035, + "flos": 18525641304960.0, + "grad_norm": 1.8031309787516752, + "language_loss": 0.6475516, + "learning_rate": 5.6657314808718e-07, + "loss": 0.67191315, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18725586, + "step": 12669, + "time_per_iteration": 2.8475418090820312 + }, + { + "auxiliary_loss_clip": 0.0141927, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.2544744, + "balance_loss_mlp": 1.01886702, + "epoch": 0.7617616113031715, + "flos": 24984049230720.0, + "grad_norm": 1.8533959803170799, + "language_loss": 0.66995114, + "learning_rate": 5.663015772261202e-07, + "loss": 0.69452667, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19421387, + "step": 12670, + "time_per_iteration": 2.872419834136963 + }, + { + "auxiliary_loss_clip": 0.01422903, + "auxiliary_loss_mlp": 0.01034896, + "balance_loss_clip": 1.25678051, + "balance_loss_mlp": 1.01447594, + "epoch": 0.7618217345558395, + "flos": 23305525194240.0, + "grad_norm": 2.87357185274531, + "language_loss": 0.73461282, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75919092, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.20410156, + "step": 12671, + "time_per_iteration": 4.285600900650024 + }, + { + "auxiliary_loss_clip": 0.01401917, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.24187565, + "balance_loss_mlp": 1.01592731, + "epoch": 0.7618818578085075, + "flos": 25493615879040.0, + "grad_norm": 1.991022301907854, + "language_loss": 0.73671198, + "learning_rate": 5.657585986122613e-07, + "loss": 0.76106924, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17871094, + "step": 12672, + "time_per_iteration": 2.9119551181793213 + }, + { + "auxiliary_loss_clip": 0.01186617, + "auxiliary_loss_mlp": 0.01024904, + "balance_loss_clip": 1.09929812, + "balance_loss_mlp": 1.00335121, + "epoch": 0.7619419810611754, + "flos": 61177813545600.0, + "grad_norm": 0.764569102270039, + "language_loss": 0.56760049, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58971566, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.21582031, + "step": 12673, + "time_per_iteration": 3.310997724533081 + }, + { + "auxiliary_loss_clip": 0.01399427, + "auxiliary_loss_mlp": 0.0103568, + "balance_loss_clip": 1.23829269, + "balance_loss_mlp": 1.01673818, + "epoch": 0.7620021043138434, + "flos": 23269166357760.0, + "grad_norm": 2.0041517898950514, + "language_loss": 0.75374138, + "learning_rate": 5.652158375447102e-07, + "loss": 0.7780925, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18945312, + "step": 12674, + "time_per_iteration": 2.884979248046875 + }, + { + "auxiliary_loss_clip": 0.01401975, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.24338019, + "balance_loss_mlp": 1.01526022, + "epoch": 0.7620622275665113, + "flos": 25093351964160.0, + "grad_norm": 2.262420639250545, + "language_loss": 0.73137534, + "learning_rate": 5.649445386165286e-07, + "loss": 0.75573361, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18591309, + "step": 12675, + "time_per_iteration": 2.885685443878174 + }, + { + "auxiliary_loss_clip": 0.01413241, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.25309765, + "balance_loss_mlp": 1.01271129, + "epoch": 0.7621223508191793, + "flos": 20164096431360.0, + "grad_norm": 2.1045791581715676, + "language_loss": 0.72916526, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75361013, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1854248, + "step": 12676, + "time_per_iteration": 2.849888324737549 + }, + { + "auxiliary_loss_clip": 0.01426572, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.25770068, + "balance_loss_mlp": 1.01830602, + "epoch": 0.7621824740718472, + "flos": 18008156816640.0, + "grad_norm": 2.8232089680317025, + "language_loss": 0.54947722, + "learning_rate": 5.644021040227927e-07, + "loss": 0.5741117, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.18554688, + "step": 12677, + "time_per_iteration": 4.306567430496216 + }, + { + "auxiliary_loss_clip": 0.01411749, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.24972188, + "balance_loss_mlp": 1.01551402, + "epoch": 0.7622425973245153, + "flos": 21735625219200.0, + "grad_norm": 2.1486598294880905, + "language_loss": 0.7967158, + "learning_rate": 5.641309683778064e-07, + "loss": 0.82117736, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18884277, + "step": 12678, + "time_per_iteration": 2.8339550495147705 + }, + { + "auxiliary_loss_clip": 0.01406955, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.24427974, + "balance_loss_mlp": 1.01582944, + "epoch": 0.7623027205771832, + "flos": 19727880883200.0, + "grad_norm": 1.7605249262247913, + "language_loss": 0.78073406, + "learning_rate": 5.638598871811175e-07, + "loss": 0.80515087, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18896484, + "step": 12679, + "time_per_iteration": 2.832075834274292 + }, + { + "auxiliary_loss_clip": 0.0140663, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.24339938, + "balance_loss_mlp": 1.01251233, + "epoch": 0.7623628438298512, + "flos": 23999645957760.0, + "grad_norm": 1.4562442673763532, + "language_loss": 0.80524266, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82961524, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18115234, + "step": 12680, + "time_per_iteration": 2.87117862701416 + }, + { + "auxiliary_loss_clip": 0.01413849, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.25283051, + "balance_loss_mlp": 1.01323938, + "epoch": 0.7624229670825191, + "flos": 22355761455360.0, + "grad_norm": 2.4494817700981923, + "language_loss": 0.6389221, + "learning_rate": 5.633178881737493e-07, + "loss": 0.66338491, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19189453, + "step": 12681, + "time_per_iteration": 2.821491241455078 + }, + { + "auxiliary_loss_clip": 0.01399796, + "auxiliary_loss_mlp": 0.01035028, + "balance_loss_clip": 1.24050367, + "balance_loss_mlp": 1.01314092, + "epoch": 0.7624830903351871, + "flos": 22722245487360.0, + "grad_norm": 2.565114427231154, + "language_loss": 0.77015972, + "learning_rate": 5.63046970383622e-07, + "loss": 0.79450798, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.21887207, + "step": 12682, + "time_per_iteration": 4.3697521686553955 + }, + { + "auxiliary_loss_clip": 0.01399986, + "auxiliary_loss_mlp": 0.01030373, + "balance_loss_clip": 1.24117351, + "balance_loss_mlp": 1.01249135, + "epoch": 0.7625432135878552, + "flos": 25604818894080.0, + "grad_norm": 1.54903505855677, + "language_loss": 0.68538773, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70969129, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17895508, + "step": 12683, + "time_per_iteration": 2.8910129070281982 + }, + { + "auxiliary_loss_clip": 0.01400332, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.23935294, + "balance_loss_mlp": 1.01692784, + "epoch": 0.7626033368405231, + "flos": 23998152879360.0, + "grad_norm": 2.3070600189162485, + "language_loss": 0.83782005, + "learning_rate": 5.625052982818472e-07, + "loss": 0.8621856, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19287109, + "step": 12684, + "time_per_iteration": 2.8531558513641357 + }, + { + "auxiliary_loss_clip": 0.01419215, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.25698888, + "balance_loss_mlp": 1.01429725, + "epoch": 0.7626634600931911, + "flos": 12605376758400.0, + "grad_norm": 2.2752651794423095, + "language_loss": 0.83569348, + "learning_rate": 5.622345439907396e-07, + "loss": 0.86022335, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19470215, + "step": 12685, + "time_per_iteration": 2.818706750869751 + }, + { + "auxiliary_loss_clip": 0.01415935, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.25334954, + "balance_loss_mlp": 1.011271, + "epoch": 0.762723583345859, + "flos": 26333669681280.0, + "grad_norm": 3.0395347593132294, + "language_loss": 0.77791262, + "learning_rate": 5.619638442198422e-07, + "loss": 0.80237067, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18603516, + "step": 12686, + "time_per_iteration": 2.8823633193969727 + }, + { + "auxiliary_loss_clip": 0.01408762, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.24430323, + "balance_loss_mlp": 1.01633716, + "epoch": 0.762783706598527, + "flos": 21916785974400.0, + "grad_norm": 1.689747716230049, + "language_loss": 0.7321105, + "learning_rate": 5.616931989794198e-07, + "loss": 0.75654948, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18786621, + "step": 12687, + "time_per_iteration": 2.879981517791748 + }, + { + "auxiliary_loss_clip": 0.01411284, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.25065935, + "balance_loss_mlp": 1.01481104, + "epoch": 0.7628438298511949, + "flos": 15347853705600.0, + "grad_norm": 1.7555698991841302, + "language_loss": 0.65299356, + "learning_rate": 5.614226082797369e-07, + "loss": 0.67744792, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19335938, + "step": 12688, + "time_per_iteration": 2.8217406272888184 + }, + { + "auxiliary_loss_clip": 0.01400132, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.2415632, + "balance_loss_mlp": 1.01081514, + "epoch": 0.7629039531038629, + "flos": 13014191940480.0, + "grad_norm": 1.8453518394461212, + "language_loss": 0.71569264, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73998749, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18530273, + "step": 12689, + "time_per_iteration": 2.8758504390716553 + }, + { + "auxiliary_loss_clip": 0.01428654, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.26175499, + "balance_loss_mlp": 1.01376617, + "epoch": 0.7629640763565309, + "flos": 26181402370560.0, + "grad_norm": 1.7041280321196504, + "language_loss": 0.7103675, + "learning_rate": 5.608815905436238e-07, + "loss": 0.73497903, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.18737793, + "step": 12690, + "time_per_iteration": 2.9213674068450928 + }, + { + "auxiliary_loss_clip": 0.01406999, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.24590039, + "balance_loss_mlp": 1.01239157, + "epoch": 0.7630241996091989, + "flos": 36807430544640.0, + "grad_norm": 1.4828477984733357, + "language_loss": 0.70401013, + "learning_rate": 5.606111635277109e-07, + "loss": 0.72838092, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.17675781, + "step": 12691, + "time_per_iteration": 2.9760804176330566 + }, + { + "auxiliary_loss_clip": 0.01411939, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.25025344, + "balance_loss_mlp": 1.01283431, + "epoch": 0.7630843228618668, + "flos": 21845154176640.0, + "grad_norm": 1.669100928945534, + "language_loss": 0.828336, + "learning_rate": 5.603407910935662e-07, + "loss": 0.85275793, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.17431641, + "step": 12692, + "time_per_iteration": 2.873231887817383 + }, + { + "auxiliary_loss_clip": 0.01418776, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.25605297, + "balance_loss_mlp": 1.01745796, + "epoch": 0.7631444461145348, + "flos": 12648069866880.0, + "grad_norm": 5.7568399503158485, + "language_loss": 0.78128642, + "learning_rate": 5.600704732514438e-07, + "loss": 0.80583066, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18188477, + "step": 12693, + "time_per_iteration": 2.9675512313842773 + }, + { + "auxiliary_loss_clip": 0.01427037, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.26184702, + "balance_loss_mlp": 1.01698744, + "epoch": 0.7632045693672027, + "flos": 16845307476480.0, + "grad_norm": 2.2406780430554982, + "language_loss": 0.73821074, + "learning_rate": 5.598002100115933e-07, + "loss": 0.76285112, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20007324, + "step": 12694, + "time_per_iteration": 2.8677423000335693 + }, + { + "auxiliary_loss_clip": 0.01413751, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.25238204, + "balance_loss_mlp": 1.01084971, + "epoch": 0.7632646926198707, + "flos": 22027310317440.0, + "grad_norm": 2.666657533146488, + "language_loss": 0.70972693, + "learning_rate": 5.595300013842625e-07, + "loss": 0.73415631, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18334961, + "step": 12695, + "time_per_iteration": 2.839151620864868 + }, + { + "auxiliary_loss_clip": 0.01406595, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.24585319, + "balance_loss_mlp": 1.01208818, + "epoch": 0.7633248158725388, + "flos": 23124816887040.0, + "grad_norm": 2.445171762730224, + "language_loss": 0.73227024, + "learning_rate": 5.592598473796985e-07, + "loss": 0.75664002, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18273926, + "step": 12696, + "time_per_iteration": 2.880070447921753 + }, + { + "auxiliary_loss_clip": 0.0141556, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.25192583, + "balance_loss_mlp": 1.01192498, + "epoch": 0.7633849391252067, + "flos": 10897099626240.0, + "grad_norm": 2.0647433863501523, + "language_loss": 0.72334319, + "learning_rate": 5.589897480081453e-07, + "loss": 0.74780977, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19165039, + "step": 12697, + "time_per_iteration": 2.8222146034240723 + }, + { + "auxiliary_loss_clip": 0.0140338, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.24505877, + "balance_loss_mlp": 1.01160693, + "epoch": 0.7634450623778747, + "flos": 21003471561600.0, + "grad_norm": 2.3135528913712426, + "language_loss": 0.67914712, + "learning_rate": 5.587197032798461e-07, + "loss": 0.70348704, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18994141, + "step": 12698, + "time_per_iteration": 2.8458311557769775 + }, + { + "auxiliary_loss_clip": 0.01403996, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.24168444, + "balance_loss_mlp": 1.01128483, + "epoch": 0.7635051856305426, + "flos": 18891853868160.0, + "grad_norm": 2.104630723177676, + "language_loss": 0.73216397, + "learning_rate": 5.5844971320504e-07, + "loss": 0.75651157, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19470215, + "step": 12699, + "time_per_iteration": 2.8765902519226074 + }, + { + "auxiliary_loss_clip": 0.01392429, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.23492348, + "balance_loss_mlp": 1.01440716, + "epoch": 0.7635653088832106, + "flos": 34800183901440.0, + "grad_norm": 2.138823316965592, + "language_loss": 0.74145281, + "learning_rate": 5.581797777939648e-07, + "loss": 0.76570344, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18249512, + "step": 12700, + "time_per_iteration": 2.9625439643859863 + }, + { + "auxiliary_loss_clip": 0.01407248, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.24517226, + "balance_loss_mlp": 1.01621389, + "epoch": 0.7636254321358785, + "flos": 23187037766400.0, + "grad_norm": 2.601566684221268, + "language_loss": 0.69795811, + "learning_rate": 5.579098970568574e-07, + "loss": 0.72237903, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18640137, + "step": 12701, + "time_per_iteration": 2.8201279640197754 + }, + { + "auxiliary_loss_clip": 0.01403601, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.24260402, + "balance_loss_mlp": 1.01907349, + "epoch": 0.7636855553885465, + "flos": 21335316059520.0, + "grad_norm": 1.923229713149261, + "language_loss": 0.65403557, + "learning_rate": 5.576400710039508e-07, + "loss": 0.67844784, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18530273, + "step": 12702, + "time_per_iteration": 2.849137544631958 + }, + { + "auxiliary_loss_clip": 0.01414866, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.2514112, + "balance_loss_mlp": 1.01647592, + "epoch": 0.7637456786412145, + "flos": 28669865155200.0, + "grad_norm": 1.9799337398349477, + "language_loss": 0.66483706, + "learning_rate": 5.57370299645477e-07, + "loss": 0.68934345, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19299316, + "step": 12703, + "time_per_iteration": 2.9443461894989014 + }, + { + "auxiliary_loss_clip": 0.01408644, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.24712467, + "balance_loss_mlp": 1.01039577, + "epoch": 0.7638058018938825, + "flos": 21917057443200.0, + "grad_norm": 1.9344879909988062, + "language_loss": 0.84562081, + "learning_rate": 5.571005829916668e-07, + "loss": 0.86999154, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18017578, + "step": 12704, + "time_per_iteration": 2.845228910446167 + }, + { + "auxiliary_loss_clip": 0.01404236, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.24338686, + "balance_loss_mlp": 1.01488793, + "epoch": 0.7638659251465504, + "flos": 29656123464960.0, + "grad_norm": 1.7550061312769576, + "language_loss": 0.68241131, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70678914, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18652344, + "step": 12705, + "time_per_iteration": 2.880814790725708 + }, + { + "auxiliary_loss_clip": 0.01400261, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.24115944, + "balance_loss_mlp": 1.015113, + "epoch": 0.7639260483992184, + "flos": 26152463681280.0, + "grad_norm": 1.6324087042250581, + "language_loss": 0.75231093, + "learning_rate": 5.565613138389427e-07, + "loss": 0.77665132, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18688965, + "step": 12706, + "time_per_iteration": 4.278339147567749 + }, + { + "auxiliary_loss_clip": 0.01398908, + "auxiliary_loss_mlp": 0.01032039, + "balance_loss_clip": 1.23965526, + "balance_loss_mlp": 1.01338291, + "epoch": 0.7639861716518863, + "flos": 20166222936960.0, + "grad_norm": 6.341129685971686, + "language_loss": 0.79136121, + "learning_rate": 5.562917613604781e-07, + "loss": 0.81567067, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18664551, + "step": 12707, + "time_per_iteration": 2.845256805419922 + }, + { + "auxiliary_loss_clip": 0.01413424, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.25081158, + "balance_loss_mlp": 1.01060188, + "epoch": 0.7640462949045543, + "flos": 18591029320320.0, + "grad_norm": 1.8003022386835217, + "language_loss": 0.80592871, + "learning_rate": 5.560222636275751e-07, + "loss": 0.83035445, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1854248, + "step": 12708, + "time_per_iteration": 2.92230224609375 + }, + { + "auxiliary_loss_clip": 0.01186316, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.09819865, + "balance_loss_mlp": 1.01104677, + "epoch": 0.7641064181572224, + "flos": 68354548202880.0, + "grad_norm": 0.8218417430016498, + "language_loss": 0.56588143, + "learning_rate": 5.557528206504521e-07, + "loss": 0.588063, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20800781, + "step": 12709, + "time_per_iteration": 3.3683218955993652 + }, + { + "auxiliary_loss_clip": 0.01399386, + "auxiliary_loss_mlp": 0.01039148, + "balance_loss_clip": 1.23798096, + "balance_loss_mlp": 1.01878691, + "epoch": 0.7641665414098903, + "flos": 17978313231360.0, + "grad_norm": 1.738426294155675, + "language_loss": 0.641559, + "learning_rate": 5.554834324393271e-07, + "loss": 0.6659444, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20361328, + "step": 12710, + "time_per_iteration": 2.8442046642303467 + }, + { + "auxiliary_loss_clip": 0.01416728, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.25178456, + "balance_loss_mlp": 1.01325357, + "epoch": 0.7642266646625583, + "flos": 21262100693760.0, + "grad_norm": 2.0511692054306203, + "language_loss": 0.66001225, + "learning_rate": 5.552140990044154e-07, + "loss": 0.68450689, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19482422, + "step": 12711, + "time_per_iteration": 2.916053056716919 + }, + { + "auxiliary_loss_clip": 0.0140631, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.24452579, + "balance_loss_mlp": 1.01054049, + "epoch": 0.7642867879152262, + "flos": 22758151875840.0, + "grad_norm": 1.4699619172574667, + "language_loss": 0.73591125, + "learning_rate": 5.549448203559293e-07, + "loss": 0.76026165, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18188477, + "step": 12712, + "time_per_iteration": 4.277302503585815 + }, + { + "auxiliary_loss_clip": 0.01398545, + "auxiliary_loss_mlp": 0.01030222, + "balance_loss_clip": 1.24002886, + "balance_loss_mlp": 1.01218593, + "epoch": 0.7643469111678942, + "flos": 23342788926720.0, + "grad_norm": 2.392841579516698, + "language_loss": 0.81207693, + "learning_rate": 5.546755965040804e-07, + "loss": 0.83636463, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18029785, + "step": 12713, + "time_per_iteration": 2.904714822769165 + }, + { + "auxiliary_loss_clip": 0.01419658, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.25574982, + "balance_loss_mlp": 1.01648951, + "epoch": 0.7644070344205621, + "flos": 19864538737920.0, + "grad_norm": 7.4534062194144886, + "language_loss": 0.84001911, + "learning_rate": 5.544064274590776e-07, + "loss": 0.86457139, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19091797, + "step": 12714, + "time_per_iteration": 2.8254282474517822 + }, + { + "auxiliary_loss_clip": 0.01405858, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.24312687, + "balance_loss_mlp": 1.01553619, + "epoch": 0.7644671576732301, + "flos": 22100706662400.0, + "grad_norm": 1.6251951420537383, + "language_loss": 0.73442614, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75883317, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1932373, + "step": 12715, + "time_per_iteration": 2.8599483966827393 + }, + { + "auxiliary_loss_clip": 0.01400242, + "auxiliary_loss_mlp": 0.01033889, + "balance_loss_clip": 1.23996735, + "balance_loss_mlp": 1.01431453, + "epoch": 0.7645272809258981, + "flos": 25491760842240.0, + "grad_norm": 1.6315110861018363, + "language_loss": 0.63892943, + "learning_rate": 5.538682538304376e-07, + "loss": 0.66327077, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19604492, + "step": 12716, + "time_per_iteration": 4.282771110534668 + }, + { + "auxiliary_loss_clip": 0.01423141, + "auxiliary_loss_mlp": 0.01032557, + "balance_loss_clip": 1.2574017, + "balance_loss_mlp": 1.01307869, + "epoch": 0.7645874041785661, + "flos": 21551433062400.0, + "grad_norm": 2.006292824749454, + "language_loss": 0.80173397, + "learning_rate": 5.535992492672068e-07, + "loss": 0.82629097, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19482422, + "step": 12717, + "time_per_iteration": 4.252044677734375 + }, + { + "auxiliary_loss_clip": 0.01394237, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.23620737, + "balance_loss_mlp": 1.01865721, + "epoch": 0.764647527431234, + "flos": 20640516624000.0, + "grad_norm": 2.5220871710642623, + "language_loss": 0.67484957, + "learning_rate": 5.53330299551638e-07, + "loss": 0.69917214, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19348145, + "step": 12718, + "time_per_iteration": 2.8416671752929688 + }, + { + "auxiliary_loss_clip": 0.01401097, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.24179149, + "balance_loss_mlp": 1.01335287, + "epoch": 0.764707650683902, + "flos": 21444030610560.0, + "grad_norm": 1.9131418645788234, + "language_loss": 0.78200352, + "learning_rate": 5.530614046939286e-07, + "loss": 0.80632961, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18139648, + "step": 12719, + "time_per_iteration": 2.8364920616149902 + }, + { + "auxiliary_loss_clip": 0.01409417, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.24718368, + "balance_loss_mlp": 1.01237535, + "epoch": 0.7647677739365699, + "flos": 22721521570560.0, + "grad_norm": 2.092901508736322, + "language_loss": 0.70849037, + "learning_rate": 5.527925647042754e-07, + "loss": 0.73289442, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18603516, + "step": 12720, + "time_per_iteration": 2.9127211570739746 + }, + { + "auxiliary_loss_clip": 0.01412858, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.25101924, + "balance_loss_mlp": 1.01348901, + "epoch": 0.7648278971892379, + "flos": 21333913470720.0, + "grad_norm": 1.956286699537998, + "language_loss": 0.74803376, + "learning_rate": 5.52523779592875e-07, + "loss": 0.77248538, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18811035, + "step": 12721, + "time_per_iteration": 2.9930717945098877 + }, + { + "auxiliary_loss_clip": 0.01406509, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.24449444, + "balance_loss_mlp": 1.01283288, + "epoch": 0.764888020441906, + "flos": 20676965950080.0, + "grad_norm": 1.797970698467843, + "language_loss": 0.74265468, + "learning_rate": 5.522550493699163e-07, + "loss": 0.76703173, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18359375, + "step": 12722, + "time_per_iteration": 2.832766056060791 + }, + { + "auxiliary_loss_clip": 0.01392635, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.23388958, + "balance_loss_mlp": 1.01606059, + "epoch": 0.7649481436945739, + "flos": 25093397208960.0, + "grad_norm": 2.636789129093383, + "language_loss": 0.74439037, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76866525, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18786621, + "step": 12723, + "time_per_iteration": 2.8806347846984863 + }, + { + "auxiliary_loss_clip": 0.01410106, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.24525011, + "balance_loss_mlp": 1.01299715, + "epoch": 0.7650082669472419, + "flos": 24911919740160.0, + "grad_norm": 2.186625810914325, + "language_loss": 0.7425586, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7669751, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18566895, + "step": 12724, + "time_per_iteration": 2.8547439575195312 + }, + { + "auxiliary_loss_clip": 0.01402226, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.24327266, + "balance_loss_mlp": 1.01281047, + "epoch": 0.7650683901999098, + "flos": 14655497489280.0, + "grad_norm": 1.9218402187409904, + "language_loss": 0.85044861, + "learning_rate": 5.514491881335935e-07, + "loss": 0.87478715, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18798828, + "step": 12725, + "time_per_iteration": 2.821133852005005 + }, + { + "auxiliary_loss_clip": 0.01392504, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.23403811, + "balance_loss_mlp": 1.01299334, + "epoch": 0.7651285134525778, + "flos": 26361793964160.0, + "grad_norm": 1.760049644319288, + "language_loss": 0.78347474, + "learning_rate": 5.511806775662901e-07, + "loss": 0.80771774, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18798828, + "step": 12726, + "time_per_iteration": 2.8860785961151123 + }, + { + "auxiliary_loss_clip": 0.0141282, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.25137281, + "balance_loss_mlp": 1.01498997, + "epoch": 0.7651886367052457, + "flos": 26656962912000.0, + "grad_norm": 2.037810448808766, + "language_loss": 0.71051371, + "learning_rate": 5.509122219383615e-07, + "loss": 0.7349844, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19250488, + "step": 12727, + "time_per_iteration": 2.9650168418884277 + }, + { + "auxiliary_loss_clip": 0.01391564, + "auxiliary_loss_mlp": 0.01034171, + "balance_loss_clip": 1.2342937, + "balance_loss_mlp": 1.01463306, + "epoch": 0.7652487599579137, + "flos": 25713895403520.0, + "grad_norm": 1.698395938408037, + "language_loss": 0.80450881, + "learning_rate": 5.506438212599864e-07, + "loss": 0.82876617, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19519043, + "step": 12728, + "time_per_iteration": 2.895752191543579 + }, + { + "auxiliary_loss_clip": 0.01417207, + "auxiliary_loss_mlp": 0.01037484, + "balance_loss_clip": 1.25388956, + "balance_loss_mlp": 1.01830316, + "epoch": 0.7653088832105817, + "flos": 28597464195840.0, + "grad_norm": 1.7209301654881508, + "language_loss": 0.57152545, + "learning_rate": 5.503754755413424e-07, + "loss": 0.59607244, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19177246, + "step": 12729, + "time_per_iteration": 2.9119129180908203 + }, + { + "auxiliary_loss_clip": 0.01402366, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.24242258, + "balance_loss_mlp": 1.01717496, + "epoch": 0.7653690064632497, + "flos": 23376930768000.0, + "grad_norm": 2.3451645255769384, + "language_loss": 0.78386605, + "learning_rate": 5.501071847926055e-07, + "loss": 0.80825031, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1887207, + "step": 12730, + "time_per_iteration": 2.909264087677002 + }, + { + "auxiliary_loss_clip": 0.01412304, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.24985492, + "balance_loss_mlp": 1.02049375, + "epoch": 0.7654291297159176, + "flos": 15781128341760.0, + "grad_norm": 1.991223923347525, + "language_loss": 0.70263696, + "learning_rate": 5.498389490239495e-07, + "loss": 0.72715652, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19177246, + "step": 12731, + "time_per_iteration": 2.811901092529297 + }, + { + "auxiliary_loss_clip": 0.01407151, + "auxiliary_loss_mlp": 0.0103668, + "balance_loss_clip": 1.24575996, + "balance_loss_mlp": 1.01730847, + "epoch": 0.7654892529685856, + "flos": 18041031803520.0, + "grad_norm": 2.752650430306556, + "language_loss": 0.70863175, + "learning_rate": 5.495707682455471e-07, + "loss": 0.73307008, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19384766, + "step": 12732, + "time_per_iteration": 2.9116008281707764 + }, + { + "auxiliary_loss_clip": 0.01404961, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.24213231, + "balance_loss_mlp": 1.01211238, + "epoch": 0.7655493762212535, + "flos": 27247662766080.0, + "grad_norm": 1.4746180353195881, + "language_loss": 0.78681993, + "learning_rate": 5.493026424675653e-07, + "loss": 0.81117773, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18701172, + "step": 12733, + "time_per_iteration": 2.928375720977783 + }, + { + "auxiliary_loss_clip": 0.01399683, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.24224973, + "balance_loss_mlp": 1.0174036, + "epoch": 0.7656094994739215, + "flos": 20782875323520.0, + "grad_norm": 1.6268419670035998, + "language_loss": 0.77862525, + "learning_rate": 5.490345717001726e-07, + "loss": 0.80297554, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.17944336, + "step": 12734, + "time_per_iteration": 2.8362410068511963 + }, + { + "auxiliary_loss_clip": 0.01416571, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.2516166, + "balance_loss_mlp": 1.01711178, + "epoch": 0.7656696227265896, + "flos": 23049565505280.0, + "grad_norm": 1.8853887936743, + "language_loss": 0.73838872, + "learning_rate": 5.48766555953535e-07, + "loss": 0.7629261, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20056152, + "step": 12735, + "time_per_iteration": 2.8671698570251465 + }, + { + "auxiliary_loss_clip": 0.01404244, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.24263322, + "balance_loss_mlp": 1.01641226, + "epoch": 0.7657297459792575, + "flos": 27536768910720.0, + "grad_norm": 1.397290564991527, + "language_loss": 0.73208427, + "learning_rate": 5.484985952378145e-07, + "loss": 0.75647998, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18896484, + "step": 12736, + "time_per_iteration": 2.9463255405426025 + }, + { + "auxiliary_loss_clip": 0.01419235, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.25605154, + "balance_loss_mlp": 1.01951921, + "epoch": 0.7657898692319255, + "flos": 17137218798720.0, + "grad_norm": 2.093482346143955, + "language_loss": 0.77698588, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80157149, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19824219, + "step": 12737, + "time_per_iteration": 2.8669357299804688 + }, + { + "auxiliary_loss_clip": 0.01397208, + "auxiliary_loss_mlp": 0.01037166, + "balance_loss_clip": 1.23660326, + "balance_loss_mlp": 1.01828313, + "epoch": 0.7658499924845934, + "flos": 21474462378240.0, + "grad_norm": 1.6361366748726105, + "language_loss": 0.77768862, + "learning_rate": 5.479628389397699e-07, + "loss": 0.80203235, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18884277, + "step": 12738, + "time_per_iteration": 2.838177442550659 + }, + { + "auxiliary_loss_clip": 0.01413176, + "auxiliary_loss_mlp": 0.01034935, + "balance_loss_clip": 1.24858868, + "balance_loss_mlp": 1.01627922, + "epoch": 0.7659101157372614, + "flos": 29508516368640.0, + "grad_norm": 2.506770874554801, + "language_loss": 0.63570589, + "learning_rate": 5.476950433777603e-07, + "loss": 0.66018701, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18652344, + "step": 12739, + "time_per_iteration": 2.9059417247772217 + }, + { + "auxiliary_loss_clip": 0.01397954, + "auxiliary_loss_mlp": 0.0103874, + "balance_loss_clip": 1.23703063, + "balance_loss_mlp": 1.01965415, + "epoch": 0.7659702389899293, + "flos": 18561050000640.0, + "grad_norm": 1.9034500342387892, + "language_loss": 0.80358171, + "learning_rate": 5.474273028873004e-07, + "loss": 0.82794869, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1907959, + "step": 12740, + "time_per_iteration": 2.803060293197632 + }, + { + "auxiliary_loss_clip": 0.01399915, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.23991966, + "balance_loss_mlp": 1.01535058, + "epoch": 0.7660303622425974, + "flos": 23559403622400.0, + "grad_norm": 1.6838082912296088, + "language_loss": 0.66214216, + "learning_rate": 5.471596174785429e-07, + "loss": 0.68647993, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18530273, + "step": 12741, + "time_per_iteration": 4.263515949249268 + }, + { + "auxiliary_loss_clip": 0.01404374, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.24490142, + "balance_loss_mlp": 1.01202607, + "epoch": 0.7660904854952653, + "flos": 18926086199040.0, + "grad_norm": 1.6371855362171621, + "language_loss": 0.76640928, + "learning_rate": 5.468919871616386e-07, + "loss": 0.79077399, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.20080566, + "step": 12742, + "time_per_iteration": 2.877699136734009 + }, + { + "auxiliary_loss_clip": 0.01392951, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.23649693, + "balance_loss_mlp": 1.01365256, + "epoch": 0.7661506087479333, + "flos": 23157556139520.0, + "grad_norm": 1.3748534017207428, + "language_loss": 0.77090824, + "learning_rate": 5.46624411946736e-07, + "loss": 0.795156, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.1817627, + "step": 12743, + "time_per_iteration": 2.967599868774414 + }, + { + "auxiliary_loss_clip": 0.01401871, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.24114227, + "balance_loss_mlp": 1.01558113, + "epoch": 0.7662107320006012, + "flos": 17574520222080.0, + "grad_norm": 1.997736621863005, + "language_loss": 0.75412136, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77848321, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18713379, + "step": 12744, + "time_per_iteration": 2.8230140209198 + }, + { + "auxiliary_loss_clip": 0.01411315, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.2483356, + "balance_loss_mlp": 1.01748669, + "epoch": 0.7662708552532692, + "flos": 22311530023680.0, + "grad_norm": 2.437989169501319, + "language_loss": 0.72114837, + "learning_rate": 5.460894268635181e-07, + "loss": 0.74563307, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19665527, + "step": 12745, + "time_per_iteration": 2.835333824157715 + }, + { + "auxiliary_loss_clip": 0.01401782, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.24045241, + "balance_loss_mlp": 1.02093601, + "epoch": 0.7663309785059371, + "flos": 15750470350080.0, + "grad_norm": 2.4880099141697842, + "language_loss": 0.78274566, + "learning_rate": 5.458220170154896e-07, + "loss": 0.80716211, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18933105, + "step": 12746, + "time_per_iteration": 2.8525919914245605 + }, + { + "auxiliary_loss_clip": 0.01188714, + "auxiliary_loss_mlp": 0.01050977, + "balance_loss_clip": 1.10334241, + "balance_loss_mlp": 1.02761197, + "epoch": 0.7663911017586051, + "flos": 62196195663360.0, + "grad_norm": 0.6795747674224933, + "language_loss": 0.56796539, + "learning_rate": 5.455546623100362e-07, + "loss": 0.59036231, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.23339844, + "step": 12747, + "time_per_iteration": 4.801854372024536 + }, + { + "auxiliary_loss_clip": 0.0140251, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.24372649, + "balance_loss_mlp": 1.01974463, + "epoch": 0.7664512250112732, + "flos": 26517409390080.0, + "grad_norm": 2.425647172387313, + "language_loss": 0.72614336, + "learning_rate": 5.452873627572956e-07, + "loss": 0.75053805, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17211914, + "step": 12748, + "time_per_iteration": 2.8943965435028076 + }, + { + "auxiliary_loss_clip": 0.01398833, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.23826551, + "balance_loss_mlp": 1.00827861, + "epoch": 0.7665113482639411, + "flos": 16257231820800.0, + "grad_norm": 2.0161611956465455, + "language_loss": 0.70450675, + "learning_rate": 5.450201183674052e-07, + "loss": 0.72876728, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18933105, + "step": 12749, + "time_per_iteration": 2.8052446842193604 + }, + { + "auxiliary_loss_clip": 0.01403074, + "auxiliary_loss_mlp": 0.01034259, + "balance_loss_clip": 1.2405175, + "balance_loss_mlp": 1.01481605, + "epoch": 0.7665714715166091, + "flos": 27209086934400.0, + "grad_norm": 1.7040117287892105, + "language_loss": 0.74020934, + "learning_rate": 5.447529291504967e-07, + "loss": 0.76458269, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19458008, + "step": 12750, + "time_per_iteration": 2.927014112472534 + }, + { + "auxiliary_loss_clip": 0.01388173, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.23168039, + "balance_loss_mlp": 1.01214623, + "epoch": 0.766631594769277, + "flos": 21077275109760.0, + "grad_norm": 1.9516509449801305, + "language_loss": 0.76692468, + "learning_rate": 5.444857951167026e-07, + "loss": 0.79110837, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18054199, + "step": 12751, + "time_per_iteration": 4.208454847335815 + }, + { + "auxiliary_loss_clip": 0.01391472, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.23362184, + "balance_loss_mlp": 1.01459837, + "epoch": 0.766691718021945, + "flos": 24108722467200.0, + "grad_norm": 1.8310705094854403, + "language_loss": 0.6252712, + "learning_rate": 5.442187162761537e-07, + "loss": 0.64952332, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19140625, + "step": 12752, + "time_per_iteration": 4.2787926197052 + }, + { + "auxiliary_loss_clip": 0.01414863, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.25264764, + "balance_loss_mlp": 1.0136795, + "epoch": 0.7667518412746129, + "flos": 23451096274560.0, + "grad_norm": 2.080690145097475, + "language_loss": 0.70161468, + "learning_rate": 5.439516926389767e-07, + "loss": 0.72609568, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19567871, + "step": 12753, + "time_per_iteration": 2.8480403423309326 + }, + { + "auxiliary_loss_clip": 0.01407329, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.24695945, + "balance_loss_mlp": 1.0163486, + "epoch": 0.766811964527281, + "flos": 18157483215360.0, + "grad_norm": 2.385092777182043, + "language_loss": 0.62560874, + "learning_rate": 5.436847242152971e-07, + "loss": 0.65002704, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1817627, + "step": 12754, + "time_per_iteration": 2.821772336959839 + }, + { + "auxiliary_loss_clip": 0.01404177, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.24551606, + "balance_loss_mlp": 1.01032555, + "epoch": 0.7668720877799489, + "flos": 19545317539200.0, + "grad_norm": 2.0865983866385815, + "language_loss": 0.80607361, + "learning_rate": 5.434178110152401e-07, + "loss": 0.83041114, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19238281, + "step": 12755, + "time_per_iteration": 2.8071374893188477 + }, + { + "auxiliary_loss_clip": 0.01404143, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.24462748, + "balance_loss_mlp": 1.01098967, + "epoch": 0.7669322110326169, + "flos": 22684529306880.0, + "grad_norm": 1.823294851921207, + "language_loss": 0.71481425, + "learning_rate": 5.431509530489242e-07, + "loss": 0.73915488, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18920898, + "step": 12756, + "time_per_iteration": 2.8444745540618896 + }, + { + "auxiliary_loss_clip": 0.01410849, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.25036216, + "balance_loss_mlp": 1.01596642, + "epoch": 0.7669923342852848, + "flos": 26480733840000.0, + "grad_norm": 1.5143147931872047, + "language_loss": 0.70239735, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72684813, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18261719, + "step": 12757, + "time_per_iteration": 2.95697283744812 + }, + { + "auxiliary_loss_clip": 0.01395915, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.23653436, + "balance_loss_mlp": 1.02127337, + "epoch": 0.7670524575379528, + "flos": 22866232999680.0, + "grad_norm": 1.9663767677185664, + "language_loss": 0.77446169, + "learning_rate": 5.426174028579955e-07, + "loss": 0.79883981, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.20629883, + "step": 12758, + "time_per_iteration": 2.8309836387634277 + }, + { + "auxiliary_loss_clip": 0.01395682, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.23903072, + "balance_loss_mlp": 1.01662493, + "epoch": 0.7671125807906207, + "flos": 22461444604800.0, + "grad_norm": 1.608406565505334, + "language_loss": 0.76974517, + "learning_rate": 5.423507106536156e-07, + "loss": 0.79404628, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17797852, + "step": 12759, + "time_per_iteration": 2.8280529975891113 + }, + { + "auxiliary_loss_clip": 0.01410795, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.24836969, + "balance_loss_mlp": 1.01331472, + "epoch": 0.7671727040432887, + "flos": 35385092421120.0, + "grad_norm": 2.3245940972762957, + "language_loss": 0.68531024, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70973253, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18139648, + "step": 12760, + "time_per_iteration": 2.9355599880218506 + }, + { + "auxiliary_loss_clip": 0.01410036, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.24744272, + "balance_loss_mlp": 1.01386178, + "epoch": 0.7672328272959568, + "flos": 22505902260480.0, + "grad_norm": 1.4626931825978964, + "language_loss": 0.79777592, + "learning_rate": 5.418174920775871e-07, + "loss": 0.82221532, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20031738, + "step": 12761, + "time_per_iteration": 2.884333372116089 + }, + { + "auxiliary_loss_clip": 0.01386912, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.23005366, + "balance_loss_mlp": 1.01150274, + "epoch": 0.7672929505486247, + "flos": 22824580521600.0, + "grad_norm": 2.8363527759582365, + "language_loss": 0.66501915, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68918645, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.1829834, + "step": 12762, + "time_per_iteration": 2.832902431488037 + }, + { + "auxiliary_loss_clip": 0.01410623, + "auxiliary_loss_mlp": 0.01036982, + "balance_loss_clip": 1.2480278, + "balance_loss_mlp": 1.0156672, + "epoch": 0.7673530738012927, + "flos": 20348333832960.0, + "grad_norm": 1.7223475564353614, + "language_loss": 0.74805903, + "learning_rate": 5.412844946792639e-07, + "loss": 0.77253509, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.21313477, + "step": 12763, + "time_per_iteration": 2.9580090045928955 + }, + { + "auxiliary_loss_clip": 0.01405924, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.24618328, + "balance_loss_mlp": 1.01688039, + "epoch": 0.7674131970539606, + "flos": 34946976591360.0, + "grad_norm": 1.442533821166884, + "language_loss": 0.71563649, + "learning_rate": 5.410180789470067e-07, + "loss": 0.74005264, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18811035, + "step": 12764, + "time_per_iteration": 2.989431858062744 + }, + { + "auxiliary_loss_clip": 0.01399538, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.24027145, + "balance_loss_mlp": 1.01769626, + "epoch": 0.7674733203066286, + "flos": 28340328142080.0, + "grad_norm": 1.425184812545941, + "language_loss": 0.6997205, + "learning_rate": 5.40751718539491e-07, + "loss": 0.72407335, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18054199, + "step": 12765, + "time_per_iteration": 2.892854928970337 + }, + { + "auxiliary_loss_clip": 0.0139858, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.24086189, + "balance_loss_mlp": 1.01336098, + "epoch": 0.7675334435592965, + "flos": 16298386606080.0, + "grad_norm": 2.148916602136365, + "language_loss": 0.61058736, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63488221, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.17529297, + "step": 12766, + "time_per_iteration": 2.8286032676696777 + }, + { + "auxiliary_loss_clip": 0.01192818, + "auxiliary_loss_mlp": 0.01018511, + "balance_loss_clip": 1.1035428, + "balance_loss_mlp": 0.99810201, + "epoch": 0.7675935668119646, + "flos": 64859665910400.0, + "grad_norm": 0.7349148886378912, + "language_loss": 0.60805774, + "learning_rate": 5.402191637390803e-07, + "loss": 0.63017106, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20410156, + "step": 12767, + "time_per_iteration": 3.460118532180786 + }, + { + "auxiliary_loss_clip": 0.01400919, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.24242663, + "balance_loss_mlp": 1.01434875, + "epoch": 0.7676536900646325, + "flos": 22685886650880.0, + "grad_norm": 1.68323071124163, + "language_loss": 0.70196617, + "learning_rate": 5.399529693663801e-07, + "loss": 0.72629541, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17651367, + "step": 12768, + "time_per_iteration": 2.886561870574951 + }, + { + "auxiliary_loss_clip": 0.01434743, + "auxiliary_loss_mlp": 0.01038404, + "balance_loss_clip": 1.26934481, + "balance_loss_mlp": 1.01967573, + "epoch": 0.7677138133173005, + "flos": 26950095843840.0, + "grad_norm": 2.223792538464322, + "language_loss": 0.71228409, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7370156, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18725586, + "step": 12769, + "time_per_iteration": 2.932403802871704 + }, + { + "auxiliary_loss_clip": 0.01418011, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.25383496, + "balance_loss_mlp": 1.01518655, + "epoch": 0.7677739365699684, + "flos": 23808531346560.0, + "grad_norm": 2.1169885356919242, + "language_loss": 0.81303322, + "learning_rate": 5.394207467264611e-07, + "loss": 0.8375535, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18835449, + "step": 12770, + "time_per_iteration": 2.9230520725250244 + }, + { + "auxiliary_loss_clip": 0.01408723, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.25135541, + "balance_loss_mlp": 1.01400805, + "epoch": 0.7678340598226364, + "flos": 34467796465920.0, + "grad_norm": 1.6782993195981655, + "language_loss": 0.78907233, + "learning_rate": 5.391547184794245e-07, + "loss": 0.81348193, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18212891, + "step": 12771, + "time_per_iteration": 2.9584872722625732 + }, + { + "auxiliary_loss_clip": 0.01400064, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.23923707, + "balance_loss_mlp": 1.01874411, + "epoch": 0.7678941830753043, + "flos": 23852219840640.0, + "grad_norm": 1.450060724155299, + "language_loss": 0.68706989, + "learning_rate": 5.388887456277876e-07, + "loss": 0.71144986, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19189453, + "step": 12772, + "time_per_iteration": 2.8646507263183594 + }, + { + "auxiliary_loss_clip": 0.01389494, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.23501587, + "balance_loss_mlp": 1.01128626, + "epoch": 0.7679543063279723, + "flos": 25421893591680.0, + "grad_norm": 1.6044283510639992, + "language_loss": 0.73979634, + "learning_rate": 5.386228281816349e-07, + "loss": 0.76397902, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.17492676, + "step": 12773, + "time_per_iteration": 2.9028186798095703 + }, + { + "auxiliary_loss_clip": 0.01397247, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.24160707, + "balance_loss_mlp": 1.01323211, + "epoch": 0.7680144295806404, + "flos": 27973346417280.0, + "grad_norm": 1.9490750044625822, + "language_loss": 0.82098019, + "learning_rate": 5.383569661510512e-07, + "loss": 0.84526306, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.17810059, + "step": 12774, + "time_per_iteration": 2.9146907329559326 + }, + { + "auxiliary_loss_clip": 0.01393823, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.23658514, + "balance_loss_mlp": 1.0110836, + "epoch": 0.7680745528333083, + "flos": 20422589829120.0, + "grad_norm": 1.609649060764939, + "language_loss": 0.71059978, + "learning_rate": 5.380911595461177e-07, + "loss": 0.73482543, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.17651367, + "step": 12775, + "time_per_iteration": 4.322916507720947 + }, + { + "auxiliary_loss_clip": 0.01191205, + "auxiliary_loss_mlp": 0.0103823, + "balance_loss_clip": 1.10229623, + "balance_loss_mlp": 1.01505554, + "epoch": 0.7681346760859763, + "flos": 68435274205440.0, + "grad_norm": 0.6978242550233952, + "language_loss": 0.56920683, + "learning_rate": 5.378254083769147e-07, + "loss": 0.59150118, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.23144531, + "step": 12776, + "time_per_iteration": 3.38327956199646 + }, + { + "auxiliary_loss_clip": 0.01402492, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.24438405, + "balance_loss_mlp": 1.01698911, + "epoch": 0.7681947993386442, + "flos": 21261331532160.0, + "grad_norm": 1.7299012545211407, + "language_loss": 0.74467599, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76905179, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1809082, + "step": 12777, + "time_per_iteration": 2.8337197303771973 + }, + { + "auxiliary_loss_clip": 0.01417176, + "auxiliary_loss_mlp": 0.01039917, + "balance_loss_clip": 1.2567457, + "balance_loss_mlp": 1.0212723, + "epoch": 0.7682549225913122, + "flos": 21407671774080.0, + "grad_norm": 2.1193447219060615, + "language_loss": 0.71785462, + "learning_rate": 5.372940723860043e-07, + "loss": 0.7424255, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18640137, + "step": 12778, + "time_per_iteration": 2.8349545001983643 + }, + { + "auxiliary_loss_clip": 0.0140062, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.24113727, + "balance_loss_mlp": 1.01218009, + "epoch": 0.7683150458439801, + "flos": 23049158302080.0, + "grad_norm": 1.7730406764156998, + "language_loss": 0.71230704, + "learning_rate": 5.37028487584446e-07, + "loss": 0.73660898, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.1739502, + "step": 12779, + "time_per_iteration": 2.85711669921875 + }, + { + "auxiliary_loss_clip": 0.01409353, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.24896002, + "balance_loss_mlp": 1.01244593, + "epoch": 0.7683751690966482, + "flos": 67354065043200.0, + "grad_norm": 1.7973171249310764, + "language_loss": 0.59459984, + "learning_rate": 5.367629582589133e-07, + "loss": 0.61900187, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18395996, + "step": 12780, + "time_per_iteration": 3.243476390838623 + }, + { + "auxiliary_loss_clip": 0.01418238, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.25418174, + "balance_loss_mlp": 1.01410592, + "epoch": 0.7684352923493161, + "flos": 21808931074560.0, + "grad_norm": 1.8979079757864064, + "language_loss": 0.68658757, + "learning_rate": 5.364974844194759e-07, + "loss": 0.71110857, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19763184, + "step": 12781, + "time_per_iteration": 2.8873727321624756 + }, + { + "auxiliary_loss_clip": 0.01416209, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.25452757, + "balance_loss_mlp": 1.0129106, + "epoch": 0.7684954156019841, + "flos": 25858380608640.0, + "grad_norm": 1.6249555011065762, + "language_loss": 0.8020525, + "learning_rate": 5.362320660762016e-07, + "loss": 0.82653248, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18884277, + "step": 12782, + "time_per_iteration": 4.345340013504028 + }, + { + "auxiliary_loss_clip": 0.0140794, + "auxiliary_loss_mlp": 0.01032024, + "balance_loss_clip": 1.24556243, + "balance_loss_mlp": 1.01242566, + "epoch": 0.768555538854652, + "flos": 25458342917760.0, + "grad_norm": 1.9343739365142263, + "language_loss": 0.67821908, + "learning_rate": 5.35966703239153e-07, + "loss": 0.70261872, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19616699, + "step": 12783, + "time_per_iteration": 2.9172325134277344 + }, + { + "auxiliary_loss_clip": 0.01410305, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.24937844, + "balance_loss_mlp": 1.01235032, + "epoch": 0.76861566210732, + "flos": 19655887127040.0, + "grad_norm": 2.4194890133328197, + "language_loss": 0.69978172, + "learning_rate": 5.357013959183938e-07, + "loss": 0.72419786, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18969727, + "step": 12784, + "time_per_iteration": 2.84894061088562 + }, + { + "auxiliary_loss_clip": 0.01404988, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.24538469, + "balance_loss_mlp": 1.0171535, + "epoch": 0.7686757853599879, + "flos": 22429203045120.0, + "grad_norm": 1.647415591246203, + "language_loss": 0.81241667, + "learning_rate": 5.354361441239843e-07, + "loss": 0.83681524, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.17687988, + "step": 12785, + "time_per_iteration": 2.860949993133545 + }, + { + "auxiliary_loss_clip": 0.01406846, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.24572659, + "balance_loss_mlp": 1.01210523, + "epoch": 0.768735908612656, + "flos": 47790396097920.0, + "grad_norm": 1.5316550827449233, + "language_loss": 0.78156984, + "learning_rate": 5.351709478659836e-07, + "loss": 0.80595165, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19238281, + "step": 12786, + "time_per_iteration": 4.514078378677368 + }, + { + "auxiliary_loss_clip": 0.01396396, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.23692441, + "balance_loss_mlp": 1.01364374, + "epoch": 0.7687960318653239, + "flos": 30275309560320.0, + "grad_norm": 1.9887703443100189, + "language_loss": 0.59352839, + "learning_rate": 5.349058071544468e-07, + "loss": 0.61780876, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18017578, + "step": 12787, + "time_per_iteration": 4.139708042144775 + }, + { + "auxiliary_loss_clip": 0.01401725, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.24319553, + "balance_loss_mlp": 1.01383615, + "epoch": 0.7688561551179919, + "flos": 19582943230080.0, + "grad_norm": 1.732209617760251, + "language_loss": 0.76478291, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78911984, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18115234, + "step": 12788, + "time_per_iteration": 2.850006580352783 + }, + { + "auxiliary_loss_clip": 0.01405862, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.24546814, + "balance_loss_mlp": 1.01631379, + "epoch": 0.7689162783706599, + "flos": 22794013019520.0, + "grad_norm": 1.7137851900376513, + "language_loss": 0.67545211, + "learning_rate": 5.343756924109821e-07, + "loss": 0.69985539, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18164062, + "step": 12789, + "time_per_iteration": 2.879929780960083 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.25058854, + "balance_loss_mlp": 1.01192153, + "epoch": 0.7689764016233278, + "flos": 34217492376960.0, + "grad_norm": 1.7058385445896214, + "language_loss": 0.69403952, + "learning_rate": 5.341107183991553e-07, + "loss": 0.71847808, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19238281, + "step": 12790, + "time_per_iteration": 2.9817512035369873 + }, + { + "auxiliary_loss_clip": 0.01401355, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.2420373, + "balance_loss_mlp": 1.01219094, + "epoch": 0.7690365248759958, + "flos": 17283287571840.0, + "grad_norm": 1.9637571035898815, + "language_loss": 0.69847864, + "learning_rate": 5.338457999739969e-07, + "loss": 0.72279775, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18395996, + "step": 12791, + "time_per_iteration": 2.825559139251709 + }, + { + "auxiliary_loss_clip": 0.01408235, + "auxiliary_loss_mlp": 0.0103479, + "balance_loss_clip": 1.24923992, + "balance_loss_mlp": 1.01641929, + "epoch": 0.7690966481286637, + "flos": 18232236904320.0, + "grad_norm": 1.740819684190571, + "language_loss": 0.80600226, + "learning_rate": 5.335809371455526e-07, + "loss": 0.83043247, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18371582, + "step": 12792, + "time_per_iteration": 2.858492851257324 + }, + { + "auxiliary_loss_clip": 0.01415337, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.25005054, + "balance_loss_mlp": 1.01244736, + "epoch": 0.7691567713813318, + "flos": 21546003686400.0, + "grad_norm": 1.7734934749712326, + "language_loss": 0.73866391, + "learning_rate": 5.333161299238673e-07, + "loss": 0.76312864, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.18676758, + "step": 12793, + "time_per_iteration": 2.860342025756836 + }, + { + "auxiliary_loss_clip": 0.0141416, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.25117731, + "balance_loss_mlp": 1.01375794, + "epoch": 0.7692168946339997, + "flos": 39393296680320.0, + "grad_norm": 1.7431414748901366, + "language_loss": 0.6437006, + "learning_rate": 5.330513783189803e-07, + "loss": 0.66816616, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1862793, + "step": 12794, + "time_per_iteration": 2.986795663833618 + }, + { + "auxiliary_loss_clip": 0.01417163, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.25392795, + "balance_loss_mlp": 1.01750028, + "epoch": 0.7692770178866677, + "flos": 25020905760000.0, + "grad_norm": 1.6315849625207577, + "language_loss": 0.76469994, + "learning_rate": 5.327866823409319e-07, + "loss": 0.7892375, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.1907959, + "step": 12795, + "time_per_iteration": 2.8748109340667725 + }, + { + "auxiliary_loss_clip": 0.01413662, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.25205684, + "balance_loss_mlp": 1.01166725, + "epoch": 0.7693371411393356, + "flos": 24726686952960.0, + "grad_norm": 1.6206897875977369, + "language_loss": 0.72078347, + "learning_rate": 5.325220419997601e-07, + "loss": 0.74522352, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18664551, + "step": 12796, + "time_per_iteration": 2.8698031902313232 + }, + { + "auxiliary_loss_clip": 0.0141114, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.24997342, + "balance_loss_mlp": 1.01039124, + "epoch": 0.7693972643920036, + "flos": 15933350407680.0, + "grad_norm": 1.825093296871905, + "language_loss": 0.66065049, + "learning_rate": 5.32257457305499e-07, + "loss": 0.68504441, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1784668, + "step": 12797, + "time_per_iteration": 2.842780828475952 + }, + { + "auxiliary_loss_clip": 0.01408112, + "auxiliary_loss_mlp": 0.01038502, + "balance_loss_clip": 1.24592865, + "balance_loss_mlp": 1.01791477, + "epoch": 0.7694573876446715, + "flos": 25415559319680.0, + "grad_norm": 1.7350722714510556, + "language_loss": 0.91854507, + "learning_rate": 5.319929282681823e-07, + "loss": 0.94301128, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20593262, + "step": 12798, + "time_per_iteration": 2.8760392665863037 + }, + { + "auxiliary_loss_clip": 0.01413149, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.25164962, + "balance_loss_mlp": 1.01225936, + "epoch": 0.7695175108973396, + "flos": 16662879866880.0, + "grad_norm": 1.8395056853499427, + "language_loss": 0.82906508, + "learning_rate": 5.317284548978418e-07, + "loss": 0.85350406, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18481445, + "step": 12799, + "time_per_iteration": 2.790595769882202 + }, + { + "auxiliary_loss_clip": 0.01423398, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.2596941, + "balance_loss_mlp": 1.01214182, + "epoch": 0.7695776341500075, + "flos": 13634554400640.0, + "grad_norm": 2.601764090360966, + "language_loss": 0.78460765, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80914903, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18591309, + "step": 12800, + "time_per_iteration": 2.8040971755981445 + }, + { + "auxiliary_loss_clip": 0.01428761, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.26064312, + "balance_loss_mlp": 1.01059818, + "epoch": 0.7696377574026755, + "flos": 24286535107200.0, + "grad_norm": 1.9057333586769767, + "language_loss": 0.84238899, + "learning_rate": 5.31199675198198e-07, + "loss": 0.8669861, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20361328, + "step": 12801, + "time_per_iteration": 2.8361923694610596 + }, + { + "auxiliary_loss_clip": 0.01404531, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.24355912, + "balance_loss_mlp": 1.01464391, + "epoch": 0.7696978806553435, + "flos": 20932925639040.0, + "grad_norm": 1.852748684212862, + "language_loss": 0.72558135, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74995756, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18432617, + "step": 12802, + "time_per_iteration": 2.8337087631225586 + }, + { + "auxiliary_loss_clip": 0.01383355, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.2273941, + "balance_loss_mlp": 1.01063108, + "epoch": 0.7697580039080114, + "flos": 22939855568640.0, + "grad_norm": 2.4271224356371106, + "language_loss": 0.77266896, + "learning_rate": 5.306711182867747e-07, + "loss": 0.79679692, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18798828, + "step": 12803, + "time_per_iteration": 2.9112470149993896 + }, + { + "auxiliary_loss_clip": 0.01191323, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.1029954, + "balance_loss_mlp": 1.00494528, + "epoch": 0.7698181271606794, + "flos": 68748839804160.0, + "grad_norm": 0.7396126573789003, + "language_loss": 0.55878556, + "learning_rate": 5.304069234017001e-07, + "loss": 0.5809772, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22851562, + "step": 12804, + "time_per_iteration": 3.367241382598877 + }, + { + "auxiliary_loss_clip": 0.01194097, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.1049726, + "balance_loss_mlp": 1.01039016, + "epoch": 0.7698782504133473, + "flos": 67442002951680.0, + "grad_norm": 0.7368160992069855, + "language_loss": 0.54101014, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56327534, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.22070312, + "step": 12805, + "time_per_iteration": 3.4386563301086426 + }, + { + "auxiliary_loss_clip": 0.01404919, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.2464087, + "balance_loss_mlp": 1.0150125, + "epoch": 0.7699383736660154, + "flos": 22498210644480.0, + "grad_norm": 5.022265834698802, + "language_loss": 0.73916399, + "learning_rate": 5.298787008229187e-07, + "loss": 0.76356113, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19799805, + "step": 12806, + "time_per_iteration": 2.8956682682037354 + }, + { + "auxiliary_loss_clip": 0.01398942, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.23946881, + "balance_loss_mlp": 1.01494956, + "epoch": 0.7699984969186833, + "flos": 21548718374400.0, + "grad_norm": 1.9152635676784635, + "language_loss": 0.75444597, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77877396, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18896484, + "step": 12807, + "time_per_iteration": 2.846027135848999 + }, + { + "auxiliary_loss_clip": 0.01420067, + "auxiliary_loss_mlp": 0.01035381, + "balance_loss_clip": 1.25600529, + "balance_loss_mlp": 1.01621246, + "epoch": 0.7700586201713513, + "flos": 21727345420800.0, + "grad_norm": 2.961570059474174, + "language_loss": 0.81495607, + "learning_rate": 5.293507012327218e-07, + "loss": 0.83951056, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19165039, + "step": 12808, + "time_per_iteration": 2.829636573791504 + }, + { + "auxiliary_loss_clip": 0.014155, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.25082159, + "balance_loss_mlp": 1.01838541, + "epoch": 0.7701187434240192, + "flos": 27867753757440.0, + "grad_norm": 2.657967422883669, + "language_loss": 0.80193138, + "learning_rate": 5.290867850833718e-07, + "loss": 0.82646304, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19274902, + "step": 12809, + "time_per_iteration": 2.8995964527130127 + }, + { + "auxiliary_loss_clip": 0.01392872, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.23448944, + "balance_loss_mlp": 1.01132965, + "epoch": 0.7701788666766872, + "flos": 28633008625920.0, + "grad_norm": 1.5632715822604455, + "language_loss": 0.70570064, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72991872, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17614746, + "step": 12810, + "time_per_iteration": 4.401693820953369 + }, + { + "auxiliary_loss_clip": 0.0140972, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.24627531, + "balance_loss_mlp": 1.0138464, + "epoch": 0.7702389899293551, + "flos": 14254554902400.0, + "grad_norm": 2.4259814534654076, + "language_loss": 0.79507029, + "learning_rate": 5.285591201262079e-07, + "loss": 0.81950939, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20349121, + "step": 12811, + "time_per_iteration": 2.8016722202301025 + }, + { + "auxiliary_loss_clip": 0.01191011, + "auxiliary_loss_mlp": 0.01024106, + "balance_loss_clip": 1.10287786, + "balance_loss_mlp": 1.00465095, + "epoch": 0.7702991131820232, + "flos": 70604678787840.0, + "grad_norm": 0.8155455763012609, + "language_loss": 0.56720567, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58935678, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19433594, + "step": 12812, + "time_per_iteration": 3.3664188385009766 + }, + { + "auxiliary_loss_clip": 0.01411281, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.2488817, + "balance_loss_mlp": 1.01242566, + "epoch": 0.7703592364346911, + "flos": 25489996295040.0, + "grad_norm": 1.582022072044726, + "language_loss": 0.72502553, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74945402, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19140625, + "step": 12813, + "time_per_iteration": 2.8728551864624023 + }, + { + "auxiliary_loss_clip": 0.01406276, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.24533987, + "balance_loss_mlp": 1.01296639, + "epoch": 0.7704193596873591, + "flos": 19290172256640.0, + "grad_norm": 1.6547571617312808, + "language_loss": 0.67868578, + "learning_rate": 5.27768041194351e-07, + "loss": 0.70306683, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18847656, + "step": 12814, + "time_per_iteration": 2.834472179412842 + }, + { + "auxiliary_loss_clip": 0.0140127, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.24149752, + "balance_loss_mlp": 1.01502776, + "epoch": 0.7704794829400271, + "flos": 23668661111040.0, + "grad_norm": 3.1529219819671446, + "language_loss": 0.66793787, + "learning_rate": 5.275044598581018e-07, + "loss": 0.69228691, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18615723, + "step": 12815, + "time_per_iteration": 2.894047260284424 + }, + { + "auxiliary_loss_clip": 0.01402334, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.24069047, + "balance_loss_mlp": 1.01477432, + "epoch": 0.770539606192695, + "flos": 18998577648000.0, + "grad_norm": 2.4803119383066385, + "language_loss": 0.66539001, + "learning_rate": 5.272409343590322e-07, + "loss": 0.6897577, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19677734, + "step": 12816, + "time_per_iteration": 2.8270721435546875 + }, + { + "auxiliary_loss_clip": 0.01422185, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.25859022, + "balance_loss_mlp": 1.01604211, + "epoch": 0.770599729445363, + "flos": 11835506920320.0, + "grad_norm": 2.6438352732834707, + "language_loss": 0.73354208, + "learning_rate": 5.26977464707133e-07, + "loss": 0.7581104, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18615723, + "step": 12817, + "time_per_iteration": 2.7835729122161865 + }, + { + "auxiliary_loss_clip": 0.01405531, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.24444675, + "balance_loss_mlp": 1.01383519, + "epoch": 0.770659852698031, + "flos": 17831792010240.0, + "grad_norm": 4.43763307180003, + "language_loss": 0.62213111, + "learning_rate": 5.267140509123957e-07, + "loss": 0.64650607, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18139648, + "step": 12818, + "time_per_iteration": 4.276930093765259 + }, + { + "auxiliary_loss_clip": 0.0140831, + "auxiliary_loss_mlp": 0.01027807, + "balance_loss_clip": 1.2513411, + "balance_loss_mlp": 1.00998497, + "epoch": 0.770719975950699, + "flos": 21882055950720.0, + "grad_norm": 1.7265032021681979, + "language_loss": 0.68078458, + "learning_rate": 5.264506929848093e-07, + "loss": 0.70514578, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17834473, + "step": 12819, + "time_per_iteration": 2.83488392829895 + }, + { + "auxiliary_loss_clip": 0.01410425, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.24768114, + "balance_loss_mlp": 1.01293325, + "epoch": 0.7707800992033669, + "flos": 21335089835520.0, + "grad_norm": 1.6130608419056032, + "language_loss": 0.58157355, + "learning_rate": 5.261873909343608e-07, + "loss": 0.60599315, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18603516, + "step": 12820, + "time_per_iteration": 2.8096301555633545 + }, + { + "auxiliary_loss_clip": 0.01405881, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.245309, + "balance_loss_mlp": 1.010674, + "epoch": 0.7708402224560349, + "flos": 28189961112960.0, + "grad_norm": 2.4298513974448617, + "language_loss": 0.81753182, + "learning_rate": 5.259241447710343e-07, + "loss": 0.84188455, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18701172, + "step": 12821, + "time_per_iteration": 2.9492671489715576 + }, + { + "auxiliary_loss_clip": 0.01408424, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.24817467, + "balance_loss_mlp": 1.01396906, + "epoch": 0.7709003457087028, + "flos": 15385343662080.0, + "grad_norm": 2.1026370360533093, + "language_loss": 0.6941129, + "learning_rate": 5.256609545048114e-07, + "loss": 0.71853554, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1986084, + "step": 12822, + "time_per_iteration": 5.659339904785156 + }, + { + "auxiliary_loss_clip": 0.01392994, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.23607969, + "balance_loss_mlp": 1.0169692, + "epoch": 0.7709604689613708, + "flos": 30632473163520.0, + "grad_norm": 1.6665474498593613, + "language_loss": 0.72808671, + "learning_rate": 5.253978201456733e-07, + "loss": 0.75237292, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18664551, + "step": 12823, + "time_per_iteration": 2.8972065448760986 + }, + { + "auxiliary_loss_clip": 0.01418669, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.25296569, + "balance_loss_mlp": 1.01533389, + "epoch": 0.7710205922140387, + "flos": 20310481918080.0, + "grad_norm": 2.0015474202086456, + "language_loss": 0.76665854, + "learning_rate": 5.251347417035969e-07, + "loss": 0.79119325, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19482422, + "step": 12824, + "time_per_iteration": 2.8395004272460938 + }, + { + "auxiliary_loss_clip": 0.01407299, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.24720407, + "balance_loss_mlp": 1.01237786, + "epoch": 0.7710807154667068, + "flos": 19653172439040.0, + "grad_norm": 2.29079044249999, + "language_loss": 0.73371273, + "learning_rate": 5.248717191885592e-07, + "loss": 0.75809145, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18188477, + "step": 12825, + "time_per_iteration": 2.853884220123291 + }, + { + "auxiliary_loss_clip": 0.01381758, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.22751594, + "balance_loss_mlp": 1.0172925, + "epoch": 0.7711408387193747, + "flos": 20014679543040.0, + "grad_norm": 1.3557029017062532, + "language_loss": 0.74471474, + "learning_rate": 5.246087526105343e-07, + "loss": 0.76887619, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.17102051, + "step": 12826, + "time_per_iteration": 2.814441204071045 + }, + { + "auxiliary_loss_clip": 0.01400668, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.23765802, + "balance_loss_mlp": 1.01425982, + "epoch": 0.7712009619720427, + "flos": 24981470277120.0, + "grad_norm": 1.58223404994639, + "language_loss": 0.8224957, + "learning_rate": 5.243458419794933e-07, + "loss": 0.84683847, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19348145, + "step": 12827, + "time_per_iteration": 2.882451057434082 + }, + { + "auxiliary_loss_clip": 0.01194414, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.10544038, + "balance_loss_mlp": 1.00704408, + "epoch": 0.7712610852247107, + "flos": 63280716975360.0, + "grad_norm": 0.8582474124169457, + "language_loss": 0.55192542, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57413739, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.19726562, + "step": 12828, + "time_per_iteration": 3.504800796508789 + }, + { + "auxiliary_loss_clip": 0.01386234, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.22937012, + "balance_loss_mlp": 1.01417363, + "epoch": 0.7713212084773786, + "flos": 18707164018560.0, + "grad_norm": 5.330907224468873, + "language_loss": 0.7072047, + "learning_rate": 5.23820188598238e-07, + "loss": 0.73139554, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18676758, + "step": 12829, + "time_per_iteration": 2.8252997398376465 + }, + { + "auxiliary_loss_clip": 0.01411577, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.24775314, + "balance_loss_mlp": 1.01404977, + "epoch": 0.7713813317300466, + "flos": 14181430026240.0, + "grad_norm": 2.6473450501136577, + "language_loss": 0.80394006, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82838869, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19238281, + "step": 12830, + "time_per_iteration": 2.821965456008911 + }, + { + "auxiliary_loss_clip": 0.01400921, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.23694754, + "balance_loss_mlp": 1.01590741, + "epoch": 0.7714414549827145, + "flos": 25715297992320.0, + "grad_norm": 1.812954315076191, + "language_loss": 0.78421295, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80857265, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19152832, + "step": 12831, + "time_per_iteration": 2.863431215286255 + }, + { + "auxiliary_loss_clip": 0.01402147, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.24187994, + "balance_loss_mlp": 1.01333189, + "epoch": 0.7715015782353826, + "flos": 30567582840960.0, + "grad_norm": 2.265148412997826, + "language_loss": 0.61269808, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63704169, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1887207, + "step": 12832, + "time_per_iteration": 2.901512384414673 + }, + { + "auxiliary_loss_clip": 0.01416509, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.25405037, + "balance_loss_mlp": 1.01470602, + "epoch": 0.7715617014880505, + "flos": 20238804875520.0, + "grad_norm": 1.705116077266607, + "language_loss": 0.79660481, + "learning_rate": 5.227695536380572e-07, + "loss": 0.82110929, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19226074, + "step": 12833, + "time_per_iteration": 2.8835997581481934 + }, + { + "auxiliary_loss_clip": 0.0118329, + "auxiliary_loss_mlp": 0.01027781, + "balance_loss_clip": 1.09530413, + "balance_loss_mlp": 1.00727749, + "epoch": 0.7716218247407185, + "flos": 63690011867520.0, + "grad_norm": 0.8607344608980417, + "language_loss": 0.55551767, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57762837, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20507812, + "step": 12834, + "time_per_iteration": 3.3282384872436523 + }, + { + "auxiliary_loss_clip": 0.01407871, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.24567378, + "balance_loss_mlp": 1.01224685, + "epoch": 0.7716819479933864, + "flos": 19801231983360.0, + "grad_norm": 1.980396611448822, + "language_loss": 0.73991501, + "learning_rate": 5.222445722184903e-07, + "loss": 0.76429522, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.17907715, + "step": 12835, + "time_per_iteration": 2.8459134101867676 + }, + { + "auxiliary_loss_clip": 0.01415156, + "auxiliary_loss_mlp": 0.01038341, + "balance_loss_clip": 1.25190246, + "balance_loss_mlp": 1.01886189, + "epoch": 0.7717420712460544, + "flos": 18451340064000.0, + "grad_norm": 1.8718804345924962, + "language_loss": 0.71372497, + "learning_rate": 5.219821655586814e-07, + "loss": 0.73825991, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19482422, + "step": 12836, + "time_per_iteration": 2.834233283996582 + }, + { + "auxiliary_loss_clip": 0.01398257, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.24053526, + "balance_loss_mlp": 1.01588213, + "epoch": 0.7718021944987223, + "flos": 35203072014720.0, + "grad_norm": 2.635408883332644, + "language_loss": 0.60660714, + "learning_rate": 5.217198149454575e-07, + "loss": 0.63094103, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19250488, + "step": 12837, + "time_per_iteration": 2.987755060195923 + }, + { + "auxiliary_loss_clip": 0.01187782, + "auxiliary_loss_mlp": 0.01040288, + "balance_loss_clip": 1.09966731, + "balance_loss_mlp": 1.01615953, + "epoch": 0.7718623177513904, + "flos": 67956275059200.0, + "grad_norm": 0.8538491014193986, + "language_loss": 0.55818582, + "learning_rate": 5.214575203887666e-07, + "loss": 0.58046651, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.24121094, + "step": 12838, + "time_per_iteration": 3.242119312286377 + }, + { + "auxiliary_loss_clip": 0.01406267, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.2469883, + "balance_loss_mlp": 1.01482606, + "epoch": 0.7719224410040583, + "flos": 18588857569920.0, + "grad_norm": 3.0293397793300985, + "language_loss": 0.6994105, + "learning_rate": 5.211952818985538e-07, + "loss": 0.72380894, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18762207, + "step": 12839, + "time_per_iteration": 2.824303388595581 + }, + { + "auxiliary_loss_clip": 0.01403817, + "auxiliary_loss_mlp": 0.01035474, + "balance_loss_clip": 1.24482298, + "balance_loss_mlp": 1.01682961, + "epoch": 0.7719825642567263, + "flos": 23086150565760.0, + "grad_norm": 1.8188304737358343, + "language_loss": 0.81145251, + "learning_rate": 5.209330994847647e-07, + "loss": 0.83584547, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18640137, + "step": 12840, + "time_per_iteration": 2.842425584793091 + }, + { + "auxiliary_loss_clip": 0.01410858, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.24964416, + "balance_loss_mlp": 1.01526022, + "epoch": 0.7720426875093943, + "flos": 20348650546560.0, + "grad_norm": 1.671582064062619, + "language_loss": 0.80051064, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82495821, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18640137, + "step": 12841, + "time_per_iteration": 2.853870153427124 + }, + { + "auxiliary_loss_clip": 0.01406611, + "auxiliary_loss_mlp": 0.01033789, + "balance_loss_clip": 1.24525023, + "balance_loss_mlp": 1.015836, + "epoch": 0.7721028107620622, + "flos": 23891836302720.0, + "grad_norm": 1.3840247019846976, + "language_loss": 0.76970279, + "learning_rate": 5.204089029262208e-07, + "loss": 0.79410672, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1796875, + "step": 12842, + "time_per_iteration": 2.899282455444336 + }, + { + "auxiliary_loss_clip": 0.01414974, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.25206411, + "balance_loss_mlp": 1.01523256, + "epoch": 0.7721629340147302, + "flos": 26662889980800.0, + "grad_norm": 1.8691851572291214, + "language_loss": 0.69549954, + "learning_rate": 5.201468888013445e-07, + "loss": 0.71998471, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1829834, + "step": 12843, + "time_per_iteration": 2.898672580718994 + }, + { + "auxiliary_loss_clip": 0.01421102, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.25505328, + "balance_loss_mlp": 1.0118196, + "epoch": 0.7722230572673981, + "flos": 21188794838400.0, + "grad_norm": 2.2500207691125302, + "language_loss": 0.74403131, + "learning_rate": 5.198849307926465e-07, + "loss": 0.76855111, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19067383, + "step": 12844, + "time_per_iteration": 2.8157546520233154 + }, + { + "auxiliary_loss_clip": 0.01399684, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.2416997, + "balance_loss_mlp": 1.0159483, + "epoch": 0.7722831805200662, + "flos": 27976196839680.0, + "grad_norm": 1.6461528938584453, + "language_loss": 0.7220279, + "learning_rate": 5.196230289100596e-07, + "loss": 0.74636912, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18493652, + "step": 12845, + "time_per_iteration": 2.8905839920043945 + }, + { + "auxiliary_loss_clip": 0.01396909, + "auxiliary_loss_mlp": 0.01035547, + "balance_loss_clip": 1.23910141, + "balance_loss_mlp": 1.0170691, + "epoch": 0.7723433037727341, + "flos": 33888724525440.0, + "grad_norm": 1.7125732304986534, + "language_loss": 0.65468359, + "learning_rate": 5.193611831635159e-07, + "loss": 0.67900813, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18469238, + "step": 12846, + "time_per_iteration": 4.394457101821899 + }, + { + "auxiliary_loss_clip": 0.01191548, + "auxiliary_loss_mlp": 0.01023154, + "balance_loss_clip": 1.10113072, + "balance_loss_mlp": 1.00446177, + "epoch": 0.7724034270254021, + "flos": 62879186206080.0, + "grad_norm": 0.7890366956393436, + "language_loss": 0.61851889, + "learning_rate": 5.19099393562945e-07, + "loss": 0.64066589, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.18652344, + "step": 12847, + "time_per_iteration": 3.318760395050049 + }, + { + "auxiliary_loss_clip": 0.01402479, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.24201107, + "balance_loss_mlp": 1.01451027, + "epoch": 0.77246355027807, + "flos": 23306068131840.0, + "grad_norm": 1.636822252845186, + "language_loss": 0.80271322, + "learning_rate": 5.188376601182732e-07, + "loss": 0.82706714, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18395996, + "step": 12848, + "time_per_iteration": 2.8582839965820312 + }, + { + "auxiliary_loss_clip": 0.01415863, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.25074041, + "balance_loss_mlp": 1.01485884, + "epoch": 0.772523673530738, + "flos": 20131085710080.0, + "grad_norm": 1.528400076222253, + "language_loss": 0.72917598, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75366223, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.17895508, + "step": 12849, + "time_per_iteration": 2.880981206893921 + }, + { + "auxiliary_loss_clip": 0.01411911, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.25156248, + "balance_loss_mlp": 1.01473999, + "epoch": 0.7725837967834059, + "flos": 17828624874240.0, + "grad_norm": 1.8509400998446042, + "language_loss": 0.78776729, + "learning_rate": 5.183143617363261e-07, + "loss": 0.8122105, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.17675781, + "step": 12850, + "time_per_iteration": 2.865903615951538 + }, + { + "auxiliary_loss_clip": 0.0141755, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.25367367, + "balance_loss_mlp": 1.0149473, + "epoch": 0.772643920036074, + "flos": 27210399033600.0, + "grad_norm": 1.5332649998210048, + "language_loss": 0.80468416, + "learning_rate": 5.180527968188935e-07, + "loss": 0.82919115, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18212891, + "step": 12851, + "time_per_iteration": 2.8891830444335938 + }, + { + "auxiliary_loss_clip": 0.01397737, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.24008679, + "balance_loss_mlp": 1.01312101, + "epoch": 0.7727040432887419, + "flos": 21589375466880.0, + "grad_norm": 1.4253847718918558, + "language_loss": 0.74217695, + "learning_rate": 5.177912880970474e-07, + "loss": 0.76646674, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18139648, + "step": 12852, + "time_per_iteration": 2.83357572555542 + }, + { + "auxiliary_loss_clip": 0.01391553, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.23369789, + "balance_loss_mlp": 1.01377344, + "epoch": 0.7727641665414099, + "flos": 22246775435520.0, + "grad_norm": 1.9314652279984856, + "language_loss": 0.82790244, + "learning_rate": 5.17529835580704e-07, + "loss": 0.85214448, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.1887207, + "step": 12853, + "time_per_iteration": 4.344544172286987 + }, + { + "auxiliary_loss_clip": 0.01186047, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.0985899, + "balance_loss_mlp": 1.00982654, + "epoch": 0.7728242897940779, + "flos": 54863863562880.0, + "grad_norm": 0.8239972945649137, + "language_loss": 0.54649103, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56862622, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.17675781, + "step": 12854, + "time_per_iteration": 3.408353805541992 + }, + { + "auxiliary_loss_clip": 0.01420935, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.25696588, + "balance_loss_mlp": 1.0175401, + "epoch": 0.7728844130467458, + "flos": 34475940529920.0, + "grad_norm": 1.5948628609225892, + "language_loss": 0.72890079, + "learning_rate": 5.170070992041826e-07, + "loss": 0.75348425, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19873047, + "step": 12855, + "time_per_iteration": 2.9575035572052 + }, + { + "auxiliary_loss_clip": 0.01411467, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.25110567, + "balance_loss_mlp": 1.01288295, + "epoch": 0.7729445362994138, + "flos": 18925588506240.0, + "grad_norm": 2.453061147726039, + "language_loss": 0.69114137, + "learning_rate": 5.167458153638254e-07, + "loss": 0.71556914, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18432617, + "step": 12856, + "time_per_iteration": 2.805394411087036 + }, + { + "auxiliary_loss_clip": 0.01402437, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.24208808, + "balance_loss_mlp": 1.00983036, + "epoch": 0.7730046595520818, + "flos": 22210145130240.0, + "grad_norm": 1.8924085117425742, + "language_loss": 0.80150884, + "learning_rate": 5.164845877686162e-07, + "loss": 0.82581866, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18713379, + "step": 12857, + "time_per_iteration": 4.295732736587524 + }, + { + "auxiliary_loss_clip": 0.0139525, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.23886681, + "balance_loss_mlp": 1.01533866, + "epoch": 0.7730647828047498, + "flos": 13560026935680.0, + "grad_norm": 2.452016914246388, + "language_loss": 0.78909129, + "learning_rate": 5.162234164284591e-07, + "loss": 0.81338984, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.19274902, + "step": 12858, + "time_per_iteration": 2.8240108489990234 + }, + { + "auxiliary_loss_clip": 0.01407886, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.24599791, + "balance_loss_mlp": 1.01451421, + "epoch": 0.7731249060574177, + "flos": 21984752943360.0, + "grad_norm": 1.7847964278596005, + "language_loss": 0.77638829, + "learning_rate": 5.159623013532591e-07, + "loss": 0.80079424, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18200684, + "step": 12859, + "time_per_iteration": 2.826036214828491 + }, + { + "auxiliary_loss_clip": 0.01403587, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.24727583, + "balance_loss_mlp": 1.01485109, + "epoch": 0.7731850293100857, + "flos": 22612128347520.0, + "grad_norm": 1.4274511925819988, + "language_loss": 0.68554693, + "learning_rate": 5.157012425529186e-07, + "loss": 0.7099154, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18395996, + "step": 12860, + "time_per_iteration": 2.883443832397461 + }, + { + "auxiliary_loss_clip": 0.01418261, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.25267243, + "balance_loss_mlp": 1.01833022, + "epoch": 0.7732451525627536, + "flos": 14105952420480.0, + "grad_norm": 2.338921187850327, + "language_loss": 0.76285827, + "learning_rate": 5.154402400373343e-07, + "loss": 0.7874108, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18664551, + "step": 12861, + "time_per_iteration": 2.857778787612915 + }, + { + "auxiliary_loss_clip": 0.01420045, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.25649738, + "balance_loss_mlp": 1.01252246, + "epoch": 0.7733052758154216, + "flos": 21479846509440.0, + "grad_norm": 1.570606637619816, + "language_loss": 0.7554003, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77990568, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.1796875, + "step": 12862, + "time_per_iteration": 2.909231185913086 + }, + { + "auxiliary_loss_clip": 0.01396884, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.23909283, + "balance_loss_mlp": 1.01394212, + "epoch": 0.7733653990680895, + "flos": 21403373518080.0, + "grad_norm": 1.5202912941199171, + "language_loss": 0.83944178, + "learning_rate": 5.149184039000256e-07, + "loss": 0.86373734, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18737793, + "step": 12863, + "time_per_iteration": 2.9152114391326904 + }, + { + "auxiliary_loss_clip": 0.01401871, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.24260831, + "balance_loss_mlp": 1.01240349, + "epoch": 0.7734255223207576, + "flos": 17685044565120.0, + "grad_norm": 1.9051990193551385, + "language_loss": 0.74207211, + "learning_rate": 5.146575702980898e-07, + "loss": 0.76639307, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17822266, + "step": 12864, + "time_per_iteration": 2.9077816009521484 + }, + { + "auxiliary_loss_clip": 0.01408091, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.2476697, + "balance_loss_mlp": 1.01343989, + "epoch": 0.7734856455734255, + "flos": 25242361649280.0, + "grad_norm": 1.7492293814113908, + "language_loss": 0.8309375, + "learning_rate": 5.143967930204871e-07, + "loss": 0.85532415, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.17114258, + "step": 12865, + "time_per_iteration": 2.897696018218994 + }, + { + "auxiliary_loss_clip": 0.01422875, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.25755203, + "balance_loss_mlp": 1.0137043, + "epoch": 0.7735457688260935, + "flos": 23441594866560.0, + "grad_norm": 2.0882221303424138, + "language_loss": 0.72544312, + "learning_rate": 5.141360720771077e-07, + "loss": 0.75000536, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19628906, + "step": 12866, + "time_per_iteration": 2.8843281269073486 + }, + { + "auxiliary_loss_clip": 0.01412908, + "auxiliary_loss_mlp": 0.01030208, + "balance_loss_clip": 1.25153685, + "balance_loss_mlp": 1.01211178, + "epoch": 0.7736058920787615, + "flos": 18737233827840.0, + "grad_norm": 2.3386266845781343, + "language_loss": 0.65964878, + "learning_rate": 5.138754074778371e-07, + "loss": 0.68407989, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18103027, + "step": 12867, + "time_per_iteration": 2.7909903526306152 + }, + { + "auxiliary_loss_clip": 0.01394424, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.23578787, + "balance_loss_mlp": 1.01609325, + "epoch": 0.7736660153314294, + "flos": 22903587221760.0, + "grad_norm": 1.5177113890457223, + "language_loss": 0.7123813, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73667163, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18518066, + "step": 12868, + "time_per_iteration": 2.842097759246826 + }, + { + "auxiliary_loss_clip": 0.0141998, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.25731146, + "balance_loss_mlp": 1.01230156, + "epoch": 0.7737261385840974, + "flos": 13806892419840.0, + "grad_norm": 1.9626503488805822, + "language_loss": 0.78537834, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80989236, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19116211, + "step": 12869, + "time_per_iteration": 2.8037359714508057 + }, + { + "auxiliary_loss_clip": 0.01400459, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.24254501, + "balance_loss_mlp": 1.01498246, + "epoch": 0.7737862618367654, + "flos": 28742447093760.0, + "grad_norm": 1.5303188553265503, + "language_loss": 0.74835062, + "learning_rate": 5.130937518435124e-07, + "loss": 0.77269685, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19165039, + "step": 12870, + "time_per_iteration": 2.942793130874634 + }, + { + "auxiliary_loss_clip": 0.01415355, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.25303686, + "balance_loss_mlp": 1.01335311, + "epoch": 0.7738463850894334, + "flos": 17027192148480.0, + "grad_norm": 2.472342749312631, + "language_loss": 0.76694071, + "learning_rate": 5.12833312719501e-07, + "loss": 0.79141855, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19091797, + "step": 12871, + "time_per_iteration": 2.793790578842163 + }, + { + "auxiliary_loss_clip": 0.01400639, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.24076998, + "balance_loss_mlp": 1.01501179, + "epoch": 0.7739065083421013, + "flos": 20713686744960.0, + "grad_norm": 1.574473737219808, + "language_loss": 0.6963681, + "learning_rate": 5.12572929988999e-07, + "loss": 0.72071576, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19116211, + "step": 12872, + "time_per_iteration": 2.824270486831665 + }, + { + "auxiliary_loss_clip": 0.01409717, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.24749494, + "balance_loss_mlp": 1.01439297, + "epoch": 0.7739666315947693, + "flos": 20705135477760.0, + "grad_norm": 1.9495597432679974, + "language_loss": 0.85515273, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87959158, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19763184, + "step": 12873, + "time_per_iteration": 2.8269100189208984 + }, + { + "auxiliary_loss_clip": 0.01406359, + "auxiliary_loss_mlp": 0.01037054, + "balance_loss_clip": 1.24660957, + "balance_loss_mlp": 1.01852906, + "epoch": 0.7740267548474372, + "flos": 29582817609600.0, + "grad_norm": 2.589736076076167, + "language_loss": 0.66572565, + "learning_rate": 5.120523337480174e-07, + "loss": 0.6901598, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18518066, + "step": 12874, + "time_per_iteration": 2.922607660293579 + }, + { + "auxiliary_loss_clip": 0.01406759, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.24698579, + "balance_loss_mlp": 1.01340723, + "epoch": 0.7740868781001052, + "flos": 23669475517440.0, + "grad_norm": 1.705741267761658, + "language_loss": 0.63018864, + "learning_rate": 5.117921202572785e-07, + "loss": 0.65457106, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18078613, + "step": 12875, + "time_per_iteration": 2.8653311729431152 + }, + { + "auxiliary_loss_clip": 0.01406907, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.24439144, + "balance_loss_mlp": 1.01476264, + "epoch": 0.7741470013527731, + "flos": 24727999052160.0, + "grad_norm": 1.9197820823430194, + "language_loss": 0.65760988, + "learning_rate": 5.115319631995318e-07, + "loss": 0.68201172, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18505859, + "step": 12876, + "time_per_iteration": 2.8641226291656494 + }, + { + "auxiliary_loss_clip": 0.01398711, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.24095464, + "balance_loss_mlp": 1.01744461, + "epoch": 0.7742071246054412, + "flos": 21881648747520.0, + "grad_norm": 10.772510389288765, + "language_loss": 0.72669393, + "learning_rate": 5.112718625846433e-07, + "loss": 0.75103593, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18054199, + "step": 12877, + "time_per_iteration": 2.8400442600250244 + }, + { + "auxiliary_loss_clip": 0.01416097, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.25183821, + "balance_loss_mlp": 1.01574719, + "epoch": 0.7742672478581091, + "flos": 22684438817280.0, + "grad_norm": 2.0635081434209788, + "language_loss": 0.83884025, + "learning_rate": 5.110118184224736e-07, + "loss": 0.86334509, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18652344, + "step": 12878, + "time_per_iteration": 2.829723596572876 + }, + { + "auxiliary_loss_clip": 0.01403167, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.24294448, + "balance_loss_mlp": 1.01477146, + "epoch": 0.7743273711107771, + "flos": 18849477473280.0, + "grad_norm": 1.7196884352332735, + "language_loss": 0.7386834, + "learning_rate": 5.10751830722885e-07, + "loss": 0.76304233, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.17956543, + "step": 12879, + "time_per_iteration": 2.8248355388641357 + }, + { + "auxiliary_loss_clip": 0.01397095, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.24001825, + "balance_loss_mlp": 1.01233697, + "epoch": 0.7743874943634451, + "flos": 28740682546560.0, + "grad_norm": 1.635856356933866, + "language_loss": 0.80459201, + "learning_rate": 5.104918994957364e-07, + "loss": 0.82887089, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18457031, + "step": 12880, + "time_per_iteration": 2.8781039714813232 + }, + { + "auxiliary_loss_clip": 0.01398448, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.23990774, + "balance_loss_mlp": 1.0152092, + "epoch": 0.774447617616113, + "flos": 21920043600000.0, + "grad_norm": 1.4731922130262374, + "language_loss": 0.70960265, + "learning_rate": 5.102320247508847e-07, + "loss": 0.73392546, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18640137, + "step": 12881, + "time_per_iteration": 4.263557434082031 + }, + { + "auxiliary_loss_clip": 0.01424369, + "auxiliary_loss_mlp": 0.01040946, + "balance_loss_clip": 1.25917172, + "balance_loss_mlp": 1.0202992, + "epoch": 0.774507740868781, + "flos": 19510180312320.0, + "grad_norm": 1.9324012921377238, + "language_loss": 0.85210252, + "learning_rate": 5.099722064981832e-07, + "loss": 0.87675571, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20629883, + "step": 12882, + "time_per_iteration": 2.8392956256866455 + }, + { + "auxiliary_loss_clip": 0.01182439, + "auxiliary_loss_mlp": 0.01023652, + "balance_loss_clip": 1.09456658, + "balance_loss_mlp": 1.00362444, + "epoch": 0.774567864121449, + "flos": 59458650399360.0, + "grad_norm": 0.7902750420997748, + "language_loss": 0.60439098, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62645185, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.20019531, + "step": 12883, + "time_per_iteration": 3.306915760040283 + }, + { + "auxiliary_loss_clip": 0.01407711, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.24720836, + "balance_loss_mlp": 1.01514626, + "epoch": 0.774627987374117, + "flos": 13233295100160.0, + "grad_norm": 1.648033509030687, + "language_loss": 0.7360152, + "learning_rate": 5.094527395086416e-07, + "loss": 0.76043522, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19140625, + "step": 12884, + "time_per_iteration": 2.818967342376709 + }, + { + "auxiliary_loss_clip": 0.01399085, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.24254286, + "balance_loss_mlp": 1.01468003, + "epoch": 0.7746881106267849, + "flos": 21403056804480.0, + "grad_norm": 1.6017123188197557, + "language_loss": 0.81852895, + "learning_rate": 5.091930907914986e-07, + "loss": 0.84284818, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18164062, + "step": 12885, + "time_per_iteration": 2.8797390460968018 + }, + { + "auxiliary_loss_clip": 0.01387622, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.23049092, + "balance_loss_mlp": 1.01277351, + "epoch": 0.7747482338794529, + "flos": 25640227589760.0, + "grad_norm": 1.710636094915488, + "language_loss": 0.64842707, + "learning_rate": 5.089334986059029e-07, + "loss": 0.67260194, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.17089844, + "step": 12886, + "time_per_iteration": 2.9133424758911133 + }, + { + "auxiliary_loss_clip": 0.01403154, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.2408886, + "balance_loss_mlp": 1.01495147, + "epoch": 0.7748083571321208, + "flos": 11554454350080.0, + "grad_norm": 2.093812379848307, + "language_loss": 0.70476377, + "learning_rate": 5.086739629616987e-07, + "loss": 0.72911429, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.16955566, + "step": 12887, + "time_per_iteration": 2.8770742416381836 + }, + { + "auxiliary_loss_clip": 0.01384822, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.22812843, + "balance_loss_mlp": 1.01157284, + "epoch": 0.7748684803847888, + "flos": 19071747768960.0, + "grad_norm": 1.653601347565621, + "language_loss": 0.7120955, + "learning_rate": 5.084144838687275e-07, + "loss": 0.73623478, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17541504, + "step": 12888, + "time_per_iteration": 4.24017596244812 + }, + { + "auxiliary_loss_clip": 0.01409166, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.24681699, + "balance_loss_mlp": 1.01270974, + "epoch": 0.7749286036374567, + "flos": 22283315251200.0, + "grad_norm": 1.5595073930722814, + "language_loss": 0.82871187, + "learning_rate": 5.081550613368279e-07, + "loss": 0.8531217, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19116211, + "step": 12889, + "time_per_iteration": 2.8465945720672607 + }, + { + "auxiliary_loss_clip": 0.01404989, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.24442959, + "balance_loss_mlp": 1.01511014, + "epoch": 0.7749887268901248, + "flos": 20201812611840.0, + "grad_norm": 1.9255146811125796, + "language_loss": 0.80247045, + "learning_rate": 5.07895695375838e-07, + "loss": 0.82685554, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18395996, + "step": 12890, + "time_per_iteration": 2.8774359226226807 + }, + { + "auxiliary_loss_clip": 0.014116, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.25043631, + "balance_loss_mlp": 1.01483011, + "epoch": 0.7750488501427927, + "flos": 20346705020160.0, + "grad_norm": 1.6611897094374393, + "language_loss": 0.67177778, + "learning_rate": 5.076363859955932e-07, + "loss": 0.69623303, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1907959, + "step": 12891, + "time_per_iteration": 2.8410561084747314 + }, + { + "auxiliary_loss_clip": 0.0142054, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.2580502, + "balance_loss_mlp": 1.01174533, + "epoch": 0.7751089733954607, + "flos": 28375193900160.0, + "grad_norm": 1.4249943814396275, + "language_loss": 0.79196048, + "learning_rate": 5.073771332059257e-07, + "loss": 0.81646979, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.1862793, + "step": 12892, + "time_per_iteration": 5.785657644271851 + }, + { + "auxiliary_loss_clip": 0.01416146, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.25153244, + "balance_loss_mlp": 1.01092863, + "epoch": 0.7751690966481286, + "flos": 16951940766720.0, + "grad_norm": 1.9238127274112011, + "language_loss": 0.6787461, + "learning_rate": 5.071179370166669e-07, + "loss": 0.70321041, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19335938, + "step": 12893, + "time_per_iteration": 2.847093343734741 + }, + { + "auxiliary_loss_clip": 0.01180029, + "auxiliary_loss_mlp": 0.01019842, + "balance_loss_clip": 1.09286535, + "balance_loss_mlp": 1.00200808, + "epoch": 0.7752292199007966, + "flos": 65702389155840.0, + "grad_norm": 0.8115400954314361, + "language_loss": 0.58626282, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60826153, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.17871094, + "step": 12894, + "time_per_iteration": 3.3785808086395264 + }, + { + "auxiliary_loss_clip": 0.01411436, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.24907136, + "balance_loss_mlp": 1.01657593, + "epoch": 0.7752893431534646, + "flos": 20604338766720.0, + "grad_norm": 2.9036921514873475, + "language_loss": 0.78758126, + "learning_rate": 5.065997144786895e-07, + "loss": 0.81204683, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1854248, + "step": 12895, + "time_per_iteration": 2.82365083694458 + }, + { + "auxiliary_loss_clip": 0.01404765, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.24524379, + "balance_loss_mlp": 1.01475835, + "epoch": 0.7753494664061326, + "flos": 20495036033280.0, + "grad_norm": 1.7159670496360433, + "language_loss": 0.68481636, + "learning_rate": 5.063406881496209e-07, + "loss": 0.70920229, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19067383, + "step": 12896, + "time_per_iteration": 2.866715669631958 + }, + { + "auxiliary_loss_clip": 0.01414593, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.25368404, + "balance_loss_mlp": 1.01475775, + "epoch": 0.7754095896588006, + "flos": 20275389936000.0, + "grad_norm": 2.1715762160471135, + "language_loss": 0.69713795, + "learning_rate": 5.060817184602629e-07, + "loss": 0.72161967, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18823242, + "step": 12897, + "time_per_iteration": 2.894561767578125 + }, + { + "auxiliary_loss_clip": 0.01405168, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.24291015, + "balance_loss_mlp": 1.01610732, + "epoch": 0.7754697129114685, + "flos": 23341476827520.0, + "grad_norm": 1.7155598508936065, + "language_loss": 0.75748509, + "learning_rate": 5.058228054204364e-07, + "loss": 0.78188479, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18701172, + "step": 12898, + "time_per_iteration": 2.8963704109191895 + }, + { + "auxiliary_loss_clip": 0.01408941, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.24713421, + "balance_loss_mlp": 1.01377547, + "epoch": 0.7755298361641365, + "flos": 17356502937600.0, + "grad_norm": 1.9640894237361135, + "language_loss": 0.70487875, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72929668, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19067383, + "step": 12899, + "time_per_iteration": 2.8076064586639404 + }, + { + "auxiliary_loss_clip": 0.01410123, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.24884725, + "balance_loss_mlp": 1.01614261, + "epoch": 0.7755899594168044, + "flos": 19655253699840.0, + "grad_norm": 1.7865752410334947, + "language_loss": 0.75947958, + "learning_rate": 5.053051493286453e-07, + "loss": 0.78394055, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19824219, + "step": 12900, + "time_per_iteration": 2.842414617538452 + }, + { + "auxiliary_loss_clip": 0.01393273, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.23631525, + "balance_loss_mlp": 1.01482034, + "epoch": 0.7756500826694724, + "flos": 27425113447680.0, + "grad_norm": 1.5490326160186982, + "language_loss": 0.7846278, + "learning_rate": 5.050464062963113e-07, + "loss": 0.80888623, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17749023, + "step": 12901, + "time_per_iteration": 2.9282703399658203 + }, + { + "auxiliary_loss_clip": 0.0140564, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.246948, + "balance_loss_mlp": 1.01330352, + "epoch": 0.7757102059221404, + "flos": 28742175624960.0, + "grad_norm": 1.6164464857059373, + "language_loss": 0.77544475, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79982167, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18737793, + "step": 12902, + "time_per_iteration": 2.888806104660034 + }, + { + "auxiliary_loss_clip": 0.01401961, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.24158609, + "balance_loss_mlp": 1.01185608, + "epoch": 0.7757703291748084, + "flos": 22495631690880.0, + "grad_norm": 1.7845514698238587, + "language_loss": 0.74036175, + "learning_rate": 5.045290903078215e-07, + "loss": 0.76468825, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18823242, + "step": 12903, + "time_per_iteration": 2.9202637672424316 + }, + { + "auxiliary_loss_clip": 0.01397406, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.23860967, + "balance_loss_mlp": 1.01024699, + "epoch": 0.7758304524274763, + "flos": 21439098927360.0, + "grad_norm": 3.515074531139347, + "language_loss": 0.76816487, + "learning_rate": 5.042705173712835e-07, + "loss": 0.79243016, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18884277, + "step": 12904, + "time_per_iteration": 2.8438947200775146 + }, + { + "auxiliary_loss_clip": 0.01387382, + "auxiliary_loss_mlp": 0.01028133, + "balance_loss_clip": 1.23108673, + "balance_loss_mlp": 1.00989437, + "epoch": 0.7758905756801443, + "flos": 23669656496640.0, + "grad_norm": 2.444856560142635, + "language_loss": 0.69312501, + "learning_rate": 5.040120011529576e-07, + "loss": 0.71728009, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18237305, + "step": 12905, + "time_per_iteration": 2.9289638996124268 + }, + { + "auxiliary_loss_clip": 0.01394269, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.23834491, + "balance_loss_mlp": 1.01063156, + "epoch": 0.7759506989328122, + "flos": 28377003692160.0, + "grad_norm": 1.831198117330667, + "language_loss": 0.67560363, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69984019, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18762207, + "step": 12906, + "time_per_iteration": 2.909403085708618 + }, + { + "auxiliary_loss_clip": 0.01399577, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.23958075, + "balance_loss_mlp": 1.01030803, + "epoch": 0.7760108221854802, + "flos": 14910959485440.0, + "grad_norm": 2.271311031902538, + "language_loss": 0.81858087, + "learning_rate": 5.034951389101498e-07, + "loss": 0.84286571, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18603516, + "step": 12907, + "time_per_iteration": 2.904597759246826 + }, + { + "auxiliary_loss_clip": 0.01391529, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.23651075, + "balance_loss_mlp": 1.01444948, + "epoch": 0.7760709454381483, + "flos": 14800435142400.0, + "grad_norm": 1.9781690676265724, + "language_loss": 0.67480612, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69905794, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1920166, + "step": 12908, + "time_per_iteration": 2.819546699523926 + }, + { + "auxiliary_loss_clip": 0.01399182, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.23831058, + "balance_loss_mlp": 1.01401782, + "epoch": 0.7761310686908162, + "flos": 17387613377280.0, + "grad_norm": 1.660729500401547, + "language_loss": 0.70856047, + "learning_rate": 5.029785036577976e-07, + "loss": 0.73287892, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18652344, + "step": 12909, + "time_per_iteration": 2.871462821960449 + }, + { + "auxiliary_loss_clip": 0.01390466, + "auxiliary_loss_mlp": 0.01039097, + "balance_loss_clip": 1.2337296, + "balance_loss_mlp": 1.02051258, + "epoch": 0.7761911919434842, + "flos": 25567464672000.0, + "grad_norm": 1.7011111376816004, + "language_loss": 0.68184507, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70614076, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18591309, + "step": 12910, + "time_per_iteration": 2.9592645168304443 + }, + { + "auxiliary_loss_clip": 0.01407324, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.24603701, + "balance_loss_mlp": 1.01582265, + "epoch": 0.7762513151961521, + "flos": 23188711824000.0, + "grad_norm": 1.6409684090076275, + "language_loss": 0.72162968, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74604332, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18225098, + "step": 12911, + "time_per_iteration": 2.8720638751983643 + }, + { + "auxiliary_loss_clip": 0.01414866, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.25287592, + "balance_loss_mlp": 1.01221645, + "epoch": 0.7763114384488201, + "flos": 21699673585920.0, + "grad_norm": 2.4355548093365713, + "language_loss": 0.64253592, + "learning_rate": 5.022039765577836e-07, + "loss": 0.6670019, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19494629, + "step": 12912, + "time_per_iteration": 2.8300845623016357 + }, + { + "auxiliary_loss_clip": 0.01186086, + "auxiliary_loss_mlp": 0.01021148, + "balance_loss_clip": 1.09646487, + "balance_loss_mlp": 1.00178862, + "epoch": 0.776371561701488, + "flos": 69060975552000.0, + "grad_norm": 0.7691832355903192, + "language_loss": 0.53233808, + "learning_rate": 5.019459144378779e-07, + "loss": 0.5544104, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19335938, + "step": 12913, + "time_per_iteration": 3.420860767364502 + }, + { + "auxiliary_loss_clip": 0.01399962, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.24038529, + "balance_loss_mlp": 1.01254129, + "epoch": 0.776431684954156, + "flos": 22904356383360.0, + "grad_norm": 8.809323581750837, + "language_loss": 0.62608898, + "learning_rate": 5.016879091243338e-07, + "loss": 0.65040857, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19458008, + "step": 12914, + "time_per_iteration": 2.847818374633789 + }, + { + "auxiliary_loss_clip": 0.01395522, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.2362144, + "balance_loss_mlp": 1.01442289, + "epoch": 0.776491808206824, + "flos": 20269915315200.0, + "grad_norm": 2.1197376985735747, + "language_loss": 0.82912111, + "learning_rate": 5.014299606269339e-07, + "loss": 0.8534143, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19348145, + "step": 12915, + "time_per_iteration": 4.3609397411346436 + }, + { + "auxiliary_loss_clip": 0.01407485, + "auxiliary_loss_mlp": 0.01036332, + "balance_loss_clip": 1.243343, + "balance_loss_mlp": 1.01645947, + "epoch": 0.776551931459492, + "flos": 26769885229440.0, + "grad_norm": 3.88367252532672, + "language_loss": 0.75204778, + "learning_rate": 5.011720689554603e-07, + "loss": 0.77648592, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19873047, + "step": 12916, + "time_per_iteration": 2.8761372566223145 + }, + { + "auxiliary_loss_clip": 0.01404914, + "auxiliary_loss_mlp": 0.01034154, + "balance_loss_clip": 1.24298143, + "balance_loss_mlp": 1.01521134, + "epoch": 0.7766120547121599, + "flos": 52682930835840.0, + "grad_norm": 1.4204809177215751, + "language_loss": 0.65994239, + "learning_rate": 5.009142341196919e-07, + "loss": 0.68433303, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1895752, + "step": 12917, + "time_per_iteration": 3.153193712234497 + }, + { + "auxiliary_loss_clip": 0.01401549, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.24165678, + "balance_loss_mlp": 1.01564169, + "epoch": 0.7766721779648279, + "flos": 25167291246720.0, + "grad_norm": 1.4470181056708658, + "language_loss": 0.64872062, + "learning_rate": 5.006564561294065e-07, + "loss": 0.67308342, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19116211, + "step": 12918, + "time_per_iteration": 2.93001651763916 + }, + { + "auxiliary_loss_clip": 0.01399099, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.24001288, + "balance_loss_mlp": 1.0092988, + "epoch": 0.7767323012174958, + "flos": 23769141108480.0, + "grad_norm": 2.0456692122967, + "language_loss": 0.73853737, + "learning_rate": 5.003987349943777e-07, + "loss": 0.76280761, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1862793, + "step": 12919, + "time_per_iteration": 2.8447794914245605 + }, + { + "auxiliary_loss_clip": 0.01412471, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.24922848, + "balance_loss_mlp": 1.01239014, + "epoch": 0.7767924244701638, + "flos": 22095684489600.0, + "grad_norm": 1.7103816970787662, + "language_loss": 0.79628801, + "learning_rate": 5.001410707243792e-07, + "loss": 0.82073087, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19421387, + "step": 12920, + "time_per_iteration": 2.863222360610962 + }, + { + "auxiliary_loss_clip": 0.01403743, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.24236584, + "balance_loss_mlp": 1.01365542, + "epoch": 0.7768525477228319, + "flos": 21991811132160.0, + "grad_norm": 1.5147490201452205, + "language_loss": 0.71430427, + "learning_rate": 4.998834633291829e-07, + "loss": 0.7386685, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19018555, + "step": 12921, + "time_per_iteration": 2.8834228515625 + }, + { + "auxiliary_loss_clip": 0.01411289, + "auxiliary_loss_mlp": 0.01033825, + "balance_loss_clip": 1.24811506, + "balance_loss_mlp": 1.01425111, + "epoch": 0.7769126709754998, + "flos": 21803501698560.0, + "grad_norm": 1.6706662816536082, + "language_loss": 0.7694155, + "learning_rate": 4.996259128185547e-07, + "loss": 0.79386663, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19580078, + "step": 12922, + "time_per_iteration": 2.86824369430542 + }, + { + "auxiliary_loss_clip": 0.01399293, + "auxiliary_loss_mlp": 0.01033788, + "balance_loss_clip": 1.24095106, + "balance_loss_mlp": 1.01519096, + "epoch": 0.7769727942281678, + "flos": 20057689365120.0, + "grad_norm": 1.7932728023572104, + "language_loss": 0.81364036, + "learning_rate": 4.993684192022625e-07, + "loss": 0.83797121, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18591309, + "step": 12923, + "time_per_iteration": 2.8691940307617188 + }, + { + "auxiliary_loss_clip": 0.01398589, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.24010944, + "balance_loss_mlp": 1.01466453, + "epoch": 0.7770329174808357, + "flos": 21696099246720.0, + "grad_norm": 1.9396210118800121, + "language_loss": 0.92447531, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94878137, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.17358398, + "step": 12924, + "time_per_iteration": 4.334291219711304 + }, + { + "auxiliary_loss_clip": 0.01399095, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.23842335, + "balance_loss_mlp": 1.01041198, + "epoch": 0.7770930407335037, + "flos": 25860461869440.0, + "grad_norm": 1.9375200640683818, + "language_loss": 0.66922653, + "learning_rate": 4.988536026917401e-07, + "loss": 0.69350624, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18469238, + "step": 12925, + "time_per_iteration": 2.8780159950256348 + }, + { + "auxiliary_loss_clip": 0.01413769, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.24999297, + "balance_loss_mlp": 1.01480806, + "epoch": 0.7771531639861716, + "flos": 24357262008960.0, + "grad_norm": 1.9302850055585379, + "language_loss": 0.72299391, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74746281, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18310547, + "step": 12926, + "time_per_iteration": 2.9007248878479004 + }, + { + "auxiliary_loss_clip": 0.0140978, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.24740434, + "balance_loss_mlp": 1.00990427, + "epoch": 0.7772132872388396, + "flos": 25640996751360.0, + "grad_norm": 2.7102063772777587, + "language_loss": 0.66797996, + "learning_rate": 4.983390138757027e-07, + "loss": 0.69236153, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18481445, + "step": 12927, + "time_per_iteration": 5.581921815872192 + }, + { + "auxiliary_loss_clip": 0.01399915, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.23862267, + "balance_loss_mlp": 1.0192343, + "epoch": 0.7772734104915076, + "flos": 26078479153920.0, + "grad_norm": 1.7177301025951173, + "language_loss": 0.72823524, + "learning_rate": 4.980818048775093e-07, + "loss": 0.75263822, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.21142578, + "step": 12928, + "time_per_iteration": 2.88481068611145 + }, + { + "auxiliary_loss_clip": 0.01386972, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.22883511, + "balance_loss_mlp": 1.01254702, + "epoch": 0.7773335337441756, + "flos": 22934109479040.0, + "grad_norm": 1.7365717101075782, + "language_loss": 0.74834478, + "learning_rate": 4.978246528322036e-07, + "loss": 0.77251792, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.17810059, + "step": 12929, + "time_per_iteration": 2.8613927364349365 + }, + { + "auxiliary_loss_clip": 0.01403881, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.24206829, + "balance_loss_mlp": 1.0137279, + "epoch": 0.7773936569968435, + "flos": 20786404417920.0, + "grad_norm": 2.1459134323343565, + "language_loss": 0.78053153, + "learning_rate": 4.975675577495377e-07, + "loss": 0.8048954, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18774414, + "step": 12930, + "time_per_iteration": 2.841214418411255 + }, + { + "auxiliary_loss_clip": 0.01411855, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.25208688, + "balance_loss_mlp": 1.01397562, + "epoch": 0.7774537802495115, + "flos": 20380982595840.0, + "grad_norm": 1.707436002694152, + "language_loss": 0.80083668, + "learning_rate": 4.973105196392613e-07, + "loss": 0.82530153, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20654297, + "step": 12931, + "time_per_iteration": 2.8196892738342285 + }, + { + "auxiliary_loss_clip": 0.01187964, + "auxiliary_loss_mlp": 0.01047396, + "balance_loss_clip": 1.09740114, + "balance_loss_mlp": 1.02860916, + "epoch": 0.7775139035021794, + "flos": 53941409700480.0, + "grad_norm": 0.8130677247362975, + "language_loss": 0.59784985, + "learning_rate": 4.970535385111199e-07, + "loss": 0.6202035, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1875, + "step": 12932, + "time_per_iteration": 3.305485725402832 + }, + { + "auxiliary_loss_clip": 0.0140387, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.24128795, + "balance_loss_mlp": 1.01503217, + "epoch": 0.7775740267548474, + "flos": 28854826473600.0, + "grad_norm": 2.085943463338824, + "language_loss": 0.76518631, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78956276, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1875, + "step": 12933, + "time_per_iteration": 2.9381344318389893 + }, + { + "auxiliary_loss_clip": 0.01393511, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.23433685, + "balance_loss_mlp": 1.01111913, + "epoch": 0.7776341500075155, + "flos": 21882689377920.0, + "grad_norm": 2.6204597020687426, + "language_loss": 0.74065655, + "learning_rate": 4.965397472402215e-07, + "loss": 0.76489681, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19396973, + "step": 12934, + "time_per_iteration": 2.9044363498687744 + }, + { + "auxiliary_loss_clip": 0.01407446, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.24561453, + "balance_loss_mlp": 1.01241159, + "epoch": 0.7776942732601834, + "flos": 20239438302720.0, + "grad_norm": 2.3637344195400773, + "language_loss": 0.70880842, + "learning_rate": 4.962829371169475e-07, + "loss": 0.73320627, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19934082, + "step": 12935, + "time_per_iteration": 2.863309621810913 + }, + { + "auxiliary_loss_clip": 0.01408723, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.24709415, + "balance_loss_mlp": 1.01471734, + "epoch": 0.7777543965128514, + "flos": 22240712632320.0, + "grad_norm": 1.655110887743921, + "language_loss": 0.84023738, + "learning_rate": 4.960261840147746e-07, + "loss": 0.86466497, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19335938, + "step": 12936, + "time_per_iteration": 2.849276304244995 + }, + { + "auxiliary_loss_clip": 0.01408111, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.24424469, + "balance_loss_mlp": 1.00928164, + "epoch": 0.7778145197655193, + "flos": 14510152632960.0, + "grad_norm": 2.053210204800827, + "language_loss": 0.69038153, + "learning_rate": 4.957694879434397e-07, + "loss": 0.71473783, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18249512, + "step": 12937, + "time_per_iteration": 2.8591666221618652 + }, + { + "auxiliary_loss_clip": 0.01409464, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.24660182, + "balance_loss_mlp": 1.01134872, + "epoch": 0.7778746430181873, + "flos": 21149676069120.0, + "grad_norm": 1.4955594043905462, + "language_loss": 0.87539577, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89979511, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19128418, + "step": 12938, + "time_per_iteration": 2.859717845916748 + }, + { + "auxiliary_loss_clip": 0.0139337, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.23325539, + "balance_loss_mlp": 1.01231384, + "epoch": 0.7779347662708552, + "flos": 20275978118400.0, + "grad_norm": 2.063790364266617, + "language_loss": 0.86109728, + "learning_rate": 4.95256266932218e-07, + "loss": 0.8853451, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19104004, + "step": 12939, + "time_per_iteration": 2.827425479888916 + }, + { + "auxiliary_loss_clip": 0.01403351, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.2458595, + "balance_loss_mlp": 1.00938725, + "epoch": 0.7779948895235232, + "flos": 19218540458880.0, + "grad_norm": 1.6384197250057164, + "language_loss": 0.70029455, + "learning_rate": 4.949997420117915e-07, + "loss": 0.72461772, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19580078, + "step": 12940, + "time_per_iteration": 2.831784963607788 + }, + { + "auxiliary_loss_clip": 0.01408528, + "auxiliary_loss_mlp": 0.01033473, + "balance_loss_clip": 1.24606466, + "balance_loss_mlp": 1.01484108, + "epoch": 0.7780550127761912, + "flos": 23925208982400.0, + "grad_norm": 1.533306348141608, + "language_loss": 0.78259188, + "learning_rate": 4.947432741611255e-07, + "loss": 0.8070119, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18640137, + "step": 12941, + "time_per_iteration": 2.8709566593170166 + }, + { + "auxiliary_loss_clip": 0.01407493, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.24196196, + "balance_loss_mlp": 1.01773989, + "epoch": 0.7781151360288592, + "flos": 32428670221440.0, + "grad_norm": 2.4282514478124124, + "language_loss": 0.74448013, + "learning_rate": 4.944868633899462e-07, + "loss": 0.76894474, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.21240234, + "step": 12942, + "time_per_iteration": 2.9307773113250732 + }, + { + "auxiliary_loss_clip": 0.01385679, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.22930598, + "balance_loss_mlp": 1.01434016, + "epoch": 0.7781752592815271, + "flos": 22356621106560.0, + "grad_norm": 1.8541942870245243, + "language_loss": 0.67977583, + "learning_rate": 4.942305097079751e-07, + "loss": 0.70396638, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19042969, + "step": 12943, + "time_per_iteration": 2.8550002574920654 + }, + { + "auxiliary_loss_clip": 0.01192954, + "auxiliary_loss_mlp": 0.01030374, + "balance_loss_clip": 1.09987271, + "balance_loss_mlp": 1.0073905, + "epoch": 0.7782353825341951, + "flos": 70489539475200.0, + "grad_norm": 0.78305372188722, + "language_loss": 0.58594441, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60817766, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.22949219, + "step": 12944, + "time_per_iteration": 3.5229294300079346 + }, + { + "auxiliary_loss_clip": 0.01402089, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.24046481, + "balance_loss_mlp": 1.01230288, + "epoch": 0.778295505786863, + "flos": 19071928748160.0, + "grad_norm": 13.09127850311997, + "language_loss": 0.68005621, + "learning_rate": 4.937179736505428e-07, + "loss": 0.7043997, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19958496, + "step": 12945, + "time_per_iteration": 2.8445849418640137 + }, + { + "auxiliary_loss_clip": 0.01399312, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.23858118, + "balance_loss_mlp": 1.016258, + "epoch": 0.778355629039531, + "flos": 21010484505600.0, + "grad_norm": 2.6435631212804602, + "language_loss": 0.69987553, + "learning_rate": 4.93461791294516e-07, + "loss": 0.72422504, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19384766, + "step": 12946, + "time_per_iteration": 2.86156964302063 + }, + { + "auxiliary_loss_clip": 0.01414371, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.25164771, + "balance_loss_mlp": 1.01487184, + "epoch": 0.7784157522921991, + "flos": 21408169466880.0, + "grad_norm": 2.5102933487474703, + "language_loss": 0.66183341, + "learning_rate": 4.932056660665689e-07, + "loss": 0.68632096, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19506836, + "step": 12947, + "time_per_iteration": 2.8440563678741455 + }, + { + "auxiliary_loss_clip": 0.013984, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.23947525, + "balance_loss_mlp": 1.01324916, + "epoch": 0.778475875544867, + "flos": 20823668150400.0, + "grad_norm": 2.2010231519347347, + "language_loss": 0.66081274, + "learning_rate": 4.929495979764147e-07, + "loss": 0.68512094, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19177246, + "step": 12948, + "time_per_iteration": 2.829538583755493 + }, + { + "auxiliary_loss_clip": 0.01395583, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.23671436, + "balance_loss_mlp": 1.01188302, + "epoch": 0.778535998797535, + "flos": 14363359943040.0, + "grad_norm": 1.7954011018244624, + "language_loss": 0.76123285, + "learning_rate": 4.926935870337625e-07, + "loss": 0.78549665, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18896484, + "step": 12949, + "time_per_iteration": 2.8459153175354004 + }, + { + "auxiliary_loss_clip": 0.0142143, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.25796533, + "balance_loss_mlp": 1.01505184, + "epoch": 0.7785961220502029, + "flos": 19219219130880.0, + "grad_norm": 1.433289863327755, + "language_loss": 0.69351196, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71807122, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19458008, + "step": 12950, + "time_per_iteration": 2.9117207527160645 + }, + { + "auxiliary_loss_clip": 0.01418873, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.25449967, + "balance_loss_mlp": 1.01395464, + "epoch": 0.7786562453028709, + "flos": 25749666057600.0, + "grad_norm": 1.6224947516160673, + "language_loss": 0.72752893, + "learning_rate": 4.921817366297938e-07, + "loss": 0.75204456, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.18762207, + "step": 12951, + "time_per_iteration": 4.233428955078125 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.24416804, + "balance_loss_mlp": 1.01406157, + "epoch": 0.7787163685555388, + "flos": 25750525708800.0, + "grad_norm": 1.6032221459014897, + "language_loss": 0.66260016, + "learning_rate": 4.919258971878877e-07, + "loss": 0.68695247, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1854248, + "step": 12952, + "time_per_iteration": 2.862560987472534 + }, + { + "auxiliary_loss_clip": 0.01378662, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.22561955, + "balance_loss_mlp": 1.01537347, + "epoch": 0.7787764918082068, + "flos": 22758061386240.0, + "grad_norm": 1.5432141301126079, + "language_loss": 0.81803787, + "learning_rate": 4.916701149323022e-07, + "loss": 0.84216309, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18493652, + "step": 12953, + "time_per_iteration": 2.880923271179199 + }, + { + "auxiliary_loss_clip": 0.01407447, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.24497068, + "balance_loss_mlp": 1.01257372, + "epoch": 0.7788366150608748, + "flos": 15198120103680.0, + "grad_norm": 3.3145636123312854, + "language_loss": 0.77881086, + "learning_rate": 4.91414389872737e-07, + "loss": 0.80320239, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19152832, + "step": 12954, + "time_per_iteration": 2.843950033187866 + }, + { + "auxiliary_loss_clip": 0.01416348, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.25120449, + "balance_loss_mlp": 1.00997615, + "epoch": 0.7788967383135428, + "flos": 21218864647680.0, + "grad_norm": 1.545202496930838, + "language_loss": 0.73318964, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7576378, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18481445, + "step": 12955, + "time_per_iteration": 2.813735008239746 + }, + { + "auxiliary_loss_clip": 0.01404127, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.24253106, + "balance_loss_mlp": 1.01290751, + "epoch": 0.7789568615662107, + "flos": 21691303297920.0, + "grad_norm": 1.6548950002426, + "language_loss": 0.69277, + "learning_rate": 4.909031113804551e-07, + "loss": 0.71713865, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19824219, + "step": 12956, + "time_per_iteration": 2.8721396923065186 + }, + { + "auxiliary_loss_clip": 0.01403684, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.24297619, + "balance_loss_mlp": 1.01536012, + "epoch": 0.7790169848188787, + "flos": 26371793064960.0, + "grad_norm": 1.53842506214568, + "language_loss": 0.76611096, + "learning_rate": 4.906475579671252e-07, + "loss": 0.79048556, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18432617, + "step": 12957, + "time_per_iteration": 2.9441137313842773 + }, + { + "auxiliary_loss_clip": 0.01397612, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.23756099, + "balance_loss_mlp": 1.0126555, + "epoch": 0.7790771080715466, + "flos": 25526083662720.0, + "grad_norm": 1.8625265923912484, + "language_loss": 0.78205442, + "learning_rate": 4.903920617885917e-07, + "loss": 0.80634075, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18383789, + "step": 12958, + "time_per_iteration": 4.347756624221802 + }, + { + "auxiliary_loss_clip": 0.01399235, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.23855996, + "balance_loss_mlp": 1.0132463, + "epoch": 0.7791372313242146, + "flos": 16042743630720.0, + "grad_norm": 2.220206722293401, + "language_loss": 0.72488797, + "learning_rate": 4.901366228545418e-07, + "loss": 0.74920774, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19506836, + "step": 12959, + "time_per_iteration": 2.8351247310638428 + }, + { + "auxiliary_loss_clip": 0.01400103, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.23984671, + "balance_loss_mlp": 1.01712012, + "epoch": 0.7791973545768827, + "flos": 23852808023040.0, + "grad_norm": 1.6383521964250731, + "language_loss": 0.78242111, + "learning_rate": 4.898812411746632e-07, + "loss": 0.80678552, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19213867, + "step": 12960, + "time_per_iteration": 2.8474748134613037 + }, + { + "auxiliary_loss_clip": 0.01413326, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.2514745, + "balance_loss_mlp": 1.01618087, + "epoch": 0.7792574778295506, + "flos": 24178499228160.0, + "grad_norm": 1.8550942028120934, + "language_loss": 0.76104397, + "learning_rate": 4.896259167586385e-07, + "loss": 0.78552806, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18920898, + "step": 12961, + "time_per_iteration": 2.834251880645752 + }, + { + "auxiliary_loss_clip": 0.0138865, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.23371911, + "balance_loss_mlp": 1.01882839, + "epoch": 0.7793176010822186, + "flos": 21473421747840.0, + "grad_norm": 1.5323738741869304, + "language_loss": 0.73918855, + "learning_rate": 4.893706496161511e-07, + "loss": 0.76344967, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18640137, + "step": 12962, + "time_per_iteration": 5.5876686573028564 + }, + { + "auxiliary_loss_clip": 0.01390097, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.2317785, + "balance_loss_mlp": 1.01305556, + "epoch": 0.7793777243348865, + "flos": 20676151543680.0, + "grad_norm": 1.8368777317580933, + "language_loss": 0.70875698, + "learning_rate": 4.891154397568795e-07, + "loss": 0.73297524, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18676758, + "step": 12963, + "time_per_iteration": 2.8474674224853516 + }, + { + "auxiliary_loss_clip": 0.01402998, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.24561644, + "balance_loss_mlp": 1.01504564, + "epoch": 0.7794378475875545, + "flos": 27137183667840.0, + "grad_norm": 2.411470811887559, + "language_loss": 0.64249909, + "learning_rate": 4.888602871905019e-07, + "loss": 0.66687781, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19836426, + "step": 12964, + "time_per_iteration": 2.872852325439453 + }, + { + "auxiliary_loss_clip": 0.01405611, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.24306488, + "balance_loss_mlp": 1.01664305, + "epoch": 0.7794979708402224, + "flos": 28085725797120.0, + "grad_norm": 1.5361110320124767, + "language_loss": 0.76802999, + "learning_rate": 4.88605191926694e-07, + "loss": 0.792436, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18347168, + "step": 12965, + "time_per_iteration": 2.8604252338409424 + }, + { + "auxiliary_loss_clip": 0.01382127, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.22839141, + "balance_loss_mlp": 1.01716566, + "epoch": 0.7795580940928905, + "flos": 26880681041280.0, + "grad_norm": 3.1888611739312784, + "language_loss": 0.73282957, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75701338, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1907959, + "step": 12966, + "time_per_iteration": 2.912429094314575 + }, + { + "auxiliary_loss_clip": 0.01393035, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.23828745, + "balance_loss_mlp": 1.01135206, + "epoch": 0.7796182173455584, + "flos": 23844211511040.0, + "grad_norm": 1.5776844912275862, + "language_loss": 0.74684435, + "learning_rate": 4.880951733454768e-07, + "loss": 0.77106309, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.17492676, + "step": 12967, + "time_per_iteration": 2.849928140640259 + }, + { + "auxiliary_loss_clip": 0.0139797, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.23745298, + "balance_loss_mlp": 1.01257157, + "epoch": 0.7796783405982264, + "flos": 19801774920960.0, + "grad_norm": 2.2392398712681008, + "language_loss": 0.73602057, + "learning_rate": 4.878402500474073e-07, + "loss": 0.76031089, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18481445, + "step": 12968, + "time_per_iteration": 2.862865924835205 + }, + { + "auxiliary_loss_clip": 0.01396531, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.23833764, + "balance_loss_mlp": 1.01477194, + "epoch": 0.7797384638508943, + "flos": 15458559027840.0, + "grad_norm": 1.817578737414817, + "language_loss": 0.62199211, + "learning_rate": 4.875853840905874e-07, + "loss": 0.64629197, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18676758, + "step": 12969, + "time_per_iteration": 2.8153748512268066 + }, + { + "auxiliary_loss_clip": 0.01375265, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.22070956, + "balance_loss_mlp": 1.01328206, + "epoch": 0.7797985871035623, + "flos": 20932427946240.0, + "grad_norm": 2.1063751260244343, + "language_loss": 0.71360373, + "learning_rate": 4.873305754846811e-07, + "loss": 0.73766911, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.17993164, + "step": 12970, + "time_per_iteration": 2.8249034881591797 + }, + { + "auxiliary_loss_clip": 0.01401899, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.24266505, + "balance_loss_mlp": 1.01576185, + "epoch": 0.7798587103562302, + "flos": 36950332181760.0, + "grad_norm": 1.7603112424268743, + "language_loss": 0.72598779, + "learning_rate": 4.870758242393507e-07, + "loss": 0.75036234, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19775391, + "step": 12971, + "time_per_iteration": 2.9629738330841064 + }, + { + "auxiliary_loss_clip": 0.01413932, + "auxiliary_loss_mlp": 0.0103439, + "balance_loss_clip": 1.24824786, + "balance_loss_mlp": 1.0152812, + "epoch": 0.7799188336088982, + "flos": 22429519758720.0, + "grad_norm": 3.8867979568180018, + "language_loss": 0.75040442, + "learning_rate": 4.868211303642578e-07, + "loss": 0.77488768, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19091797, + "step": 12972, + "time_per_iteration": 2.86275053024292 + }, + { + "auxiliary_loss_clip": 0.01405619, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.24479675, + "balance_loss_mlp": 1.01187181, + "epoch": 0.7799789568615663, + "flos": 18889863096960.0, + "grad_norm": 1.751668256941028, + "language_loss": 0.72695756, + "learning_rate": 4.865664938690584e-07, + "loss": 0.75131947, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18713379, + "step": 12973, + "time_per_iteration": 2.8891513347625732 + }, + { + "auxiliary_loss_clip": 0.01395706, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.23709476, + "balance_loss_mlp": 1.0125109, + "epoch": 0.7800390801142342, + "flos": 20270820211200.0, + "grad_norm": 1.7938447069518013, + "language_loss": 0.78306472, + "learning_rate": 4.863119147634089e-07, + "loss": 0.80733186, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18493652, + "step": 12974, + "time_per_iteration": 2.8275368213653564 + }, + { + "auxiliary_loss_clip": 0.01401878, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.24311996, + "balance_loss_mlp": 1.01277232, + "epoch": 0.7800992033669022, + "flos": 16698876744960.0, + "grad_norm": 1.6201087187802843, + "language_loss": 0.69994426, + "learning_rate": 4.86057393056964e-07, + "loss": 0.72428024, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18969727, + "step": 12975, + "time_per_iteration": 2.824751615524292 + }, + { + "auxiliary_loss_clip": 0.01395474, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.23736954, + "balance_loss_mlp": 1.01392031, + "epoch": 0.7801593266195701, + "flos": 18593653518720.0, + "grad_norm": 1.8371547915935698, + "language_loss": 0.82985628, + "learning_rate": 4.858029287593739e-07, + "loss": 0.85412776, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1776123, + "step": 12976, + "time_per_iteration": 2.843679666519165 + }, + { + "auxiliary_loss_clip": 0.01403505, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.24148643, + "balance_loss_mlp": 1.01362848, + "epoch": 0.7802194498722381, + "flos": 25496194832640.0, + "grad_norm": 1.4074638019666617, + "language_loss": 0.66324306, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68760455, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19030762, + "step": 12977, + "time_per_iteration": 2.944409132003784 + }, + { + "auxiliary_loss_clip": 0.01398068, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.23974121, + "balance_loss_mlp": 1.01414573, + "epoch": 0.780279573124906, + "flos": 31188940686720.0, + "grad_norm": 1.3753715698584794, + "language_loss": 0.75424886, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77855694, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18615723, + "step": 12978, + "time_per_iteration": 2.9356861114501953 + }, + { + "auxiliary_loss_clip": 0.01414495, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.24973762, + "balance_loss_mlp": 1.0162735, + "epoch": 0.780339696377574, + "flos": 26955570464640.0, + "grad_norm": 4.014597087676522, + "language_loss": 0.61995471, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64445448, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.1920166, + "step": 12979, + "time_per_iteration": 2.893747568130493 + }, + { + "auxiliary_loss_clip": 0.01396486, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.23687923, + "balance_loss_mlp": 1.01148605, + "epoch": 0.780399819630242, + "flos": 27967102634880.0, + "grad_norm": 2.381779203198935, + "language_loss": 0.78247046, + "learning_rate": 4.847856458505217e-07, + "loss": 0.80673552, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1854248, + "step": 12980, + "time_per_iteration": 2.917649507522583 + }, + { + "auxiliary_loss_clip": 0.01410458, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.24794722, + "balance_loss_mlp": 1.01786029, + "epoch": 0.78045994288291, + "flos": 22495631690880.0, + "grad_norm": 1.9981213780639615, + "language_loss": 0.77858138, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80306363, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19897461, + "step": 12981, + "time_per_iteration": 2.9370310306549072 + }, + { + "auxiliary_loss_clip": 0.01394291, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.23474479, + "balance_loss_mlp": 1.01434159, + "epoch": 0.7805200661355779, + "flos": 20860796148480.0, + "grad_norm": 1.9711577020420064, + "language_loss": 0.74030173, + "learning_rate": 4.842773491000067e-07, + "loss": 0.76457965, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19152832, + "step": 12982, + "time_per_iteration": 2.910907745361328 + }, + { + "auxiliary_loss_clip": 0.01400368, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.24050689, + "balance_loss_mlp": 1.01414323, + "epoch": 0.7805801893882459, + "flos": 25676903139840.0, + "grad_norm": 1.9527304102802039, + "language_loss": 0.73905575, + "learning_rate": 4.840232869344636e-07, + "loss": 0.76337808, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.17724609, + "step": 12983, + "time_per_iteration": 2.8859875202178955 + }, + { + "auxiliary_loss_clip": 0.01405111, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.2457881, + "balance_loss_mlp": 1.01211262, + "epoch": 0.7806403126409138, + "flos": 11335215456000.0, + "grad_norm": 1.9913788675929787, + "language_loss": 0.75110692, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77546477, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18566895, + "step": 12984, + "time_per_iteration": 2.8306057453155518 + }, + { + "auxiliary_loss_clip": 0.01398076, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.23818421, + "balance_loss_mlp": 1.01323295, + "epoch": 0.7807004358935818, + "flos": 19582852740480.0, + "grad_norm": 1.8753534586343803, + "language_loss": 0.82323694, + "learning_rate": 4.835153350709746e-07, + "loss": 0.84753144, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18139648, + "step": 12985, + "time_per_iteration": 2.8380794525146484 + }, + { + "auxiliary_loss_clip": 0.01394676, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.23551536, + "balance_loss_mlp": 1.01547968, + "epoch": 0.7807605591462499, + "flos": 19145279848320.0, + "grad_norm": 2.6271716510012415, + "language_loss": 0.77886826, + "learning_rate": 4.832614453922915e-07, + "loss": 0.80316341, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19372559, + "step": 12986, + "time_per_iteration": 4.284211158752441 + }, + { + "auxiliary_loss_clip": 0.0139503, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.23478603, + "balance_loss_mlp": 1.01075411, + "epoch": 0.7808206823989178, + "flos": 32386112847360.0, + "grad_norm": 1.5963310433853592, + "language_loss": 0.75058842, + "learning_rate": 4.830076132284859e-07, + "loss": 0.77483344, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18725586, + "step": 12987, + "time_per_iteration": 2.9402637481689453 + }, + { + "auxiliary_loss_clip": 0.0118706, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.09814739, + "balance_loss_mlp": 1.01059747, + "epoch": 0.7808808056515858, + "flos": 55081699868160.0, + "grad_norm": 0.7283969219330716, + "language_loss": 0.55091417, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57309961, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20898438, + "step": 12988, + "time_per_iteration": 3.362290859222412 + }, + { + "auxiliary_loss_clip": 0.01383568, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.22782731, + "balance_loss_mlp": 1.01432562, + "epoch": 0.7809409289042537, + "flos": 12867082536960.0, + "grad_norm": 3.1877027573436476, + "language_loss": 0.81424177, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83839786, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.17700195, + "step": 12989, + "time_per_iteration": 2.793565034866333 + }, + { + "auxiliary_loss_clip": 0.01385369, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.22976828, + "balance_loss_mlp": 1.01260829, + "epoch": 0.7810010521569217, + "flos": 21696687429120.0, + "grad_norm": 1.4934553134017414, + "language_loss": 0.70886892, + "learning_rate": 4.822464619225806e-07, + "loss": 0.73303962, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1907959, + "step": 12990, + "time_per_iteration": 2.9377901554107666 + }, + { + "auxiliary_loss_clip": 0.01398851, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.23883104, + "balance_loss_mlp": 1.01356614, + "epoch": 0.7810611754095896, + "flos": 16764038536320.0, + "grad_norm": 2.972776726195489, + "language_loss": 0.7879836, + "learning_rate": 4.819928599145184e-07, + "loss": 0.81230968, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.20202637, + "step": 12991, + "time_per_iteration": 2.8548505306243896 + }, + { + "auxiliary_loss_clip": 0.01399475, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.2380898, + "balance_loss_mlp": 1.01783168, + "epoch": 0.7811212986622577, + "flos": 43523019768960.0, + "grad_norm": 1.7988634875313585, + "language_loss": 0.66935432, + "learning_rate": 4.817393154694398e-07, + "loss": 0.69372129, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19396973, + "step": 12992, + "time_per_iteration": 3.0471010208129883 + }, + { + "auxiliary_loss_clip": 0.01405538, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.24336326, + "balance_loss_mlp": 1.01383686, + "epoch": 0.7811814219149256, + "flos": 21766509434880.0, + "grad_norm": 1.79085967750718, + "language_loss": 0.61930048, + "learning_rate": 4.814858285969578e-07, + "loss": 0.64367706, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18273926, + "step": 12993, + "time_per_iteration": 4.325740098953247 + }, + { + "auxiliary_loss_clip": 0.01392028, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.23493564, + "balance_loss_mlp": 1.01200414, + "epoch": 0.7812415451675936, + "flos": 24072273141120.0, + "grad_norm": 1.5457258450376856, + "language_loss": 0.69388568, + "learning_rate": 4.812323993066862e-07, + "loss": 0.71810997, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18395996, + "step": 12994, + "time_per_iteration": 2.8494036197662354 + }, + { + "auxiliary_loss_clip": 0.01395785, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.23669672, + "balance_loss_mlp": 1.01480854, + "epoch": 0.7813016684202615, + "flos": 18999075340800.0, + "grad_norm": 2.1098154628228367, + "language_loss": 0.70483458, + "learning_rate": 4.809790276082335e-07, + "loss": 0.72912621, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18579102, + "step": 12995, + "time_per_iteration": 2.8093745708465576 + }, + { + "auxiliary_loss_clip": 0.01371715, + "auxiliary_loss_mlp": 0.0103059, + "balance_loss_clip": 1.21897769, + "balance_loss_mlp": 1.01304245, + "epoch": 0.7813617916729295, + "flos": 25270621666560.0, + "grad_norm": 1.6194757677352019, + "language_loss": 0.75604224, + "learning_rate": 4.807257135112088e-07, + "loss": 0.7800653, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.17565918, + "step": 12996, + "time_per_iteration": 2.8690731525421143 + }, + { + "auxiliary_loss_clip": 0.01416956, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.25253415, + "balance_loss_mlp": 1.01053321, + "epoch": 0.7814219149255974, + "flos": 17974603157760.0, + "grad_norm": 2.8501421898219523, + "language_loss": 0.69705385, + "learning_rate": 4.804724570252167e-07, + "loss": 0.72152418, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1953125, + "step": 12997, + "time_per_iteration": 5.647717714309692 + }, + { + "auxiliary_loss_clip": 0.01411277, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.24642229, + "balance_loss_mlp": 1.01444173, + "epoch": 0.7814820381782654, + "flos": 25787336993280.0, + "grad_norm": 1.7618086406920521, + "language_loss": 0.83031404, + "learning_rate": 4.802192581598614e-07, + "loss": 0.85476196, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19067383, + "step": 12998, + "time_per_iteration": 2.881556749343872 + }, + { + "auxiliary_loss_clip": 0.01404129, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.24162447, + "balance_loss_mlp": 1.01312971, + "epoch": 0.7815421614309335, + "flos": 20528680181760.0, + "grad_norm": 2.1304975275250118, + "language_loss": 0.75691235, + "learning_rate": 4.799661169247453e-07, + "loss": 0.78128266, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19775391, + "step": 12999, + "time_per_iteration": 2.8413801193237305 + }, + { + "auxiliary_loss_clip": 0.01408344, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.24572933, + "balance_loss_mlp": 1.01704502, + "epoch": 0.7816022846836014, + "flos": 21297464144640.0, + "grad_norm": 1.4709608665299407, + "language_loss": 0.85027063, + "learning_rate": 4.797130333294652e-07, + "loss": 0.8747167, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1920166, + "step": 13000, + "time_per_iteration": 2.851088762283325 + }, + { + "auxiliary_loss_clip": 0.01407115, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.24545383, + "balance_loss_mlp": 1.01213837, + "epoch": 0.7816624079362694, + "flos": 19217771297280.0, + "grad_norm": 2.10151247921773, + "language_loss": 0.66874242, + "learning_rate": 4.794600073836192e-07, + "loss": 0.69313258, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19775391, + "step": 13001, + "time_per_iteration": 2.785374641418457 + }, + { + "auxiliary_loss_clip": 0.01409438, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.24666584, + "balance_loss_mlp": 1.01562381, + "epoch": 0.7817225311889373, + "flos": 26115969110400.0, + "grad_norm": 1.521587902980314, + "language_loss": 0.6775111, + "learning_rate": 4.792070390968027e-07, + "loss": 0.70194387, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18225098, + "step": 13002, + "time_per_iteration": 2.8679866790771484 + }, + { + "auxiliary_loss_clip": 0.01408384, + "auxiliary_loss_mlp": 0.01038007, + "balance_loss_clip": 1.24580884, + "balance_loss_mlp": 1.01842046, + "epoch": 0.7817826544416053, + "flos": 21260652860160.0, + "grad_norm": 2.148328675645518, + "language_loss": 0.74268848, + "learning_rate": 4.78954128478607e-07, + "loss": 0.76715243, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19555664, + "step": 13003, + "time_per_iteration": 2.8665497303009033 + }, + { + "auxiliary_loss_clip": 0.01389252, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.23072743, + "balance_loss_mlp": 1.01803756, + "epoch": 0.7818427776942732, + "flos": 19940197322880.0, + "grad_norm": 3.5561849624486754, + "language_loss": 0.62639415, + "learning_rate": 4.787012755386233e-07, + "loss": 0.65066266, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19555664, + "step": 13004, + "time_per_iteration": 2.8020849227905273 + }, + { + "auxiliary_loss_clip": 0.01375966, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.22148705, + "balance_loss_mlp": 1.01290345, + "epoch": 0.7819029009469413, + "flos": 11371031354880.0, + "grad_norm": 4.260272313095162, + "language_loss": 0.83851421, + "learning_rate": 4.784484802864403e-07, + "loss": 0.86258328, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18041992, + "step": 13005, + "time_per_iteration": 2.803419828414917 + }, + { + "auxiliary_loss_clip": 0.01397976, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.237921, + "balance_loss_mlp": 1.01691413, + "epoch": 0.7819630241996092, + "flos": 24289792732800.0, + "grad_norm": 2.7749163928906517, + "language_loss": 0.73403215, + "learning_rate": 4.781957427316432e-07, + "loss": 0.75836658, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18566895, + "step": 13006, + "time_per_iteration": 2.866915225982666 + }, + { + "auxiliary_loss_clip": 0.01404715, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.24203992, + "balance_loss_mlp": 1.01379168, + "epoch": 0.7820231474522772, + "flos": 22718625903360.0, + "grad_norm": 1.558104661022574, + "language_loss": 0.72805524, + "learning_rate": 4.779430628838157e-07, + "loss": 0.7524246, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18432617, + "step": 13007, + "time_per_iteration": 2.8378140926361084 + }, + { + "auxiliary_loss_clip": 0.01401215, + "auxiliary_loss_mlp": 0.01030949, + "balance_loss_clip": 1.23960781, + "balance_loss_mlp": 1.01192307, + "epoch": 0.7820832707049451, + "flos": 20056965448320.0, + "grad_norm": 2.170121632712835, + "language_loss": 0.70121992, + "learning_rate": 4.776904407525397e-07, + "loss": 0.72554159, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19018555, + "step": 13008, + "time_per_iteration": 2.8372271060943604 + }, + { + "auxiliary_loss_clip": 0.01411852, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.25099647, + "balance_loss_mlp": 1.0137012, + "epoch": 0.7821433939576131, + "flos": 27174356910720.0, + "grad_norm": 3.7471443092366563, + "language_loss": 0.7061227, + "learning_rate": 4.774378763473954e-07, + "loss": 0.73056906, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19091797, + "step": 13009, + "time_per_iteration": 2.885308265686035 + }, + { + "auxiliary_loss_clip": 0.01392274, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.23347044, + "balance_loss_mlp": 1.01469088, + "epoch": 0.782203517210281, + "flos": 22612445061120.0, + "grad_norm": 1.77202396738767, + "language_loss": 0.82575548, + "learning_rate": 4.771853696779586e-07, + "loss": 0.85002172, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1965332, + "step": 13010, + "time_per_iteration": 2.8283424377441406 + }, + { + "auxiliary_loss_clip": 0.01397207, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.23939121, + "balance_loss_mlp": 1.01383042, + "epoch": 0.782263640462949, + "flos": 29071803127680.0, + "grad_norm": 1.4449851263979883, + "language_loss": 0.63146597, + "learning_rate": 4.76932920753806e-07, + "loss": 0.65576327, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18676758, + "step": 13011, + "time_per_iteration": 2.9336981773376465 + }, + { + "auxiliary_loss_clip": 0.01403595, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.2452898, + "balance_loss_mlp": 1.01138091, + "epoch": 0.782323763715617, + "flos": 25309921415040.0, + "grad_norm": 1.6994300097498223, + "language_loss": 0.70273566, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.7270624, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17712402, + "step": 13012, + "time_per_iteration": 2.910227060317993 + }, + { + "auxiliary_loss_clip": 0.01188184, + "auxiliary_loss_mlp": 0.01025388, + "balance_loss_clip": 1.09841585, + "balance_loss_mlp": 1.00784063, + "epoch": 0.782383886968285, + "flos": 65228774140800.0, + "grad_norm": 0.7057369109183197, + "language_loss": 0.55032551, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57246125, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.17578125, + "step": 13013, + "time_per_iteration": 3.445112943649292 + }, + { + "auxiliary_loss_clip": 0.01413286, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.25016546, + "balance_loss_mlp": 1.01416552, + "epoch": 0.782444010220953, + "flos": 18414528779520.0, + "grad_norm": 1.9350609671886692, + "language_loss": 0.66058284, + "learning_rate": 4.76175920548765e-07, + "loss": 0.68504441, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18688965, + "step": 13014, + "time_per_iteration": 2.8648433685302734 + }, + { + "auxiliary_loss_clip": 0.01187829, + "auxiliary_loss_mlp": 0.01026315, + "balance_loss_clip": 1.09857166, + "balance_loss_mlp": 1.00810003, + "epoch": 0.7825041334736209, + "flos": 63989361319680.0, + "grad_norm": 0.7216210942972047, + "language_loss": 0.5847705, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60691196, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18261719, + "step": 13015, + "time_per_iteration": 3.3883609771728516 + }, + { + "auxiliary_loss_clip": 0.01394898, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.23588991, + "balance_loss_mlp": 1.01632619, + "epoch": 0.7825642567262889, + "flos": 20349012504960.0, + "grad_norm": 1.665599260359687, + "language_loss": 0.75652218, + "learning_rate": 4.756715426472666e-07, + "loss": 0.78082299, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18847656, + "step": 13016, + "time_per_iteration": 2.83048415184021 + }, + { + "auxiliary_loss_clip": 0.01401928, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.24022222, + "balance_loss_mlp": 1.01321018, + "epoch": 0.7826243799789568, + "flos": 20271679862400.0, + "grad_norm": 1.89620983116403, + "language_loss": 0.75568002, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.78002918, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19775391, + "step": 13017, + "time_per_iteration": 2.8532748222351074 + }, + { + "auxiliary_loss_clip": 0.01404532, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.2421999, + "balance_loss_mlp": 1.01539922, + "epoch": 0.7826845032316249, + "flos": 21140400885120.0, + "grad_norm": 1.9854552585107719, + "language_loss": 0.76432729, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.78872454, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19763184, + "step": 13018, + "time_per_iteration": 2.8410983085632324 + }, + { + "auxiliary_loss_clip": 0.01401185, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.24076712, + "balance_loss_mlp": 1.01308477, + "epoch": 0.7827446264842928, + "flos": 22502508900480.0, + "grad_norm": 1.7139443538740933, + "language_loss": 0.77774203, + "learning_rate": 4.749154093390708e-07, + "loss": 0.80207515, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19030762, + "step": 13019, + "time_per_iteration": 2.9361085891723633 + }, + { + "auxiliary_loss_clip": 0.01391661, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.23201227, + "balance_loss_mlp": 1.01201153, + "epoch": 0.7828047497369608, + "flos": 28852518988800.0, + "grad_norm": 1.4017092587619229, + "language_loss": 0.6789971, + "learning_rate": 4.746634805529852e-07, + "loss": 0.70322847, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19482422, + "step": 13020, + "time_per_iteration": 2.9403278827667236 + }, + { + "auxiliary_loss_clip": 0.0141425, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.25452352, + "balance_loss_mlp": 1.01615095, + "epoch": 0.7828648729896287, + "flos": 23267944748160.0, + "grad_norm": 1.9142440350307939, + "language_loss": 0.6351428, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.6596359, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18945312, + "step": 13021, + "time_per_iteration": 4.308583736419678 + }, + { + "auxiliary_loss_clip": 0.01402818, + "auxiliary_loss_mlp": 0.01037934, + "balance_loss_clip": 1.24253356, + "balance_loss_mlp": 1.01833558, + "epoch": 0.7829249962422967, + "flos": 25276774959360.0, + "grad_norm": 1.8221266527442146, + "language_loss": 0.69956446, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.72397202, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19580078, + "step": 13022, + "time_per_iteration": 2.8561339378356934 + }, + { + "auxiliary_loss_clip": 0.01189093, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.10087299, + "balance_loss_mlp": 1.01266384, + "epoch": 0.7829851194949646, + "flos": 70753145535360.0, + "grad_norm": 0.9201245487460189, + "language_loss": 0.56132513, + "learning_rate": 4.739080412784131e-07, + "loss": 0.5835411, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19824219, + "step": 13023, + "time_per_iteration": 3.4867708683013916 + }, + { + "auxiliary_loss_clip": 0.01382799, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_clip": 1.22768664, + "balance_loss_mlp": 1.0148654, + "epoch": 0.7830452427476327, + "flos": 25670387888640.0, + "grad_norm": 1.6293329751410373, + "language_loss": 0.67138052, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69553208, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.17504883, + "step": 13024, + "time_per_iteration": 2.8926727771759033 + }, + { + "auxiliary_loss_clip": 0.01412354, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.24921727, + "balance_loss_mlp": 1.01301646, + "epoch": 0.7831053660003006, + "flos": 22794691691520.0, + "grad_norm": 1.7695032788532197, + "language_loss": 0.77935326, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80380332, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19628906, + "step": 13025, + "time_per_iteration": 2.899077892303467 + }, + { + "auxiliary_loss_clip": 0.01394259, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.23616874, + "balance_loss_mlp": 1.01297259, + "epoch": 0.7831654892529686, + "flos": 25823333871360.0, + "grad_norm": 1.7283572912796994, + "language_loss": 0.79432505, + "learning_rate": 4.731531228298673e-07, + "loss": 0.81858194, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18457031, + "step": 13026, + "time_per_iteration": 2.890242576599121 + }, + { + "auxiliary_loss_clip": 0.01397609, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.23795974, + "balance_loss_mlp": 1.01126337, + "epoch": 0.7832256125056366, + "flos": 20779843921920.0, + "grad_norm": 1.9161527871519994, + "language_loss": 0.76276159, + "learning_rate": 4.729015991306715e-07, + "loss": 0.78703439, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18408203, + "step": 13027, + "time_per_iteration": 2.829946279525757 + }, + { + "auxiliary_loss_clip": 0.01383651, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.22722173, + "balance_loss_mlp": 1.01246309, + "epoch": 0.7832857357583045, + "flos": 21516386325120.0, + "grad_norm": 1.775939269878815, + "language_loss": 0.71456814, + "learning_rate": 4.726501333391997e-07, + "loss": 0.73871529, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.1862793, + "step": 13028, + "time_per_iteration": 4.281008720397949 + }, + { + "auxiliary_loss_clip": 0.0140888, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.24558187, + "balance_loss_mlp": 1.01531923, + "epoch": 0.7833458590109725, + "flos": 18086982537600.0, + "grad_norm": 2.3574613039435888, + "language_loss": 0.69524658, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71968472, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19604492, + "step": 13029, + "time_per_iteration": 2.8206748962402344 + }, + { + "auxiliary_loss_clip": 0.01408725, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.24500763, + "balance_loss_mlp": 1.01498985, + "epoch": 0.7834059822636404, + "flos": 28299263846400.0, + "grad_norm": 1.7009069705774704, + "language_loss": 0.81088704, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83531833, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1940918, + "step": 13030, + "time_per_iteration": 2.898679733276367 + }, + { + "auxiliary_loss_clip": 0.01414065, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.2493906, + "balance_loss_mlp": 1.00970936, + "epoch": 0.7834661055163085, + "flos": 31698733559040.0, + "grad_norm": 1.6134550990267615, + "language_loss": 0.71673876, + "learning_rate": 4.71896083506476e-07, + "loss": 0.74117279, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19641113, + "step": 13031, + "time_per_iteration": 2.920527696609497 + }, + { + "auxiliary_loss_clip": 0.01407041, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.24472368, + "balance_loss_mlp": 1.01493382, + "epoch": 0.7835262287689764, + "flos": 12941021819520.0, + "grad_norm": 2.25309099742245, + "language_loss": 0.79505622, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.81946015, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18432617, + "step": 13032, + "time_per_iteration": 4.302996873855591 + }, + { + "auxiliary_loss_clip": 0.0141984, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.2566843, + "balance_loss_mlp": 1.01638126, + "epoch": 0.7835863520216444, + "flos": 16151684405760.0, + "grad_norm": 4.473564124739625, + "language_loss": 0.63702285, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.66158497, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19995117, + "step": 13033, + "time_per_iteration": 2.8294851779937744 + }, + { + "auxiliary_loss_clip": 0.01402582, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.24131632, + "balance_loss_mlp": 1.012321, + "epoch": 0.7836464752743123, + "flos": 11517281107200.0, + "grad_norm": 1.5936208561337228, + "language_loss": 0.7254467, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74980807, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.21240234, + "step": 13034, + "time_per_iteration": 2.831049919128418 + }, + { + "auxiliary_loss_clip": 0.01411032, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.24850523, + "balance_loss_mlp": 1.01445532, + "epoch": 0.7837065985269803, + "flos": 18232915576320.0, + "grad_norm": 1.7518187347945404, + "language_loss": 0.72614825, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.75059998, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19702148, + "step": 13035, + "time_per_iteration": 2.969496250152588 + }, + { + "auxiliary_loss_clip": 0.01409551, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.24790406, + "balance_loss_mlp": 1.01510501, + "epoch": 0.7837667217796482, + "flos": 24765262784640.0, + "grad_norm": 2.8138327502982383, + "language_loss": 0.66971612, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.69416022, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19750977, + "step": 13036, + "time_per_iteration": 2.8700625896453857 + }, + { + "auxiliary_loss_clip": 0.01428801, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.26164758, + "balance_loss_mlp": 1.01214504, + "epoch": 0.7838268450323163, + "flos": 22393432391040.0, + "grad_norm": 2.412661510933924, + "language_loss": 0.73452461, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75913048, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19628906, + "step": 13037, + "time_per_iteration": 2.8770968914031982 + }, + { + "auxiliary_loss_clip": 0.01403867, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.24175763, + "balance_loss_mlp": 1.01527798, + "epoch": 0.7838869682849842, + "flos": 19509727864320.0, + "grad_norm": 2.0718239100503633, + "language_loss": 0.61009365, + "learning_rate": 4.701386624460717e-07, + "loss": 0.6344732, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18823242, + "step": 13038, + "time_per_iteration": 2.875861406326294 + }, + { + "auxiliary_loss_clip": 0.01398795, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.23929453, + "balance_loss_mlp": 1.01205325, + "epoch": 0.7839470915376522, + "flos": 32906357268480.0, + "grad_norm": 2.8675947189716857, + "language_loss": 0.69062352, + "learning_rate": 4.698878342684349e-07, + "loss": 0.7149145, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18273926, + "step": 13039, + "time_per_iteration": 2.969456911087036 + }, + { + "auxiliary_loss_clip": 0.01386743, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.22880268, + "balance_loss_mlp": 1.01515365, + "epoch": 0.7840072147903202, + "flos": 29687098170240.0, + "grad_norm": 2.0604243724699125, + "language_loss": 0.69867575, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.72286713, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.17236328, + "step": 13040, + "time_per_iteration": 2.975078821182251 + }, + { + "auxiliary_loss_clip": 0.01403668, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.24003434, + "balance_loss_mlp": 1.01355839, + "epoch": 0.7840673380429881, + "flos": 18195832823040.0, + "grad_norm": 1.5399012042993476, + "language_loss": 0.68511176, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.70947963, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19567871, + "step": 13041, + "time_per_iteration": 2.8367114067077637 + }, + { + "auxiliary_loss_clip": 0.01184433, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.09728742, + "balance_loss_mlp": 1.0136528, + "epoch": 0.7841274612956561, + "flos": 66376556962560.0, + "grad_norm": 0.6738887200192273, + "language_loss": 0.57451963, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59668458, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.18359375, + "step": 13042, + "time_per_iteration": 3.318455696105957 + }, + { + "auxiliary_loss_clip": 0.01405228, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.24361157, + "balance_loss_mlp": 1.01282167, + "epoch": 0.784187584548324, + "flos": 26658998928000.0, + "grad_norm": 3.3301492896633484, + "language_loss": 0.84695703, + "learning_rate": 4.688851018730369e-07, + "loss": 0.87132347, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18591309, + "step": 13043, + "time_per_iteration": 2.8806488513946533 + }, + { + "auxiliary_loss_clip": 0.01398449, + "auxiliary_loss_mlp": 0.01032219, + "balance_loss_clip": 1.2403301, + "balance_loss_mlp": 1.01425385, + "epoch": 0.7842477078009921, + "flos": 25751430604800.0, + "grad_norm": 1.403675687803688, + "language_loss": 0.88573456, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.91004121, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1796875, + "step": 13044, + "time_per_iteration": 2.899879217147827 + }, + { + "auxiliary_loss_clip": 0.01432976, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.26501918, + "balance_loss_mlp": 1.01357937, + "epoch": 0.78430783105366, + "flos": 21990951480960.0, + "grad_norm": 1.6464969611380285, + "language_loss": 0.79901791, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.82367337, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.18994141, + "step": 13045, + "time_per_iteration": 2.8622372150421143 + }, + { + "auxiliary_loss_clip": 0.01407077, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.24754608, + "balance_loss_mlp": 1.0142653, + "epoch": 0.784367954306328, + "flos": 23852898512640.0, + "grad_norm": 1.6663942649260395, + "language_loss": 0.73004389, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.75444448, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18713379, + "step": 13046, + "time_per_iteration": 2.8690426349639893 + }, + { + "auxiliary_loss_clip": 0.01386438, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.2293644, + "balance_loss_mlp": 1.01575923, + "epoch": 0.7844280775589959, + "flos": 24837347030400.0, + "grad_norm": 1.605668286714682, + "language_loss": 0.63567019, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65988958, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19763184, + "step": 13047, + "time_per_iteration": 2.8654730319976807 + }, + { + "auxiliary_loss_clip": 0.01382317, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.22654295, + "balance_loss_mlp": 1.01584435, + "epoch": 0.7844882008116639, + "flos": 22465878595200.0, + "grad_norm": 1.4418588881948526, + "language_loss": 0.73733646, + "learning_rate": 4.676329928006515e-07, + "loss": 0.76150203, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18383789, + "step": 13048, + "time_per_iteration": 2.8380441665649414 + }, + { + "auxiliary_loss_clip": 0.01424723, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.26026511, + "balance_loss_mlp": 1.01289487, + "epoch": 0.7845483240643318, + "flos": 26115154704000.0, + "grad_norm": 1.6811489555815828, + "language_loss": 0.7584542, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.78301984, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18945312, + "step": 13049, + "time_per_iteration": 2.907701253890991 + }, + { + "auxiliary_loss_clip": 0.01419098, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.25229669, + "balance_loss_mlp": 1.01640534, + "epoch": 0.7846084473169999, + "flos": 19363704336000.0, + "grad_norm": 2.5697764316261487, + "language_loss": 0.73680598, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.7613647, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20373535, + "step": 13050, + "time_per_iteration": 2.8363893032073975 + }, + { + "auxiliary_loss_clip": 0.01396841, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.23773789, + "balance_loss_mlp": 1.01516294, + "epoch": 0.7846685705696678, + "flos": 23334825841920.0, + "grad_norm": 3.388389562284319, + "language_loss": 0.74128026, + "learning_rate": 4.668824245713825e-07, + "loss": 0.76558828, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18786621, + "step": 13051, + "time_per_iteration": 2.865374803543091 + }, + { + "auxiliary_loss_clip": 0.01405715, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.24363518, + "balance_loss_mlp": 1.01603317, + "epoch": 0.7847286938223358, + "flos": 35823253495680.0, + "grad_norm": 1.7512106253690365, + "language_loss": 0.73840845, + "learning_rate": 4.666323514209227e-07, + "loss": 0.76282275, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19677734, + "step": 13052, + "time_per_iteration": 2.9775054454803467 + }, + { + "auxiliary_loss_clip": 0.01386784, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.23174763, + "balance_loss_mlp": 1.0134151, + "epoch": 0.7847888170750038, + "flos": 18487201207680.0, + "grad_norm": 1.794222135962841, + "language_loss": 0.70359498, + "learning_rate": 4.663823364159183e-07, + "loss": 0.72777224, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1751709, + "step": 13053, + "time_per_iteration": 2.826277732849121 + }, + { + "auxiliary_loss_clip": 0.01398459, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.2396102, + "balance_loss_mlp": 1.01277018, + "epoch": 0.7848489403276717, + "flos": 25130072759040.0, + "grad_norm": 2.0448443434757313, + "language_loss": 0.71073198, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.73501641, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17224121, + "step": 13054, + "time_per_iteration": 2.8972856998443604 + }, + { + "auxiliary_loss_clip": 0.01420279, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.25541186, + "balance_loss_mlp": 1.01493692, + "epoch": 0.7849090635803397, + "flos": 26512884910080.0, + "grad_norm": 1.6419634943706152, + "language_loss": 0.76551646, + "learning_rate": 4.658824808801938e-07, + "loss": 0.79005867, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19006348, + "step": 13055, + "time_per_iteration": 2.8905715942382812 + }, + { + "auxiliary_loss_clip": 0.01408879, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.24388599, + "balance_loss_mlp": 1.01345444, + "epoch": 0.7849691868330076, + "flos": 20969465454720.0, + "grad_norm": 1.656475693331985, + "language_loss": 0.75763351, + "learning_rate": 4.656326403684283e-07, + "loss": 0.78205311, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19628906, + "step": 13056, + "time_per_iteration": 4.253546237945557 + }, + { + "auxiliary_loss_clip": 0.01406331, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.24618399, + "balance_loss_mlp": 1.01250243, + "epoch": 0.7850293100856757, + "flos": 26078569643520.0, + "grad_norm": 1.6879934161918249, + "language_loss": 0.704813, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72918952, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18823242, + "step": 13057, + "time_per_iteration": 2.9047679901123047 + }, + { + "auxiliary_loss_clip": 0.01408453, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.24576151, + "balance_loss_mlp": 1.01138759, + "epoch": 0.7850894333383436, + "flos": 22502056452480.0, + "grad_norm": 2.5217608712053177, + "language_loss": 0.77057707, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.79496789, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19226074, + "step": 13058, + "time_per_iteration": 2.850862979888916 + }, + { + "auxiliary_loss_clip": 0.01402839, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.24262595, + "balance_loss_mlp": 1.01690507, + "epoch": 0.7851495565910116, + "flos": 20568432378240.0, + "grad_norm": 2.1789705463054316, + "language_loss": 0.71500045, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73938465, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18688965, + "step": 13059, + "time_per_iteration": 2.788881778717041 + }, + { + "auxiliary_loss_clip": 0.0142461, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.25557542, + "balance_loss_mlp": 1.01890993, + "epoch": 0.7852096798436795, + "flos": 15933395652480.0, + "grad_norm": 1.9498435699233132, + "language_loss": 0.77097058, + "learning_rate": 4.646338602497144e-07, + "loss": 0.79560524, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 1.69140625, + "router_z_loss_mlp": 0.19934082, + "step": 13060, + "time_per_iteration": 2.8147923946380615 + }, + { + "auxiliary_loss_clip": 0.01397575, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.23653913, + "balance_loss_mlp": 1.01653135, + "epoch": 0.7852698030963475, + "flos": 19071566789760.0, + "grad_norm": 3.96583942742496, + "language_loss": 0.77340651, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79773748, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18981934, + "step": 13061, + "time_per_iteration": 2.8011252880096436 + }, + { + "auxiliary_loss_clip": 0.0139907, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.23739719, + "balance_loss_mlp": 1.0175792, + "epoch": 0.7853299263490154, + "flos": 24655055155200.0, + "grad_norm": 1.9189128849551842, + "language_loss": 0.75165868, + "learning_rate": 4.641348194799164e-07, + "loss": 0.77601087, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18566895, + "step": 13062, + "time_per_iteration": 2.893202781677246 + }, + { + "auxiliary_loss_clip": 0.01394754, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.23618174, + "balance_loss_mlp": 1.01463366, + "epoch": 0.7853900496016835, + "flos": 22028124723840.0, + "grad_norm": 1.6606820501597521, + "language_loss": 0.69770932, + "learning_rate": 4.638853864505297e-07, + "loss": 0.72200853, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.20556641, + "step": 13063, + "time_per_iteration": 4.311232328414917 + }, + { + "auxiliary_loss_clip": 0.01400353, + "auxiliary_loss_mlp": 0.01034588, + "balance_loss_clip": 1.24291229, + "balance_loss_mlp": 1.01476359, + "epoch": 0.7854501728543514, + "flos": 30239719885440.0, + "grad_norm": 2.103788531023688, + "language_loss": 0.74097103, + "learning_rate": 4.636360116707625e-07, + "loss": 0.76532048, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19812012, + "step": 13064, + "time_per_iteration": 3.012084722518921 + }, + { + "auxiliary_loss_clip": 0.01403289, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.24060977, + "balance_loss_mlp": 1.01660895, + "epoch": 0.7855102961070194, + "flos": 18853006567680.0, + "grad_norm": 1.7592552135700514, + "language_loss": 0.68472248, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70911574, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19421387, + "step": 13065, + "time_per_iteration": 2.8086533546447754 + }, + { + "auxiliary_loss_clip": 0.01402434, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.24323988, + "balance_loss_mlp": 1.01707721, + "epoch": 0.7855704193596874, + "flos": 22320217025280.0, + "grad_norm": 3.080608069282332, + "language_loss": 0.77254295, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.79692125, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18322754, + "step": 13066, + "time_per_iteration": 2.867990732192993 + }, + { + "auxiliary_loss_clip": 0.01184875, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.09634936, + "balance_loss_mlp": 1.00889194, + "epoch": 0.7856305426123553, + "flos": 60035143386240.0, + "grad_norm": 0.7077892430351624, + "language_loss": 0.53409147, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55623418, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20507812, + "step": 13067, + "time_per_iteration": 4.882134437561035 + }, + { + "auxiliary_loss_clip": 0.01400917, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.23863149, + "balance_loss_mlp": 1.01427579, + "epoch": 0.7856906658650233, + "flos": 21877802939520.0, + "grad_norm": 1.6778242546905209, + "language_loss": 0.68169677, + "learning_rate": 4.62639095236989e-07, + "loss": 0.70603526, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18652344, + "step": 13068, + "time_per_iteration": 2.859661340713501 + }, + { + "auxiliary_loss_clip": 0.0139457, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.23609638, + "balance_loss_mlp": 1.01454949, + "epoch": 0.7857507891176913, + "flos": 23633388149760.0, + "grad_norm": 1.9383701587717874, + "language_loss": 0.68951464, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.71379852, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19250488, + "step": 13069, + "time_per_iteration": 2.866055965423584 + }, + { + "auxiliary_loss_clip": 0.01400808, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.23929238, + "balance_loss_mlp": 1.01485932, + "epoch": 0.7858109123703593, + "flos": 25530970101120.0, + "grad_norm": 1.4688811108640505, + "language_loss": 0.77373844, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79807651, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18139648, + "step": 13070, + "time_per_iteration": 2.896535634994507 + }, + { + "auxiliary_loss_clip": 0.01388255, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.23127556, + "balance_loss_mlp": 1.01415193, + "epoch": 0.7858710356230272, + "flos": 17466393853440.0, + "grad_norm": 1.788181622523246, + "language_loss": 0.66943705, + "learning_rate": 4.618920199958083e-07, + "loss": 0.69364882, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18786621, + "step": 13071, + "time_per_iteration": 2.8158957958221436 + }, + { + "auxiliary_loss_clip": 0.01415319, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.25089049, + "balance_loss_mlp": 1.01339293, + "epoch": 0.7859311588756952, + "flos": 24690056647680.0, + "grad_norm": 1.6242134549229874, + "language_loss": 0.74359584, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76807117, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18811035, + "step": 13072, + "time_per_iteration": 2.863069772720337 + }, + { + "auxiliary_loss_clip": 0.01414685, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.25091362, + "balance_loss_mlp": 1.01330018, + "epoch": 0.7859912821283631, + "flos": 21809247788160.0, + "grad_norm": 1.6748413796446775, + "language_loss": 0.71869552, + "learning_rate": 4.613942614453268e-07, + "loss": 0.74318254, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20739746, + "step": 13073, + "time_per_iteration": 2.8655266761779785 + }, + { + "auxiliary_loss_clip": 0.01400071, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.2383852, + "balance_loss_mlp": 1.01490891, + "epoch": 0.7860514053810311, + "flos": 20856316913280.0, + "grad_norm": 1.985768155984696, + "language_loss": 0.7745887, + "learning_rate": 4.611454696814938e-07, + "loss": 0.79893363, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.1953125, + "step": 13074, + "time_per_iteration": 2.8562986850738525 + }, + { + "auxiliary_loss_clip": 0.01393024, + "auxiliary_loss_mlp": 0.01031909, + "balance_loss_clip": 1.23570776, + "balance_loss_mlp": 1.01335979, + "epoch": 0.786111528633699, + "flos": 24326196814080.0, + "grad_norm": 1.7162570867431068, + "language_loss": 0.75493431, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77918363, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18554688, + "step": 13075, + "time_per_iteration": 2.85612416267395 + }, + { + "auxiliary_loss_clip": 0.01413589, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.25087941, + "balance_loss_mlp": 1.0142808, + "epoch": 0.7861716518863671, + "flos": 24363958239360.0, + "grad_norm": 1.5892777648182057, + "language_loss": 0.69289535, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71735591, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1817627, + "step": 13076, + "time_per_iteration": 2.872288703918457 + }, + { + "auxiliary_loss_clip": 0.01390455, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.23190761, + "balance_loss_mlp": 1.01935458, + "epoch": 0.786231775139035, + "flos": 14029117470720.0, + "grad_norm": 2.127097694202951, + "language_loss": 0.80834281, + "learning_rate": 4.603994445488282e-07, + "loss": 0.83262384, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18273926, + "step": 13077, + "time_per_iteration": 2.799344539642334 + }, + { + "auxiliary_loss_clip": 0.01400855, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.24138165, + "balance_loss_mlp": 1.01520455, + "epoch": 0.786291898391703, + "flos": 33735959521920.0, + "grad_norm": 6.254359433186231, + "language_loss": 0.71338087, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.73773992, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19848633, + "step": 13078, + "time_per_iteration": 2.9313602447509766 + }, + { + "auxiliary_loss_clip": 0.01392739, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.23523808, + "balance_loss_mlp": 1.01432514, + "epoch": 0.786352021644371, + "flos": 25822338485760.0, + "grad_norm": 1.4650103859296424, + "language_loss": 0.82143408, + "learning_rate": 4.599023863537039e-07, + "loss": 0.84569603, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19140625, + "step": 13079, + "time_per_iteration": 2.9809300899505615 + }, + { + "auxiliary_loss_clip": 0.01386161, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.23141932, + "balance_loss_mlp": 1.01080251, + "epoch": 0.7864121448970389, + "flos": 28921979036160.0, + "grad_norm": 1.483621496917262, + "language_loss": 0.68893397, + "learning_rate": 4.596539448524146e-07, + "loss": 0.71308857, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.18493652, + "step": 13080, + "time_per_iteration": 2.9410922527313232 + }, + { + "auxiliary_loss_clip": 0.01406732, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.24535275, + "balance_loss_mlp": 1.01698077, + "epoch": 0.7864722681497069, + "flos": 19218314234880.0, + "grad_norm": 2.121933409377488, + "language_loss": 0.70145488, + "learning_rate": 4.594055617612016e-07, + "loss": 0.72588599, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19396973, + "step": 13081, + "time_per_iteration": 2.8512074947357178 + }, + { + "auxiliary_loss_clip": 0.01408318, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.24693096, + "balance_loss_mlp": 1.01439118, + "epoch": 0.7865323914023749, + "flos": 21881558257920.0, + "grad_norm": 1.5175661549401527, + "language_loss": 0.69180393, + "learning_rate": 4.591572370894838e-07, + "loss": 0.71621984, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18896484, + "step": 13082, + "time_per_iteration": 2.849046230316162 + }, + { + "auxiliary_loss_clip": 0.01392288, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.23461747, + "balance_loss_mlp": 1.00986743, + "epoch": 0.7865925146550429, + "flos": 25531286814720.0, + "grad_norm": 3.187113904525805, + "language_loss": 0.66893369, + "learning_rate": 4.589089708466789e-07, + "loss": 0.69313896, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18371582, + "step": 13083, + "time_per_iteration": 2.8761391639709473 + }, + { + "auxiliary_loss_clip": 0.01412692, + "auxiliary_loss_mlp": 0.01035481, + "balance_loss_clip": 1.24761415, + "balance_loss_mlp": 1.01566815, + "epoch": 0.7866526379077108, + "flos": 19106296813440.0, + "grad_norm": 2.0643848136477256, + "language_loss": 0.7601493, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.78463101, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19799805, + "step": 13084, + "time_per_iteration": 2.8028526306152344 + }, + { + "auxiliary_loss_clip": 0.01395379, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.2374624, + "balance_loss_mlp": 1.01347899, + "epoch": 0.7867127611603788, + "flos": 16180713584640.0, + "grad_norm": 1.9698061447107538, + "language_loss": 0.70769948, + "learning_rate": 4.584126136854591e-07, + "loss": 0.7319715, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18371582, + "step": 13085, + "time_per_iteration": 2.7927777767181396 + }, + { + "auxiliary_loss_clip": 0.01417847, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.25138068, + "balance_loss_mlp": 1.01629806, + "epoch": 0.7867728844130467, + "flos": 20782468120320.0, + "grad_norm": 3.4155729288041896, + "language_loss": 0.73158485, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.75611997, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19372559, + "step": 13086, + "time_per_iteration": 2.838545799255371 + }, + { + "auxiliary_loss_clip": 0.01401976, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.24236131, + "balance_loss_mlp": 1.01190186, + "epoch": 0.7868330076657147, + "flos": 21769540836480.0, + "grad_norm": 1.6063665131774179, + "language_loss": 0.74972248, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77404487, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18371582, + "step": 13087, + "time_per_iteration": 2.8449931144714355 + }, + { + "auxiliary_loss_clip": 0.01394097, + "auxiliary_loss_mlp": 0.01032114, + "balance_loss_clip": 1.23562491, + "balance_loss_mlp": 1.01346946, + "epoch": 0.7868931309183826, + "flos": 25711180715520.0, + "grad_norm": 1.5678719455008807, + "language_loss": 0.7197752, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.74403727, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18652344, + "step": 13088, + "time_per_iteration": 2.8436849117279053 + }, + { + "auxiliary_loss_clip": 0.01181542, + "auxiliary_loss_mlp": 0.01027951, + "balance_loss_clip": 1.0937407, + "balance_loss_mlp": 1.00363278, + "epoch": 0.7869532541710507, + "flos": 64678550400000.0, + "grad_norm": 0.6751855620452716, + "language_loss": 0.55575746, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57785243, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.24316406, + "step": 13089, + "time_per_iteration": 3.48530650138855 + }, + { + "auxiliary_loss_clip": 0.01181507, + "auxiliary_loss_mlp": 0.01018094, + "balance_loss_clip": 1.09483123, + "balance_loss_mlp": 0.99739915, + "epoch": 0.7870133774237186, + "flos": 67487817951360.0, + "grad_norm": 0.7217355566410324, + "language_loss": 0.50065136, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52264738, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.20703125, + "step": 13090, + "time_per_iteration": 3.3805007934570312 + }, + { + "auxiliary_loss_clip": 0.01390389, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.23374987, + "balance_loss_mlp": 1.01216257, + "epoch": 0.7870735006763866, + "flos": 26079610273920.0, + "grad_norm": 1.4921356497422589, + "language_loss": 0.8393603, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.86356521, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17956543, + "step": 13091, + "time_per_iteration": 4.403404474258423 + }, + { + "auxiliary_loss_clip": 0.01187454, + "auxiliary_loss_mlp": 0.01024258, + "balance_loss_clip": 1.09759355, + "balance_loss_mlp": 1.00442147, + "epoch": 0.7871336239290546, + "flos": 70321952160000.0, + "grad_norm": 0.7520196810134033, + "language_loss": 0.64070886, + "learning_rate": 4.566772055150947e-07, + "loss": 0.662826, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19824219, + "step": 13092, + "time_per_iteration": 3.3163554668426514 + }, + { + "auxiliary_loss_clip": 0.01403762, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.24259901, + "balance_loss_mlp": 1.01539421, + "epoch": 0.7871937471817225, + "flos": 15787010165760.0, + "grad_norm": 1.939181008655669, + "language_loss": 0.79935181, + "learning_rate": 4.564295240788285e-07, + "loss": 0.82372952, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18615723, + "step": 13093, + "time_per_iteration": 2.834933280944824 + }, + { + "auxiliary_loss_clip": 0.01395844, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.23792887, + "balance_loss_mlp": 1.01217842, + "epoch": 0.7872538704343905, + "flos": 20494900298880.0, + "grad_norm": 1.8817966158010948, + "language_loss": 0.76454651, + "learning_rate": 4.561819011749106e-07, + "loss": 0.78881496, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18811035, + "step": 13094, + "time_per_iteration": 2.8988912105560303 + }, + { + "auxiliary_loss_clip": 0.01409796, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.24651718, + "balance_loss_mlp": 1.01496387, + "epoch": 0.7873139936870585, + "flos": 25093578188160.0, + "grad_norm": 1.6197846547070032, + "language_loss": 0.7989338, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.82337248, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19104004, + "step": 13095, + "time_per_iteration": 2.926340341567993 + }, + { + "auxiliary_loss_clip": 0.01417056, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.25155473, + "balance_loss_mlp": 1.01127374, + "epoch": 0.7873741169397265, + "flos": 30895400551680.0, + "grad_norm": 3.0243484426360236, + "language_loss": 0.68693143, + "learning_rate": 4.556868310016715e-07, + "loss": 0.71140093, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.1862793, + "step": 13096, + "time_per_iteration": 2.977992057800293 + }, + { + "auxiliary_loss_clip": 0.01382064, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.22714734, + "balance_loss_mlp": 1.01024508, + "epoch": 0.7874342401923944, + "flos": 46808571778560.0, + "grad_norm": 1.9263557640852986, + "language_loss": 0.71184075, + "learning_rate": 4.55439383751125e-07, + "loss": 0.7359367, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.17285156, + "step": 13097, + "time_per_iteration": 3.134122133255005 + }, + { + "auxiliary_loss_clip": 0.01419948, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.2558012, + "balance_loss_mlp": 1.01418006, + "epoch": 0.7874943634450624, + "flos": 23594495604480.0, + "grad_norm": 3.0813082465038497, + "language_loss": 0.80984259, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.83436877, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18493652, + "step": 13098, + "time_per_iteration": 4.278926610946655 + }, + { + "auxiliary_loss_clip": 0.01405798, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.24676895, + "balance_loss_mlp": 1.0125308, + "epoch": 0.7875544866977303, + "flos": 20200183799040.0, + "grad_norm": 1.7960888943148634, + "language_loss": 0.74524385, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76961815, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19104004, + "step": 13099, + "time_per_iteration": 2.855069875717163 + }, + { + "auxiliary_loss_clip": 0.01402784, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.24291217, + "balance_loss_mlp": 1.01259398, + "epoch": 0.7876146099503983, + "flos": 22613078488320.0, + "grad_norm": 1.575075372643208, + "language_loss": 0.78598326, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.81032383, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18676758, + "step": 13100, + "time_per_iteration": 2.8431742191314697 + }, + { + "auxiliary_loss_clip": 0.01417921, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.25002408, + "balance_loss_mlp": 1.0117805, + "epoch": 0.7876747332030662, + "flos": 10712636000640.0, + "grad_norm": 2.4086443667305133, + "language_loss": 0.67027497, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.69476408, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19189453, + "step": 13101, + "time_per_iteration": 2.7791900634765625 + }, + { + "auxiliary_loss_clip": 0.01403883, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.24338758, + "balance_loss_mlp": 1.01231444, + "epoch": 0.7877348564557343, + "flos": 38413508376960.0, + "grad_norm": 1.5930540822563457, + "language_loss": 0.78630197, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.81064951, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18566895, + "step": 13102, + "time_per_iteration": 4.4363086223602295 + }, + { + "auxiliary_loss_clip": 0.01407143, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.24549472, + "balance_loss_mlp": 1.01579869, + "epoch": 0.7877949797084022, + "flos": 18337829564160.0, + "grad_norm": 1.9713081738052707, + "language_loss": 0.82978702, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.85420215, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18579102, + "step": 13103, + "time_per_iteration": 2.8238611221313477 + }, + { + "auxiliary_loss_clip": 0.01418275, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.25459528, + "balance_loss_mlp": 1.01266885, + "epoch": 0.7878551029610702, + "flos": 25816366172160.0, + "grad_norm": 1.8704237441301041, + "language_loss": 0.81146783, + "learning_rate": 4.537088934794913e-07, + "loss": 0.83596456, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18737793, + "step": 13104, + "time_per_iteration": 2.903510570526123 + }, + { + "auxiliary_loss_clip": 0.01408668, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.24754274, + "balance_loss_mlp": 1.01455712, + "epoch": 0.7879152262137382, + "flos": 22351960892160.0, + "grad_norm": 1.744454142235192, + "language_loss": 0.74557817, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.77000248, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19189453, + "step": 13105, + "time_per_iteration": 2.8472750186920166 + }, + { + "auxiliary_loss_clip": 0.01407141, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.24304557, + "balance_loss_mlp": 1.016433, + "epoch": 0.7879753494664061, + "flos": 24794744411520.0, + "grad_norm": 2.218426667926628, + "language_loss": 0.76321751, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78763491, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1817627, + "step": 13106, + "time_per_iteration": 3.0162904262542725 + }, + { + "auxiliary_loss_clip": 0.0140882, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.24734163, + "balance_loss_mlp": 1.01347733, + "epoch": 0.7880354727190741, + "flos": 16917663191040.0, + "grad_norm": 2.9886489591470133, + "language_loss": 0.7395466, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.76396108, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19140625, + "step": 13107, + "time_per_iteration": 2.8826143741607666 + }, + { + "auxiliary_loss_clip": 0.01395267, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.23766005, + "balance_loss_mlp": 1.01482368, + "epoch": 0.7880955959717421, + "flos": 22239038574720.0, + "grad_norm": 1.5431232318482726, + "language_loss": 0.73333645, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75763392, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1965332, + "step": 13108, + "time_per_iteration": 2.8822319507598877 + }, + { + "auxiliary_loss_clip": 0.01184848, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.09639215, + "balance_loss_mlp": 1.00183856, + "epoch": 0.7881557192244101, + "flos": 69212229494400.0, + "grad_norm": 0.8855643149798529, + "language_loss": 0.60293037, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62506431, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.26757812, + "step": 13109, + "time_per_iteration": 3.3394501209259033 + }, + { + "auxiliary_loss_clip": 0.01405864, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.24818766, + "balance_loss_mlp": 1.01776385, + "epoch": 0.788215842477078, + "flos": 24945971091840.0, + "grad_norm": 1.7970172585502624, + "language_loss": 0.73347068, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.75789505, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18798828, + "step": 13110, + "time_per_iteration": 2.8715012073516846 + }, + { + "auxiliary_loss_clip": 0.01392286, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.23588586, + "balance_loss_mlp": 1.01443744, + "epoch": 0.788275965729746, + "flos": 26118231350400.0, + "grad_norm": 1.7989786419543992, + "language_loss": 0.75328046, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77753794, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19030762, + "step": 13111, + "time_per_iteration": 2.9268946647644043 + }, + { + "auxiliary_loss_clip": 0.01398857, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.2387197, + "balance_loss_mlp": 1.01292861, + "epoch": 0.7883360889824139, + "flos": 21224610737280.0, + "grad_norm": 2.0195089402529303, + "language_loss": 0.62934369, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.65365303, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19152832, + "step": 13112, + "time_per_iteration": 2.8421576023101807 + }, + { + "auxiliary_loss_clip": 0.01397307, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.2367866, + "balance_loss_mlp": 1.0117588, + "epoch": 0.7883962122350819, + "flos": 21152526491520.0, + "grad_norm": 1.583519804162614, + "language_loss": 0.68047154, + "learning_rate": 4.514881996216644e-07, + "loss": 0.7047534, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19128418, + "step": 13113, + "time_per_iteration": 2.832814931869507 + }, + { + "auxiliary_loss_clip": 0.01397917, + "auxiliary_loss_mlp": 0.0103087, + "balance_loss_clip": 1.2395916, + "balance_loss_mlp": 1.0121783, + "epoch": 0.7884563354877498, + "flos": 15310861441920.0, + "grad_norm": 4.208869199240977, + "language_loss": 0.59512091, + "learning_rate": 4.5124174933361e-07, + "loss": 0.61940885, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18701172, + "step": 13114, + "time_per_iteration": 2.8384997844696045 + }, + { + "auxiliary_loss_clip": 0.01412641, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.24961758, + "balance_loss_mlp": 1.01483262, + "epoch": 0.7885164587404179, + "flos": 24398733507840.0, + "grad_norm": 1.663883766622936, + "language_loss": 0.67282438, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69729042, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19116211, + "step": 13115, + "time_per_iteration": 2.916074752807617 + }, + { + "auxiliary_loss_clip": 0.01401339, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.24017358, + "balance_loss_mlp": 1.01239216, + "epoch": 0.7885765819930858, + "flos": 14392343877120.0, + "grad_norm": 2.1095630452075493, + "language_loss": 0.88872176, + "learning_rate": 4.50749024954048e-07, + "loss": 0.91305494, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19580078, + "step": 13116, + "time_per_iteration": 2.81485915184021 + }, + { + "auxiliary_loss_clip": 0.0143052, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.26200366, + "balance_loss_mlp": 1.01773429, + "epoch": 0.7886367052457538, + "flos": 18269093433600.0, + "grad_norm": 1.9617815308627462, + "language_loss": 0.73685533, + "learning_rate": 4.505027508812245e-07, + "loss": 0.76152885, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.19091797, + "step": 13117, + "time_per_iteration": 2.876619577407837 + }, + { + "auxiliary_loss_clip": 0.01394531, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.23755896, + "balance_loss_mlp": 1.01183236, + "epoch": 0.7886968284984217, + "flos": 15313621374720.0, + "grad_norm": 1.4983411461230447, + "language_loss": 0.81000149, + "learning_rate": 4.502565355654926e-07, + "loss": 0.8342464, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18139648, + "step": 13118, + "time_per_iteration": 2.878735065460205 + }, + { + "auxiliary_loss_clip": 0.01407737, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.24851215, + "balance_loss_mlp": 1.0116415, + "epoch": 0.7887569517510897, + "flos": 21225425143680.0, + "grad_norm": 1.9753034576956758, + "language_loss": 0.7397542, + "learning_rate": 4.500103790161878e-07, + "loss": 0.7641409, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19299316, + "step": 13119, + "time_per_iteration": 2.859299898147583 + }, + { + "auxiliary_loss_clip": 0.01403499, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.24210012, + "balance_loss_mlp": 1.01246846, + "epoch": 0.7888170750037578, + "flos": 22721566815360.0, + "grad_norm": 1.9493997572483794, + "language_loss": 0.7310946, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.75544536, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19104004, + "step": 13120, + "time_per_iteration": 2.882517099380493 + }, + { + "auxiliary_loss_clip": 0.01409898, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.24904084, + "balance_loss_mlp": 1.01713943, + "epoch": 0.7888771982564257, + "flos": 36442846794240.0, + "grad_norm": 1.5329161960345008, + "language_loss": 0.79328454, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.81774801, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19311523, + "step": 13121, + "time_per_iteration": 2.9987456798553467 + }, + { + "auxiliary_loss_clip": 0.0139102, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.23349535, + "balance_loss_mlp": 1.01289988, + "epoch": 0.7889373215090937, + "flos": 27321918762240.0, + "grad_norm": 1.3902463288001532, + "language_loss": 0.80629271, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.8305198, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18774414, + "step": 13122, + "time_per_iteration": 2.9769225120544434 + }, + { + "auxiliary_loss_clip": 0.01413012, + "auxiliary_loss_mlp": 0.01028139, + "balance_loss_clip": 1.24985075, + "balance_loss_mlp": 1.00991225, + "epoch": 0.7889974447617616, + "flos": 19838631450240.0, + "grad_norm": 2.5935306288793583, + "language_loss": 0.79072869, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.81514019, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18237305, + "step": 13123, + "time_per_iteration": 2.8771793842315674 + }, + { + "auxiliary_loss_clip": 0.01407222, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.24439275, + "balance_loss_mlp": 1.01355982, + "epoch": 0.7890575680144296, + "flos": 17279170295040.0, + "grad_norm": 2.0654404777079205, + "language_loss": 0.6775887, + "learning_rate": 4.487804780926985e-07, + "loss": 0.70198309, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18652344, + "step": 13124, + "time_per_iteration": 2.8167896270751953 + }, + { + "auxiliary_loss_clip": 0.01420293, + "auxiliary_loss_mlp": 0.01033748, + "balance_loss_clip": 1.25524139, + "balance_loss_mlp": 1.01538992, + "epoch": 0.7891176912670975, + "flos": 27611703578880.0, + "grad_norm": 2.4496303760940434, + "language_loss": 0.7376523, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.76219273, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.18383789, + "step": 13125, + "time_per_iteration": 2.9604265689849854 + }, + { + "auxiliary_loss_clip": 0.01405206, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.24139702, + "balance_loss_mlp": 1.01355767, + "epoch": 0.7891778145197655, + "flos": 22722109752960.0, + "grad_norm": 2.1977709569571147, + "language_loss": 0.73349631, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.75787318, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18920898, + "step": 13126, + "time_per_iteration": 4.367655515670776 + }, + { + "auxiliary_loss_clip": 0.01409055, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.24649835, + "balance_loss_mlp": 1.0114851, + "epoch": 0.7892379377724335, + "flos": 17319555918720.0, + "grad_norm": 2.223271789224908, + "language_loss": 0.77672559, + "learning_rate": 4.480432433327845e-07, + "loss": 0.80112159, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19067383, + "step": 13127, + "time_per_iteration": 2.8758678436279297 + }, + { + "auxiliary_loss_clip": 0.01395145, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.23907828, + "balance_loss_mlp": 1.01709878, + "epoch": 0.7892980610251015, + "flos": 25786703566080.0, + "grad_norm": 3.832518336375528, + "language_loss": 0.85890484, + "learning_rate": 4.47797616101103e-07, + "loss": 0.88321376, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18615723, + "step": 13128, + "time_per_iteration": 2.8777527809143066 + }, + { + "auxiliary_loss_clip": 0.01405736, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.246171, + "balance_loss_mlp": 1.01908827, + "epoch": 0.7893581842777694, + "flos": 21589918404480.0, + "grad_norm": 1.9144395087568669, + "language_loss": 0.70285076, + "learning_rate": 4.475520477290904e-07, + "loss": 0.72728771, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18859863, + "step": 13129, + "time_per_iteration": 2.862499475479126 + }, + { + "auxiliary_loss_clip": 0.01183807, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.0916419, + "balance_loss_mlp": 0.99970025, + "epoch": 0.7894183075304374, + "flos": 69049528617600.0, + "grad_norm": 0.7194667768663862, + "language_loss": 0.61630964, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63841558, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.27148438, + "step": 13130, + "time_per_iteration": 3.3502655029296875 + }, + { + "auxiliary_loss_clip": 0.01408183, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.24634767, + "balance_loss_mlp": 1.01093817, + "epoch": 0.7894784307831053, + "flos": 24253886344320.0, + "grad_norm": 4.821647133248566, + "language_loss": 0.74241775, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76679945, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19055176, + "step": 13131, + "time_per_iteration": 2.9093801975250244 + }, + { + "auxiliary_loss_clip": 0.01446175, + "auxiliary_loss_mlp": 0.01035677, + "balance_loss_clip": 1.27344298, + "balance_loss_mlp": 1.01665068, + "epoch": 0.7895385540357733, + "flos": 20276204342400.0, + "grad_norm": 2.6704034172239477, + "language_loss": 0.70941299, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.73423147, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 1.72558594, + "router_z_loss_mlp": 0.19030762, + "step": 13132, + "time_per_iteration": 2.8548989295959473 + }, + { + "auxiliary_loss_clip": 0.01405836, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.24321532, + "balance_loss_mlp": 1.01851273, + "epoch": 0.7895986772884414, + "flos": 21006502963200.0, + "grad_norm": 2.053478760963436, + "language_loss": 0.63007629, + "learning_rate": 4.465703630239468e-07, + "loss": 0.6545257, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20581055, + "step": 13133, + "time_per_iteration": 4.370396375656128 + }, + { + "auxiliary_loss_clip": 0.01420051, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.25486302, + "balance_loss_mlp": 1.01533008, + "epoch": 0.7896588005411093, + "flos": 18666416436480.0, + "grad_norm": 2.299738570329337, + "language_loss": 0.80583155, + "learning_rate": 4.463250890899195e-07, + "loss": 0.83039373, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20837402, + "step": 13134, + "time_per_iteration": 2.860684633255005 + }, + { + "auxiliary_loss_clip": 0.01409755, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.24739587, + "balance_loss_mlp": 1.0158608, + "epoch": 0.7897189237937773, + "flos": 18415116961920.0, + "grad_norm": 1.84131816972958, + "language_loss": 0.80728316, + "learning_rate": 4.460798740713998e-07, + "loss": 0.83172166, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18237305, + "step": 13135, + "time_per_iteration": 2.8761332035064697 + }, + { + "auxiliary_loss_clip": 0.01396051, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.23638904, + "balance_loss_mlp": 1.01379466, + "epoch": 0.7897790470464452, + "flos": 23741876476800.0, + "grad_norm": 1.5044080583111246, + "language_loss": 0.72690171, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.75119352, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19335938, + "step": 13136, + "time_per_iteration": 2.8541975021362305 + }, + { + "auxiliary_loss_clip": 0.01421436, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.25418782, + "balance_loss_mlp": 1.01539075, + "epoch": 0.7898391702991132, + "flos": 15925975505280.0, + "grad_norm": 2.011809003530409, + "language_loss": 0.71973455, + "learning_rate": 4.455896208180778e-07, + "loss": 0.74429643, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19348145, + "step": 13137, + "time_per_iteration": 5.683560132980347 + }, + { + "auxiliary_loss_clip": 0.0139441, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.23619914, + "balance_loss_mlp": 1.01956296, + "epoch": 0.7898992935517811, + "flos": 19838676695040.0, + "grad_norm": 1.6990068612306333, + "language_loss": 0.74920261, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.77353448, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19238281, + "step": 13138, + "time_per_iteration": 2.8559727668762207 + }, + { + "auxiliary_loss_clip": 0.01400315, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.24054599, + "balance_loss_mlp": 1.01327229, + "epoch": 0.7899594168044491, + "flos": 16224764037120.0, + "grad_norm": 2.9882428534371495, + "language_loss": 0.69143927, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.71576136, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18615723, + "step": 13139, + "time_per_iteration": 2.790332317352295 + }, + { + "auxiliary_loss_clip": 0.01181143, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.09111667, + "balance_loss_mlp": 1.01306152, + "epoch": 0.790019540057117, + "flos": 68365904647680.0, + "grad_norm": 0.8436019008232135, + "language_loss": 0.60291696, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62509358, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.234375, + "step": 13140, + "time_per_iteration": 3.4267210960388184 + }, + { + "auxiliary_loss_clip": 0.01417216, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.2556777, + "balance_loss_mlp": 1.01637959, + "epoch": 0.7900796633097851, + "flos": 30344045690880.0, + "grad_norm": 1.51829752592844, + "language_loss": 0.770459, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.79498649, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19152832, + "step": 13141, + "time_per_iteration": 2.9079947471618652 + }, + { + "auxiliary_loss_clip": 0.01411276, + "auxiliary_loss_mlp": 0.01038082, + "balance_loss_clip": 1.24683428, + "balance_loss_mlp": 1.01817441, + "epoch": 0.790139786562453, + "flos": 22136794030080.0, + "grad_norm": 2.0135132679004792, + "language_loss": 0.68994081, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.71443439, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19909668, + "step": 13142, + "time_per_iteration": 2.8320469856262207 + }, + { + "auxiliary_loss_clip": 0.01180452, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.09011459, + "balance_loss_mlp": 1.01032948, + "epoch": 0.790199909815121, + "flos": 58235100520320.0, + "grad_norm": 1.2373978371991885, + "language_loss": 0.60003251, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62216347, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.22363281, + "step": 13143, + "time_per_iteration": 3.0765933990478516 + }, + { + "auxiliary_loss_clip": 0.01416476, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.25278449, + "balance_loss_mlp": 1.01365948, + "epoch": 0.7902600330677889, + "flos": 34546893655680.0, + "grad_norm": 1.4596886939115634, + "language_loss": 0.74733305, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.77182293, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18859863, + "step": 13144, + "time_per_iteration": 3.0011978149414062 + }, + { + "auxiliary_loss_clip": 0.01408943, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.24678051, + "balance_loss_mlp": 1.01593614, + "epoch": 0.7903201563204569, + "flos": 22356621106560.0, + "grad_norm": 3.321935167621783, + "language_loss": 0.84239352, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.86683106, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1887207, + "step": 13145, + "time_per_iteration": 2.9080007076263428 + }, + { + "auxiliary_loss_clip": 0.01384292, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.22722793, + "balance_loss_mlp": 1.01453066, + "epoch": 0.790380279573125, + "flos": 22063442929920.0, + "grad_norm": 2.8307370625736734, + "language_loss": 0.7358157, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75997669, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17272949, + "step": 13146, + "time_per_iteration": 2.973395347595215 + }, + { + "auxiliary_loss_clip": 0.01409847, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.24598622, + "balance_loss_mlp": 1.01808763, + "epoch": 0.7904404028257929, + "flos": 20312155975680.0, + "grad_norm": 2.124659270269472, + "language_loss": 0.76710016, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.79157227, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19299316, + "step": 13147, + "time_per_iteration": 2.8957834243774414 + }, + { + "auxiliary_loss_clip": 0.01396334, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.23676586, + "balance_loss_mlp": 1.0173887, + "epoch": 0.7905005260784609, + "flos": 20017937168640.0, + "grad_norm": 2.3954272797524236, + "language_loss": 0.72488105, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74920678, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18847656, + "step": 13148, + "time_per_iteration": 2.8731532096862793 + }, + { + "auxiliary_loss_clip": 0.01412503, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.24941218, + "balance_loss_mlp": 1.01705861, + "epoch": 0.7905606493311288, + "flos": 26917039877760.0, + "grad_norm": 1.906188222676574, + "language_loss": 0.71959454, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.744084, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19384766, + "step": 13149, + "time_per_iteration": 2.9207262992858887 + }, + { + "auxiliary_loss_clip": 0.01409236, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.24598837, + "balance_loss_mlp": 1.01610255, + "epoch": 0.7906207725837968, + "flos": 23706784494720.0, + "grad_norm": 1.9337503226556743, + "language_loss": 0.65471315, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67917091, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.2043457, + "step": 13150, + "time_per_iteration": 2.8461062908172607 + }, + { + "auxiliary_loss_clip": 0.01411307, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.24906743, + "balance_loss_mlp": 1.01425338, + "epoch": 0.7906808958364647, + "flos": 20858262439680.0, + "grad_norm": 1.9555739119724718, + "language_loss": 0.70785117, + "learning_rate": 4.421644538650231e-07, + "loss": 0.73230058, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19396973, + "step": 13151, + "time_per_iteration": 2.839892864227295 + }, + { + "auxiliary_loss_clip": 0.01418297, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.25471997, + "balance_loss_mlp": 1.01595616, + "epoch": 0.7907410190891327, + "flos": 40750201543680.0, + "grad_norm": 1.3722799824834713, + "language_loss": 0.70692933, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.73146445, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19262695, + "step": 13152, + "time_per_iteration": 3.0044891834259033 + }, + { + "auxiliary_loss_clip": 0.01410187, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.24863911, + "balance_loss_mlp": 1.01585031, + "epoch": 0.7908011423418007, + "flos": 13268975264640.0, + "grad_norm": 2.05821725722843, + "language_loss": 0.73932326, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.76377261, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18896484, + "step": 13153, + "time_per_iteration": 2.8173136711120605 + }, + { + "auxiliary_loss_clip": 0.01410963, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.250283, + "balance_loss_mlp": 1.01326489, + "epoch": 0.7908612655944687, + "flos": 19764013495680.0, + "grad_norm": 1.7237798766394241, + "language_loss": 0.79206467, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81649595, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18908691, + "step": 13154, + "time_per_iteration": 2.8330025672912598 + }, + { + "auxiliary_loss_clip": 0.01425989, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.25730419, + "balance_loss_mlp": 1.01405025, + "epoch": 0.7909213888471366, + "flos": 21297554634240.0, + "grad_norm": 4.448787687740204, + "language_loss": 0.70871878, + "learning_rate": 4.411879602612185e-07, + "loss": 0.73331499, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.19555664, + "step": 13155, + "time_per_iteration": 2.8646368980407715 + }, + { + "auxiliary_loss_clip": 0.01419962, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.25636077, + "balance_loss_mlp": 1.01533818, + "epoch": 0.7909815120998046, + "flos": 22539184450560.0, + "grad_norm": 1.6651271864144412, + "language_loss": 0.77502751, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79957181, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19140625, + "step": 13156, + "time_per_iteration": 2.825650930404663 + }, + { + "auxiliary_loss_clip": 0.01402693, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.24104881, + "balance_loss_mlp": 1.01375306, + "epoch": 0.7910416353524725, + "flos": 26739860664960.0, + "grad_norm": 2.0404942755576925, + "language_loss": 0.66375816, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.68811262, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19006348, + "step": 13157, + "time_per_iteration": 2.8835389614105225 + }, + { + "auxiliary_loss_clip": 0.01416466, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.25177956, + "balance_loss_mlp": 1.01379132, + "epoch": 0.7911017586051405, + "flos": 24655643337600.0, + "grad_norm": 2.793494836542453, + "language_loss": 0.74993968, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.77444398, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20178223, + "step": 13158, + "time_per_iteration": 2.8694803714752197 + }, + { + "auxiliary_loss_clip": 0.01392419, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.23459029, + "balance_loss_mlp": 1.01702857, + "epoch": 0.7911618818578086, + "flos": 17574158263680.0, + "grad_norm": 2.576151492569059, + "language_loss": 0.68523359, + "learning_rate": 4.40212412422309e-07, + "loss": 0.70951211, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18395996, + "step": 13159, + "time_per_iteration": 2.828059434890747 + }, + { + "auxiliary_loss_clip": 0.01398619, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.23818159, + "balance_loss_mlp": 1.01380706, + "epoch": 0.7912220051104765, + "flos": 16728901309440.0, + "grad_norm": 1.9181800479059365, + "language_loss": 0.67978263, + "learning_rate": 4.399686733077206e-07, + "loss": 0.70409459, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18762207, + "step": 13160, + "time_per_iteration": 4.280477523803711 + }, + { + "auxiliary_loss_clip": 0.013879, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.23261452, + "balance_loss_mlp": 1.01340914, + "epoch": 0.7912821283631445, + "flos": 13706593401600.0, + "grad_norm": 2.1502519696768574, + "language_loss": 0.73218083, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75636953, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17565918, + "step": 13161, + "time_per_iteration": 2.8082079887390137 + }, + { + "auxiliary_loss_clip": 0.01395333, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.23659706, + "balance_loss_mlp": 1.01215959, + "epoch": 0.7913422516158124, + "flos": 23779547412480.0, + "grad_norm": 1.8162508539690134, + "language_loss": 0.7410934, + "learning_rate": 4.39481372557418e-07, + "loss": 0.76535952, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19128418, + "step": 13162, + "time_per_iteration": 2.8707776069641113 + }, + { + "auxiliary_loss_clip": 0.01421036, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.25523901, + "balance_loss_mlp": 1.01274443, + "epoch": 0.7914023748684804, + "flos": 19947843694080.0, + "grad_norm": 1.7038615507579982, + "language_loss": 0.72782254, + "learning_rate": 4.392378109401811e-07, + "loss": 0.75234759, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18725586, + "step": 13163, + "time_per_iteration": 2.796349287033081 + }, + { + "auxiliary_loss_clip": 0.01400566, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.24086487, + "balance_loss_mlp": 1.01115251, + "epoch": 0.7914624981211483, + "flos": 20604519745920.0, + "grad_norm": 1.9181859606666187, + "language_loss": 0.70454097, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72885716, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19885254, + "step": 13164, + "time_per_iteration": 2.8394362926483154 + }, + { + "auxiliary_loss_clip": 0.01400034, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.24113321, + "balance_loss_mlp": 1.01544833, + "epoch": 0.7915226213738163, + "flos": 21809654991360.0, + "grad_norm": 1.7922628726138288, + "language_loss": 0.67050624, + "learning_rate": 4.387508652677177e-07, + "loss": 0.694857, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19604492, + "step": 13165, + "time_per_iteration": 2.954888105392456 + }, + { + "auxiliary_loss_clip": 0.01388108, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.23080039, + "balance_loss_mlp": 1.01286817, + "epoch": 0.7915827446264843, + "flos": 16296667303680.0, + "grad_norm": 2.2926026842653866, + "language_loss": 0.73131597, + "learning_rate": 4.385074812309557e-07, + "loss": 0.75551069, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18493652, + "step": 13166, + "time_per_iteration": 2.813096284866333 + }, + { + "auxiliary_loss_clip": 0.01397987, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.23840797, + "balance_loss_mlp": 1.01428521, + "epoch": 0.7916428678791523, + "flos": 25713578689920.0, + "grad_norm": 1.5996349383257311, + "language_loss": 0.78094697, + "learning_rate": 4.382641564061462e-07, + "loss": 0.80527401, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.2043457, + "step": 13167, + "time_per_iteration": 2.9601874351501465 + }, + { + "auxiliary_loss_clip": 0.01392816, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.23496222, + "balance_loss_mlp": 1.01404262, + "epoch": 0.7917029911318202, + "flos": 23889076369920.0, + "grad_norm": 1.6243061132833714, + "language_loss": 0.84426802, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86851901, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18249512, + "step": 13168, + "time_per_iteration": 4.280240297317505 + }, + { + "auxiliary_loss_clip": 0.01404729, + "auxiliary_loss_mlp": 0.01037103, + "balance_loss_clip": 1.2432971, + "balance_loss_mlp": 1.01758838, + "epoch": 0.7917631143844882, + "flos": 21654944461440.0, + "grad_norm": 1.77450893405643, + "language_loss": 0.73309529, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.75751358, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19506836, + "step": 13169, + "time_per_iteration": 2.8296756744384766 + }, + { + "auxiliary_loss_clip": 0.01420191, + "auxiliary_loss_mlp": 0.01041806, + "balance_loss_clip": 1.25557947, + "balance_loss_mlp": 1.02217197, + "epoch": 0.7918232376371561, + "flos": 38888661715200.0, + "grad_norm": 1.6275570288010899, + "language_loss": 0.67600018, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.70062006, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19616699, + "step": 13170, + "time_per_iteration": 3.0673794746398926 + }, + { + "auxiliary_loss_clip": 0.01400194, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.23907876, + "balance_loss_mlp": 1.01365578, + "epoch": 0.7918833608898241, + "flos": 20784866094720.0, + "grad_norm": 1.6055142056580742, + "language_loss": 0.71968305, + "learning_rate": 4.372914494109412e-07, + "loss": 0.74400115, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.17956543, + "step": 13171, + "time_per_iteration": 2.842642307281494 + }, + { + "auxiliary_loss_clip": 0.01402512, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.24136353, + "balance_loss_mlp": 1.01516199, + "epoch": 0.7919434841424922, + "flos": 33922775877120.0, + "grad_norm": 1.8860067673785397, + "language_loss": 0.68087256, + "learning_rate": 4.370484207842553e-07, + "loss": 0.70523143, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18212891, + "step": 13172, + "time_per_iteration": 4.533926725387573 + }, + { + "auxiliary_loss_clip": 0.01397557, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.23638105, + "balance_loss_mlp": 1.01458585, + "epoch": 0.7920036073951601, + "flos": 21073338812160.0, + "grad_norm": 2.090175880612048, + "language_loss": 0.79872406, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.82303888, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19335938, + "step": 13173, + "time_per_iteration": 2.897432327270508 + }, + { + "auxiliary_loss_clip": 0.01404878, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.24240565, + "balance_loss_mlp": 1.01510859, + "epoch": 0.7920637306478281, + "flos": 23665539219840.0, + "grad_norm": 2.0252304779881922, + "language_loss": 0.77689534, + "learning_rate": 4.365625413419365e-07, + "loss": 0.80127859, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18334961, + "step": 13174, + "time_per_iteration": 2.88288950920105 + }, + { + "auxiliary_loss_clip": 0.01394412, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.23533607, + "balance_loss_mlp": 1.01611161, + "epoch": 0.792123853900496, + "flos": 27206010288000.0, + "grad_norm": 1.8321112930921333, + "language_loss": 0.72266459, + "learning_rate": 4.363196905447297e-07, + "loss": 0.74695635, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18652344, + "step": 13175, + "time_per_iteration": 2.8836381435394287 + }, + { + "auxiliary_loss_clip": 0.01407785, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.2461524, + "balance_loss_mlp": 1.01501369, + "epoch": 0.792183977153164, + "flos": 19107925626240.0, + "grad_norm": 2.0657255494935116, + "language_loss": 0.60511816, + "learning_rate": 4.360768990424364e-07, + "loss": 0.62954128, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19506836, + "step": 13176, + "time_per_iteration": 2.8110954761505127 + }, + { + "auxiliary_loss_clip": 0.01392103, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.23574853, + "balance_loss_mlp": 1.01554251, + "epoch": 0.7922441004058319, + "flos": 17137716491520.0, + "grad_norm": 1.8596129923946556, + "language_loss": 0.74512452, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.76939881, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19775391, + "step": 13177, + "time_per_iteration": 2.827244281768799 + }, + { + "auxiliary_loss_clip": 0.01404377, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.2458086, + "balance_loss_mlp": 1.01376271, + "epoch": 0.7923042236585, + "flos": 17830570400640.0, + "grad_norm": 1.8281009577180232, + "language_loss": 0.64802253, + "learning_rate": 4.355914939594174e-07, + "loss": 0.67239356, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18969727, + "step": 13178, + "time_per_iteration": 2.8260326385498047 + }, + { + "auxiliary_loss_clip": 0.01403499, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.24205625, + "balance_loss_mlp": 1.01429009, + "epoch": 0.7923643469111679, + "flos": 29947356115200.0, + "grad_norm": 2.362481135009082, + "language_loss": 0.69361758, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.7179786, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18322754, + "step": 13179, + "time_per_iteration": 2.9049313068389893 + }, + { + "auxiliary_loss_clip": 0.01393639, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.23342586, + "balance_loss_mlp": 1.01345503, + "epoch": 0.7924244701638359, + "flos": 22685026999680.0, + "grad_norm": 2.0149103280749907, + "language_loss": 0.74820715, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.77246547, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1875, + "step": 13180, + "time_per_iteration": 2.854595422744751 + }, + { + "auxiliary_loss_clip": 0.01408467, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.24522781, + "balance_loss_mlp": 1.01422977, + "epoch": 0.7924845934165038, + "flos": 17977272600960.0, + "grad_norm": 2.294296105556513, + "language_loss": 0.82487768, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.84930062, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19616699, + "step": 13181, + "time_per_iteration": 2.814661741256714 + }, + { + "auxiliary_loss_clip": 0.01388243, + "auxiliary_loss_mlp": 0.01036224, + "balance_loss_clip": 1.22960806, + "balance_loss_mlp": 1.01636398, + "epoch": 0.7925447166691718, + "flos": 23487364621440.0, + "grad_norm": 1.8611054589738132, + "language_loss": 0.78680831, + "learning_rate": 4.346213957372895e-07, + "loss": 0.81105304, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.1986084, + "step": 13182, + "time_per_iteration": 2.849520444869995 + }, + { + "auxiliary_loss_clip": 0.01417278, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.25096703, + "balance_loss_mlp": 1.01864672, + "epoch": 0.7926048399218397, + "flos": 20456912649600.0, + "grad_norm": 2.221762866295825, + "language_loss": 0.75491172, + "learning_rate": 4.34379019557056e-07, + "loss": 0.77946168, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.1907959, + "step": 13183, + "time_per_iteration": 2.8210959434509277 + }, + { + "auxiliary_loss_clip": 0.01398339, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.23877013, + "balance_loss_mlp": 1.01352382, + "epoch": 0.7926649631745077, + "flos": 37174231290240.0, + "grad_norm": 1.7071848944929464, + "language_loss": 0.68921286, + "learning_rate": 4.341367027453264e-07, + "loss": 0.71351945, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18811035, + "step": 13184, + "time_per_iteration": 2.9569835662841797 + }, + { + "auxiliary_loss_clip": 0.01413681, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.25013804, + "balance_loss_mlp": 1.01264024, + "epoch": 0.7927250864271758, + "flos": 17027237393280.0, + "grad_norm": 1.9679779153109054, + "language_loss": 0.71353328, + "learning_rate": 4.338944453112907e-07, + "loss": 0.73798758, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19104004, + "step": 13185, + "time_per_iteration": 2.8615000247955322 + }, + { + "auxiliary_loss_clip": 0.01400075, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.23778105, + "balance_loss_mlp": 1.01341498, + "epoch": 0.7927852096798437, + "flos": 17758259930880.0, + "grad_norm": 10.31688541981603, + "language_loss": 0.66104364, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.68536037, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18188477, + "step": 13186, + "time_per_iteration": 2.9377281665802 + }, + { + "auxiliary_loss_clip": 0.013876, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.2290777, + "balance_loss_mlp": 1.01542068, + "epoch": 0.7928453329325117, + "flos": 23848057319040.0, + "grad_norm": 1.4825015731640327, + "language_loss": 0.77625597, + "learning_rate": 4.334101086130408e-07, + "loss": 0.80048048, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19445801, + "step": 13187, + "time_per_iteration": 2.886777639389038 + }, + { + "auxiliary_loss_clip": 0.01400319, + "auxiliary_loss_mlp": 0.01033617, + "balance_loss_clip": 1.24077559, + "balance_loss_mlp": 1.01496112, + "epoch": 0.7929054561851796, + "flos": 17463271962240.0, + "grad_norm": 1.999573854177103, + "language_loss": 0.7290529, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.75339228, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18664551, + "step": 13188, + "time_per_iteration": 2.8092637062072754 + }, + { + "auxiliary_loss_clip": 0.01407193, + "auxiliary_loss_mlp": 0.01039315, + "balance_loss_clip": 1.24319744, + "balance_loss_mlp": 1.01914477, + "epoch": 0.7929655794378476, + "flos": 21991358684160.0, + "grad_norm": 2.1513763927081686, + "language_loss": 0.64211547, + "learning_rate": 4.329260095357725e-07, + "loss": 0.66658056, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.20178223, + "step": 13189, + "time_per_iteration": 2.8658435344696045 + }, + { + "auxiliary_loss_clip": 0.01397687, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.23750854, + "balance_loss_mlp": 1.01042593, + "epoch": 0.7930257026905155, + "flos": 17282608899840.0, + "grad_norm": 1.8065356601772888, + "language_loss": 0.73053253, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.75480527, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19140625, + "step": 13190, + "time_per_iteration": 2.8388171195983887 + }, + { + "auxiliary_loss_clip": 0.01396891, + "auxiliary_loss_mlp": 0.01032212, + "balance_loss_clip": 1.23919296, + "balance_loss_mlp": 1.0142467, + "epoch": 0.7930858259431836, + "flos": 27310064624640.0, + "grad_norm": 2.7094537506629135, + "language_loss": 0.73473883, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75902981, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.17956543, + "step": 13191, + "time_per_iteration": 2.8924832344055176 + }, + { + "auxiliary_loss_clip": 0.01401804, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.24108315, + "balance_loss_mlp": 1.01940405, + "epoch": 0.7931459491958515, + "flos": 19872909025920.0, + "grad_norm": 1.7392862851351796, + "language_loss": 0.69570041, + "learning_rate": 4.322003066198219e-07, + "loss": 0.72010541, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19274902, + "step": 13192, + "time_per_iteration": 2.8502604961395264 + }, + { + "auxiliary_loss_clip": 0.01414783, + "auxiliary_loss_mlp": 0.01035304, + "balance_loss_clip": 1.25192881, + "balance_loss_mlp": 1.01631439, + "epoch": 0.7932060724485195, + "flos": 23157058446720.0, + "grad_norm": 1.9322576501059043, + "language_loss": 0.7556476, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.78014845, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18994141, + "step": 13193, + "time_per_iteration": 2.939967393875122 + }, + { + "auxiliary_loss_clip": 0.01393939, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.2340486, + "balance_loss_mlp": 1.01543915, + "epoch": 0.7932661957011874, + "flos": 29946903667200.0, + "grad_norm": 1.644386691094131, + "language_loss": 0.72673452, + "learning_rate": 4.317168019161741e-07, + "loss": 0.75102925, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.20092773, + "step": 13194, + "time_per_iteration": 2.8973097801208496 + }, + { + "auxiliary_loss_clip": 0.01414286, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.24880528, + "balance_loss_mlp": 1.01381063, + "epoch": 0.7933263189538554, + "flos": 22567806426240.0, + "grad_norm": 2.5712499962936546, + "language_loss": 0.70860493, + "learning_rate": 4.314751387639517e-07, + "loss": 0.73308367, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19787598, + "step": 13195, + "time_per_iteration": 2.846968412399292 + }, + { + "auxiliary_loss_clip": 0.01394902, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.23511231, + "balance_loss_mlp": 1.01311159, + "epoch": 0.7933864422065233, + "flos": 25488774685440.0, + "grad_norm": 1.4688099150118168, + "language_loss": 0.78165507, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.80592626, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19104004, + "step": 13196, + "time_per_iteration": 4.371555805206299 + }, + { + "auxiliary_loss_clip": 0.01417405, + "auxiliary_loss_mlp": 0.01037769, + "balance_loss_clip": 1.25412297, + "balance_loss_mlp": 1.01852846, + "epoch": 0.7934465654591913, + "flos": 33596089286400.0, + "grad_norm": 1.9101247984088534, + "language_loss": 0.69080579, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71535754, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19250488, + "step": 13197, + "time_per_iteration": 2.9264822006225586 + }, + { + "auxiliary_loss_clip": 0.01397776, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.23803759, + "balance_loss_mlp": 1.0112468, + "epoch": 0.7935066887118594, + "flos": 31445443313280.0, + "grad_norm": 1.7005880860943656, + "language_loss": 0.66152132, + "learning_rate": 4.30750506215646e-07, + "loss": 0.68580383, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19226074, + "step": 13198, + "time_per_iteration": 2.9265859127044678 + }, + { + "auxiliary_loss_clip": 0.01400807, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.23846543, + "balance_loss_mlp": 1.01669741, + "epoch": 0.7935668119645273, + "flos": 14690408492160.0, + "grad_norm": 2.039838745056827, + "language_loss": 0.74258578, + "learning_rate": 4.30509081032864e-07, + "loss": 0.76696837, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.2076416, + "step": 13199, + "time_per_iteration": 2.827786684036255 + }, + { + "auxiliary_loss_clip": 0.01412847, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.25139618, + "balance_loss_mlp": 1.01610398, + "epoch": 0.7936269352171953, + "flos": 18013269479040.0, + "grad_norm": 1.841506396335372, + "language_loss": 0.81233013, + "learning_rate": 4.302677153653349e-07, + "loss": 0.83680964, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19006348, + "step": 13200, + "time_per_iteration": 2.825831174850464 + }, + { + "auxiliary_loss_clip": 0.01390297, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.23537064, + "balance_loss_mlp": 1.01422834, + "epoch": 0.7936870584698632, + "flos": 18889546383360.0, + "grad_norm": 1.6423996170618573, + "language_loss": 0.78093421, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.80517387, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.19445801, + "step": 13201, + "time_per_iteration": 2.834867477416992 + }, + { + "auxiliary_loss_clip": 0.01395213, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.23586833, + "balance_loss_mlp": 1.01820087, + "epoch": 0.7937471817225312, + "flos": 23377247481600.0, + "grad_norm": 1.6072687672762493, + "language_loss": 0.67429101, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69861782, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19262695, + "step": 13202, + "time_per_iteration": 2.8812999725341797 + }, + { + "auxiliary_loss_clip": 0.01402189, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.24027252, + "balance_loss_mlp": 1.01507354, + "epoch": 0.7938073049751991, + "flos": 22684665041280.0, + "grad_norm": 1.9616246797932737, + "language_loss": 0.75297928, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77733982, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18798828, + "step": 13203, + "time_per_iteration": 4.264508247375488 + }, + { + "auxiliary_loss_clip": 0.01407366, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.24497986, + "balance_loss_mlp": 1.0155977, + "epoch": 0.7938674282278672, + "flos": 22860305930880.0, + "grad_norm": 1.9545093505133189, + "language_loss": 0.6720528, + "learning_rate": 4.293028480307643e-07, + "loss": 0.69647205, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18981934, + "step": 13204, + "time_per_iteration": 2.8519256114959717 + }, + { + "auxiliary_loss_clip": 0.01390921, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.23128951, + "balance_loss_mlp": 1.01202476, + "epoch": 0.7939275514805351, + "flos": 27022904006400.0, + "grad_norm": 1.3627280460936382, + "language_loss": 0.79843754, + "learning_rate": 4.290617800767438e-07, + "loss": 0.82264912, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18225098, + "step": 13205, + "time_per_iteration": 2.9623944759368896 + }, + { + "auxiliary_loss_clip": 0.01396951, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.23822927, + "balance_loss_mlp": 1.01557064, + "epoch": 0.7939876747332031, + "flos": 21152888449920.0, + "grad_norm": 1.779260183489354, + "language_loss": 0.78298801, + "learning_rate": 4.28820771692858e-07, + "loss": 0.80731231, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19897461, + "step": 13206, + "time_per_iteration": 2.945782423019409 + }, + { + "auxiliary_loss_clip": 0.01416442, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.25192726, + "balance_loss_mlp": 1.01253843, + "epoch": 0.794047797985871, + "flos": 23298014557440.0, + "grad_norm": 2.1726778832580966, + "language_loss": 0.79911202, + "learning_rate": 4.285798228882456e-07, + "loss": 0.82360923, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.20715332, + "step": 13207, + "time_per_iteration": 4.3649115562438965 + }, + { + "auxiliary_loss_clip": 0.014017, + "auxiliary_loss_mlp": 0.01039277, + "balance_loss_clip": 1.24150646, + "balance_loss_mlp": 1.02017951, + "epoch": 0.794107921238539, + "flos": 24618786808320.0, + "grad_norm": 1.7916487385926858, + "language_loss": 0.84398955, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86839926, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19104004, + "step": 13208, + "time_per_iteration": 2.9645464420318604 + }, + { + "auxiliary_loss_clip": 0.01184335, + "auxiliary_loss_mlp": 0.01026061, + "balance_loss_clip": 1.09130323, + "balance_loss_mlp": 1.00355399, + "epoch": 0.7941680444912069, + "flos": 64126200153600.0, + "grad_norm": 0.7225737517523693, + "language_loss": 0.58323455, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60533845, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.22460938, + "step": 13209, + "time_per_iteration": 3.454373359680176 + }, + { + "auxiliary_loss_clip": 0.01420409, + "auxiliary_loss_mlp": 0.01036939, + "balance_loss_clip": 1.25391078, + "balance_loss_mlp": 1.01656592, + "epoch": 0.794228167743875, + "flos": 24399412179840.0, + "grad_norm": 2.2981187421525173, + "language_loss": 0.64016068, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.66473413, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20385742, + "step": 13210, + "time_per_iteration": 2.8909783363342285 + }, + { + "auxiliary_loss_clip": 0.01401812, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.24311686, + "balance_loss_mlp": 1.01520348, + "epoch": 0.794288290996543, + "flos": 28524339319680.0, + "grad_norm": 1.701178463083485, + "language_loss": 0.69494903, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71931159, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19226074, + "step": 13211, + "time_per_iteration": 2.925614833831787 + }, + { + "auxiliary_loss_clip": 0.01406839, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.24249315, + "balance_loss_mlp": 1.01669097, + "epoch": 0.7943484142492109, + "flos": 25933315276800.0, + "grad_norm": 2.052002291558341, + "language_loss": 0.73279762, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.75723624, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.20336914, + "step": 13212, + "time_per_iteration": 2.9228127002716064 + }, + { + "auxiliary_loss_clip": 0.01394043, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.23897028, + "balance_loss_mlp": 1.01096082, + "epoch": 0.7944085375018789, + "flos": 23925932899200.0, + "grad_norm": 1.6840048480512937, + "language_loss": 0.81189346, + "learning_rate": 4.271353817368246e-07, + "loss": 0.83613485, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.19128418, + "step": 13213, + "time_per_iteration": 2.866795301437378 + }, + { + "auxiliary_loss_clip": 0.01412572, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.24875236, + "balance_loss_mlp": 1.01656902, + "epoch": 0.7944686607545468, + "flos": 20239574037120.0, + "grad_norm": 2.0142548273966536, + "language_loss": 0.67880762, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70329714, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19799805, + "step": 13214, + "time_per_iteration": 2.828404426574707 + }, + { + "auxiliary_loss_clip": 0.01388405, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.23167729, + "balance_loss_mlp": 1.01441693, + "epoch": 0.7945287840072148, + "flos": 21990996725760.0, + "grad_norm": 1.7219054150412112, + "language_loss": 0.73300099, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.75721073, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18151855, + "step": 13215, + "time_per_iteration": 2.8291025161743164 + }, + { + "auxiliary_loss_clip": 0.01398246, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.24046552, + "balance_loss_mlp": 1.01218057, + "epoch": 0.7945889072598827, + "flos": 26409373511040.0, + "grad_norm": 1.5518921628164828, + "language_loss": 0.79656738, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.820867, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.1953125, + "step": 13216, + "time_per_iteration": 2.9038772583007812 + }, + { + "auxiliary_loss_clip": 0.01410338, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.24876857, + "balance_loss_mlp": 1.01261973, + "epoch": 0.7946490305125508, + "flos": 25820800162560.0, + "grad_norm": 1.7092256504923542, + "language_loss": 0.74444008, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76885748, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18798828, + "step": 13217, + "time_per_iteration": 2.8707785606384277 + }, + { + "auxiliary_loss_clip": 0.01395524, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.238675, + "balance_loss_mlp": 1.01654935, + "epoch": 0.7947091537652187, + "flos": 15969437775360.0, + "grad_norm": 1.7249471626209976, + "language_loss": 0.74181449, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76612568, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19055176, + "step": 13218, + "time_per_iteration": 2.809525728225708 + }, + { + "auxiliary_loss_clip": 0.01418024, + "auxiliary_loss_mlp": 0.01034117, + "balance_loss_clip": 1.2531594, + "balance_loss_mlp": 1.01473355, + "epoch": 0.7947692770178867, + "flos": 18597001633920.0, + "grad_norm": 1.955040649114494, + "language_loss": 0.84067714, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.86519861, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19384766, + "step": 13219, + "time_per_iteration": 2.8192687034606934 + }, + { + "auxiliary_loss_clip": 0.01421688, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.25615549, + "balance_loss_mlp": 1.01447845, + "epoch": 0.7948294002705546, + "flos": 20450171174400.0, + "grad_norm": 1.7637282539829946, + "language_loss": 0.76430631, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.78886431, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19628906, + "step": 13220, + "time_per_iteration": 2.8452131748199463 + }, + { + "auxiliary_loss_clip": 0.01421163, + "auxiliary_loss_mlp": 0.01037565, + "balance_loss_clip": 1.25567639, + "balance_loss_mlp": 1.01840794, + "epoch": 0.7948895235232226, + "flos": 38195536337280.0, + "grad_norm": 1.6277391725459955, + "language_loss": 0.72934854, + "learning_rate": 4.252128005599176e-07, + "loss": 0.75393581, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19165039, + "step": 13221, + "time_per_iteration": 3.0420186519622803 + }, + { + "auxiliary_loss_clip": 0.01402991, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.24548435, + "balance_loss_mlp": 1.0146358, + "epoch": 0.7949496467758905, + "flos": 15568223719680.0, + "grad_norm": 2.0207509239734303, + "language_loss": 0.76240504, + "learning_rate": 4.249727465395634e-07, + "loss": 0.78677171, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19042969, + "step": 13222, + "time_per_iteration": 2.802612781524658 + }, + { + "auxiliary_loss_clip": 0.01182224, + "auxiliary_loss_mlp": 0.01022005, + "balance_loss_clip": 1.09122252, + "balance_loss_mlp": 0.9984495, + "epoch": 0.7950097700285585, + "flos": 70926252716160.0, + "grad_norm": 0.7728219635981586, + "language_loss": 0.67168772, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69373, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.23535156, + "step": 13223, + "time_per_iteration": 3.1903345584869385 + }, + { + "auxiliary_loss_clip": 0.01397485, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.23614466, + "balance_loss_mlp": 1.01689315, + "epoch": 0.7950698932812266, + "flos": 23962246490880.0, + "grad_norm": 1.6633361261182191, + "language_loss": 0.72039998, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.74473643, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19250488, + "step": 13224, + "time_per_iteration": 2.898922920227051 + }, + { + "auxiliary_loss_clip": 0.01186334, + "auxiliary_loss_mlp": 0.0102556, + "balance_loss_clip": 1.09301329, + "balance_loss_mlp": 1.0039115, + "epoch": 0.7951300165338945, + "flos": 60309771194880.0, + "grad_norm": 0.6702092418642943, + "language_loss": 0.55040693, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57252586, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.21679688, + "step": 13225, + "time_per_iteration": 3.3522980213165283 + }, + { + "auxiliary_loss_clip": 0.0138591, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.22908032, + "balance_loss_mlp": 1.01135683, + "epoch": 0.7951901397865625, + "flos": 22828924022400.0, + "grad_norm": 1.8804770480151354, + "language_loss": 0.65793741, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.68209499, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18493652, + "step": 13226, + "time_per_iteration": 2.9450154304504395 + }, + { + "auxiliary_loss_clip": 0.0141149, + "auxiliary_loss_mlp": 0.01039701, + "balance_loss_clip": 1.24934304, + "balance_loss_mlp": 1.02037668, + "epoch": 0.7952502630392304, + "flos": 35708204672640.0, + "grad_norm": 2.188620469654787, + "language_loss": 0.70580125, + "learning_rate": 4.237733724976349e-07, + "loss": 0.73031318, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19311523, + "step": 13227, + "time_per_iteration": 2.961012601852417 + }, + { + "auxiliary_loss_clip": 0.01396341, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.23811364, + "balance_loss_mlp": 1.01648211, + "epoch": 0.7953103862918984, + "flos": 25640861016960.0, + "grad_norm": 2.024395469589045, + "language_loss": 0.70730519, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.7316227, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18933105, + "step": 13228, + "time_per_iteration": 2.9083805084228516 + }, + { + "auxiliary_loss_clip": 0.01411004, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.24759865, + "balance_loss_mlp": 1.01774132, + "epoch": 0.7953705095445663, + "flos": 40566733303680.0, + "grad_norm": 1.4588173432157578, + "language_loss": 0.7151854, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73966277, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18981934, + "step": 13229, + "time_per_iteration": 3.0630245208740234 + }, + { + "auxiliary_loss_clip": 0.01422885, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.25694394, + "balance_loss_mlp": 1.0155592, + "epoch": 0.7954306327972344, + "flos": 27648243394560.0, + "grad_norm": 2.0472774328758256, + "language_loss": 0.72387516, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74844426, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18481445, + "step": 13230, + "time_per_iteration": 2.9199440479278564 + }, + { + "auxiliary_loss_clip": 0.01185271, + "auxiliary_loss_mlp": 0.01018789, + "balance_loss_clip": 1.09497464, + "balance_loss_mlp": 0.99818945, + "epoch": 0.7954907560499023, + "flos": 59537340385920.0, + "grad_norm": 0.8918167198737608, + "language_loss": 0.63586503, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65790558, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.20605469, + "step": 13231, + "time_per_iteration": 4.875218152999878 + }, + { + "auxiliary_loss_clip": 0.01398469, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.23853719, + "balance_loss_mlp": 1.01590061, + "epoch": 0.7955508793025703, + "flos": 20130135569280.0, + "grad_norm": 1.5484346043210993, + "language_loss": 0.69868237, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72301567, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1895752, + "step": 13232, + "time_per_iteration": 2.8007094860076904 + }, + { + "auxiliary_loss_clip": 0.01396758, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.23744047, + "balance_loss_mlp": 1.01046824, + "epoch": 0.7956110025552382, + "flos": 26516594983680.0, + "grad_norm": 1.5926178486358917, + "language_loss": 0.78725058, + "learning_rate": 4.223360961792952e-07, + "loss": 0.81150609, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18334961, + "step": 13233, + "time_per_iteration": 2.8794307708740234 + }, + { + "auxiliary_loss_clip": 0.01413425, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.25088644, + "balance_loss_mlp": 1.01685286, + "epoch": 0.7956711258079062, + "flos": 22575814755840.0, + "grad_norm": 1.870837900095762, + "language_loss": 0.7901178, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81461436, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19384766, + "step": 13234, + "time_per_iteration": 2.8657515048980713 + }, + { + "auxiliary_loss_clip": 0.01395766, + "auxiliary_loss_mlp": 0.01030043, + "balance_loss_clip": 1.23649096, + "balance_loss_mlp": 1.01226902, + "epoch": 0.7957312490605741, + "flos": 17386572746880.0, + "grad_norm": 1.8780614939952491, + "language_loss": 0.71234792, + "learning_rate": 4.218574825777077e-07, + "loss": 0.736606, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17773438, + "step": 13235, + "time_per_iteration": 2.8679332733154297 + }, + { + "auxiliary_loss_clip": 0.01405271, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.2432363, + "balance_loss_mlp": 1.01259565, + "epoch": 0.7957913723132422, + "flos": 22501468270080.0, + "grad_norm": 1.7680316176325024, + "language_loss": 0.68436146, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70874196, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20178223, + "step": 13236, + "time_per_iteration": 2.913508415222168 + }, + { + "auxiliary_loss_clip": 0.01393211, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.23367381, + "balance_loss_mlp": 1.01159573, + "epoch": 0.7958514955659101, + "flos": 22648351449600.0, + "grad_norm": 1.7280168756465975, + "language_loss": 0.7573992, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.78163278, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18554688, + "step": 13237, + "time_per_iteration": 2.849341630935669 + }, + { + "auxiliary_loss_clip": 0.01407541, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.24599612, + "balance_loss_mlp": 1.0184356, + "epoch": 0.7959116188185781, + "flos": 20713958213760.0, + "grad_norm": 1.9014453575501817, + "language_loss": 0.72672468, + "learning_rate": 4.211400110229175e-07, + "loss": 0.75118363, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19909668, + "step": 13238, + "time_per_iteration": 4.229424238204956 + }, + { + "auxiliary_loss_clip": 0.01409904, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.24572587, + "balance_loss_mlp": 1.01168537, + "epoch": 0.7959717420712461, + "flos": 19033669630080.0, + "grad_norm": 2.0546732464113133, + "language_loss": 0.74735898, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.77174985, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.17504883, + "step": 13239, + "time_per_iteration": 2.808567762374878 + }, + { + "auxiliary_loss_clip": 0.01405062, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.24230301, + "balance_loss_mlp": 1.01465046, + "epoch": 0.796031865323914, + "flos": 26367087605760.0, + "grad_norm": 3.221169611220296, + "language_loss": 0.70325577, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.72764504, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.1920166, + "step": 13240, + "time_per_iteration": 2.8738486766815186 + }, + { + "auxiliary_loss_clip": 0.01187707, + "auxiliary_loss_mlp": 0.01017627, + "balance_loss_clip": 1.09629679, + "balance_loss_mlp": 0.9950245, + "epoch": 0.796091988576582, + "flos": 62096874048000.0, + "grad_norm": 0.8885041855905748, + "language_loss": 0.58782405, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60987735, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.22558594, + "step": 13241, + "time_per_iteration": 3.1371231079101562 + }, + { + "auxiliary_loss_clip": 0.01411643, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.25100815, + "balance_loss_mlp": 1.01712775, + "epoch": 0.7961521118292499, + "flos": 39034594753920.0, + "grad_norm": 4.131947185764265, + "language_loss": 0.65623599, + "learning_rate": 4.201842205128772e-07, + "loss": 0.68070567, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18188477, + "step": 13242, + "time_per_iteration": 3.0218987464904785 + }, + { + "auxiliary_loss_clip": 0.0140678, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.2445246, + "balance_loss_mlp": 1.01839447, + "epoch": 0.796212235081918, + "flos": 21772979441280.0, + "grad_norm": 1.7332283403050783, + "language_loss": 0.76824605, + "learning_rate": 4.199454226296526e-07, + "loss": 0.79268891, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19116211, + "step": 13243, + "time_per_iteration": 4.4210405349731445 + }, + { + "auxiliary_loss_clip": 0.01402089, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.23929465, + "balance_loss_mlp": 1.01392341, + "epoch": 0.7962723583345859, + "flos": 21188794838400.0, + "grad_norm": 2.291292089819298, + "language_loss": 0.7970767, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.82144058, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20373535, + "step": 13244, + "time_per_iteration": 2.8988592624664307 + }, + { + "auxiliary_loss_clip": 0.01404974, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.24122405, + "balance_loss_mlp": 1.01469398, + "epoch": 0.7963324815872539, + "flos": 17137445022720.0, + "grad_norm": 2.417821148948565, + "language_loss": 0.6985274, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.72290981, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18591309, + "step": 13245, + "time_per_iteration": 2.874469041824341 + }, + { + "auxiliary_loss_clip": 0.01405238, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.24390292, + "balance_loss_mlp": 1.0187937, + "epoch": 0.7963926048399218, + "flos": 21407400305280.0, + "grad_norm": 1.391528315711335, + "language_loss": 0.79606742, + "learning_rate": 4.192293885111549e-07, + "loss": 0.8204962, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18847656, + "step": 13246, + "time_per_iteration": 2.8558104038238525 + }, + { + "auxiliary_loss_clip": 0.01412459, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.24821126, + "balance_loss_mlp": 1.01987052, + "epoch": 0.7964527280925898, + "flos": 25193062800000.0, + "grad_norm": 1.9368216300400465, + "language_loss": 0.66860163, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.69311631, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19128418, + "step": 13247, + "time_per_iteration": 2.864593267440796 + }, + { + "auxiliary_loss_clip": 0.01401426, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.24315238, + "balance_loss_mlp": 1.01453292, + "epoch": 0.7965128513452577, + "flos": 27027157017600.0, + "grad_norm": 1.9813205772606521, + "language_loss": 0.72694278, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.75128812, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18579102, + "step": 13248, + "time_per_iteration": 2.9105472564697266 + }, + { + "auxiliary_loss_clip": 0.01412747, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.24760914, + "balance_loss_mlp": 1.01468325, + "epoch": 0.7965729745979258, + "flos": 24428667582720.0, + "grad_norm": 2.0671284721643066, + "language_loss": 0.7641809, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78865188, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19677734, + "step": 13249, + "time_per_iteration": 2.842820882797241 + }, + { + "auxiliary_loss_clip": 0.01398635, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.23889685, + "balance_loss_mlp": 1.01322269, + "epoch": 0.7966330978505937, + "flos": 18848844046080.0, + "grad_norm": 3.1168721366569816, + "language_loss": 0.62194723, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.64625061, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18493652, + "step": 13250, + "time_per_iteration": 2.862680435180664 + }, + { + "auxiliary_loss_clip": 0.01397947, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.23784089, + "balance_loss_mlp": 1.01095164, + "epoch": 0.7966932211032617, + "flos": 13160260713600.0, + "grad_norm": 5.950341315173901, + "language_loss": 0.73937345, + "learning_rate": 4.180371972938206e-07, + "loss": 0.76365435, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19189453, + "step": 13251, + "time_per_iteration": 2.94284987449646 + }, + { + "auxiliary_loss_clip": 0.0141653, + "auxiliary_loss_mlp": 0.01037786, + "balance_loss_clip": 1.25211668, + "balance_loss_mlp": 1.01622128, + "epoch": 0.7967533443559297, + "flos": 23959893761280.0, + "grad_norm": 1.7336423675468802, + "language_loss": 0.73754013, + "learning_rate": 4.177989389787624e-07, + "loss": 0.76208329, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.21569824, + "step": 13252, + "time_per_iteration": 2.849231481552124 + }, + { + "auxiliary_loss_clip": 0.0139357, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.23594499, + "balance_loss_mlp": 1.01598418, + "epoch": 0.7968134676085976, + "flos": 30379952079360.0, + "grad_norm": 1.7973329785108136, + "language_loss": 0.67353249, + "learning_rate": 4.175607406609278e-07, + "loss": 0.69781959, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19152832, + "step": 13253, + "time_per_iteration": 2.9045536518096924 + }, + { + "auxiliary_loss_clip": 0.01411506, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.24890065, + "balance_loss_mlp": 1.01540637, + "epoch": 0.7968735908612656, + "flos": 23085200424960.0, + "grad_norm": 1.5761818291393073, + "language_loss": 0.68036878, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.70483196, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19421387, + "step": 13254, + "time_per_iteration": 2.8448596000671387 + }, + { + "auxiliary_loss_clip": 0.01418212, + "auxiliary_loss_mlp": 0.0103998, + "balance_loss_clip": 1.2564069, + "balance_loss_mlp": 1.02128768, + "epoch": 0.7969337141139335, + "flos": 23591599937280.0, + "grad_norm": 1.800961943067236, + "language_loss": 0.70118439, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.7257663, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18688965, + "step": 13255, + "time_per_iteration": 2.897002696990967 + }, + { + "auxiliary_loss_clip": 0.01399022, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.24006164, + "balance_loss_mlp": 1.01514125, + "epoch": 0.7969938373666016, + "flos": 19765416084480.0, + "grad_norm": 1.8944943288054252, + "language_loss": 0.79627419, + "learning_rate": 4.168465057810733e-07, + "loss": 0.8206045, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18884277, + "step": 13256, + "time_per_iteration": 2.969757318496704 + }, + { + "auxiliary_loss_clip": 0.01408854, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.24612904, + "balance_loss_mlp": 1.01261735, + "epoch": 0.7970539606192695, + "flos": 24144764590080.0, + "grad_norm": 1.598665652955987, + "language_loss": 0.66440415, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68880951, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19067383, + "step": 13257, + "time_per_iteration": 2.8490302562713623 + }, + { + "auxiliary_loss_clip": 0.01434841, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.26830828, + "balance_loss_mlp": 1.01689422, + "epoch": 0.7971140838719375, + "flos": 17977860783360.0, + "grad_norm": 3.236489136504159, + "language_loss": 0.73731267, + "learning_rate": 4.163706493461523e-07, + "loss": 0.76201439, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18457031, + "step": 13258, + "time_per_iteration": 2.8159732818603516 + }, + { + "auxiliary_loss_clip": 0.01404991, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.24158025, + "balance_loss_mlp": 1.01414251, + "epoch": 0.7971742071246054, + "flos": 19178290569600.0, + "grad_norm": 1.7377220730474514, + "language_loss": 0.69556797, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71994984, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19042969, + "step": 13259, + "time_per_iteration": 2.8283636569976807 + }, + { + "auxiliary_loss_clip": 0.01399275, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.2403295, + "balance_loss_mlp": 1.01282096, + "epoch": 0.7972343303772734, + "flos": 27137409891840.0, + "grad_norm": 1.622953234984002, + "language_loss": 0.74181032, + "learning_rate": 4.158950331167641e-07, + "loss": 0.76611519, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18395996, + "step": 13260, + "time_per_iteration": 2.8672029972076416 + }, + { + "auxiliary_loss_clip": 0.01390697, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.23162365, + "balance_loss_mlp": 1.01400137, + "epoch": 0.7972944536299413, + "flos": 21006593452800.0, + "grad_norm": 3.133663783291427, + "language_loss": 0.78948921, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.81371838, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18200684, + "step": 13261, + "time_per_iteration": 2.873385190963745 + }, + { + "auxiliary_loss_clip": 0.01376204, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.22323489, + "balance_loss_mlp": 1.01235712, + "epoch": 0.7973545768826094, + "flos": 21590008894080.0, + "grad_norm": 1.5419340529151206, + "language_loss": 0.7637623, + "learning_rate": 4.154196571650501e-07, + "loss": 0.7878229, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.17504883, + "step": 13262, + "time_per_iteration": 2.854612112045288 + }, + { + "auxiliary_loss_clip": 0.01416179, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.24953008, + "balance_loss_mlp": 1.0133419, + "epoch": 0.7974147001352773, + "flos": 20568296643840.0, + "grad_norm": 2.27672191806721, + "language_loss": 0.71654081, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.74104285, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.20678711, + "step": 13263, + "time_per_iteration": 2.8338117599487305 + }, + { + "auxiliary_loss_clip": 0.01426213, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.25859606, + "balance_loss_mlp": 1.01860762, + "epoch": 0.7974748233879453, + "flos": 21006774432000.0, + "grad_norm": 1.8106810051212017, + "language_loss": 0.72356218, + "learning_rate": 4.149445215631153e-07, + "loss": 0.74820626, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 1.67578125, + "router_z_loss_mlp": 0.19592285, + "step": 13264, + "time_per_iteration": 2.8476834297180176 + }, + { + "auxiliary_loss_clip": 0.01388235, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.2312746, + "balance_loss_mlp": 1.01557755, + "epoch": 0.7975349466406133, + "flos": 22575452797440.0, + "grad_norm": 2.421825716571987, + "language_loss": 0.77565849, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79987478, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.17822266, + "step": 13265, + "time_per_iteration": 2.8510687351226807 + }, + { + "auxiliary_loss_clip": 0.01419756, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.25434637, + "balance_loss_mlp": 1.01737118, + "epoch": 0.7975950698932812, + "flos": 21699583096320.0, + "grad_norm": 1.7862720124012819, + "language_loss": 0.75793487, + "learning_rate": 4.144696263830285e-07, + "loss": 0.78249788, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19165039, + "step": 13266, + "time_per_iteration": 4.291946887969971 + }, + { + "auxiliary_loss_clip": 0.01402497, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.24305773, + "balance_loss_mlp": 1.01365888, + "epoch": 0.7976551931459492, + "flos": 19612696325760.0, + "grad_norm": 4.168652815933306, + "language_loss": 0.8449589, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86930686, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18664551, + "step": 13267, + "time_per_iteration": 2.8800418376922607 + }, + { + "auxiliary_loss_clip": 0.01399145, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.2401967, + "balance_loss_mlp": 1.01340628, + "epoch": 0.7977153163986171, + "flos": 21697139877120.0, + "grad_norm": 1.601632516319317, + "language_loss": 0.77319175, + "learning_rate": 4.139949716968223e-07, + "loss": 0.79750586, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18835449, + "step": 13268, + "time_per_iteration": 2.8882741928100586 + }, + { + "auxiliary_loss_clip": 0.01407597, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.24764872, + "balance_loss_mlp": 1.01638532, + "epoch": 0.7977754396512852, + "flos": 23487138397440.0, + "grad_norm": 1.6707782944322322, + "language_loss": 0.78623521, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.81066298, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18811035, + "step": 13269, + "time_per_iteration": 2.885432004928589 + }, + { + "auxiliary_loss_clip": 0.01394269, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.23748016, + "balance_loss_mlp": 1.01527214, + "epoch": 0.7978355629039531, + "flos": 22392301271040.0, + "grad_norm": 2.2996677297106265, + "language_loss": 0.82722688, + "learning_rate": 4.135205575764922e-07, + "loss": 0.85150087, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17871094, + "step": 13270, + "time_per_iteration": 2.865199565887451 + }, + { + "auxiliary_loss_clip": 0.01398148, + "auxiliary_loss_mlp": 0.01033876, + "balance_loss_clip": 1.2376045, + "balance_loss_mlp": 1.0149219, + "epoch": 0.7978956861566211, + "flos": 20275932873600.0, + "grad_norm": 1.7828878033480218, + "language_loss": 0.60792458, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.63224477, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.1895752, + "step": 13271, + "time_per_iteration": 2.8499033451080322 + }, + { + "auxiliary_loss_clip": 0.0142633, + "auxiliary_loss_mlp": 0.01031387, + "balance_loss_clip": 1.26023734, + "balance_loss_mlp": 1.0120635, + "epoch": 0.797955809409289, + "flos": 28124482608000.0, + "grad_norm": 1.536301934465478, + "language_loss": 0.74077737, + "learning_rate": 4.130463840939975e-07, + "loss": 0.76535457, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19311523, + "step": 13272, + "time_per_iteration": 4.34929895401001 + }, + { + "auxiliary_loss_clip": 0.01402225, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.2432462, + "balance_loss_mlp": 1.01676154, + "epoch": 0.798015932661957, + "flos": 15567952250880.0, + "grad_norm": 1.8591890248152174, + "language_loss": 0.72395533, + "learning_rate": 4.128093876144161e-07, + "loss": 0.74833524, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18981934, + "step": 13273, + "time_per_iteration": 2.7964675426483154 + }, + { + "auxiliary_loss_clip": 0.01415621, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.25213075, + "balance_loss_mlp": 1.01334584, + "epoch": 0.7980760559146249, + "flos": 23961703553280.0, + "grad_norm": 1.851210603569782, + "language_loss": 0.76823199, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.79271734, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19567871, + "step": 13274, + "time_per_iteration": 2.8522191047668457 + }, + { + "auxiliary_loss_clip": 0.01388233, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.23365355, + "balance_loss_mlp": 1.01368916, + "epoch": 0.798136179167293, + "flos": 28049774163840.0, + "grad_norm": 1.395743994326899, + "language_loss": 0.78209221, + "learning_rate": 4.12335575223518e-07, + "loss": 0.80628681, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.17553711, + "step": 13275, + "time_per_iteration": 2.914494037628174 + }, + { + "auxiliary_loss_clip": 0.01404373, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.24106503, + "balance_loss_mlp": 1.01949692, + "epoch": 0.7981963024199609, + "flos": 35996360676480.0, + "grad_norm": 1.9699397959520755, + "language_loss": 0.63994145, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66437519, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19519043, + "step": 13276, + "time_per_iteration": 2.936234951019287 + }, + { + "auxiliary_loss_clip": 0.0139197, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.23520362, + "balance_loss_mlp": 1.0133388, + "epoch": 0.7982564256726289, + "flos": 25895915809920.0, + "grad_norm": 2.2587716022534106, + "language_loss": 0.61781532, + "learning_rate": 4.118620036501945e-07, + "loss": 0.64205116, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18273926, + "step": 13277, + "time_per_iteration": 4.273302316665649 + }, + { + "auxiliary_loss_clip": 0.01421657, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.25654674, + "balance_loss_mlp": 1.01858723, + "epoch": 0.7983165489252969, + "flos": 25750163750400.0, + "grad_norm": 2.0819229394265997, + "language_loss": 0.80471265, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.82929623, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18103027, + "step": 13278, + "time_per_iteration": 4.296995162963867 + }, + { + "auxiliary_loss_clip": 0.01401165, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.23896527, + "balance_loss_mlp": 1.01441932, + "epoch": 0.7983766721779648, + "flos": 21918233808000.0, + "grad_norm": 1.8176660452942959, + "language_loss": 0.63939524, + "learning_rate": 4.113886729662768e-07, + "loss": 0.66374964, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19873047, + "step": 13279, + "time_per_iteration": 2.8215928077697754 + }, + { + "auxiliary_loss_clip": 0.01376156, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.22301793, + "balance_loss_mlp": 1.01301718, + "epoch": 0.7984367954306328, + "flos": 29358375563520.0, + "grad_norm": 2.321491824805067, + "language_loss": 0.71855992, + "learning_rate": 4.111520979802825e-07, + "loss": 0.74263203, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18029785, + "step": 13280, + "time_per_iteration": 2.9326117038726807 + }, + { + "auxiliary_loss_clip": 0.01413956, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.25072145, + "balance_loss_mlp": 1.01613545, + "epoch": 0.7984969186833007, + "flos": 31370191931520.0, + "grad_norm": 1.7489052342858649, + "language_loss": 0.63653111, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.66102791, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19592285, + "step": 13281, + "time_per_iteration": 2.957246780395508 + }, + { + "auxiliary_loss_clip": 0.01423877, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.25820422, + "balance_loss_mlp": 1.01479137, + "epoch": 0.7985570419359688, + "flos": 24322712964480.0, + "grad_norm": 1.7332388445531814, + "language_loss": 0.81166023, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.83623719, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19030762, + "step": 13282, + "time_per_iteration": 2.870013952255249 + }, + { + "auxiliary_loss_clip": 0.0141107, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.24618638, + "balance_loss_mlp": 1.01185, + "epoch": 0.7986171651886367, + "flos": 15750696574080.0, + "grad_norm": 1.8152278234385615, + "language_loss": 0.72231758, + "learning_rate": 4.10442734553802e-07, + "loss": 0.74673498, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18835449, + "step": 13283, + "time_per_iteration": 2.823507785797119 + }, + { + "auxiliary_loss_clip": 0.01400339, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.24028969, + "balance_loss_mlp": 1.01715374, + "epoch": 0.7986772884413047, + "flos": 11626810064640.0, + "grad_norm": 3.1678750335664905, + "language_loss": 0.74192721, + "learning_rate": 4.102064006186967e-07, + "loss": 0.76628989, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18786621, + "step": 13284, + "time_per_iteration": 2.8379576206207275 + }, + { + "auxiliary_loss_clip": 0.01396599, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.23708129, + "balance_loss_mlp": 1.01319408, + "epoch": 0.7987374116939726, + "flos": 22101249600000.0, + "grad_norm": 1.7289609691828889, + "language_loss": 0.71047521, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.73474991, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.17687988, + "step": 13285, + "time_per_iteration": 2.8509156703948975 + }, + { + "auxiliary_loss_clip": 0.01411738, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.24990129, + "balance_loss_mlp": 1.01467252, + "epoch": 0.7987975349466406, + "flos": 17898627859200.0, + "grad_norm": 2.086115844629727, + "language_loss": 0.74683487, + "learning_rate": 4.097339136128437e-07, + "loss": 0.77129364, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19470215, + "step": 13286, + "time_per_iteration": 2.8563501834869385 + }, + { + "auxiliary_loss_clip": 0.01396194, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.23619795, + "balance_loss_mlp": 1.01755834, + "epoch": 0.7988576581993085, + "flos": 19728740534400.0, + "grad_norm": 1.9104079453622531, + "language_loss": 0.75835037, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.78267503, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18725586, + "step": 13287, + "time_per_iteration": 2.850722312927246 + }, + { + "auxiliary_loss_clip": 0.01401793, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.24214661, + "balance_loss_mlp": 1.01520038, + "epoch": 0.7989177814519766, + "flos": 28047602413440.0, + "grad_norm": 1.666730307914582, + "language_loss": 0.62692153, + "learning_rate": 4.092616678191863e-07, + "loss": 0.65126991, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.17834473, + "step": 13288, + "time_per_iteration": 2.9073915481567383 + }, + { + "auxiliary_loss_clip": 0.01396084, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.23836446, + "balance_loss_mlp": 1.01268768, + "epoch": 0.7989779047046445, + "flos": 28881321943680.0, + "grad_norm": 1.87429497291728, + "language_loss": 0.71692508, + "learning_rate": 4.090256353993169e-07, + "loss": 0.74119449, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18151855, + "step": 13289, + "time_per_iteration": 2.9358832836151123 + }, + { + "auxiliary_loss_clip": 0.01387696, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.23375213, + "balance_loss_mlp": 1.0150311, + "epoch": 0.7990380279573125, + "flos": 18196013802240.0, + "grad_norm": 2.0150737019581975, + "language_loss": 0.63507438, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.65929639, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.19470215, + "step": 13290, + "time_per_iteration": 2.835797071456909 + }, + { + "auxiliary_loss_clip": 0.01404, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.24208045, + "balance_loss_mlp": 1.01469159, + "epoch": 0.7990981512099805, + "flos": 20887970290560.0, + "grad_norm": 2.1633976861846715, + "language_loss": 0.71976525, + "learning_rate": 4.08553751558248e-07, + "loss": 0.74414504, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19287109, + "step": 13291, + "time_per_iteration": 2.8636746406555176 + }, + { + "auxiliary_loss_clip": 0.01385762, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.22900915, + "balance_loss_mlp": 1.01439178, + "epoch": 0.7991582744626484, + "flos": 26109951552000.0, + "grad_norm": 1.4615128070866787, + "language_loss": 0.64702326, + "learning_rate": 4.083179001549422e-07, + "loss": 0.67121553, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.1907959, + "step": 13292, + "time_per_iteration": 2.8756368160247803 + }, + { + "auxiliary_loss_clip": 0.01392818, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.23469555, + "balance_loss_mlp": 1.01342797, + "epoch": 0.7992183977153164, + "flos": 35308664674560.0, + "grad_norm": 1.678089109899059, + "language_loss": 0.56733555, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.59158254, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18457031, + "step": 13293, + "time_per_iteration": 2.971869707107544 + }, + { + "auxiliary_loss_clip": 0.01404835, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.24594545, + "balance_loss_mlp": 1.01574683, + "epoch": 0.7992785209679844, + "flos": 51868739076480.0, + "grad_norm": 4.444168723012482, + "language_loss": 0.72471726, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.74910843, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18530273, + "step": 13294, + "time_per_iteration": 3.219226121902466 + }, + { + "auxiliary_loss_clip": 0.01400193, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.23936558, + "balance_loss_mlp": 1.01618338, + "epoch": 0.7993386442206524, + "flos": 22575362307840.0, + "grad_norm": 1.7500496273240513, + "language_loss": 0.73398054, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.75832868, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18432617, + "step": 13295, + "time_per_iteration": 2.8873276710510254 + }, + { + "auxiliary_loss_clip": 0.01396698, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.23978031, + "balance_loss_mlp": 1.01709211, + "epoch": 0.7993987674733203, + "flos": 18807779750400.0, + "grad_norm": 2.036066491082595, + "language_loss": 0.77066594, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.79498482, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1809082, + "step": 13296, + "time_per_iteration": 2.838067054748535 + }, + { + "auxiliary_loss_clip": 0.0118438, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.09535599, + "balance_loss_mlp": 1.01174533, + "epoch": 0.7994588907259883, + "flos": 69455900580480.0, + "grad_norm": 0.7017637639938828, + "language_loss": 0.60867286, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.63084203, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20800781, + "step": 13297, + "time_per_iteration": 3.4176766872406006 + }, + { + "auxiliary_loss_clip": 0.01394317, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.23679876, + "balance_loss_mlp": 1.01198936, + "epoch": 0.7995190139786562, + "flos": 13488259403520.0, + "grad_norm": 1.8885496590076198, + "language_loss": 0.71579051, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.74003756, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1842041, + "step": 13298, + "time_per_iteration": 2.8164517879486084 + }, + { + "auxiliary_loss_clip": 0.01421931, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.25796485, + "balance_loss_mlp": 1.01103246, + "epoch": 0.7995791372313242, + "flos": 21662817056640.0, + "grad_norm": 2.186709837120161, + "language_loss": 0.76327157, + "learning_rate": 4.066686308212037e-07, + "loss": 0.78779685, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19555664, + "step": 13299, + "time_per_iteration": 2.8223607540130615 + }, + { + "auxiliary_loss_clip": 0.01394258, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.23843789, + "balance_loss_mlp": 1.01889849, + "epoch": 0.7996392604839921, + "flos": 26079384049920.0, + "grad_norm": 1.6596092438204544, + "language_loss": 0.78017485, + "learning_rate": 4.064332625220828e-07, + "loss": 0.80449224, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18579102, + "step": 13300, + "time_per_iteration": 2.876464605331421 + }, + { + "auxiliary_loss_clip": 0.01413821, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.25062513, + "balance_loss_mlp": 1.01514268, + "epoch": 0.7996993837366602, + "flos": 24617293729920.0, + "grad_norm": 1.7198083623228242, + "language_loss": 0.64811099, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.67258602, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.1854248, + "step": 13301, + "time_per_iteration": 4.368839979171753 + }, + { + "auxiliary_loss_clip": 0.01402931, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.24505448, + "balance_loss_mlp": 1.01444244, + "epoch": 0.7997595069893281, + "flos": 21000802118400.0, + "grad_norm": 1.8998907969499954, + "language_loss": 0.72645801, + "learning_rate": 4.059627072173928e-07, + "loss": 0.75080919, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.17724609, + "step": 13302, + "time_per_iteration": 2.860013246536255 + }, + { + "auxiliary_loss_clip": 0.01417624, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.25416493, + "balance_loss_mlp": 1.01526725, + "epoch": 0.7998196302419961, + "flos": 24437399829120.0, + "grad_norm": 1.7562027035667913, + "language_loss": 0.84599954, + "learning_rate": 4.057275202296684e-07, + "loss": 0.87051535, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18664551, + "step": 13303, + "time_per_iteration": 2.9080238342285156 + }, + { + "auxiliary_loss_clip": 0.0138479, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.22932541, + "balance_loss_mlp": 1.0161202, + "epoch": 0.7998797534946641, + "flos": 30277164597120.0, + "grad_norm": 2.041327873573778, + "language_loss": 0.59849191, + "learning_rate": 4.054923936969166e-07, + "loss": 0.62268674, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18579102, + "step": 13304, + "time_per_iteration": 2.912205696105957 + }, + { + "auxiliary_loss_clip": 0.013987, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.23696947, + "balance_loss_mlp": 1.01341391, + "epoch": 0.799939876747332, + "flos": 23524537864320.0, + "grad_norm": 2.01576990073222, + "language_loss": 0.70049548, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.7248081, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19140625, + "step": 13305, + "time_per_iteration": 2.86551570892334 + }, + { + "auxiliary_loss_clip": 0.01398361, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.24065924, + "balance_loss_mlp": 1.01468253, + "epoch": 0.8, + "flos": 19327571723520.0, + "grad_norm": 1.7454988102765079, + "language_loss": 0.69893146, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.72324836, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18640137, + "step": 13306, + "time_per_iteration": 2.838120937347412 + }, + { + "auxiliary_loss_clip": 0.01408175, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.24775124, + "balance_loss_mlp": 1.01516438, + "epoch": 0.800060123252668, + "flos": 32424236231040.0, + "grad_norm": 1.4449921462211532, + "language_loss": 0.70533663, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.72975302, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.1829834, + "step": 13307, + "time_per_iteration": 2.99957013130188 + }, + { + "auxiliary_loss_clip": 0.01407771, + "auxiliary_loss_mlp": 0.01036395, + "balance_loss_clip": 1.24719274, + "balance_loss_mlp": 1.01736903, + "epoch": 0.800120246505336, + "flos": 20020063674240.0, + "grad_norm": 3.784966767959506, + "language_loss": 0.77890271, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.80334437, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19030762, + "step": 13308, + "time_per_iteration": 4.374410390853882 + }, + { + "auxiliary_loss_clip": 0.01407826, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.24431705, + "balance_loss_mlp": 1.01024508, + "epoch": 0.8001803697580039, + "flos": 31880753965440.0, + "grad_norm": 2.226683186507593, + "language_loss": 0.78958583, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81395566, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18920898, + "step": 13309, + "time_per_iteration": 2.928068161010742 + }, + { + "auxiliary_loss_clip": 0.01183171, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.0938518, + "balance_loss_mlp": 1.0046227, + "epoch": 0.8002404930106719, + "flos": 63422216023680.0, + "grad_norm": 0.8749271372195145, + "language_loss": 0.64609575, + "learning_rate": 4.040829045539571e-07, + "loss": 0.668194, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.22070312, + "step": 13310, + "time_per_iteration": 3.267462968826294 + }, + { + "auxiliary_loss_clip": 0.01403327, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.24301207, + "balance_loss_mlp": 1.01772642, + "epoch": 0.8003006162633398, + "flos": 27867075085440.0, + "grad_norm": 1.9751929962755699, + "language_loss": 0.83766818, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.86206114, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18249512, + "step": 13311, + "time_per_iteration": 2.895260810852051 + }, + { + "auxiliary_loss_clip": 0.0140173, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.24203825, + "balance_loss_mlp": 1.01184988, + "epoch": 0.8003607395160078, + "flos": 18232463128320.0, + "grad_norm": 1.9382538817917305, + "language_loss": 0.67085201, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.69517422, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18640137, + "step": 13312, + "time_per_iteration": 4.316673517227173 + }, + { + "auxiliary_loss_clip": 0.01407897, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.2458632, + "balance_loss_mlp": 1.01129985, + "epoch": 0.8004208627686757, + "flos": 20896838271360.0, + "grad_norm": 1.7393213547943387, + "language_loss": 0.75894678, + "learning_rate": 4.033789768462843e-07, + "loss": 0.78333461, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19580078, + "step": 13313, + "time_per_iteration": 2.9634180068969727 + }, + { + "auxiliary_loss_clip": 0.01405378, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.2447902, + "balance_loss_mlp": 1.01834083, + "epoch": 0.8004809860213438, + "flos": 26448311301120.0, + "grad_norm": 2.024263548702081, + "language_loss": 0.76427019, + "learning_rate": 4.031444553532575e-07, + "loss": 0.7886914, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18408203, + "step": 13314, + "time_per_iteration": 3.0608139038085938 + }, + { + "auxiliary_loss_clip": 0.01179191, + "auxiliary_loss_mlp": 0.01011983, + "balance_loss_clip": 1.09272552, + "balance_loss_mlp": 0.99472106, + "epoch": 0.8005411092740117, + "flos": 63679125853440.0, + "grad_norm": 0.8021283165920722, + "language_loss": 0.53799558, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55990732, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.17285156, + "step": 13315, + "time_per_iteration": 3.290274143218994 + }, + { + "auxiliary_loss_clip": 0.01393643, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.23478246, + "balance_loss_mlp": 1.01782012, + "epoch": 0.8006012325266797, + "flos": 36151885612800.0, + "grad_norm": 2.344575574171086, + "language_loss": 0.72153997, + "learning_rate": 4.026755940348603e-07, + "loss": 0.74584508, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1907959, + "step": 13316, + "time_per_iteration": 3.0587940216064453 + }, + { + "auxiliary_loss_clip": 0.0141602, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.25119805, + "balance_loss_mlp": 1.01713777, + "epoch": 0.8006613557793477, + "flos": 33852048975360.0, + "grad_norm": 1.8151006819756503, + "language_loss": 0.65632206, + "learning_rate": 4.024412542272706e-07, + "loss": 0.68085629, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20275879, + "step": 13317, + "time_per_iteration": 2.9783949851989746 + }, + { + "auxiliary_loss_clip": 0.0118188, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.09422231, + "balance_loss_mlp": 1.01017618, + "epoch": 0.8007214790320156, + "flos": 67383582635520.0, + "grad_norm": 0.7905015223025839, + "language_loss": 0.59083366, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61295933, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.20507812, + "step": 13318, + "time_per_iteration": 3.3819329738616943 + }, + { + "auxiliary_loss_clip": 0.01396477, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.23671484, + "balance_loss_mlp": 1.01166189, + "epoch": 0.8007816022846836, + "flos": 23196131971200.0, + "grad_norm": 1.5968409563819024, + "language_loss": 0.66664571, + "learning_rate": 4.019727563597366e-07, + "loss": 0.69091713, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18994141, + "step": 13319, + "time_per_iteration": 2.8640754222869873 + }, + { + "auxiliary_loss_clip": 0.01410417, + "auxiliary_loss_mlp": 0.01035206, + "balance_loss_clip": 1.24702477, + "balance_loss_mlp": 1.01492834, + "epoch": 0.8008417255373516, + "flos": 21991494418560.0, + "grad_norm": 1.6950110563242653, + "language_loss": 0.74484235, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76929855, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20263672, + "step": 13320, + "time_per_iteration": 2.8719913959503174 + }, + { + "auxiliary_loss_clip": 0.01401467, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.24016058, + "balance_loss_mlp": 1.01243651, + "epoch": 0.8009018487900196, + "flos": 16736049987840.0, + "grad_norm": 1.9969630608322348, + "language_loss": 0.8143028, + "learning_rate": 4.015045008816138e-07, + "loss": 0.83863884, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19702148, + "step": 13321, + "time_per_iteration": 2.8701283931732178 + }, + { + "auxiliary_loss_clip": 0.01387901, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.23184276, + "balance_loss_mlp": 1.0137893, + "epoch": 0.8009619720426875, + "flos": 20823351436800.0, + "grad_norm": 1.6626634863518122, + "language_loss": 0.66346359, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68766564, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18505859, + "step": 13322, + "time_per_iteration": 2.8731205463409424 + }, + { + "auxiliary_loss_clip": 0.0140787, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.2469269, + "balance_loss_mlp": 1.01330316, + "epoch": 0.8010220952953555, + "flos": 17940054113280.0, + "grad_norm": 3.724672340431362, + "language_loss": 0.78425062, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80864841, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18591309, + "step": 13323, + "time_per_iteration": 2.8070011138916016 + }, + { + "auxiliary_loss_clip": 0.01410205, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.24658406, + "balance_loss_mlp": 1.01311004, + "epoch": 0.8010822185480234, + "flos": 24583106643840.0, + "grad_norm": 2.8363813073089803, + "language_loss": 0.72595429, + "learning_rate": 4.00802572299932e-07, + "loss": 0.75037616, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.1887207, + "step": 13324, + "time_per_iteration": 2.8942506313323975 + }, + { + "auxiliary_loss_clip": 0.01406229, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.24246788, + "balance_loss_mlp": 1.01033771, + "epoch": 0.8011423418006914, + "flos": 21838638925440.0, + "grad_norm": 1.9085114351444477, + "language_loss": 0.76782846, + "learning_rate": 4.005687173776635e-07, + "loss": 0.79218912, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19506836, + "step": 13325, + "time_per_iteration": 2.8620030879974365 + }, + { + "auxiliary_loss_clip": 0.01378213, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.22391212, + "balance_loss_mlp": 1.01297176, + "epoch": 0.8012024650533593, + "flos": 23925797164800.0, + "grad_norm": 1.5414783718703742, + "language_loss": 0.7995562, + "learning_rate": 4.003349231059898e-07, + "loss": 0.8236531, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18505859, + "step": 13326, + "time_per_iteration": 2.8625690937042236 + }, + { + "auxiliary_loss_clip": 0.01388224, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.23289514, + "balance_loss_mlp": 1.01332974, + "epoch": 0.8012625883060274, + "flos": 23597662740480.0, + "grad_norm": 2.33854399127897, + "language_loss": 0.67225844, + "learning_rate": 4.001011894937765e-07, + "loss": 0.69646138, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.1875, + "step": 13327, + "time_per_iteration": 2.871870756149292 + }, + { + "auxiliary_loss_clip": 0.01381455, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.22755504, + "balance_loss_mlp": 1.01573324, + "epoch": 0.8013227115586953, + "flos": 20823758640000.0, + "grad_norm": 1.5921773694600445, + "language_loss": 0.74410141, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.76825547, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18212891, + "step": 13328, + "time_per_iteration": 2.8560938835144043 + }, + { + "auxiliary_loss_clip": 0.01408069, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.2446444, + "balance_loss_mlp": 1.01610708, + "epoch": 0.8013828348113633, + "flos": 15896855836800.0, + "grad_norm": 4.8184963041625, + "language_loss": 0.75032544, + "learning_rate": 3.996339042831798e-07, + "loss": 0.77475786, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19067383, + "step": 13329, + "time_per_iteration": 2.871370553970337 + }, + { + "auxiliary_loss_clip": 0.01181576, + "auxiliary_loss_mlp": 0.01018336, + "balance_loss_clip": 1.0924747, + "balance_loss_mlp": 0.99582928, + "epoch": 0.8014429580640313, + "flos": 71097432353280.0, + "grad_norm": 0.7153730252649813, + "language_loss": 0.53056759, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.55256671, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.22460938, + "step": 13330, + "time_per_iteration": 3.4606757164001465 + }, + { + "auxiliary_loss_clip": 0.01412876, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.24903917, + "balance_loss_mlp": 1.01433611, + "epoch": 0.8015030813166992, + "flos": 23086829237760.0, + "grad_norm": 2.1635703215190034, + "language_loss": 0.73551559, + "learning_rate": 3.991668618167519e-07, + "loss": 0.75999117, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20336914, + "step": 13331, + "time_per_iteration": 2.8754663467407227 + }, + { + "auxiliary_loss_clip": 0.01403448, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.24385619, + "balance_loss_mlp": 1.01195419, + "epoch": 0.8015632045693672, + "flos": 21882508398720.0, + "grad_norm": 2.0673101500122457, + "language_loss": 0.78023982, + "learning_rate": 3.989334316347401e-07, + "loss": 0.80458128, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1875, + "step": 13332, + "time_per_iteration": 2.9269278049468994 + }, + { + "auxiliary_loss_clip": 0.01413133, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.24985361, + "balance_loss_mlp": 1.01309717, + "epoch": 0.8016233278220352, + "flos": 23666489360640.0, + "grad_norm": 2.1745263091696927, + "language_loss": 0.83880568, + "learning_rate": 3.987000621653338e-07, + "loss": 0.8632561, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18811035, + "step": 13333, + "time_per_iteration": 2.872791290283203 + }, + { + "auxiliary_loss_clip": 0.01408549, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.2464354, + "balance_loss_mlp": 1.00881195, + "epoch": 0.8016834510747032, + "flos": 16261801545600.0, + "grad_norm": 1.5992027164871274, + "language_loss": 0.74431789, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.76868606, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19458008, + "step": 13334, + "time_per_iteration": 2.933253765106201 + }, + { + "auxiliary_loss_clip": 0.01394653, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.23831701, + "balance_loss_mlp": 1.01210022, + "epoch": 0.8017435743273711, + "flos": 12283576606080.0, + "grad_norm": 2.0958048110053693, + "language_loss": 0.75370705, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.77796209, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1875, + "step": 13335, + "time_per_iteration": 2.997249126434326 + }, + { + "auxiliary_loss_clip": 0.01393926, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.2350527, + "balance_loss_mlp": 1.01329112, + "epoch": 0.8018036975800391, + "flos": 17203511710080.0, + "grad_norm": 1.7343200601894442, + "language_loss": 0.76203632, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.78630233, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19372559, + "step": 13336, + "time_per_iteration": 4.448650360107422 + }, + { + "auxiliary_loss_clip": 0.0142825, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.26044357, + "balance_loss_mlp": 1.01596701, + "epoch": 0.801863820832707, + "flos": 20641738233600.0, + "grad_norm": 2.348112691545349, + "language_loss": 0.75608218, + "learning_rate": 3.977671915907068e-07, + "loss": 0.78072083, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.19641113, + "step": 13337, + "time_per_iteration": 3.005729913711548 + }, + { + "auxiliary_loss_clip": 0.01419074, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.25424063, + "balance_loss_mlp": 1.01583362, + "epoch": 0.801923944085375, + "flos": 30457737169920.0, + "grad_norm": 1.6151915566018284, + "language_loss": 0.8077603, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.83231795, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20849609, + "step": 13338, + "time_per_iteration": 2.909785747528076 + }, + { + "auxiliary_loss_clip": 0.01411257, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.24632215, + "balance_loss_mlp": 1.01633406, + "epoch": 0.801984067338043, + "flos": 20020018429440.0, + "grad_norm": 2.440966384649834, + "language_loss": 0.74669373, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77117264, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20300293, + "step": 13339, + "time_per_iteration": 2.831796169281006 + }, + { + "auxiliary_loss_clip": 0.01393431, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.23631573, + "balance_loss_mlp": 1.01236415, + "epoch": 0.802044190590711, + "flos": 22794329733120.0, + "grad_norm": 1.569885241709506, + "language_loss": 0.80079502, + "learning_rate": 3.970681765754775e-07, + "loss": 0.82504141, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18859863, + "step": 13340, + "time_per_iteration": 2.8791189193725586 + }, + { + "auxiliary_loss_clip": 0.01409918, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.24893641, + "balance_loss_mlp": 1.01490879, + "epoch": 0.8021043138433789, + "flos": 27611658334080.0, + "grad_norm": 1.6570462058200652, + "language_loss": 0.68713456, + "learning_rate": 3.968352931252936e-07, + "loss": 0.71156579, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1829834, + "step": 13341, + "time_per_iteration": 2.904867649078369 + }, + { + "auxiliary_loss_clip": 0.01180193, + "auxiliary_loss_mlp": 0.01025175, + "balance_loss_clip": 1.09204245, + "balance_loss_mlp": 1.00324047, + "epoch": 0.8021644370960469, + "flos": 62089680124800.0, + "grad_norm": 0.8059924242146608, + "language_loss": 0.61814427, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.64019799, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21972656, + "step": 13342, + "time_per_iteration": 4.70370888710022 + }, + { + "auxiliary_loss_clip": 0.01400198, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.23883688, + "balance_loss_mlp": 1.01791191, + "epoch": 0.8022245603487148, + "flos": 23370686985600.0, + "grad_norm": 1.7655239161903353, + "language_loss": 0.64092684, + "learning_rate": 3.963697086102522e-07, + "loss": 0.66530907, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20129395, + "step": 13343, + "time_per_iteration": 2.8668668270111084 + }, + { + "auxiliary_loss_clip": 0.01378764, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.2258904, + "balance_loss_mlp": 1.01294732, + "epoch": 0.8022846836013828, + "flos": 10860605055360.0, + "grad_norm": 1.9682487662073866, + "language_loss": 0.69904774, + "learning_rate": 3.96137007563051e-07, + "loss": 0.72314924, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.18444824, + "step": 13344, + "time_per_iteration": 2.8238441944122314 + }, + { + "auxiliary_loss_clip": 0.01410126, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.24877131, + "balance_loss_mlp": 1.0107801, + "epoch": 0.8023448068540509, + "flos": 29252013742080.0, + "grad_norm": 1.4506718320760645, + "language_loss": 0.70702064, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.73141491, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18518066, + "step": 13345, + "time_per_iteration": 2.9619622230529785 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.09240556, + "balance_loss_mlp": 1.01091623, + "epoch": 0.8024049301067188, + "flos": 64182720188160.0, + "grad_norm": 0.8698532431680145, + "language_loss": 0.63026547, + "learning_rate": 3.956717879334059e-07, + "loss": 0.65242368, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.22949219, + "step": 13346, + "time_per_iteration": 4.80489182472229 + }, + { + "auxiliary_loss_clip": 0.01394854, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.23882937, + "balance_loss_mlp": 1.01391053, + "epoch": 0.8024650533593868, + "flos": 28597192727040.0, + "grad_norm": 1.452428863659513, + "language_loss": 0.73034203, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.75461918, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18969727, + "step": 13347, + "time_per_iteration": 4.368834972381592 + }, + { + "auxiliary_loss_clip": 0.01410289, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.24692655, + "balance_loss_mlp": 1.01192868, + "epoch": 0.8025251766120547, + "flos": 16990923801600.0, + "grad_norm": 2.0851838048172584, + "language_loss": 0.74169374, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.76610082, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18481445, + "step": 13348, + "time_per_iteration": 2.839907646179199 + }, + { + "auxiliary_loss_clip": 0.01409438, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.24862599, + "balance_loss_mlp": 1.01191854, + "epoch": 0.8025852998647227, + "flos": 22173786293760.0, + "grad_norm": 1.8729169133572126, + "language_loss": 0.77139854, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.79579949, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18737793, + "step": 13349, + "time_per_iteration": 2.8709583282470703 + }, + { + "auxiliary_loss_clip": 0.01406627, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.2463845, + "balance_loss_mlp": 1.01511788, + "epoch": 0.8026454231173906, + "flos": 22026903114240.0, + "grad_norm": 1.9305067292656592, + "language_loss": 0.84559262, + "learning_rate": 3.947420787800755e-07, + "loss": 0.8699919, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18200684, + "step": 13350, + "time_per_iteration": 2.8675475120544434 + }, + { + "auxiliary_loss_clip": 0.01403036, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.24237621, + "balance_loss_mlp": 1.01594651, + "epoch": 0.8027055463700586, + "flos": 22501332535680.0, + "grad_norm": 1.8557291929072517, + "language_loss": 0.7174865, + "learning_rate": 3.945098036485679e-07, + "loss": 0.74186099, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18444824, + "step": 13351, + "time_per_iteration": 2.8649048805236816 + }, + { + "auxiliary_loss_clip": 0.01399525, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.24127555, + "balance_loss_mlp": 1.01439667, + "epoch": 0.8027656696227266, + "flos": 28924603234560.0, + "grad_norm": 1.84848062896164, + "language_loss": 0.62185168, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.64618874, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19787598, + "step": 13352, + "time_per_iteration": 2.981959581375122 + }, + { + "auxiliary_loss_clip": 0.01405343, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.24642456, + "balance_loss_mlp": 1.02150643, + "epoch": 0.8028257928753946, + "flos": 18598901915520.0, + "grad_norm": 2.5197190629107244, + "language_loss": 0.77759337, + "learning_rate": 3.940454360354046e-07, + "loss": 0.80204833, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18640137, + "step": 13353, + "time_per_iteration": 2.8468422889709473 + }, + { + "auxiliary_loss_clip": 0.01437039, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.26907778, + "balance_loss_mlp": 1.01776195, + "epoch": 0.8028859161280625, + "flos": 19137950190720.0, + "grad_norm": 2.1501713607611195, + "language_loss": 0.73810446, + "learning_rate": 3.938133435713582e-07, + "loss": 0.76284695, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.19445801, + "step": 13354, + "time_per_iteration": 2.892155885696411 + }, + { + "auxiliary_loss_clip": 0.01409714, + "auxiliary_loss_mlp": 0.01032895, + "balance_loss_clip": 1.24654627, + "balance_loss_mlp": 1.01441717, + "epoch": 0.8029460393807305, + "flos": 20239483547520.0, + "grad_norm": 1.9218560480353146, + "language_loss": 0.66441488, + "learning_rate": 3.935813120140714e-07, + "loss": 0.68884099, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18481445, + "step": 13355, + "time_per_iteration": 2.866694211959839 + }, + { + "auxiliary_loss_clip": 0.0141864, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.25425148, + "balance_loss_mlp": 1.01343441, + "epoch": 0.8030061626333984, + "flos": 49800583653120.0, + "grad_norm": 2.1045918623817155, + "language_loss": 0.70108789, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.72560674, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19824219, + "step": 13356, + "time_per_iteration": 3.144876480102539 + }, + { + "auxiliary_loss_clip": 0.01396133, + "auxiliary_loss_mlp": 0.01037566, + "balance_loss_clip": 1.23636615, + "balance_loss_mlp": 1.01814675, + "epoch": 0.8030662858860664, + "flos": 21624919896960.0, + "grad_norm": 1.5179408840296515, + "language_loss": 0.77886069, + "learning_rate": 3.931174316549666e-07, + "loss": 0.80319762, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19421387, + "step": 13357, + "time_per_iteration": 2.864150285720825 + }, + { + "auxiliary_loss_clip": 0.01413859, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.24884462, + "balance_loss_mlp": 1.01446986, + "epoch": 0.8031264091387345, + "flos": 25641132485760.0, + "grad_norm": 2.791562979941978, + "language_loss": 0.77858865, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.80306423, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19213867, + "step": 13358, + "time_per_iteration": 2.904538154602051 + }, + { + "auxiliary_loss_clip": 0.01407522, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.24705625, + "balance_loss_mlp": 1.01476693, + "epoch": 0.8031865323914024, + "flos": 19655706147840.0, + "grad_norm": 6.785732844229658, + "language_loss": 0.84930623, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87372613, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19726562, + "step": 13359, + "time_per_iteration": 2.8707432746887207 + }, + { + "auxiliary_loss_clip": 0.01397784, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.23972654, + "balance_loss_mlp": 1.01734114, + "epoch": 0.8032466556440704, + "flos": 26178189989760.0, + "grad_norm": 2.097757955264157, + "language_loss": 0.74238986, + "learning_rate": 3.924220681368928e-07, + "loss": 0.766729, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18786621, + "step": 13360, + "time_per_iteration": 2.868642568588257 + }, + { + "auxiliary_loss_clip": 0.01400121, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.23841929, + "balance_loss_mlp": 1.01592064, + "epoch": 0.8033067788967383, + "flos": 25530924856320.0, + "grad_norm": 1.8195244037567007, + "language_loss": 0.7042132, + "learning_rate": 3.921904022048512e-07, + "loss": 0.72855419, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18078613, + "step": 13361, + "time_per_iteration": 2.8701629638671875 + }, + { + "auxiliary_loss_clip": 0.0142019, + "auxiliary_loss_mlp": 0.01041346, + "balance_loss_clip": 1.25511992, + "balance_loss_mlp": 1.02109241, + "epoch": 0.8033669021494063, + "flos": 24034828429440.0, + "grad_norm": 1.9306990159389623, + "language_loss": 0.70559216, + "learning_rate": 3.919587972411098e-07, + "loss": 0.7302075, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20251465, + "step": 13362, + "time_per_iteration": 2.8885715007781982 + }, + { + "auxiliary_loss_clip": 0.01435, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.26640141, + "balance_loss_mlp": 1.02402854, + "epoch": 0.8034270254020742, + "flos": 13595571365760.0, + "grad_norm": 2.273211962661656, + "language_loss": 0.80425048, + "learning_rate": 3.91727253254452e-07, + "loss": 0.82905066, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 1.68359375, + "router_z_loss_mlp": 0.20983887, + "step": 13363, + "time_per_iteration": 2.8379573822021484 + }, + { + "auxiliary_loss_clip": 0.01407458, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.24526346, + "balance_loss_mlp": 1.01166821, + "epoch": 0.8034871486547422, + "flos": 27422851207680.0, + "grad_norm": 2.09731247627203, + "language_loss": 0.75518692, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77957392, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19567871, + "step": 13364, + "time_per_iteration": 2.9101834297180176 + }, + { + "auxiliary_loss_clip": 0.01402736, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.24424291, + "balance_loss_mlp": 1.01319742, + "epoch": 0.8035472719074102, + "flos": 32611414544640.0, + "grad_norm": 2.411511613457792, + "language_loss": 0.61450535, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.63886482, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.20019531, + "step": 13365, + "time_per_iteration": 2.919126033782959 + }, + { + "auxiliary_loss_clip": 0.0141585, + "auxiliary_loss_mlp": 0.0103769, + "balance_loss_clip": 1.25221515, + "balance_loss_mlp": 1.01809168, + "epoch": 0.8036073951600782, + "flos": 21297871347840.0, + "grad_norm": 1.7043073744871837, + "language_loss": 0.66899055, + "learning_rate": 3.910329872447706e-07, + "loss": 0.69352603, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19592285, + "step": 13366, + "time_per_iteration": 2.895183801651001 + }, + { + "auxiliary_loss_clip": 0.01392792, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.23471987, + "balance_loss_mlp": 1.01560879, + "epoch": 0.8036675184127461, + "flos": 18122934170880.0, + "grad_norm": 2.0232066243531284, + "language_loss": 0.75347459, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77773875, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18005371, + "step": 13367, + "time_per_iteration": 2.838040590286255 + }, + { + "auxiliary_loss_clip": 0.01396053, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.2357235, + "balance_loss_mlp": 1.01367247, + "epoch": 0.8037276416654141, + "flos": 26041170176640.0, + "grad_norm": 1.6785514424084573, + "language_loss": 0.74849415, + "learning_rate": 3.905704482846428e-07, + "loss": 0.77278501, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19348145, + "step": 13368, + "time_per_iteration": 2.9056050777435303 + }, + { + "auxiliary_loss_clip": 0.01420491, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.25623417, + "balance_loss_mlp": 1.01153433, + "epoch": 0.803787764918082, + "flos": 18810177724800.0, + "grad_norm": 1.84992754034051, + "language_loss": 0.71005833, + "learning_rate": 3.90339270344789e-07, + "loss": 0.73457015, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19152832, + "step": 13369, + "time_per_iteration": 2.870034694671631 + }, + { + "auxiliary_loss_clip": 0.0139655, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.23738241, + "balance_loss_mlp": 1.01980495, + "epoch": 0.80384788817075, + "flos": 20234325640320.0, + "grad_norm": 2.6113379166807786, + "language_loss": 0.74184191, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76619947, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19396973, + "step": 13370, + "time_per_iteration": 2.8375585079193115 + }, + { + "auxiliary_loss_clip": 0.01419329, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.2539084, + "balance_loss_mlp": 1.01672029, + "epoch": 0.8039080114234181, + "flos": 18524510184960.0, + "grad_norm": 4.004737498883886, + "language_loss": 0.88359576, + "learning_rate": 3.898770975893342e-07, + "loss": 0.90814072, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18432617, + "step": 13371, + "time_per_iteration": 2.86247181892395 + }, + { + "auxiliary_loss_clip": 0.01410706, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.24493933, + "balance_loss_mlp": 1.01379001, + "epoch": 0.803968134676086, + "flos": 22392663229440.0, + "grad_norm": 2.561506122708692, + "language_loss": 0.75775659, + "learning_rate": 3.89646102791259e-07, + "loss": 0.7821914, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18994141, + "step": 13372, + "time_per_iteration": 4.3639185428619385 + }, + { + "auxiliary_loss_clip": 0.01402667, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.24137187, + "balance_loss_mlp": 1.01461315, + "epoch": 0.804028257928754, + "flos": 23853260471040.0, + "grad_norm": 2.1775810626621945, + "language_loss": 0.80139089, + "learning_rate": 3.894151690579646e-07, + "loss": 0.82575816, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19445801, + "step": 13373, + "time_per_iteration": 2.8709256649017334 + }, + { + "auxiliary_loss_clip": 0.01396602, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.23903036, + "balance_loss_mlp": 1.01748013, + "epoch": 0.8040883811814219, + "flos": 23561349148800.0, + "grad_norm": 1.4792355393855843, + "language_loss": 0.75224125, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.77656257, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18054199, + "step": 13374, + "time_per_iteration": 2.895918846130371 + }, + { + "auxiliary_loss_clip": 0.01413674, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.248752, + "balance_loss_mlp": 1.01750886, + "epoch": 0.8041485044340899, + "flos": 19035660401280.0, + "grad_norm": 3.145790817163915, + "language_loss": 0.70417738, + "learning_rate": 3.889534848207452e-07, + "loss": 0.72867978, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19067383, + "step": 13375, + "time_per_iteration": 2.8317067623138428 + }, + { + "auxiliary_loss_clip": 0.01180618, + "auxiliary_loss_mlp": 0.01025227, + "balance_loss_clip": 1.09132934, + "balance_loss_mlp": 1.00205266, + "epoch": 0.8042086276867578, + "flos": 70040039938560.0, + "grad_norm": 0.7248596454600553, + "language_loss": 0.55704474, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57910323, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.23144531, + "step": 13376, + "time_per_iteration": 3.401216506958008 + }, + { + "auxiliary_loss_clip": 0.01414418, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.25132394, + "balance_loss_mlp": 1.01126146, + "epoch": 0.8042687509394258, + "flos": 21882644133120.0, + "grad_norm": 1.6362383386345543, + "language_loss": 0.74345529, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.76790154, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18933105, + "step": 13377, + "time_per_iteration": 4.335128545761108 + }, + { + "auxiliary_loss_clip": 0.01403893, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.24230123, + "balance_loss_mlp": 1.01428533, + "epoch": 0.8043288741920938, + "flos": 26626123941120.0, + "grad_norm": 1.824744961102246, + "language_loss": 0.70923626, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.73360646, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18847656, + "step": 13378, + "time_per_iteration": 2.9427356719970703 + }, + { + "auxiliary_loss_clip": 0.01412892, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.25102913, + "balance_loss_mlp": 1.01460564, + "epoch": 0.8043889974447618, + "flos": 33416964547200.0, + "grad_norm": 1.5260763012474514, + "language_loss": 0.69611555, + "learning_rate": 3.880308495088347e-07, + "loss": 0.72058737, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19689941, + "step": 13379, + "time_per_iteration": 2.9899532794952393 + }, + { + "auxiliary_loss_clip": 0.0141677, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.24986553, + "balance_loss_mlp": 1.01643848, + "epoch": 0.8044491206974297, + "flos": 20385642810240.0, + "grad_norm": 1.6541637886710243, + "language_loss": 0.76768619, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.79221535, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 0.19702148, + "step": 13380, + "time_per_iteration": 2.8687806129455566 + }, + { + "auxiliary_loss_clip": 0.01401888, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.24181104, + "balance_loss_mlp": 1.01444054, + "epoch": 0.8045092439500977, + "flos": 23414149255680.0, + "grad_norm": 1.8580536363382052, + "language_loss": 0.70144057, + "learning_rate": 3.875698985740887e-07, + "loss": 0.72578353, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.17980957, + "step": 13381, + "time_per_iteration": 4.260072231292725 + }, + { + "auxiliary_loss_clip": 0.01399153, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.23931193, + "balance_loss_mlp": 1.0159682, + "epoch": 0.8045693672027656, + "flos": 24107817571200.0, + "grad_norm": 2.2076316758976007, + "language_loss": 0.64842212, + "learning_rate": 3.873395148176135e-07, + "loss": 0.67275834, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18493652, + "step": 13382, + "time_per_iteration": 4.318398714065552 + }, + { + "auxiliary_loss_clip": 0.01397266, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.23773038, + "balance_loss_mlp": 1.01495266, + "epoch": 0.8046294904554336, + "flos": 27718110645120.0, + "grad_norm": 2.1800360630137163, + "language_loss": 0.76733893, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.79164571, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18444824, + "step": 13383, + "time_per_iteration": 2.9065141677856445 + }, + { + "auxiliary_loss_clip": 0.01403413, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.24282432, + "balance_loss_mlp": 1.01678848, + "epoch": 0.8046896137081017, + "flos": 24983823006720.0, + "grad_norm": 1.699060748201446, + "language_loss": 0.70425606, + "learning_rate": 3.868789307701381e-07, + "loss": 0.72864074, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18273926, + "step": 13384, + "time_per_iteration": 2.895906448364258 + }, + { + "auxiliary_loss_clip": 0.01413816, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.24940586, + "balance_loss_mlp": 1.01345611, + "epoch": 0.8047497369607696, + "flos": 17684320648320.0, + "grad_norm": 2.0454217516507676, + "language_loss": 0.80326629, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.82773185, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19287109, + "step": 13385, + "time_per_iteration": 2.867671012878418 + }, + { + "auxiliary_loss_clip": 0.01413446, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.25112367, + "balance_loss_mlp": 1.01872492, + "epoch": 0.8048098602134376, + "flos": 22392120291840.0, + "grad_norm": 1.6124151516120915, + "language_loss": 0.72837007, + "learning_rate": 3.864185914015108e-07, + "loss": 0.75289428, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.20251465, + "step": 13386, + "time_per_iteration": 2.8528923988342285 + }, + { + "auxiliary_loss_clip": 0.01177859, + "auxiliary_loss_mlp": 0.01020416, + "balance_loss_clip": 1.09060335, + "balance_loss_mlp": 1.0009613, + "epoch": 0.8048699834661055, + "flos": 71233366291200.0, + "grad_norm": 0.660111408875946, + "language_loss": 0.51274914, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53473186, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.19433594, + "step": 13387, + "time_per_iteration": 3.4094936847686768 + }, + { + "auxiliary_loss_clip": 0.01405939, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.24468327, + "balance_loss_mlp": 1.01525784, + "epoch": 0.8049301067187735, + "flos": 23671013840640.0, + "grad_norm": 1.7715273981272661, + "language_loss": 0.74534595, + "learning_rate": 3.859584967815559e-07, + "loss": 0.76975548, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19750977, + "step": 13388, + "time_per_iteration": 2.8850924968719482 + }, + { + "auxiliary_loss_clip": 0.01406912, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.24756408, + "balance_loss_mlp": 1.01223445, + "epoch": 0.8049902299714414, + "flos": 24437173605120.0, + "grad_norm": 1.5394425217330887, + "language_loss": 0.72088742, + "learning_rate": 3.857285412741411e-07, + "loss": 0.7452665, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18737793, + "step": 13389, + "time_per_iteration": 2.9136300086975098 + }, + { + "auxiliary_loss_clip": 0.01398889, + "auxiliary_loss_mlp": 0.01038611, + "balance_loss_clip": 1.24092698, + "balance_loss_mlp": 1.01997852, + "epoch": 0.8050503532241094, + "flos": 17501304856320.0, + "grad_norm": 2.742761592439407, + "language_loss": 0.83371258, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.8580876, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1862793, + "step": 13390, + "time_per_iteration": 2.8573920726776123 + }, + { + "auxiliary_loss_clip": 0.01177475, + "auxiliary_loss_mlp": 0.01022807, + "balance_loss_clip": 1.08930671, + "balance_loss_mlp": 0.99896479, + "epoch": 0.8051104764767774, + "flos": 57685193493120.0, + "grad_norm": 0.7870348976964566, + "language_loss": 0.55487627, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57687902, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.23828125, + "step": 13391, + "time_per_iteration": 3.2774362564086914 + }, + { + "auxiliary_loss_clip": 0.01391105, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.23493397, + "balance_loss_mlp": 1.01465011, + "epoch": 0.8051705997294454, + "flos": 18012274093440.0, + "grad_norm": 1.4839752610738173, + "language_loss": 0.85194135, + "learning_rate": 3.850390420667762e-07, + "loss": 0.87618941, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19042969, + "step": 13392, + "time_per_iteration": 2.8301596641540527 + }, + { + "auxiliary_loss_clip": 0.0139934, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.2377255, + "balance_loss_mlp": 1.01678348, + "epoch": 0.8052307229821133, + "flos": 26409418755840.0, + "grad_norm": 1.376581924699406, + "language_loss": 0.71049362, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.73483384, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.17883301, + "step": 13393, + "time_per_iteration": 2.996227502822876 + }, + { + "auxiliary_loss_clip": 0.01413413, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.25075722, + "balance_loss_mlp": 1.01450419, + "epoch": 0.8052908462347813, + "flos": 21765649783680.0, + "grad_norm": 2.102848144448321, + "language_loss": 0.77329767, + "learning_rate": 3.84579682111414e-07, + "loss": 0.79777873, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20202637, + "step": 13394, + "time_per_iteration": 2.8571460247039795 + }, + { + "auxiliary_loss_clip": 0.01399391, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.23942518, + "balance_loss_mlp": 1.01696396, + "epoch": 0.8053509694874492, + "flos": 25451918156160.0, + "grad_norm": 1.6990338924013488, + "language_loss": 0.65545225, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67980391, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18811035, + "step": 13395, + "time_per_iteration": 3.0366392135620117 + }, + { + "auxiliary_loss_clip": 0.01178067, + "auxiliary_loss_mlp": 0.01016177, + "balance_loss_clip": 1.08974624, + "balance_loss_mlp": 0.99824774, + "epoch": 0.8054110927401172, + "flos": 57697907281920.0, + "grad_norm": 0.7573514763798864, + "language_loss": 0.57400811, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59595048, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.1796875, + "step": 13396, + "time_per_iteration": 3.5205607414245605 + }, + { + "auxiliary_loss_clip": 0.0140063, + "auxiliary_loss_mlp": 0.01037487, + "balance_loss_clip": 1.24034929, + "balance_loss_mlp": 1.01807928, + "epoch": 0.8054712159927853, + "flos": 19284245187840.0, + "grad_norm": 1.6523474499059487, + "language_loss": 0.78283644, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.8072176, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19396973, + "step": 13397, + "time_per_iteration": 2.8544516563415527 + }, + { + "auxiliary_loss_clip": 0.01406363, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.24623597, + "balance_loss_mlp": 1.01268756, + "epoch": 0.8055313392454532, + "flos": 17976593928960.0, + "grad_norm": 1.5269162296436827, + "language_loss": 0.71035254, + "learning_rate": 3.836616973531266e-07, + "loss": 0.73472345, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18041992, + "step": 13398, + "time_per_iteration": 2.874073028564453 + }, + { + "auxiliary_loss_clip": 0.01402514, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.24234724, + "balance_loss_mlp": 1.01258111, + "epoch": 0.8055914624981212, + "flos": 13485770939520.0, + "grad_norm": 4.277179931618462, + "language_loss": 0.70165575, + "learning_rate": 3.834323543710805e-07, + "loss": 0.72599733, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19067383, + "step": 13399, + "time_per_iteration": 2.8311643600463867 + }, + { + "auxiliary_loss_clip": 0.01405009, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.24476457, + "balance_loss_mlp": 1.01409113, + "epoch": 0.8056515857507891, + "flos": 13232616428160.0, + "grad_norm": 2.388711289797942, + "language_loss": 0.73357832, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.7579546, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18518066, + "step": 13400, + "time_per_iteration": 2.8717217445373535 + }, + { + "auxiliary_loss_clip": 0.0139456, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.2356416, + "balance_loss_mlp": 1.01272678, + "epoch": 0.8057117090034571, + "flos": 23888669166720.0, + "grad_norm": 1.6750622398553707, + "language_loss": 0.6437602, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66801876, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18579102, + "step": 13401, + "time_per_iteration": 2.903275489807129 + }, + { + "auxiliary_loss_clip": 0.01401045, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.24025702, + "balance_loss_mlp": 1.01766479, + "epoch": 0.805771832256125, + "flos": 21223977310080.0, + "grad_norm": 2.0537003144785007, + "language_loss": 0.85153913, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.87590593, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.1796875, + "step": 13402, + "time_per_iteration": 2.8236422538757324 + }, + { + "auxiliary_loss_clip": 0.01411687, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.25045991, + "balance_loss_mlp": 1.01458752, + "epoch": 0.805831955508793, + "flos": 17575832321280.0, + "grad_norm": 1.974165997438179, + "language_loss": 0.68876833, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.71322548, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19433594, + "step": 13403, + "time_per_iteration": 2.813140869140625 + }, + { + "auxiliary_loss_clip": 0.01394621, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.23779941, + "balance_loss_mlp": 1.01739371, + "epoch": 0.805892078761461, + "flos": 26919075893760.0, + "grad_norm": 1.908368806174867, + "language_loss": 0.85583323, + "learning_rate": 3.822865591408084e-07, + "loss": 0.88014162, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18823242, + "step": 13404, + "time_per_iteration": 2.8958840370178223 + }, + { + "auxiliary_loss_clip": 0.01386982, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.23119926, + "balance_loss_mlp": 1.01335764, + "epoch": 0.805952202014129, + "flos": 31519608819840.0, + "grad_norm": 3.766745832848424, + "language_loss": 0.71002877, + "learning_rate": 3.820575840915743e-07, + "loss": 0.73421705, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18469238, + "step": 13405, + "time_per_iteration": 3.001819372177124 + }, + { + "auxiliary_loss_clip": 0.01395809, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.23836029, + "balance_loss_mlp": 1.01561356, + "epoch": 0.8060123252667969, + "flos": 24400814768640.0, + "grad_norm": 2.32728104341965, + "language_loss": 0.76381093, + "learning_rate": 3.818286703948788e-07, + "loss": 0.78811073, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18566895, + "step": 13406, + "time_per_iteration": 2.8827598094940186 + }, + { + "auxiliary_loss_clip": 0.01404163, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.2430861, + "balance_loss_mlp": 1.01238704, + "epoch": 0.8060724485194649, + "flos": 23491119939840.0, + "grad_norm": 1.4433025573017388, + "language_loss": 0.76715004, + "learning_rate": 3.815998180594018e-07, + "loss": 0.79150343, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18798828, + "step": 13407, + "time_per_iteration": 4.370076894760132 + }, + { + "auxiliary_loss_clip": 0.01396391, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.23763514, + "balance_loss_mlp": 1.01608622, + "epoch": 0.8061325717721328, + "flos": 18633677184000.0, + "grad_norm": 2.0721268400864585, + "language_loss": 0.74708676, + "learning_rate": 3.81371027093822e-07, + "loss": 0.77140594, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19433594, + "step": 13408, + "time_per_iteration": 2.857653856277466 + }, + { + "auxiliary_loss_clip": 0.01391995, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.23388374, + "balance_loss_mlp": 1.01248431, + "epoch": 0.8061926950248008, + "flos": 23592233364480.0, + "grad_norm": 2.2115415629621915, + "language_loss": 0.71501279, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.73925042, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19299316, + "step": 13409, + "time_per_iteration": 2.8886425495147705 + }, + { + "auxiliary_loss_clip": 0.01403514, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.24171436, + "balance_loss_mlp": 1.01261663, + "epoch": 0.8062528182774689, + "flos": 11150163648000.0, + "grad_norm": 2.420248317528739, + "language_loss": 0.77676463, + "learning_rate": 3.809136293070545e-07, + "loss": 0.80111969, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19372559, + "step": 13410, + "time_per_iteration": 2.8205883502960205 + }, + { + "auxiliary_loss_clip": 0.0140103, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.24196076, + "balance_loss_mlp": 1.01374602, + "epoch": 0.8063129415301368, + "flos": 22357164044160.0, + "grad_norm": 1.7771576195938623, + "language_loss": 0.6957072, + "learning_rate": 3.806850225032117e-07, + "loss": 0.72005826, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.20324707, + "step": 13411, + "time_per_iteration": 2.909684181213379 + }, + { + "auxiliary_loss_clip": 0.01394575, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.23658526, + "balance_loss_mlp": 1.01101506, + "epoch": 0.8063730647828048, + "flos": 23999103020160.0, + "grad_norm": 1.8186997707670824, + "language_loss": 0.68837917, + "learning_rate": 3.804564771039551e-07, + "loss": 0.7126267, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19140625, + "step": 13412, + "time_per_iteration": 4.444105386734009 + }, + { + "auxiliary_loss_clip": 0.01411578, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.24694204, + "balance_loss_mlp": 1.01557016, + "epoch": 0.8064331880354727, + "flos": 21330610600320.0, + "grad_norm": 1.6652889251444154, + "language_loss": 0.82359982, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.84806943, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19787598, + "step": 13413, + "time_per_iteration": 2.838923454284668 + }, + { + "auxiliary_loss_clip": 0.01413195, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.25296795, + "balance_loss_mlp": 1.01195621, + "epoch": 0.8064933112881407, + "flos": 19692426942720.0, + "grad_norm": 1.7864748797440038, + "language_loss": 0.85732174, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.88175499, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.1817627, + "step": 13414, + "time_per_iteration": 2.835331678390503 + }, + { + "auxiliary_loss_clip": 0.01400054, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.24170423, + "balance_loss_mlp": 1.01214647, + "epoch": 0.8065534345408086, + "flos": 19288679178240.0, + "grad_norm": 1.856555948343962, + "language_loss": 0.68126941, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.70557392, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18249512, + "step": 13415, + "time_per_iteration": 2.846843957901001 + }, + { + "auxiliary_loss_clip": 0.01387345, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.23092055, + "balance_loss_mlp": 1.01019323, + "epoch": 0.8066135577934767, + "flos": 19685911691520.0, + "grad_norm": 1.5312430613754078, + "language_loss": 0.7666108, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.79076672, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18041992, + "step": 13416, + "time_per_iteration": 4.20928430557251 + }, + { + "auxiliary_loss_clip": 0.01410127, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.24603999, + "balance_loss_mlp": 1.0117321, + "epoch": 0.8066736810461446, + "flos": 21153838590720.0, + "grad_norm": 1.4524632766069496, + "language_loss": 0.659881, + "learning_rate": 3.793146714797086e-07, + "loss": 0.68428689, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18725586, + "step": 13417, + "time_per_iteration": 2.867288112640381 + }, + { + "auxiliary_loss_clip": 0.01405622, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.24253702, + "balance_loss_mlp": 1.01152968, + "epoch": 0.8067338042988126, + "flos": 22607649112320.0, + "grad_norm": 2.0606198133882385, + "language_loss": 0.81405467, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.8384093, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18322754, + "step": 13418, + "time_per_iteration": 4.231289386749268 + }, + { + "auxiliary_loss_clip": 0.01412803, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.25128007, + "balance_loss_mlp": 1.01063967, + "epoch": 0.8067939275514805, + "flos": 16517082562560.0, + "grad_norm": 1.4988214139260125, + "language_loss": 0.84761739, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87204462, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19262695, + "step": 13419, + "time_per_iteration": 2.8087000846862793 + }, + { + "auxiliary_loss_clip": 0.0140044, + "auxiliary_loss_mlp": 0.01030373, + "balance_loss_clip": 1.23788905, + "balance_loss_mlp": 1.01190722, + "epoch": 0.8068540508041485, + "flos": 28551830175360.0, + "grad_norm": 1.5717635802358858, + "language_loss": 0.76504129, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.78934944, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18444824, + "step": 13420, + "time_per_iteration": 2.917249917984009 + }, + { + "auxiliary_loss_clip": 0.01396025, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.23767257, + "balance_loss_mlp": 1.01245666, + "epoch": 0.8069141740568164, + "flos": 21662364608640.0, + "grad_norm": 1.7092244522952564, + "language_loss": 0.78796673, + "learning_rate": 3.784023331462207e-07, + "loss": 0.81223476, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18322754, + "step": 13421, + "time_per_iteration": 2.855011224746704 + }, + { + "auxiliary_loss_clip": 0.01412617, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.25177979, + "balance_loss_mlp": 1.01333582, + "epoch": 0.8069742973094844, + "flos": 17538070896000.0, + "grad_norm": 2.0495907132919964, + "language_loss": 0.8040309, + "learning_rate": 3.78174402269098e-07, + "loss": 0.8284806, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19006348, + "step": 13422, + "time_per_iteration": 2.852229356765747 + }, + { + "auxiliary_loss_clip": 0.01395802, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.2371943, + "balance_loss_mlp": 1.01395929, + "epoch": 0.8070344205621525, + "flos": 23377292726400.0, + "grad_norm": 1.7468343822093018, + "language_loss": 0.6895225, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.71380544, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.1854248, + "step": 13423, + "time_per_iteration": 2.840623140335083 + }, + { + "auxiliary_loss_clip": 0.01419077, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.25435531, + "balance_loss_mlp": 1.01881194, + "epoch": 0.8070945438148204, + "flos": 22940579485440.0, + "grad_norm": 1.762256392215455, + "language_loss": 0.8105579, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.83513713, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20043945, + "step": 13424, + "time_per_iteration": 2.8937370777130127 + }, + { + "auxiliary_loss_clip": 0.01412924, + "auxiliary_loss_mlp": 0.01028897, + "balance_loss_clip": 1.24881172, + "balance_loss_mlp": 1.00999045, + "epoch": 0.8071546670674884, + "flos": 25311414493440.0, + "grad_norm": 2.321315802714643, + "language_loss": 0.79427475, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81869292, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18908691, + "step": 13425, + "time_per_iteration": 2.9255423545837402 + }, + { + "auxiliary_loss_clip": 0.01403706, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.24339509, + "balance_loss_mlp": 1.0156343, + "epoch": 0.8072147903201563, + "flos": 18122753191680.0, + "grad_norm": 2.0971848216966538, + "language_loss": 0.76181436, + "learning_rate": 3.772632938448923e-07, + "loss": 0.78619444, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18676758, + "step": 13426, + "time_per_iteration": 2.800926446914673 + }, + { + "auxiliary_loss_clip": 0.01403854, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.24378252, + "balance_loss_mlp": 1.01422751, + "epoch": 0.8072749135728243, + "flos": 26699520286080.0, + "grad_norm": 1.7238513280618004, + "language_loss": 0.73577142, + "learning_rate": 3.770356705530997e-07, + "loss": 0.7601369, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18469238, + "step": 13427, + "time_per_iteration": 2.9195337295532227 + }, + { + "auxiliary_loss_clip": 0.01399103, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.23940539, + "balance_loss_mlp": 1.01887488, + "epoch": 0.8073350368254922, + "flos": 19248474533760.0, + "grad_norm": 2.5037620072777624, + "language_loss": 0.70854712, + "learning_rate": 3.768081088042774e-07, + "loss": 0.73292267, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19580078, + "step": 13428, + "time_per_iteration": 2.827843427658081 + }, + { + "auxiliary_loss_clip": 0.01402142, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.24028349, + "balance_loss_mlp": 1.01230669, + "epoch": 0.8073951600781603, + "flos": 13342462099200.0, + "grad_norm": 2.063948696597186, + "language_loss": 0.75970745, + "learning_rate": 3.765806086070544e-07, + "loss": 0.78403044, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.17858887, + "step": 13429, + "time_per_iteration": 2.8270816802978516 + }, + { + "auxiliary_loss_clip": 0.01379577, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.22508872, + "balance_loss_mlp": 1.01476634, + "epoch": 0.8074552833308282, + "flos": 22862522926080.0, + "grad_norm": 3.0863117260565063, + "language_loss": 0.68205303, + "learning_rate": 3.763531699700568e-07, + "loss": 0.70617515, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.17883301, + "step": 13430, + "time_per_iteration": 2.861515998840332 + }, + { + "auxiliary_loss_clip": 0.01406064, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.24630642, + "balance_loss_mlp": 1.01395106, + "epoch": 0.8075154065834962, + "flos": 20349057749760.0, + "grad_norm": 1.8807955256633995, + "language_loss": 0.80627155, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.83066094, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18920898, + "step": 13431, + "time_per_iteration": 2.8602702617645264 + }, + { + "auxiliary_loss_clip": 0.01390081, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.2339052, + "balance_loss_mlp": 1.01022744, + "epoch": 0.8075755298361641, + "flos": 21918052828800.0, + "grad_norm": 2.031051061640121, + "language_loss": 0.81108963, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.83528656, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19384766, + "step": 13432, + "time_per_iteration": 2.849001169204712 + }, + { + "auxiliary_loss_clip": 0.01422394, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.2576108, + "balance_loss_mlp": 1.01475358, + "epoch": 0.8076356530888321, + "flos": 15677752677120.0, + "grad_norm": 2.24558034939697, + "language_loss": 0.71517682, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.73974097, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19287109, + "step": 13433, + "time_per_iteration": 2.852055788040161 + }, + { + "auxiliary_loss_clip": 0.01398316, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.23847163, + "balance_loss_mlp": 1.01363671, + "epoch": 0.8076957763415, + "flos": 37791019411200.0, + "grad_norm": 1.3435480779108975, + "language_loss": 0.72543967, + "learning_rate": 3.754440311967828e-07, + "loss": 0.7497499, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19055176, + "step": 13434, + "time_per_iteration": 2.9942433834075928 + }, + { + "auxiliary_loss_clip": 0.01410553, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.25133061, + "balance_loss_mlp": 1.0135448, + "epoch": 0.807755899594168, + "flos": 19620523676160.0, + "grad_norm": 2.184675900733435, + "language_loss": 0.68845904, + "learning_rate": 3.752169004902361e-07, + "loss": 0.71288216, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18225098, + "step": 13435, + "time_per_iteration": 2.8255913257598877 + }, + { + "auxiliary_loss_clip": 0.01416443, + "auxiliary_loss_mlp": 0.01034263, + "balance_loss_clip": 1.25368381, + "balance_loss_mlp": 1.01390171, + "epoch": 0.8078160228468361, + "flos": 23305344215040.0, + "grad_norm": 1.5233189888650691, + "language_loss": 0.7551465, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77965355, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20349121, + "step": 13436, + "time_per_iteration": 2.858335494995117 + }, + { + "auxiliary_loss_clip": 0.01382798, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.22650027, + "balance_loss_mlp": 1.0140698, + "epoch": 0.807876146099504, + "flos": 27174175931520.0, + "grad_norm": 1.6432180504901805, + "language_loss": 0.71060187, + "learning_rate": 3.747628239215674e-07, + "loss": 0.73475164, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18103027, + "step": 13437, + "time_per_iteration": 2.88662052154541 + }, + { + "auxiliary_loss_clip": 0.01384997, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.22900522, + "balance_loss_mlp": 1.01005995, + "epoch": 0.807936269352172, + "flos": 27170873061120.0, + "grad_norm": 1.5604907904453087, + "language_loss": 0.73257667, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75670755, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18017578, + "step": 13438, + "time_per_iteration": 2.9677536487579346 + }, + { + "auxiliary_loss_clip": 0.01399877, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.24069858, + "balance_loss_mlp": 1.01134086, + "epoch": 0.8079963926048399, + "flos": 20750090826240.0, + "grad_norm": 1.8005601387861576, + "language_loss": 0.77489525, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79920167, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1940918, + "step": 13439, + "time_per_iteration": 2.8827760219573975 + }, + { + "auxiliary_loss_clip": 0.01392876, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.23657906, + "balance_loss_mlp": 1.01210344, + "epoch": 0.8080565158575079, + "flos": 25020543801600.0, + "grad_norm": 1.7471501629102566, + "language_loss": 0.7868281, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.81106341, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1854248, + "step": 13440, + "time_per_iteration": 2.9174726009368896 + }, + { + "auxiliary_loss_clip": 0.0139757, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.23653221, + "balance_loss_mlp": 1.01216996, + "epoch": 0.8081166391101758, + "flos": 18707435487360.0, + "grad_norm": 1.6725201238059553, + "language_loss": 0.60161972, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.62590694, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18981934, + "step": 13441, + "time_per_iteration": 2.845165491104126 + }, + { + "auxiliary_loss_clip": 0.01396581, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.23917246, + "balance_loss_mlp": 1.01580548, + "epoch": 0.8081767623628439, + "flos": 19838179002240.0, + "grad_norm": 2.4444683439216446, + "language_loss": 0.7675755, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.79189801, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19873047, + "step": 13442, + "time_per_iteration": 4.256820440292358 + }, + { + "auxiliary_loss_clip": 0.01391403, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.23285604, + "balance_loss_mlp": 1.01274657, + "epoch": 0.8082368856155118, + "flos": 35786623190400.0, + "grad_norm": 1.5338835421849233, + "language_loss": 0.71498811, + "learning_rate": 3.734020735906169e-07, + "loss": 0.73921257, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1829834, + "step": 13443, + "time_per_iteration": 2.9580256938934326 + }, + { + "auxiliary_loss_clip": 0.01391541, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.23460186, + "balance_loss_mlp": 1.01431108, + "epoch": 0.8082970088681798, + "flos": 17205909684480.0, + "grad_norm": 1.8009008886486892, + "language_loss": 0.82978928, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.85402864, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18078613, + "step": 13444, + "time_per_iteration": 2.865795850753784 + }, + { + "auxiliary_loss_clip": 0.01174137, + "auxiliary_loss_mlp": 0.01019104, + "balance_loss_clip": 1.08760107, + "balance_loss_mlp": 0.99850464, + "epoch": 0.8083571321208477, + "flos": 63580392420480.0, + "grad_norm": 0.8272516304923463, + "language_loss": 0.53665751, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55858994, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.20605469, + "step": 13445, + "time_per_iteration": 3.2157814502716064 + }, + { + "auxiliary_loss_clip": 0.01391211, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.23282659, + "balance_loss_mlp": 1.01463938, + "epoch": 0.8084172553735157, + "flos": 17939556420480.0, + "grad_norm": 2.0178203616534214, + "language_loss": 0.72440219, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.7486549, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1940918, + "step": 13446, + "time_per_iteration": 4.304142713546753 + }, + { + "auxiliary_loss_clip": 0.01407064, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.24586511, + "balance_loss_mlp": 1.01547408, + "epoch": 0.8084773786261836, + "flos": 24108767712000.0, + "grad_norm": 1.7112118015712188, + "language_loss": 0.72015703, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.74457252, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18994141, + "step": 13447, + "time_per_iteration": 2.8976590633392334 + }, + { + "auxiliary_loss_clip": 0.01410539, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.24756515, + "balance_loss_mlp": 1.01644468, + "epoch": 0.8085375018788516, + "flos": 15594538210560.0, + "grad_norm": 2.183405769692324, + "language_loss": 0.75872105, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.78317976, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.1887207, + "step": 13448, + "time_per_iteration": 2.8139891624450684 + }, + { + "auxiliary_loss_clip": 0.0117512, + "auxiliary_loss_mlp": 0.01022667, + "balance_loss_clip": 1.08901834, + "balance_loss_mlp": 1.0035938, + "epoch": 0.8085976251315197, + "flos": 67595084668800.0, + "grad_norm": 0.7410067512851463, + "language_loss": 0.6386224, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.6606003, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.19042969, + "step": 13449, + "time_per_iteration": 3.3320038318634033 + }, + { + "auxiliary_loss_clip": 0.01394795, + "auxiliary_loss_mlp": 0.01031452, + "balance_loss_clip": 1.23682225, + "balance_loss_mlp": 1.01229537, + "epoch": 0.8086577483841876, + "flos": 22570747338240.0, + "grad_norm": 1.6531788752029546, + "language_loss": 0.74663514, + "learning_rate": 3.718173381422105e-07, + "loss": 0.77089763, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19152832, + "step": 13450, + "time_per_iteration": 2.9263105392456055 + }, + { + "auxiliary_loss_clip": 0.01403752, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.24338615, + "balance_loss_mlp": 1.01256454, + "epoch": 0.8087178716368556, + "flos": 17977182111360.0, + "grad_norm": 1.5437321048113068, + "language_loss": 0.74898309, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.77333713, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19091797, + "step": 13451, + "time_per_iteration": 4.340465545654297 + }, + { + "auxiliary_loss_clip": 0.01406669, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.2430979, + "balance_loss_mlp": 1.01401758, + "epoch": 0.8087779948895235, + "flos": 21727888358400.0, + "grad_norm": 1.7114667337020155, + "language_loss": 0.80888969, + "learning_rate": 3.713651121244543e-07, + "loss": 0.83329558, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19921875, + "step": 13452, + "time_per_iteration": 4.305746793746948 + }, + { + "auxiliary_loss_clip": 0.01407692, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.24642503, + "balance_loss_mlp": 1.01656127, + "epoch": 0.8088381181421915, + "flos": 29103999442560.0, + "grad_norm": 1.6044304996884662, + "language_loss": 0.78824681, + "learning_rate": 3.711390917482875e-07, + "loss": 0.8126719, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18249512, + "step": 13453, + "time_per_iteration": 2.9207146167755127 + }, + { + "auxiliary_loss_clip": 0.01402337, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.24172354, + "balance_loss_mlp": 1.01487637, + "epoch": 0.8088982413948594, + "flos": 22208109114240.0, + "grad_norm": 2.0147043983107427, + "language_loss": 0.77580142, + "learning_rate": 3.709131331386892e-07, + "loss": 0.8001644, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19091797, + "step": 13454, + "time_per_iteration": 2.8465993404388428 + }, + { + "auxiliary_loss_clip": 0.0139017, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.23253608, + "balance_loss_mlp": 1.01423478, + "epoch": 0.8089583646475275, + "flos": 28048054861440.0, + "grad_norm": 2.5628215284992946, + "language_loss": 0.77683169, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.80106145, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18591309, + "step": 13455, + "time_per_iteration": 2.983485460281372 + }, + { + "auxiliary_loss_clip": 0.01402375, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.24126875, + "balance_loss_mlp": 1.01271582, + "epoch": 0.8090184879001954, + "flos": 16626385296000.0, + "grad_norm": 1.7265328103719009, + "language_loss": 0.78946733, + "learning_rate": 3.70461401253471e-07, + "loss": 0.8138079, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1895752, + "step": 13456, + "time_per_iteration": 2.936126470565796 + }, + { + "auxiliary_loss_clip": 0.01396775, + "auxiliary_loss_mlp": 0.01036171, + "balance_loss_clip": 1.2388258, + "balance_loss_mlp": 1.01703823, + "epoch": 0.8090786111528634, + "flos": 27351264654720.0, + "grad_norm": 2.6741286677832736, + "language_loss": 0.72651482, + "learning_rate": 3.702356279949801e-07, + "loss": 0.75084424, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19152832, + "step": 13457, + "time_per_iteration": 2.9458792209625244 + }, + { + "auxiliary_loss_clip": 0.01396255, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.23786497, + "balance_loss_mlp": 1.0149343, + "epoch": 0.8091387344055313, + "flos": 21115624717440.0, + "grad_norm": 1.7721667203755127, + "language_loss": 0.73588699, + "learning_rate": 3.700099165373176e-07, + "loss": 0.76018488, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18603516, + "step": 13458, + "time_per_iteration": 2.8698205947875977 + }, + { + "auxiliary_loss_clip": 0.01397105, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.23778677, + "balance_loss_mlp": 1.0123415, + "epoch": 0.8091988576581993, + "flos": 11662264005120.0, + "grad_norm": 2.1287545182733334, + "language_loss": 0.80517602, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.82946622, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19567871, + "step": 13459, + "time_per_iteration": 2.8145265579223633 + }, + { + "auxiliary_loss_clip": 0.01409241, + "auxiliary_loss_mlp": 0.01033741, + "balance_loss_clip": 1.24548149, + "balance_loss_mlp": 1.01453614, + "epoch": 0.8092589809108672, + "flos": 22972866289920.0, + "grad_norm": 2.06712138115123, + "language_loss": 0.80803663, + "learning_rate": 3.695586790587113e-07, + "loss": 0.83246642, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1920166, + "step": 13460, + "time_per_iteration": 2.843167781829834 + }, + { + "auxiliary_loss_clip": 0.01400691, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.23851919, + "balance_loss_mlp": 1.01429009, + "epoch": 0.8093191041635353, + "flos": 13268749040640.0, + "grad_norm": 1.630399147939918, + "language_loss": 0.84659678, + "learning_rate": 3.693331530548789e-07, + "loss": 0.87093842, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19152832, + "step": 13461, + "time_per_iteration": 2.8288347721099854 + }, + { + "auxiliary_loss_clip": 0.01427082, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.26469254, + "balance_loss_mlp": 1.01404786, + "epoch": 0.8093792274162032, + "flos": 25525857438720.0, + "grad_norm": 2.3343039637622183, + "language_loss": 0.76328546, + "learning_rate": 3.69107688886096e-07, + "loss": 0.7878902, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19348145, + "step": 13462, + "time_per_iteration": 2.8809814453125 + }, + { + "auxiliary_loss_clip": 0.01402101, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.23977041, + "balance_loss_mlp": 1.01029837, + "epoch": 0.8094393506688712, + "flos": 23556010262400.0, + "grad_norm": 1.8638709452131261, + "language_loss": 0.83789152, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.86221051, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19494629, + "step": 13463, + "time_per_iteration": 2.9326932430267334 + }, + { + "auxiliary_loss_clip": 0.01399908, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.24146128, + "balance_loss_mlp": 1.01553977, + "epoch": 0.8094994739215392, + "flos": 17064772594560.0, + "grad_norm": 1.7739793667129924, + "language_loss": 0.63016355, + "learning_rate": 3.686569460878779e-07, + "loss": 0.65450633, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18859863, + "step": 13464, + "time_per_iteration": 2.907449245452881 + }, + { + "auxiliary_loss_clip": 0.01392434, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.23524594, + "balance_loss_mlp": 1.01154506, + "epoch": 0.8095595971742071, + "flos": 23561982576000.0, + "grad_norm": 1.44915529881324, + "language_loss": 0.62461627, + "learning_rate": 3.684316674755341e-07, + "loss": 0.64883929, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18322754, + "step": 13465, + "time_per_iteration": 2.954393148422241 + }, + { + "auxiliary_loss_clip": 0.0140304, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.24598396, + "balance_loss_mlp": 1.01724279, + "epoch": 0.8096197204268751, + "flos": 20382158960640.0, + "grad_norm": 1.8480124554444806, + "language_loss": 0.83076537, + "learning_rate": 3.682064507324256e-07, + "loss": 0.85515791, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18969727, + "step": 13466, + "time_per_iteration": 2.8842945098876953 + }, + { + "auxiliary_loss_clip": 0.01410914, + "auxiliary_loss_mlp": 0.01034638, + "balance_loss_clip": 1.24905515, + "balance_loss_mlp": 1.01602888, + "epoch": 0.809679843679543, + "flos": 27830309045760.0, + "grad_norm": 1.7689382894199852, + "language_loss": 0.76922786, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.79368341, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18615723, + "step": 13467, + "time_per_iteration": 2.9189577102661133 + }, + { + "auxiliary_loss_clip": 0.01399078, + "auxiliary_loss_mlp": 0.01032671, + "balance_loss_clip": 1.2388072, + "balance_loss_mlp": 1.01372886, + "epoch": 0.8097399669322111, + "flos": 22023238285440.0, + "grad_norm": 2.443888589279125, + "language_loss": 0.80130643, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.82562387, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18933105, + "step": 13468, + "time_per_iteration": 2.8364083766937256 + }, + { + "auxiliary_loss_clip": 0.01397534, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.24015021, + "balance_loss_mlp": 1.01430726, + "epoch": 0.809800090184879, + "flos": 18998170444800.0, + "grad_norm": 1.7580657093807892, + "language_loss": 0.68790483, + "learning_rate": 3.675311718038978e-07, + "loss": 0.71220434, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18115234, + "step": 13469, + "time_per_iteration": 2.8162729740142822 + }, + { + "auxiliary_loss_clip": 0.01178165, + "auxiliary_loss_mlp": 0.01023253, + "balance_loss_clip": 1.09203196, + "balance_loss_mlp": 1.00122309, + "epoch": 0.809860213437547, + "flos": 66132315676800.0, + "grad_norm": 0.6967584942257878, + "language_loss": 0.5469597, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56897384, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.22070312, + "step": 13470, + "time_per_iteration": 3.4479353427886963 + }, + { + "auxiliary_loss_clip": 0.01394841, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.23712695, + "balance_loss_mlp": 1.01683855, + "epoch": 0.8099203366902149, + "flos": 20891182671360.0, + "grad_norm": 1.7057923027401618, + "language_loss": 0.69693375, + "learning_rate": 3.670812953542279e-07, + "loss": 0.72122902, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17871094, + "step": 13471, + "time_per_iteration": 2.892167806625366 + }, + { + "auxiliary_loss_clip": 0.01402354, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.24107051, + "balance_loss_mlp": 1.01231182, + "epoch": 0.8099804599428829, + "flos": 26041984583040.0, + "grad_norm": 2.390512875219101, + "language_loss": 0.80031836, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82465345, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18847656, + "step": 13472, + "time_per_iteration": 2.9150197505950928 + }, + { + "auxiliary_loss_clip": 0.011809, + "auxiliary_loss_mlp": 0.01025477, + "balance_loss_clip": 1.09441757, + "balance_loss_mlp": 1.00201643, + "epoch": 0.8100405831955508, + "flos": 69335467626240.0, + "grad_norm": 0.7484884530121954, + "language_loss": 0.57821077, + "learning_rate": 3.666316665863201e-07, + "loss": 0.6002745, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.234375, + "step": 13473, + "time_per_iteration": 3.2223007678985596 + }, + { + "auxiliary_loss_clip": 0.01410316, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.24806726, + "balance_loss_mlp": 1.0100646, + "epoch": 0.8101007064482189, + "flos": 15020759911680.0, + "grad_norm": 1.778275432865064, + "language_loss": 0.75583076, + "learning_rate": 3.664069451043399e-07, + "loss": 0.780222, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1875, + "step": 13474, + "time_per_iteration": 2.8070240020751953 + }, + { + "auxiliary_loss_clip": 0.0141126, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.24860883, + "balance_loss_mlp": 1.01903987, + "epoch": 0.8101608297008868, + "flos": 21076551192960.0, + "grad_norm": 1.9408270906159197, + "language_loss": 0.79078841, + "learning_rate": 3.661822855683723e-07, + "loss": 0.8152796, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18823242, + "step": 13475, + "time_per_iteration": 2.8977417945861816 + }, + { + "auxiliary_loss_clip": 0.01389287, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.23308361, + "balance_loss_mlp": 1.01389766, + "epoch": 0.8102209529535548, + "flos": 23741469273600.0, + "grad_norm": 1.7942690540795605, + "language_loss": 0.75947475, + "learning_rate": 3.659576879869364e-07, + "loss": 0.78369373, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18713379, + "step": 13476, + "time_per_iteration": 2.869100332260132 + }, + { + "auxiliary_loss_clip": 0.01414755, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.24997663, + "balance_loss_mlp": 1.01991653, + "epoch": 0.8102810762062228, + "flos": 10961356521600.0, + "grad_norm": 7.7721040218832265, + "language_loss": 0.75003046, + "learning_rate": 3.657331523685485e-07, + "loss": 0.77457112, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19384766, + "step": 13477, + "time_per_iteration": 4.328181266784668 + }, + { + "auxiliary_loss_clip": 0.01393006, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.23454666, + "balance_loss_mlp": 1.01401854, + "epoch": 0.8103411994588907, + "flos": 14657307281280.0, + "grad_norm": 2.11863210499417, + "language_loss": 0.70381033, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.72806448, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18395996, + "step": 13478, + "time_per_iteration": 2.860466241836548 + }, + { + "auxiliary_loss_clip": 0.0118039, + "auxiliary_loss_mlp": 0.01022524, + "balance_loss_clip": 1.09372914, + "balance_loss_mlp": 1.00173366, + "epoch": 0.8104013227115587, + "flos": 59180701737600.0, + "grad_norm": 0.6836770516720002, + "language_loss": 0.52149808, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54352725, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.20800781, + "step": 13479, + "time_per_iteration": 3.291825294494629 + }, + { + "auxiliary_loss_clip": 0.01408573, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.24887002, + "balance_loss_mlp": 1.01417565, + "epoch": 0.8104614459642266, + "flos": 19838224247040.0, + "grad_norm": 1.5617914474173218, + "language_loss": 0.71860874, + "learning_rate": 3.650599173768072e-07, + "loss": 0.74302459, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18847656, + "step": 13480, + "time_per_iteration": 2.905086040496826 + }, + { + "auxiliary_loss_clip": 0.01411177, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.24888122, + "balance_loss_mlp": 1.01561129, + "epoch": 0.8105215692168947, + "flos": 25385172796800.0, + "grad_norm": 1.8330618593781554, + "language_loss": 0.80463707, + "learning_rate": 3.648356296957327e-07, + "loss": 0.82909548, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19067383, + "step": 13481, + "time_per_iteration": 4.388688325881958 + }, + { + "auxiliary_loss_clip": 0.01397843, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.23775244, + "balance_loss_mlp": 1.01163137, + "epoch": 0.8105816924695626, + "flos": 20490466308480.0, + "grad_norm": 2.163503746868634, + "language_loss": 0.74123478, + "learning_rate": 3.646114040202548e-07, + "loss": 0.76552331, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19360352, + "step": 13482, + "time_per_iteration": 2.875131368637085 + }, + { + "auxiliary_loss_clip": 0.01389642, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.22962558, + "balance_loss_mlp": 1.01135719, + "epoch": 0.8106418157222306, + "flos": 14546963917440.0, + "grad_norm": 2.1120403161941885, + "language_loss": 0.65736389, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.68156523, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19128418, + "step": 13483, + "time_per_iteration": 2.8489186763763428 + }, + { + "auxiliary_loss_clip": 0.01387617, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.23010445, + "balance_loss_mlp": 1.01284313, + "epoch": 0.8107019389748985, + "flos": 22574547901440.0, + "grad_norm": 2.0622138686317077, + "language_loss": 0.77287006, + "learning_rate": 3.641631387200992e-07, + "loss": 0.79707164, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19702148, + "step": 13484, + "time_per_iteration": 2.8588461875915527 + }, + { + "auxiliary_loss_clip": 0.01431082, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.2646066, + "balance_loss_mlp": 1.0146246, + "epoch": 0.8107620622275665, + "flos": 19618985352960.0, + "grad_norm": 1.6977798272517879, + "language_loss": 0.72568345, + "learning_rate": 3.639390991124183e-07, + "loss": 0.75033516, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.19470215, + "step": 13485, + "time_per_iteration": 2.927285671234131 + }, + { + "auxiliary_loss_clip": 0.01386486, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.23151326, + "balance_loss_mlp": 1.01040721, + "epoch": 0.8108221854802344, + "flos": 16152046364160.0, + "grad_norm": 2.33374652513425, + "language_loss": 0.76655138, + "learning_rate": 3.637151215443308e-07, + "loss": 0.79070425, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18383789, + "step": 13486, + "time_per_iteration": 4.2570788860321045 + }, + { + "auxiliary_loss_clip": 0.01426312, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.26047564, + "balance_loss_mlp": 1.01218843, + "epoch": 0.8108823087329025, + "flos": 21116212899840.0, + "grad_norm": 2.8913080062013714, + "language_loss": 0.72489178, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74946564, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.18884277, + "step": 13487, + "time_per_iteration": 4.246339559555054 + }, + { + "auxiliary_loss_clip": 0.01396277, + "auxiliary_loss_mlp": 0.01033745, + "balance_loss_clip": 1.24049568, + "balance_loss_mlp": 1.01487374, + "epoch": 0.8109424319855704, + "flos": 29210089795200.0, + "grad_norm": 1.947805547147092, + "language_loss": 0.84972107, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.87402129, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18884277, + "step": 13488, + "time_per_iteration": 2.899254322052002 + }, + { + "auxiliary_loss_clip": 0.01406675, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.24568844, + "balance_loss_mlp": 1.01239824, + "epoch": 0.8110025552382384, + "flos": 23122102199040.0, + "grad_norm": 1.7468792126379293, + "language_loss": 0.75008273, + "learning_rate": 3.630435611625502e-07, + "loss": 0.77445704, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18359375, + "step": 13489, + "time_per_iteration": 2.847289562225342 + }, + { + "auxiliary_loss_clip": 0.01388591, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.23300743, + "balance_loss_mlp": 1.0167594, + "epoch": 0.8110626784909064, + "flos": 22389450848640.0, + "grad_norm": 1.8397913676858275, + "language_loss": 0.72508311, + "learning_rate": 3.628198318377453e-07, + "loss": 0.74932557, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18896484, + "step": 13490, + "time_per_iteration": 2.897650957107544 + }, + { + "auxiliary_loss_clip": 0.01415898, + "auxiliary_loss_mlp": 0.01040963, + "balance_loss_clip": 1.25347757, + "balance_loss_mlp": 1.02074552, + "epoch": 0.8111228017435743, + "flos": 23378559580800.0, + "grad_norm": 2.192671840505119, + "language_loss": 0.72671551, + "learning_rate": 3.625961645949762e-07, + "loss": 0.75128412, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20214844, + "step": 13491, + "time_per_iteration": 2.8672280311584473 + }, + { + "auxiliary_loss_clip": 0.01392519, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.23382807, + "balance_loss_mlp": 1.01360464, + "epoch": 0.8111829249962423, + "flos": 21296061555840.0, + "grad_norm": 1.3452202847585073, + "language_loss": 0.6821835, + "learning_rate": 3.623725594427245e-07, + "loss": 0.7064358, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19104004, + "step": 13492, + "time_per_iteration": 2.9359896183013916 + }, + { + "auxiliary_loss_clip": 0.01397894, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.2372793, + "balance_loss_mlp": 1.01000905, + "epoch": 0.8112430482489102, + "flos": 22355580476160.0, + "grad_norm": 1.678052942179456, + "language_loss": 0.72720635, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.75147867, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.1932373, + "step": 13493, + "time_per_iteration": 2.8839704990386963 + }, + { + "auxiliary_loss_clip": 0.01403098, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.24274445, + "balance_loss_mlp": 1.01539505, + "epoch": 0.8113031715015783, + "flos": 31150545834240.0, + "grad_norm": 1.7168049747418779, + "language_loss": 0.71684796, + "learning_rate": 3.619255354436885e-07, + "loss": 0.74122596, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19299316, + "step": 13494, + "time_per_iteration": 2.9632768630981445 + }, + { + "auxiliary_loss_clip": 0.01413005, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.24896169, + "balance_loss_mlp": 1.01828527, + "epoch": 0.8113632947542462, + "flos": 25345737313920.0, + "grad_norm": 1.9418700433829768, + "language_loss": 0.76929957, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.79381502, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20263672, + "step": 13495, + "time_per_iteration": 2.8555891513824463 + }, + { + "auxiliary_loss_clip": 0.01408341, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.24595952, + "balance_loss_mlp": 1.01816297, + "epoch": 0.8114234180069142, + "flos": 28450897729920.0, + "grad_norm": 2.2689920798579335, + "language_loss": 0.80666351, + "learning_rate": 3.614787599084417e-07, + "loss": 0.83112282, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19433594, + "step": 13496, + "time_per_iteration": 2.881586790084839 + }, + { + "auxiliary_loss_clip": 0.01405028, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.24503183, + "balance_loss_mlp": 1.01046252, + "epoch": 0.8114835412595821, + "flos": 20348379077760.0, + "grad_norm": 1.525812396407154, + "language_loss": 0.7189908, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.74333709, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19140625, + "step": 13497, + "time_per_iteration": 2.8347280025482178 + }, + { + "auxiliary_loss_clip": 0.01416153, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.25307035, + "balance_loss_mlp": 1.0164628, + "epoch": 0.8115436645122501, + "flos": 22500789598080.0, + "grad_norm": 1.5048499581551866, + "language_loss": 0.77281737, + "learning_rate": 3.610322329047508e-07, + "loss": 0.7973299, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.1862793, + "step": 13498, + "time_per_iteration": 2.9008307456970215 + }, + { + "auxiliary_loss_clip": 0.01393321, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.2333827, + "balance_loss_mlp": 1.01637816, + "epoch": 0.811603787764918, + "flos": 13853340846720.0, + "grad_norm": 1.811891277669646, + "language_loss": 0.84690154, + "learning_rate": 3.608090626234055e-07, + "loss": 0.87118906, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19042969, + "step": 13499, + "time_per_iteration": 2.827768087387085 + }, + { + "auxiliary_loss_clip": 0.01399868, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.24043703, + "balance_loss_mlp": 1.01220846, + "epoch": 0.8116639110175861, + "flos": 21624150735360.0, + "grad_norm": 1.3912354954799457, + "language_loss": 0.76919818, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.79351819, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19934082, + "step": 13500, + "time_per_iteration": 2.825660228729248 + }, + { + "auxiliary_loss_clip": 0.01178126, + "auxiliary_loss_mlp": 0.01019484, + "balance_loss_clip": 1.09238482, + "balance_loss_mlp": 0.99993384, + "epoch": 0.811724034270254, + "flos": 64492729430400.0, + "grad_norm": 0.8026169566292045, + "language_loss": 0.5999583, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62193441, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.1953125, + "step": 13501, + "time_per_iteration": 3.4107296466827393 + }, + { + "auxiliary_loss_clip": 0.01382712, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.22846603, + "balance_loss_mlp": 1.01141465, + "epoch": 0.811784157522922, + "flos": 24764900826240.0, + "grad_norm": 1.6743180323355924, + "language_loss": 0.79802608, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.82215315, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18579102, + "step": 13502, + "time_per_iteration": 2.9191954135894775 + }, + { + "auxiliary_loss_clip": 0.01391643, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.23425317, + "balance_loss_mlp": 1.01605332, + "epoch": 0.81184428077559, + "flos": 12174816810240.0, + "grad_norm": 1.7881579827195597, + "language_loss": 0.72144854, + "learning_rate": 3.599170031654635e-07, + "loss": 0.74571073, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18518066, + "step": 13503, + "time_per_iteration": 2.8148205280303955 + }, + { + "auxiliary_loss_clip": 0.01393295, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.23299289, + "balance_loss_mlp": 1.01054764, + "epoch": 0.8119044040282579, + "flos": 44438460687360.0, + "grad_norm": 1.4717057571244003, + "language_loss": 0.68397236, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.70820349, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19299316, + "step": 13504, + "time_per_iteration": 3.081800699234009 + }, + { + "auxiliary_loss_clip": 0.01399839, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.23771286, + "balance_loss_mlp": 1.01270294, + "epoch": 0.8119645272809259, + "flos": 52173545166720.0, + "grad_norm": 2.9173700377815783, + "language_loss": 0.74783909, + "learning_rate": 3.594713465553403e-07, + "loss": 0.77215803, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19360352, + "step": 13505, + "time_per_iteration": 3.146167755126953 + }, + { + "auxiliary_loss_clip": 0.01402245, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.24235678, + "balance_loss_mlp": 1.01225662, + "epoch": 0.8120246505335939, + "flos": 30246732829440.0, + "grad_norm": 2.7580390856095347, + "language_loss": 0.73336411, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.75770116, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19213867, + "step": 13506, + "time_per_iteration": 3.004589080810547 + }, + { + "auxiliary_loss_clip": 0.01417786, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.25182283, + "balance_loss_mlp": 1.01774395, + "epoch": 0.8120847737862619, + "flos": 22137879905280.0, + "grad_norm": 2.4767184757549647, + "language_loss": 0.77827179, + "learning_rate": 3.590259387812593e-07, + "loss": 0.80281949, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19238281, + "step": 13507, + "time_per_iteration": 2.8584558963775635 + }, + { + "auxiliary_loss_clip": 0.01418503, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.25210333, + "balance_loss_mlp": 1.01113486, + "epoch": 0.8121448970389298, + "flos": 23305706173440.0, + "grad_norm": 5.014580703257244, + "language_loss": 0.71337032, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.73786384, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19714355, + "step": 13508, + "time_per_iteration": 2.8471953868865967 + }, + { + "auxiliary_loss_clip": 0.01393624, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.2349, + "balance_loss_mlp": 1.01354778, + "epoch": 0.8122050202915978, + "flos": 22174374476160.0, + "grad_norm": 19.045173736050675, + "language_loss": 0.76612306, + "learning_rate": 3.585807799107785e-07, + "loss": 0.79038441, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18981934, + "step": 13509, + "time_per_iteration": 2.8435637950897217 + }, + { + "auxiliary_loss_clip": 0.01410409, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.24751461, + "balance_loss_mlp": 1.01383829, + "epoch": 0.8122651435442657, + "flos": 23269528316160.0, + "grad_norm": 2.3691179832243416, + "language_loss": 0.77860075, + "learning_rate": 3.58358293835491e-07, + "loss": 0.8030355, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19213867, + "step": 13510, + "time_per_iteration": 2.91888689994812 + }, + { + "auxiliary_loss_clip": 0.01416841, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.25255942, + "balance_loss_mlp": 1.01439548, + "epoch": 0.8123252667969337, + "flos": 16147657618560.0, + "grad_norm": 2.056496193666657, + "language_loss": 0.70399904, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72850633, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19494629, + "step": 13511, + "time_per_iteration": 2.8182337284088135 + }, + { + "auxiliary_loss_clip": 0.01408433, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.24589694, + "balance_loss_mlp": 1.01455426, + "epoch": 0.8123853900496016, + "flos": 21253685160960.0, + "grad_norm": 2.2762200634618153, + "language_loss": 0.79794562, + "learning_rate": 3.57913508447004e-07, + "loss": 0.82236874, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19311523, + "step": 13512, + "time_per_iteration": 4.245841026306152 + }, + { + "auxiliary_loss_clip": 0.0140241, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.24207425, + "balance_loss_mlp": 1.01339257, + "epoch": 0.8124455133022697, + "flos": 64398638229120.0, + "grad_norm": 1.690663887340754, + "language_loss": 0.64406532, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.66840637, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18310547, + "step": 13513, + "time_per_iteration": 3.2920007705688477 + }, + { + "auxiliary_loss_clip": 0.01411044, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.24816287, + "balance_loss_mlp": 1.01359022, + "epoch": 0.8125056365549376, + "flos": 23852808023040.0, + "grad_norm": 1.676351556656475, + "language_loss": 0.72163451, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.74607408, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19311523, + "step": 13514, + "time_per_iteration": 2.861069917678833 + }, + { + "auxiliary_loss_clip": 0.01397489, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.23965168, + "balance_loss_mlp": 1.01359916, + "epoch": 0.8125657598076056, + "flos": 23560851456000.0, + "grad_norm": 2.195099394732198, + "language_loss": 0.63877189, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.66307044, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18762207, + "step": 13515, + "time_per_iteration": 2.917102336883545 + }, + { + "auxiliary_loss_clip": 0.01370875, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.21892405, + "balance_loss_mlp": 1.01120031, + "epoch": 0.8126258830602736, + "flos": 20713958213760.0, + "grad_norm": 1.4684457395372836, + "language_loss": 0.7569828, + "learning_rate": 3.570246849544616e-07, + "loss": 0.78098941, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.18591309, + "step": 13516, + "time_per_iteration": 2.8092081546783447 + }, + { + "auxiliary_loss_clip": 0.0141482, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.25246429, + "balance_loss_mlp": 1.01702082, + "epoch": 0.8126860063129415, + "flos": 23627370591360.0, + "grad_norm": 1.4521761860822036, + "language_loss": 0.92041659, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.94493306, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19812012, + "step": 13517, + "time_per_iteration": 4.2871012687683105 + }, + { + "auxiliary_loss_clip": 0.01410051, + "auxiliary_loss_mlp": 0.01039691, + "balance_loss_clip": 1.25011039, + "balance_loss_mlp": 1.02027178, + "epoch": 0.8127461295656095, + "flos": 25017376665600.0, + "grad_norm": 1.3625454617075954, + "language_loss": 0.79225785, + "learning_rate": 3.565806469852244e-07, + "loss": 0.81675524, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19421387, + "step": 13518, + "time_per_iteration": 2.888892889022827 + }, + { + "auxiliary_loss_clip": 0.0140407, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.24506402, + "balance_loss_mlp": 1.01682556, + "epoch": 0.8128062528182775, + "flos": 27352983957120.0, + "grad_norm": 1.8822185888809235, + "language_loss": 0.79822671, + "learning_rate": 3.56358721474336e-07, + "loss": 0.82261574, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18005371, + "step": 13519, + "time_per_iteration": 2.9129831790924072 + }, + { + "auxiliary_loss_clip": 0.01398401, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.23722494, + "balance_loss_mlp": 1.01844943, + "epoch": 0.8128663760709455, + "flos": 26517771348480.0, + "grad_norm": 1.554729094038506, + "language_loss": 0.70869911, + "learning_rate": 3.561368582904905e-07, + "loss": 0.73305202, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18444824, + "step": 13520, + "time_per_iteration": 2.94035005569458 + }, + { + "auxiliary_loss_clip": 0.01405209, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.24294662, + "balance_loss_mlp": 1.01211953, + "epoch": 0.8129264993236134, + "flos": 17940235092480.0, + "grad_norm": 2.2056915212875836, + "language_loss": 0.73329836, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.75765795, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18640137, + "step": 13521, + "time_per_iteration": 4.249757289886475 + }, + { + "auxiliary_loss_clip": 0.01405277, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.24350035, + "balance_loss_mlp": 1.01295781, + "epoch": 0.8129866225762814, + "flos": 26189003496960.0, + "grad_norm": 2.9470438122567457, + "language_loss": 0.71075451, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.73513341, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19641113, + "step": 13522, + "time_per_iteration": 2.97939133644104 + }, + { + "auxiliary_loss_clip": 0.01391753, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.23535109, + "balance_loss_mlp": 1.01413798, + "epoch": 0.8130467458289493, + "flos": 21041911658880.0, + "grad_norm": 1.5984336671436858, + "language_loss": 0.71099138, + "learning_rate": 3.554716427853233e-07, + "loss": 0.73523259, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18225098, + "step": 13523, + "time_per_iteration": 4.290147066116333 + }, + { + "auxiliary_loss_clip": 0.01395383, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.23711014, + "balance_loss_mlp": 1.01398015, + "epoch": 0.8131068690816173, + "flos": 15495506046720.0, + "grad_norm": 2.0694671035541887, + "language_loss": 0.72241986, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.74671316, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1998291, + "step": 13524, + "time_per_iteration": 2.8383266925811768 + }, + { + "auxiliary_loss_clip": 0.01389356, + "auxiliary_loss_mlp": 0.01031221, + "balance_loss_clip": 1.23142838, + "balance_loss_mlp": 1.01260018, + "epoch": 0.8131669923342852, + "flos": 29363171512320.0, + "grad_norm": 1.8033140219625157, + "language_loss": 0.63730812, + "learning_rate": 3.550284775712653e-07, + "loss": 0.66151381, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1862793, + "step": 13525, + "time_per_iteration": 2.9527149200439453 + }, + { + "auxiliary_loss_clip": 0.01403223, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.24391043, + "balance_loss_mlp": 1.01474857, + "epoch": 0.8132271155869533, + "flos": 35268143316480.0, + "grad_norm": 1.5629172108576377, + "language_loss": 0.6585139, + "learning_rate": 3.548069885262628e-07, + "loss": 0.68288851, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19494629, + "step": 13526, + "time_per_iteration": 2.956932306289673 + }, + { + "auxiliary_loss_clip": 0.01390152, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.23187602, + "balance_loss_mlp": 1.01344371, + "epoch": 0.8132872388396212, + "flos": 27793316782080.0, + "grad_norm": 1.8155000453708177, + "language_loss": 0.75843775, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.7826553, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1817627, + "step": 13527, + "time_per_iteration": 2.9314746856689453 + }, + { + "auxiliary_loss_clip": 0.01400581, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.24044442, + "balance_loss_mlp": 1.01200581, + "epoch": 0.8133473620922892, + "flos": 27831349676160.0, + "grad_norm": 1.7040118385777634, + "language_loss": 0.7118206, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.7361334, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18701172, + "step": 13528, + "time_per_iteration": 2.8801779747009277 + }, + { + "auxiliary_loss_clip": 0.01395936, + "auxiliary_loss_mlp": 0.01029895, + "balance_loss_clip": 1.23482645, + "balance_loss_mlp": 1.01141787, + "epoch": 0.8134074853449572, + "flos": 18998849116800.0, + "grad_norm": 1.9995951856702965, + "language_loss": 0.69646776, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.72072613, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18481445, + "step": 13529, + "time_per_iteration": 2.851712703704834 + }, + { + "auxiliary_loss_clip": 0.01383788, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.22788942, + "balance_loss_mlp": 1.01473379, + "epoch": 0.8134676085976251, + "flos": 24253388651520.0, + "grad_norm": 1.368128870405857, + "language_loss": 0.78070927, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.80488032, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18579102, + "step": 13530, + "time_per_iteration": 2.9558002948760986 + }, + { + "auxiliary_loss_clip": 0.01393532, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.23566103, + "balance_loss_mlp": 1.01156819, + "epoch": 0.8135277318502931, + "flos": 19071702524160.0, + "grad_norm": 1.9190069865026615, + "language_loss": 0.8244375, + "learning_rate": 3.537004792574052e-07, + "loss": 0.8486771, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18847656, + "step": 13531, + "time_per_iteration": 2.864884853363037 + }, + { + "auxiliary_loss_clip": 0.01407921, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.24574888, + "balance_loss_mlp": 1.01368356, + "epoch": 0.813587855102961, + "flos": 17277315258240.0, + "grad_norm": 2.0285763293091947, + "language_loss": 0.72579634, + "learning_rate": 3.534793646536065e-07, + "loss": 0.75021076, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19836426, + "step": 13532, + "time_per_iteration": 2.801363945007324 + }, + { + "auxiliary_loss_clip": 0.01400062, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.24135172, + "balance_loss_mlp": 1.01355171, + "epoch": 0.8136479783556291, + "flos": 20167354056960.0, + "grad_norm": 2.216635256682109, + "language_loss": 0.76574361, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.79006982, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18981934, + "step": 13533, + "time_per_iteration": 2.8682994842529297 + }, + { + "auxiliary_loss_clip": 0.01407498, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.24401295, + "balance_loss_mlp": 1.01390529, + "epoch": 0.813708101608297, + "flos": 22061949851520.0, + "grad_norm": 6.7888533142617264, + "language_loss": 0.77142185, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.795829, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19311523, + "step": 13534, + "time_per_iteration": 2.848888635635376 + }, + { + "auxiliary_loss_clip": 0.01407938, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.24814129, + "balance_loss_mlp": 1.0093658, + "epoch": 0.813768224860965, + "flos": 16180261136640.0, + "grad_norm": 2.044377705431547, + "language_loss": 0.94264638, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.9670136, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1940918, + "step": 13535, + "time_per_iteration": 2.7819979190826416 + }, + { + "auxiliary_loss_clip": 0.01397901, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.24181354, + "balance_loss_mlp": 1.01139009, + "epoch": 0.8138283481136329, + "flos": 24362872364160.0, + "grad_norm": 1.6683097287701654, + "language_loss": 0.70738614, + "learning_rate": 3.52595530684499e-07, + "loss": 0.73166162, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18261719, + "step": 13536, + "time_per_iteration": 2.8698537349700928 + }, + { + "auxiliary_loss_clip": 0.01397007, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.23711205, + "balance_loss_mlp": 1.01397443, + "epoch": 0.8138884713663009, + "flos": 25526309886720.0, + "grad_norm": 2.4962809913961164, + "language_loss": 0.75701231, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.78131998, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19787598, + "step": 13537, + "time_per_iteration": 2.850372791290283 + }, + { + "auxiliary_loss_clip": 0.01390951, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.23403311, + "balance_loss_mlp": 1.01418924, + "epoch": 0.8139485946189688, + "flos": 22464249782400.0, + "grad_norm": 1.513434597064544, + "language_loss": 0.76451468, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78875923, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.1932373, + "step": 13538, + "time_per_iteration": 2.8471248149871826 + }, + { + "auxiliary_loss_clip": 0.0140765, + "auxiliary_loss_mlp": 0.01033266, + "balance_loss_clip": 1.24579942, + "balance_loss_mlp": 1.01555192, + "epoch": 0.8140087178716369, + "flos": 21259974188160.0, + "grad_norm": 1.6569194589258451, + "language_loss": 0.78301644, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80742562, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.17700195, + "step": 13539, + "time_per_iteration": 2.828122138977051 + }, + { + "auxiliary_loss_clip": 0.01389705, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.23372817, + "balance_loss_mlp": 1.01440334, + "epoch": 0.8140688411243048, + "flos": 39428841110400.0, + "grad_norm": 2.858238899256375, + "language_loss": 0.66952294, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.6937449, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.1809082, + "step": 13540, + "time_per_iteration": 2.967628002166748 + }, + { + "auxiliary_loss_clip": 0.01404755, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.2446692, + "balance_loss_mlp": 1.01346588, + "epoch": 0.8141289643769728, + "flos": 25428137374080.0, + "grad_norm": 1.7951704032067992, + "language_loss": 0.68229127, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.70666254, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18896484, + "step": 13541, + "time_per_iteration": 2.942077875137329 + }, + { + "auxiliary_loss_clip": 0.01402197, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.24270988, + "balance_loss_mlp": 1.01548648, + "epoch": 0.8141890876296408, + "flos": 12575714152320.0, + "grad_norm": 1.8837731430623945, + "language_loss": 0.6962626, + "learning_rate": 3.512716539904355e-07, + "loss": 0.72063255, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19299316, + "step": 13542, + "time_per_iteration": 2.823521137237549 + }, + { + "auxiliary_loss_clip": 0.01423765, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.25870323, + "balance_loss_mlp": 1.00879073, + "epoch": 0.8142492108823087, + "flos": 14973135120000.0, + "grad_norm": 5.119279918072919, + "language_loss": 0.81214213, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.83666253, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19494629, + "step": 13543, + "time_per_iteration": 2.791187047958374 + }, + { + "auxiliary_loss_clip": 0.01419746, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.25422668, + "balance_loss_mlp": 1.01318467, + "epoch": 0.8143093341349767, + "flos": 12429600134400.0, + "grad_norm": 3.3323498821398703, + "language_loss": 0.79722989, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.82175434, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.1953125, + "step": 13544, + "time_per_iteration": 2.8285703659057617 + }, + { + "auxiliary_loss_clip": 0.0143176, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.26263475, + "balance_loss_mlp": 1.01208961, + "epoch": 0.8143694573876447, + "flos": 11917861735680.0, + "grad_norm": 3.047154412613177, + "language_loss": 0.74279177, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.76741076, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 1.69042969, + "router_z_loss_mlp": 0.18054199, + "step": 13545, + "time_per_iteration": 2.823859453201294 + }, + { + "auxiliary_loss_clip": 0.01392817, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.23609269, + "balance_loss_mlp": 1.01171207, + "epoch": 0.8144295806403127, + "flos": 21222212762880.0, + "grad_norm": 1.5678898567386188, + "language_loss": 0.77275741, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.796987, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18432617, + "step": 13546, + "time_per_iteration": 2.8518002033233643 + }, + { + "auxiliary_loss_clip": 0.01413351, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.25243139, + "balance_loss_mlp": 1.01019728, + "epoch": 0.8144897038929806, + "flos": 19874854552320.0, + "grad_norm": 2.7186540605652163, + "language_loss": 0.71753359, + "learning_rate": 3.501701426337178e-07, + "loss": 0.741956, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18701172, + "step": 13547, + "time_per_iteration": 4.267401695251465 + }, + { + "auxiliary_loss_clip": 0.01421429, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.25696838, + "balance_loss_mlp": 1.01546419, + "epoch": 0.8145498271456486, + "flos": 24582473216640.0, + "grad_norm": 2.211363391236721, + "language_loss": 0.71152198, + "learning_rate": 3.49950028014111e-07, + "loss": 0.7360875, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1965332, + "step": 13548, + "time_per_iteration": 2.9667985439300537 + }, + { + "auxiliary_loss_clip": 0.01412259, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.25073051, + "balance_loss_mlp": 1.01035643, + "epoch": 0.8146099503983165, + "flos": 20202536528640.0, + "grad_norm": 2.497025247035961, + "language_loss": 0.77958041, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.80399799, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19116211, + "step": 13549, + "time_per_iteration": 2.8495147228240967 + }, + { + "auxiliary_loss_clip": 0.01408734, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.24729347, + "balance_loss_mlp": 1.01713097, + "epoch": 0.8146700736509845, + "flos": 19546448659200.0, + "grad_norm": 1.9174457948763852, + "language_loss": 0.71618432, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.74063003, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18701172, + "step": 13550, + "time_per_iteration": 2.8415443897247314 + }, + { + "auxiliary_loss_clip": 0.01392213, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.23547709, + "balance_loss_mlp": 1.01400006, + "epoch": 0.8147301969036524, + "flos": 18050533211520.0, + "grad_norm": 1.8368164783651235, + "language_loss": 0.72357595, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74783254, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19458008, + "step": 13551, + "time_per_iteration": 2.8269121646881104 + }, + { + "auxiliary_loss_clip": 0.01422562, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.25694931, + "balance_loss_mlp": 1.01482654, + "epoch": 0.8147903201563205, + "flos": 18013857661440.0, + "grad_norm": 2.3669171851338904, + "language_loss": 0.69401944, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.71859586, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20263672, + "step": 13552, + "time_per_iteration": 4.2146618366241455 + }, + { + "auxiliary_loss_clip": 0.01402661, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.24277711, + "balance_loss_mlp": 1.01283216, + "epoch": 0.8148504434089884, + "flos": 20267110137600.0, + "grad_norm": 2.152403379533042, + "language_loss": 0.83003736, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.85437745, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18530273, + "step": 13553, + "time_per_iteration": 2.8352210521698 + }, + { + "auxiliary_loss_clip": 0.01407573, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.24663806, + "balance_loss_mlp": 1.01251936, + "epoch": 0.8149105666616564, + "flos": 12502272562560.0, + "grad_norm": 2.1078577879750053, + "language_loss": 0.68576002, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.7101447, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18395996, + "step": 13554, + "time_per_iteration": 2.8004610538482666 + }, + { + "auxiliary_loss_clip": 0.01399359, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.23927426, + "balance_loss_mlp": 1.01618314, + "epoch": 0.8149706899143244, + "flos": 32535982183680.0, + "grad_norm": 1.8774545221828276, + "language_loss": 0.66914481, + "learning_rate": 3.484109781056723e-07, + "loss": 0.69349384, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19348145, + "step": 13555, + "time_per_iteration": 2.92211651802063 + }, + { + "auxiliary_loss_clip": 0.01411673, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.24681532, + "balance_loss_mlp": 1.01519656, + "epoch": 0.8150308131669923, + "flos": 19394271838080.0, + "grad_norm": 2.1583631178587943, + "language_loss": 0.74342304, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.76788294, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19128418, + "step": 13556, + "time_per_iteration": 4.223504066467285 + }, + { + "auxiliary_loss_clip": 0.01405625, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.24709916, + "balance_loss_mlp": 1.01270866, + "epoch": 0.8150909364196604, + "flos": 17430487464960.0, + "grad_norm": 1.5634842674435958, + "language_loss": 0.81100249, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.83535999, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17419434, + "step": 13557, + "time_per_iteration": 2.8169989585876465 + }, + { + "auxiliary_loss_clip": 0.01416098, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.2524457, + "balance_loss_mlp": 1.0133822, + "epoch": 0.8151510596723283, + "flos": 27174356910720.0, + "grad_norm": 1.546706155200626, + "language_loss": 0.6626333, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68713355, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20507812, + "step": 13558, + "time_per_iteration": 4.331890821456909 + }, + { + "auxiliary_loss_clip": 0.01183451, + "auxiliary_loss_mlp": 0.01021018, + "balance_loss_clip": 1.09469497, + "balance_loss_mlp": 1.00032318, + "epoch": 0.8152111829249963, + "flos": 64246253166720.0, + "grad_norm": 1.05260805802815, + "language_loss": 0.56966209, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.59170675, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20703125, + "step": 13559, + "time_per_iteration": 3.3095040321350098 + }, + { + "auxiliary_loss_clip": 0.01179237, + "auxiliary_loss_mlp": 0.01022681, + "balance_loss_clip": 1.09365773, + "balance_loss_mlp": 1.00303578, + "epoch": 0.8152713061776642, + "flos": 67101788165760.0, + "grad_norm": 0.6822284107158197, + "language_loss": 0.55229557, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57431471, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.19628906, + "step": 13560, + "time_per_iteration": 3.148033380508423 + }, + { + "auxiliary_loss_clip": 0.01396496, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.23767126, + "balance_loss_mlp": 1.01430416, + "epoch": 0.8153314294303322, + "flos": 14398949617920.0, + "grad_norm": 1.6336265634506493, + "language_loss": 0.6763739, + "learning_rate": 3.470942348696948e-07, + "loss": 0.7006706, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18859863, + "step": 13561, + "time_per_iteration": 2.8311214447021484 + }, + { + "auxiliary_loss_clip": 0.0141314, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.24955237, + "balance_loss_mlp": 1.01324892, + "epoch": 0.8153915526830001, + "flos": 25632897932160.0, + "grad_norm": 1.6048239218587728, + "language_loss": 0.8229087, + "learning_rate": 3.468749969894085e-07, + "loss": 0.84735841, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18566895, + "step": 13562, + "time_per_iteration": 2.880063533782959 + }, + { + "auxiliary_loss_clip": 0.01405866, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.24431086, + "balance_loss_mlp": 1.01079369, + "epoch": 0.8154516759356681, + "flos": 23379962169600.0, + "grad_norm": 1.5071510172553437, + "language_loss": 0.72423738, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74859393, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18994141, + "step": 13563, + "time_per_iteration": 2.8954052925109863 + }, + { + "auxiliary_loss_clip": 0.01395499, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.23416996, + "balance_loss_mlp": 1.0129106, + "epoch": 0.815511799188336, + "flos": 28161339137280.0, + "grad_norm": 1.6858600436701376, + "language_loss": 0.70792514, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.73221743, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.20788574, + "step": 13564, + "time_per_iteration": 2.894212245941162 + }, + { + "auxiliary_loss_clip": 0.01406503, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.2446394, + "balance_loss_mlp": 1.01377249, + "epoch": 0.8155719224410041, + "flos": 16992371635200.0, + "grad_norm": 1.837469886573768, + "language_loss": 0.71225548, + "learning_rate": 3.462176595017854e-07, + "loss": 0.736642, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18383789, + "step": 13565, + "time_per_iteration": 2.864867925643921 + }, + { + "auxiliary_loss_clip": 0.01394427, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.23589516, + "balance_loss_mlp": 1.01332939, + "epoch": 0.815632045693672, + "flos": 24692635601280.0, + "grad_norm": 1.9626181086217405, + "language_loss": 0.79486388, + "learning_rate": 3.459986724180188e-07, + "loss": 0.8191269, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18554688, + "step": 13566, + "time_per_iteration": 2.9344594478607178 + }, + { + "auxiliary_loss_clip": 0.01401934, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.24482656, + "balance_loss_mlp": 1.01156259, + "epoch": 0.81569216894634, + "flos": 19947934183680.0, + "grad_norm": 1.7532638678498138, + "language_loss": 0.82946861, + "learning_rate": 3.457797480541491e-07, + "loss": 0.85378218, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17858887, + "step": 13567, + "time_per_iteration": 2.90010142326355 + }, + { + "auxiliary_loss_clip": 0.0139531, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.23741698, + "balance_loss_mlp": 1.00942898, + "epoch": 0.8157522921990079, + "flos": 21809564501760.0, + "grad_norm": 1.9504872357600622, + "language_loss": 0.80785686, + "learning_rate": 3.455608864184771e-07, + "loss": 0.8320815, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17736816, + "step": 13568, + "time_per_iteration": 2.940246343612671 + }, + { + "auxiliary_loss_clip": 0.01389113, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.23271465, + "balance_loss_mlp": 1.01562154, + "epoch": 0.8158124154516759, + "flos": 18515732693760.0, + "grad_norm": 1.737845573299104, + "language_loss": 0.78044611, + "learning_rate": 3.453420875193016e-07, + "loss": 0.80468035, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18688965, + "step": 13569, + "time_per_iteration": 2.876735210418701 + }, + { + "auxiliary_loss_clip": 0.01402434, + "auxiliary_loss_mlp": 0.01032321, + "balance_loss_clip": 1.24333107, + "balance_loss_mlp": 1.01407003, + "epoch": 0.815872538704344, + "flos": 26841336048000.0, + "grad_norm": 2.5153069587250654, + "language_loss": 0.60361445, + "learning_rate": 3.451233513649199e-07, + "loss": 0.62796199, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18249512, + "step": 13570, + "time_per_iteration": 2.878286123275757 + }, + { + "auxiliary_loss_clip": 0.01420727, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.2552911, + "balance_loss_mlp": 1.02163768, + "epoch": 0.8159326619570119, + "flos": 21735670464000.0, + "grad_norm": 2.1566803612483456, + "language_loss": 0.82799274, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.85261428, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19775391, + "step": 13571, + "time_per_iteration": 2.854039192199707 + }, + { + "auxiliary_loss_clip": 0.0140083, + "auxiliary_loss_mlp": 0.01036102, + "balance_loss_clip": 1.2400918, + "balance_loss_mlp": 1.01650381, + "epoch": 0.8159927852096799, + "flos": 13847594757120.0, + "grad_norm": 7.84937214120772, + "language_loss": 0.79818088, + "learning_rate": 3.446860673237142e-07, + "loss": 0.82255018, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19592285, + "step": 13572, + "time_per_iteration": 2.8260018825531006 + }, + { + "auxiliary_loss_clip": 0.01390885, + "auxiliary_loss_mlp": 0.01032422, + "balance_loss_clip": 1.23120713, + "balance_loss_mlp": 1.0136826, + "epoch": 0.8160529084623478, + "flos": 24510434215680.0, + "grad_norm": 1.6316371337733233, + "language_loss": 0.65803164, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.68226469, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1875, + "step": 13573, + "time_per_iteration": 2.8815722465515137 + }, + { + "auxiliary_loss_clip": 0.01386164, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.22865462, + "balance_loss_mlp": 1.0165832, + "epoch": 0.8161130317150158, + "flos": 24836939827200.0, + "grad_norm": 1.60587970426764, + "language_loss": 0.75788772, + "learning_rate": 3.442490343611868e-07, + "loss": 0.7820996, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18444824, + "step": 13574, + "time_per_iteration": 2.892906427383423 + }, + { + "auxiliary_loss_clip": 0.01414185, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.25107038, + "balance_loss_mlp": 1.01535404, + "epoch": 0.8161731549676837, + "flos": 30968887386240.0, + "grad_norm": 1.8672883125320314, + "language_loss": 0.60763943, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.6321224, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.1875, + "step": 13575, + "time_per_iteration": 2.909097194671631 + }, + { + "auxiliary_loss_clip": 0.01401962, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.2421844, + "balance_loss_mlp": 1.0131762, + "epoch": 0.8162332782203517, + "flos": 18561276224640.0, + "grad_norm": 1.9168040804998403, + "language_loss": 0.74944532, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.77379477, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19787598, + "step": 13576, + "time_per_iteration": 2.8313894271850586 + }, + { + "auxiliary_loss_clip": 0.01183961, + "auxiliary_loss_mlp": 0.01020512, + "balance_loss_clip": 1.09604371, + "balance_loss_mlp": 0.99819583, + "epoch": 0.8162934014730197, + "flos": 70416097885440.0, + "grad_norm": 0.8489935748313457, + "language_loss": 0.58721066, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60925531, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.22363281, + "step": 13577, + "time_per_iteration": 3.358959674835205 + }, + { + "auxiliary_loss_clip": 0.01387996, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.23156059, + "balance_loss_mlp": 1.01256263, + "epoch": 0.8163535247256877, + "flos": 21224655982080.0, + "grad_norm": 1.892239910634745, + "language_loss": 0.71718776, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.74137819, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18493652, + "step": 13578, + "time_per_iteration": 2.852933645248413 + }, + { + "auxiliary_loss_clip": 0.01405302, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.24514103, + "balance_loss_mlp": 1.0153327, + "epoch": 0.8164136479783556, + "flos": 21106937715840.0, + "grad_norm": 1.7037479012927401, + "language_loss": 0.74561465, + "learning_rate": 3.431575508590172e-07, + "loss": 0.77000529, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18432617, + "step": 13579, + "time_per_iteration": 2.894939661026001 + }, + { + "auxiliary_loss_clip": 0.01403014, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.24042416, + "balance_loss_mlp": 1.01326227, + "epoch": 0.8164737712310236, + "flos": 21729290947200.0, + "grad_norm": 1.9876260065042013, + "language_loss": 0.79443181, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81878209, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18762207, + "step": 13580, + "time_per_iteration": 2.8581268787384033 + }, + { + "auxiliary_loss_clip": 0.01384843, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.22907853, + "balance_loss_mlp": 1.01246989, + "epoch": 0.8165338944836915, + "flos": 19546267680000.0, + "grad_norm": 1.813379842606365, + "language_loss": 0.70175481, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.72591591, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18786621, + "step": 13581, + "time_per_iteration": 4.268720626831055 + }, + { + "auxiliary_loss_clip": 0.0141223, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.25189352, + "balance_loss_mlp": 1.0112443, + "epoch": 0.8165940177363595, + "flos": 22938543469440.0, + "grad_norm": 2.535597630302302, + "language_loss": 0.60517716, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.62959969, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18774414, + "step": 13582, + "time_per_iteration": 2.853557586669922 + }, + { + "auxiliary_loss_clip": 0.01383944, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.22956252, + "balance_loss_mlp": 1.01135635, + "epoch": 0.8166541409890276, + "flos": 23381500492800.0, + "grad_norm": 1.4219648572864834, + "language_loss": 0.8270306, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.85117424, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.19067383, + "step": 13583, + "time_per_iteration": 2.8908979892730713 + }, + { + "auxiliary_loss_clip": 0.01406107, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.24483752, + "balance_loss_mlp": 1.011729, + "epoch": 0.8167142642416955, + "flos": 18451113840000.0, + "grad_norm": 1.7027146221963114, + "language_loss": 0.74572939, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.77009469, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18688965, + "step": 13584, + "time_per_iteration": 2.842946767807007 + }, + { + "auxiliary_loss_clip": 0.01413256, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.25113368, + "balance_loss_mlp": 1.01237655, + "epoch": 0.8167743874943635, + "flos": 21224836961280.0, + "grad_norm": 1.5913049334934954, + "language_loss": 0.74825937, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.77270699, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19128418, + "step": 13585, + "time_per_iteration": 2.8418047428131104 + }, + { + "auxiliary_loss_clip": 0.01399488, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.24019933, + "balance_loss_mlp": 1.01253331, + "epoch": 0.8168345107470314, + "flos": 18706937794560.0, + "grad_norm": 1.5264375937352501, + "language_loss": 0.69754159, + "learning_rate": 3.416321129478068e-07, + "loss": 0.72185504, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19311523, + "step": 13586, + "time_per_iteration": 4.432991027832031 + }, + { + "auxiliary_loss_clip": 0.01398819, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.23985755, + "balance_loss_mlp": 1.01679909, + "epoch": 0.8168946339996994, + "flos": 16261711056000.0, + "grad_norm": 1.6887032516169675, + "language_loss": 0.6168133, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.64115107, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18164062, + "step": 13587, + "time_per_iteration": 2.836973190307617 + }, + { + "auxiliary_loss_clip": 0.01424557, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.25949359, + "balance_loss_mlp": 1.01589823, + "epoch": 0.8169547572523673, + "flos": 26952493818240.0, + "grad_norm": 2.178159926810593, + "language_loss": 0.70101333, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.72560465, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.18688965, + "step": 13588, + "time_per_iteration": 2.8740203380584717 + }, + { + "auxiliary_loss_clip": 0.01409328, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.24706411, + "balance_loss_mlp": 1.01474333, + "epoch": 0.8170148805050353, + "flos": 18961449649920.0, + "grad_norm": 1.4605860792696277, + "language_loss": 0.73127711, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75571549, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19750977, + "step": 13589, + "time_per_iteration": 2.834681749343872 + }, + { + "auxiliary_loss_clip": 0.01390468, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.23284101, + "balance_loss_mlp": 1.01668739, + "epoch": 0.8170750037577033, + "flos": 21844837463040.0, + "grad_norm": 1.751038553397567, + "language_loss": 0.74025905, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.76452565, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19506836, + "step": 13590, + "time_per_iteration": 2.861180067062378 + }, + { + "auxiliary_loss_clip": 0.0142486, + "auxiliary_loss_mlp": 0.01037573, + "balance_loss_clip": 1.25890338, + "balance_loss_mlp": 1.01776052, + "epoch": 0.8171351270103713, + "flos": 33519299581440.0, + "grad_norm": 1.8209665943625968, + "language_loss": 0.65270162, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67732596, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19812012, + "step": 13591, + "time_per_iteration": 4.374577522277832 + }, + { + "auxiliary_loss_clip": 0.01416304, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.25123262, + "balance_loss_mlp": 1.01158547, + "epoch": 0.8171952502630392, + "flos": 22717721007360.0, + "grad_norm": 2.023447457249368, + "language_loss": 0.69270837, + "learning_rate": 3.403270471641373e-07, + "loss": 0.7171731, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18579102, + "step": 13592, + "time_per_iteration": 4.260854005813599 + }, + { + "auxiliary_loss_clip": 0.01408693, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.24690926, + "balance_loss_mlp": 1.01252246, + "epoch": 0.8172553735157072, + "flos": 26734838492160.0, + "grad_norm": 2.230681192681461, + "language_loss": 0.67442727, + "learning_rate": 3.401097564244759e-07, + "loss": 0.69882017, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18078613, + "step": 13593, + "time_per_iteration": 2.8703136444091797 + }, + { + "auxiliary_loss_clip": 0.01403819, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.24388587, + "balance_loss_mlp": 1.01522195, + "epoch": 0.8173154967683751, + "flos": 15969573509760.0, + "grad_norm": 1.7685695493507874, + "language_loss": 0.69900858, + "learning_rate": 3.398925286280188e-07, + "loss": 0.72338438, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18530273, + "step": 13594, + "time_per_iteration": 2.8059043884277344 + }, + { + "auxiliary_loss_clip": 0.01417144, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.25364912, + "balance_loss_mlp": 1.01815915, + "epoch": 0.8173756200210431, + "flos": 25995762380160.0, + "grad_norm": 2.043034371449735, + "language_loss": 0.66788328, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.69242477, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18835449, + "step": 13595, + "time_per_iteration": 2.8718745708465576 + }, + { + "auxiliary_loss_clip": 0.01414343, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.24895597, + "balance_loss_mlp": 1.01178277, + "epoch": 0.8174357432737112, + "flos": 25674957613440.0, + "grad_norm": 1.691058253791697, + "language_loss": 0.79323345, + "learning_rate": 3.394582618976658e-07, + "loss": 0.81767833, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18359375, + "step": 13596, + "time_per_iteration": 2.895853281021118 + }, + { + "auxiliary_loss_clip": 0.01389208, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.23142052, + "balance_loss_mlp": 1.011343, + "epoch": 0.8174958665263791, + "flos": 21845063687040.0, + "grad_norm": 10.435880081979729, + "language_loss": 0.58622366, + "learning_rate": 3.392412229802362e-07, + "loss": 0.6104154, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18615723, + "step": 13597, + "time_per_iteration": 2.862119436264038 + }, + { + "auxiliary_loss_clip": 0.0139786, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.24030149, + "balance_loss_mlp": 1.01853895, + "epoch": 0.8175559897790471, + "flos": 22465697616000.0, + "grad_norm": 1.5786245472374147, + "language_loss": 0.82988024, + "learning_rate": 3.390242470389462e-07, + "loss": 0.85423023, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18603516, + "step": 13598, + "time_per_iteration": 2.9172825813293457 + }, + { + "auxiliary_loss_clip": 0.01417203, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.25421119, + "balance_loss_mlp": 1.01518559, + "epoch": 0.817616113031715, + "flos": 23624384434560.0, + "grad_norm": 1.8601046879346308, + "language_loss": 0.8353442, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.85985589, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.18774414, + "step": 13599, + "time_per_iteration": 2.854243278503418 + }, + { + "auxiliary_loss_clip": 0.01399065, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.24137664, + "balance_loss_mlp": 1.01691794, + "epoch": 0.817676236284383, + "flos": 27683606845440.0, + "grad_norm": 1.9713277418723483, + "language_loss": 0.84412074, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.8684808, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.20043945, + "step": 13600, + "time_per_iteration": 2.8763363361358643 + }, + { + "auxiliary_loss_clip": 0.01410486, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.24817443, + "balance_loss_mlp": 1.01456881, + "epoch": 0.8177363595370509, + "flos": 24691730705280.0, + "grad_norm": 1.8280905636982194, + "language_loss": 0.74341416, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76785815, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19348145, + "step": 13601, + "time_per_iteration": 2.86928129196167 + }, + { + "auxiliary_loss_clip": 0.01425362, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.25818324, + "balance_loss_mlp": 1.01215112, + "epoch": 0.817796482789719, + "flos": 17354874124800.0, + "grad_norm": 2.1129279625100956, + "language_loss": 0.69408834, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.71865851, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19519043, + "step": 13602, + "time_per_iteration": 2.836453676223755 + }, + { + "auxiliary_loss_clip": 0.01398296, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.24033856, + "balance_loss_mlp": 1.01434481, + "epoch": 0.8178566060423869, + "flos": 17785660296960.0, + "grad_norm": 3.448251294289978, + "language_loss": 0.84687346, + "learning_rate": 3.379403122624718e-07, + "loss": 0.8711831, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18334961, + "step": 13603, + "time_per_iteration": 2.858989715576172 + }, + { + "auxiliary_loss_clip": 0.01400519, + "auxiliary_loss_mlp": 0.01031159, + "balance_loss_clip": 1.24019825, + "balance_loss_mlp": 1.01278913, + "epoch": 0.8179167292950549, + "flos": 24984139720320.0, + "grad_norm": 1.6662148799399195, + "language_loss": 0.7037878, + "learning_rate": 3.377237143507159e-07, + "loss": 0.72810459, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18371582, + "step": 13604, + "time_per_iteration": 2.9248199462890625 + }, + { + "auxiliary_loss_clip": 0.01399734, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.2418623, + "balance_loss_mlp": 1.01459026, + "epoch": 0.8179768525477228, + "flos": 22867047406080.0, + "grad_norm": 1.7264286835492468, + "language_loss": 0.74692047, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.77126062, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19689941, + "step": 13605, + "time_per_iteration": 2.868864059448242 + }, + { + "auxiliary_loss_clip": 0.01387982, + "auxiliary_loss_mlp": 0.01038318, + "balance_loss_clip": 1.23077941, + "balance_loss_mlp": 1.01979232, + "epoch": 0.8180369758003908, + "flos": 18524329205760.0, + "grad_norm": 3.6635283318678207, + "language_loss": 0.74789143, + "learning_rate": 3.372907076364666e-07, + "loss": 0.77215445, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18530273, + "step": 13606, + "time_per_iteration": 2.8313395977020264 + }, + { + "auxiliary_loss_clip": 0.01392122, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.23479259, + "balance_loss_mlp": 1.01663482, + "epoch": 0.8180970990530587, + "flos": 33195010965120.0, + "grad_norm": 1.8365911765911214, + "language_loss": 0.66687715, + "learning_rate": 3.370742988503916e-07, + "loss": 0.69113958, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17492676, + "step": 13607, + "time_per_iteration": 2.9095962047576904 + }, + { + "auxiliary_loss_clip": 0.01402427, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.24139774, + "balance_loss_mlp": 1.01311922, + "epoch": 0.8181572223057267, + "flos": 25020996249600.0, + "grad_norm": 1.9821678733533423, + "language_loss": 0.71513951, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.73948133, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18640137, + "step": 13608, + "time_per_iteration": 2.873129367828369 + }, + { + "auxiliary_loss_clip": 0.01395743, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.23660767, + "balance_loss_mlp": 1.01468205, + "epoch": 0.8182173455583948, + "flos": 28560517176960.0, + "grad_norm": 1.7249551515619423, + "language_loss": 0.80563682, + "learning_rate": 3.366416704613735e-07, + "loss": 0.82992333, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18237305, + "step": 13609, + "time_per_iteration": 2.97982120513916 + }, + { + "auxiliary_loss_clip": 0.01183958, + "auxiliary_loss_mlp": 0.01024972, + "balance_loss_clip": 1.09492874, + "balance_loss_mlp": 1.00341892, + "epoch": 0.8182774688110627, + "flos": 72057539168640.0, + "grad_norm": 0.7703080376404929, + "language_loss": 0.5591746, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.58126396, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.21582031, + "step": 13610, + "time_per_iteration": 3.4243316650390625 + }, + { + "auxiliary_loss_clip": 0.01382246, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.22831011, + "balance_loss_mlp": 1.01374888, + "epoch": 0.8183375920637307, + "flos": 19764782657280.0, + "grad_norm": 2.4211628357598856, + "language_loss": 0.78486025, + "learning_rate": 3.362092943712107e-07, + "loss": 0.80900127, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.1809082, + "step": 13611, + "time_per_iteration": 2.8364012241363525 + }, + { + "auxiliary_loss_clip": 0.01426859, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.2589817, + "balance_loss_mlp": 1.0133158, + "epoch": 0.8183977153163986, + "flos": 22350965506560.0, + "grad_norm": 2.193083279597007, + "language_loss": 0.77722037, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.80182135, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19921875, + "step": 13612, + "time_per_iteration": 2.889310836791992 + }, + { + "auxiliary_loss_clip": 0.01397209, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.23885846, + "balance_loss_mlp": 1.01186252, + "epoch": 0.8184578385690666, + "flos": 17721448646400.0, + "grad_norm": 2.095936772390873, + "language_loss": 0.87192839, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.89620709, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18798828, + "step": 13613, + "time_per_iteration": 2.864138603210449 + }, + { + "auxiliary_loss_clip": 0.01402594, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.24379086, + "balance_loss_mlp": 1.01772523, + "epoch": 0.8185179618217345, + "flos": 25711768897920.0, + "grad_norm": 1.4036608954610408, + "language_loss": 0.73503768, + "learning_rate": 3.355612034397746e-07, + "loss": 0.75942802, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18713379, + "step": 13614, + "time_per_iteration": 2.873224973678589 + }, + { + "auxiliary_loss_clip": 0.01415296, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.25299859, + "balance_loss_mlp": 1.01620936, + "epoch": 0.8185780850744026, + "flos": 25971981598080.0, + "grad_norm": 5.186502430899813, + "language_loss": 0.81854641, + "learning_rate": 3.353452993497479e-07, + "loss": 0.84305227, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19067383, + "step": 13615, + "time_per_iteration": 2.9031550884246826 + }, + { + "auxiliary_loss_clip": 0.01402397, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.24237633, + "balance_loss_mlp": 1.01529276, + "epoch": 0.8186382083270705, + "flos": 25239330247680.0, + "grad_norm": 2.1206491673360897, + "language_loss": 0.76197088, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.78634256, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19482422, + "step": 13616, + "time_per_iteration": 4.312572240829468 + }, + { + "auxiliary_loss_clip": 0.01393095, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.23569465, + "balance_loss_mlp": 1.01511216, + "epoch": 0.8186983315797385, + "flos": 22424135627520.0, + "grad_norm": 1.8628166690582386, + "language_loss": 0.7567479, + "learning_rate": 3.349136805494979e-07, + "loss": 0.7810201, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19006348, + "step": 13617, + "time_per_iteration": 2.901494026184082 + }, + { + "auxiliary_loss_clip": 0.01389898, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.23222899, + "balance_loss_mlp": 1.01463902, + "epoch": 0.8187584548324064, + "flos": 22028169968640.0, + "grad_norm": 1.9961347428385463, + "language_loss": 0.68744707, + "learning_rate": 3.346979658556415e-07, + "loss": 0.71167719, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18469238, + "step": 13618, + "time_per_iteration": 2.9176394939422607 + }, + { + "auxiliary_loss_clip": 0.01423412, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.25669456, + "balance_loss_mlp": 1.01536798, + "epoch": 0.8188185780850744, + "flos": 29253552065280.0, + "grad_norm": 2.062410412721167, + "language_loss": 0.70580137, + "learning_rate": 3.344823143102058e-07, + "loss": 0.73038602, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19665527, + "step": 13619, + "time_per_iteration": 2.938044548034668 + }, + { + "auxiliary_loss_clip": 0.01404368, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.24291658, + "balance_loss_mlp": 1.01553607, + "epoch": 0.8188787013377423, + "flos": 20704230581760.0, + "grad_norm": 1.9534950082902396, + "language_loss": 0.7440114, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76840389, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19335938, + "step": 13620, + "time_per_iteration": 2.8428852558135986 + }, + { + "auxiliary_loss_clip": 0.0139245, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.23495793, + "balance_loss_mlp": 1.01205897, + "epoch": 0.8189388245904103, + "flos": 23743595779200.0, + "grad_norm": 1.8107132326994495, + "language_loss": 0.76842308, + "learning_rate": 3.340512006973011e-07, + "loss": 0.7926569, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18859863, + "step": 13621, + "time_per_iteration": 4.277806520462036 + }, + { + "auxiliary_loss_clip": 0.01402575, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.2445159, + "balance_loss_mlp": 1.01480889, + "epoch": 0.8189989478430784, + "flos": 28266343614720.0, + "grad_norm": 2.2115070196176814, + "language_loss": 0.6653049, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.68966341, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18469238, + "step": 13622, + "time_per_iteration": 2.956760883331299 + }, + { + "auxiliary_loss_clip": 0.01408058, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.24783945, + "balance_loss_mlp": 1.01221371, + "epoch": 0.8190590710957463, + "flos": 21407671774080.0, + "grad_norm": 1.7694429607167805, + "language_loss": 0.7554701, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.77987111, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19812012, + "step": 13623, + "time_per_iteration": 2.9342575073242188 + }, + { + "auxiliary_loss_clip": 0.01407345, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.24617624, + "balance_loss_mlp": 1.01653743, + "epoch": 0.8191191943484143, + "flos": 38809655015040.0, + "grad_norm": 1.8929206703065888, + "language_loss": 0.63991237, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.66434741, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19628906, + "step": 13624, + "time_per_iteration": 3.0209832191467285 + }, + { + "auxiliary_loss_clip": 0.01389911, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.23346376, + "balance_loss_mlp": 1.01312184, + "epoch": 0.8191793176010822, + "flos": 25457257042560.0, + "grad_norm": 2.4609487857970276, + "language_loss": 0.79125261, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.81547356, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19055176, + "step": 13625, + "time_per_iteration": 2.9937992095947266 + }, + { + "auxiliary_loss_clip": 0.01429656, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.25998366, + "balance_loss_mlp": 1.01622057, + "epoch": 0.8192394408537502, + "flos": 25094030636160.0, + "grad_norm": 1.9500094621619282, + "language_loss": 0.76437545, + "learning_rate": 3.329745223345244e-07, + "loss": 0.78901905, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.18469238, + "step": 13626, + "time_per_iteration": 2.8793458938598633 + }, + { + "auxiliary_loss_clip": 0.01397251, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.23877645, + "balance_loss_mlp": 1.01713729, + "epoch": 0.8192995641064181, + "flos": 27685461882240.0, + "grad_norm": 2.0789645144914783, + "language_loss": 0.74311423, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.76745099, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19262695, + "step": 13627, + "time_per_iteration": 4.376024961471558 + }, + { + "auxiliary_loss_clip": 0.01414715, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.25084043, + "balance_loss_mlp": 1.01477861, + "epoch": 0.8193596873590862, + "flos": 21298504775040.0, + "grad_norm": 1.9376342572017424, + "language_loss": 0.69242108, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71689665, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18041992, + "step": 13628, + "time_per_iteration": 2.8965373039245605 + }, + { + "auxiliary_loss_clip": 0.01421562, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.25629222, + "balance_loss_mlp": 1.0165813, + "epoch": 0.8194198106117541, + "flos": 17501304856320.0, + "grad_norm": 1.6044243598610655, + "language_loss": 0.85823864, + "learning_rate": 3.323292738168171e-07, + "loss": 0.88282204, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20178223, + "step": 13629, + "time_per_iteration": 2.8026208877563477 + }, + { + "auxiliary_loss_clip": 0.0140338, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.24225378, + "balance_loss_mlp": 1.01440322, + "epoch": 0.8194799338644221, + "flos": 15276267152640.0, + "grad_norm": 1.9979581057163118, + "language_loss": 0.74643719, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.77080244, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1875, + "step": 13630, + "time_per_iteration": 2.781601667404175 + }, + { + "auxiliary_loss_clip": 0.01406999, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.24410391, + "balance_loss_mlp": 1.01312423, + "epoch": 0.81954005711709, + "flos": 14726541104640.0, + "grad_norm": 6.1177006827861495, + "language_loss": 0.72865582, + "learning_rate": 3.31899424315957e-07, + "loss": 0.75304383, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18676758, + "step": 13631, + "time_per_iteration": 2.8208000659942627 + }, + { + "auxiliary_loss_clip": 0.01409072, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.24673104, + "balance_loss_mlp": 1.01669145, + "epoch": 0.819600180369758, + "flos": 23083933570560.0, + "grad_norm": 1.6783451019322302, + "language_loss": 0.76942509, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.79386747, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18493652, + "step": 13632, + "time_per_iteration": 2.8577749729156494 + }, + { + "auxiliary_loss_clip": 0.01406405, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.24502659, + "balance_loss_mlp": 1.01138186, + "epoch": 0.8196603036224259, + "flos": 27611251130880.0, + "grad_norm": 1.8870648514627582, + "language_loss": 0.66182685, + "learning_rate": 3.314698278332588e-07, + "loss": 0.686185, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18029785, + "step": 13633, + "time_per_iteration": 2.881197929382324 + }, + { + "auxiliary_loss_clip": 0.01390403, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.23377073, + "balance_loss_mlp": 1.01073694, + "epoch": 0.8197204268750939, + "flos": 28592984960640.0, + "grad_norm": 1.483150896501977, + "language_loss": 0.75866783, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.78285962, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18054199, + "step": 13634, + "time_per_iteration": 2.9018876552581787 + }, + { + "auxiliary_loss_clip": 0.01396342, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.23928833, + "balance_loss_mlp": 1.0172708, + "epoch": 0.819780550127762, + "flos": 23268713909760.0, + "grad_norm": 1.9968649437151198, + "language_loss": 0.82090646, + "learning_rate": 3.310404844338841e-07, + "loss": 0.84522498, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18261719, + "step": 13635, + "time_per_iteration": 2.8636741638183594 + }, + { + "auxiliary_loss_clip": 0.01399439, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.23808312, + "balance_loss_mlp": 1.01343608, + "epoch": 0.8198406733804299, + "flos": 26695855457280.0, + "grad_norm": 2.200115374195653, + "language_loss": 0.76791167, + "learning_rate": 3.308259076607949e-07, + "loss": 0.7922377, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19714355, + "step": 13636, + "time_per_iteration": 2.864133358001709 + }, + { + "auxiliary_loss_clip": 0.01405985, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.24704552, + "balance_loss_mlp": 1.01466012, + "epoch": 0.8199007966330979, + "flos": 20093686243200.0, + "grad_norm": 1.9848408531013424, + "language_loss": 0.81718493, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.84157205, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18066406, + "step": 13637, + "time_per_iteration": 2.825047016143799 + }, + { + "auxiliary_loss_clip": 0.0139592, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.23747778, + "balance_loss_mlp": 1.01566863, + "epoch": 0.8199609198857658, + "flos": 31914760072320.0, + "grad_norm": 2.4306871214085923, + "language_loss": 0.72029632, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.74460137, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18933105, + "step": 13638, + "time_per_iteration": 2.930588483810425 + }, + { + "auxiliary_loss_clip": 0.01409199, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.24464035, + "balance_loss_mlp": 1.01237416, + "epoch": 0.8200210431384338, + "flos": 26481683980800.0, + "grad_norm": 3.385886401367235, + "language_loss": 0.80704951, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.83146721, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.2019043, + "step": 13639, + "time_per_iteration": 2.9667327404022217 + }, + { + "auxiliary_loss_clip": 0.01389367, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.23132193, + "balance_loss_mlp": 1.01337707, + "epoch": 0.8200811663911017, + "flos": 22101747292800.0, + "grad_norm": 1.7304522804709102, + "language_loss": 0.7956779, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81989324, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18811035, + "step": 13640, + "time_per_iteration": 2.8516106605529785 + }, + { + "auxiliary_loss_clip": 0.01423125, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.25537777, + "balance_loss_mlp": 1.01555538, + "epoch": 0.8201412896437698, + "flos": 37606872499200.0, + "grad_norm": 2.3505325070472436, + "language_loss": 0.64389557, + "learning_rate": 3.297539733867336e-07, + "loss": 0.66847944, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 1.67871094, + "router_z_loss_mlp": 0.19714355, + "step": 13641, + "time_per_iteration": 3.010220766067505 + }, + { + "auxiliary_loss_clip": 0.01396585, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.23675311, + "balance_loss_mlp": 1.01350474, + "epoch": 0.8202014128964377, + "flos": 19655841882240.0, + "grad_norm": 1.721750478810888, + "language_loss": 0.74162078, + "learning_rate": 3.295397765071055e-07, + "loss": 0.76590776, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18603516, + "step": 13642, + "time_per_iteration": 2.8540937900543213 + }, + { + "auxiliary_loss_clip": 0.01397167, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.23837531, + "balance_loss_mlp": 1.01525009, + "epoch": 0.8202615361491057, + "flos": 31479811378560.0, + "grad_norm": 1.5632308343215724, + "language_loss": 0.71268678, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.73701227, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.20129395, + "step": 13643, + "time_per_iteration": 2.9355967044830322 + }, + { + "auxiliary_loss_clip": 0.01400363, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.24161983, + "balance_loss_mlp": 1.0152688, + "epoch": 0.8203216594017736, + "flos": 24725782056960.0, + "grad_norm": 1.7839900812620881, + "language_loss": 0.66279197, + "learning_rate": 3.291115727880256e-07, + "loss": 0.68713522, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18701172, + "step": 13644, + "time_per_iteration": 2.872098684310913 + }, + { + "auxiliary_loss_clip": 0.01408262, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.24662268, + "balance_loss_mlp": 1.01611459, + "epoch": 0.8203817826544416, + "flos": 26043115703040.0, + "grad_norm": 1.4066573696012568, + "language_loss": 0.71743143, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.74186182, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18664551, + "step": 13645, + "time_per_iteration": 2.879178524017334 + }, + { + "auxiliary_loss_clip": 0.01393253, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.23606634, + "balance_loss_mlp": 1.01731277, + "epoch": 0.8204419059071095, + "flos": 25964697185280.0, + "grad_norm": 1.9392072925540567, + "language_loss": 0.71655375, + "learning_rate": 3.286836225099707e-07, + "loss": 0.740839, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.17980957, + "step": 13646, + "time_per_iteration": 2.9040184020996094 + }, + { + "auxiliary_loss_clip": 0.01418689, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.25494301, + "balance_loss_mlp": 1.01476657, + "epoch": 0.8205020291597775, + "flos": 23588840004480.0, + "grad_norm": 2.3642757223903823, + "language_loss": 0.79674739, + "learning_rate": 3.284697424316132e-07, + "loss": 0.82127136, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18933105, + "step": 13647, + "time_per_iteration": 2.849364757537842 + }, + { + "auxiliary_loss_clip": 0.01395457, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.23937583, + "balance_loss_mlp": 1.01184702, + "epoch": 0.8205621524124456, + "flos": 26810949525120.0, + "grad_norm": 1.421317001165268, + "language_loss": 0.68978417, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.71403921, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18212891, + "step": 13648, + "time_per_iteration": 2.898265838623047 + }, + { + "auxiliary_loss_clip": 0.01408909, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.24726343, + "balance_loss_mlp": 1.01338434, + "epoch": 0.8206222756651135, + "flos": 27539574088320.0, + "grad_norm": 3.9286076544463806, + "language_loss": 0.80893523, + "learning_rate": 3.28042172436791e-07, + "loss": 0.83334088, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18261719, + "step": 13649, + "time_per_iteration": 2.8910861015319824 + }, + { + "auxiliary_loss_clip": 0.01409428, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.24831748, + "balance_loss_mlp": 1.01210141, + "epoch": 0.8206823989177815, + "flos": 21188478124800.0, + "grad_norm": 1.6626201439531358, + "language_loss": 0.69518292, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71960139, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20349121, + "step": 13650, + "time_per_iteration": 2.978311777114868 + }, + { + "auxiliary_loss_clip": 0.01407334, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.24645329, + "balance_loss_mlp": 1.01235819, + "epoch": 0.8207425221704494, + "flos": 11516919148800.0, + "grad_norm": 2.0958116947717733, + "language_loss": 0.62388122, + "learning_rate": 3.276148560452001e-07, + "loss": 0.64827031, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19226074, + "step": 13651, + "time_per_iteration": 4.307434558868408 + }, + { + "auxiliary_loss_clip": 0.01421374, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.25898623, + "balance_loss_mlp": 1.0136677, + "epoch": 0.8208026454231174, + "flos": 19801458207360.0, + "grad_norm": 2.0262157922183386, + "language_loss": 0.73311204, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.75765198, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1895752, + "step": 13652, + "time_per_iteration": 2.967442512512207 + }, + { + "auxiliary_loss_clip": 0.01382652, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.22748733, + "balance_loss_mlp": 1.01302922, + "epoch": 0.8208627686757853, + "flos": 15675535681920.0, + "grad_norm": 2.1597454015734967, + "language_loss": 0.73173773, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75587487, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18041992, + "step": 13653, + "time_per_iteration": 2.8257453441619873 + }, + { + "auxiliary_loss_clip": 0.01428657, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.26314211, + "balance_loss_mlp": 1.01614428, + "epoch": 0.8209228919284534, + "flos": 37495443260160.0, + "grad_norm": 1.8268479763457002, + "language_loss": 0.63936377, + "learning_rate": 3.269743571056451e-07, + "loss": 0.66401672, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.20495605, + "step": 13654, + "time_per_iteration": 2.971712589263916 + }, + { + "auxiliary_loss_clip": 0.01406561, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.2442807, + "balance_loss_mlp": 1.01284254, + "epoch": 0.8209830151811213, + "flos": 23123504787840.0, + "grad_norm": 1.8026333341910348, + "language_loss": 0.7085861, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.73296189, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.1817627, + "step": 13655, + "time_per_iteration": 2.8667402267456055 + }, + { + "auxiliary_loss_clip": 0.01396301, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.23802543, + "balance_loss_mlp": 1.01624, + "epoch": 0.8210431384337893, + "flos": 21298097571840.0, + "grad_norm": 1.9666750434396565, + "language_loss": 0.82932436, + "learning_rate": 3.265476750056162e-07, + "loss": 0.85363698, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18713379, + "step": 13656, + "time_per_iteration": 4.252135992050171 + }, + { + "auxiliary_loss_clip": 0.01387595, + "auxiliary_loss_mlp": 0.01036351, + "balance_loss_clip": 1.23169196, + "balance_loss_mlp": 1.01626468, + "epoch": 0.8211032616864572, + "flos": 11507824944000.0, + "grad_norm": 2.1447991137201807, + "language_loss": 0.74644238, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.7706818, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.20080566, + "step": 13657, + "time_per_iteration": 2.8247885704040527 + }, + { + "auxiliary_loss_clip": 0.01397819, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.23885417, + "balance_loss_mlp": 1.01470935, + "epoch": 0.8211633849391252, + "flos": 29832488271360.0, + "grad_norm": 1.7331919185024978, + "language_loss": 0.56518334, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.58949363, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18493652, + "step": 13658, + "time_per_iteration": 2.9081149101257324 + }, + { + "auxiliary_loss_clip": 0.01400373, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.23804402, + "balance_loss_mlp": 1.01723444, + "epoch": 0.8212235081917931, + "flos": 13123268449920.0, + "grad_norm": 2.2489389178019703, + "language_loss": 0.80101293, + "learning_rate": 3.259081278068805e-07, + "loss": 0.82539034, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20129395, + "step": 13659, + "time_per_iteration": 2.8728461265563965 + }, + { + "auxiliary_loss_clip": 0.01389215, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.23394406, + "balance_loss_mlp": 1.01249719, + "epoch": 0.8212836314444611, + "flos": 40530057753600.0, + "grad_norm": 1.5825785860118864, + "language_loss": 0.60191619, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62611043, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17724609, + "step": 13660, + "time_per_iteration": 2.982999801635742 + }, + { + "auxiliary_loss_clip": 0.01413157, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.25083923, + "balance_loss_mlp": 1.0163635, + "epoch": 0.8213437546971292, + "flos": 18779791201920.0, + "grad_norm": 1.9095221264378233, + "language_loss": 0.7402159, + "learning_rate": 3.254820804029075e-07, + "loss": 0.76470077, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18969727, + "step": 13661, + "time_per_iteration": 2.850397825241089 + }, + { + "auxiliary_loss_clip": 0.01415593, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.25210583, + "balance_loss_mlp": 1.01694059, + "epoch": 0.8214038779497971, + "flos": 19691657781120.0, + "grad_norm": 3.0480544883713634, + "language_loss": 0.76007998, + "learning_rate": 3.252691519437143e-07, + "loss": 0.78459865, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19335938, + "step": 13662, + "time_per_iteration": 4.318909406661987 + }, + { + "auxiliary_loss_clip": 0.0117783, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.09023976, + "balance_loss_mlp": 1.0016408, + "epoch": 0.8214640012024651, + "flos": 71635666222080.0, + "grad_norm": 0.744445010058128, + "language_loss": 0.54041034, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56243491, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.22949219, + "step": 13663, + "time_per_iteration": 3.4888930320739746 + }, + { + "auxiliary_loss_clip": 0.01400466, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.23901188, + "balance_loss_mlp": 1.01611114, + "epoch": 0.821524124455133, + "flos": 14765433649920.0, + "grad_norm": 2.920265910623727, + "language_loss": 0.66848946, + "learning_rate": 3.248434855512838e-07, + "loss": 0.69284701, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19177246, + "step": 13664, + "time_per_iteration": 2.852261543273926 + }, + { + "auxiliary_loss_clip": 0.01398735, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.24135566, + "balance_loss_mlp": 1.01462483, + "epoch": 0.821584247707801, + "flos": 25093080495360.0, + "grad_norm": 1.9052408941484495, + "language_loss": 0.75660419, + "learning_rate": 3.246307476341881e-07, + "loss": 0.78091788, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17993164, + "step": 13665, + "time_per_iteration": 2.8690290451049805 + }, + { + "auxiliary_loss_clip": 0.01402423, + "auxiliary_loss_mlp": 0.01034234, + "balance_loss_clip": 1.24179065, + "balance_loss_mlp": 1.01588786, + "epoch": 0.8216443709604689, + "flos": 36844558542720.0, + "grad_norm": 2.0451820159758856, + "language_loss": 0.66091228, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.68527883, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18347168, + "step": 13666, + "time_per_iteration": 2.970503807067871 + }, + { + "auxiliary_loss_clip": 0.01412781, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.25355363, + "balance_loss_mlp": 1.01930714, + "epoch": 0.821704494213137, + "flos": 25092492312960.0, + "grad_norm": 1.6336781317191988, + "language_loss": 0.7740593, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.7985664, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18603516, + "step": 13667, + "time_per_iteration": 2.862119674682617 + }, + { + "auxiliary_loss_clip": 0.01409724, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.2481811, + "balance_loss_mlp": 1.01493382, + "epoch": 0.8217646174658049, + "flos": 14364310083840.0, + "grad_norm": 1.778425629772752, + "language_loss": 0.77607071, + "learning_rate": 3.239929150961773e-07, + "loss": 0.80051142, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19384766, + "step": 13668, + "time_per_iteration": 2.7893452644348145 + }, + { + "auxiliary_loss_clip": 0.01400497, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.24206471, + "balance_loss_mlp": 1.01368856, + "epoch": 0.8218247407184729, + "flos": 22100616172800.0, + "grad_norm": 1.942083401316505, + "language_loss": 0.74853319, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.77286112, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18603516, + "step": 13669, + "time_per_iteration": 2.8443350791931152 + }, + { + "auxiliary_loss_clip": 0.0140475, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.2454809, + "balance_loss_mlp": 1.01223409, + "epoch": 0.8218848639711408, + "flos": 16772816027520.0, + "grad_norm": 1.605876823669812, + "language_loss": 0.79401231, + "learning_rate": 3.235680111625161e-07, + "loss": 0.81837034, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18811035, + "step": 13670, + "time_per_iteration": 2.8055572509765625 + }, + { + "auxiliary_loss_clip": 0.01421952, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.25859547, + "balance_loss_mlp": 1.01625097, + "epoch": 0.8219449872238088, + "flos": 26005942460160.0, + "grad_norm": 2.293031743267957, + "language_loss": 0.75486302, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.77943116, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18591309, + "step": 13671, + "time_per_iteration": 2.8439908027648926 + }, + { + "auxiliary_loss_clip": 0.0142661, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.26085401, + "balance_loss_mlp": 1.01600814, + "epoch": 0.8220051104764767, + "flos": 20788078475520.0, + "grad_norm": 1.6870705656880354, + "language_loss": 0.77398109, + "learning_rate": 3.23143361510728e-07, + "loss": 0.79859346, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.1862793, + "step": 13672, + "time_per_iteration": 2.8243961334228516 + }, + { + "auxiliary_loss_clip": 0.0141131, + "auxiliary_loss_mlp": 0.01033727, + "balance_loss_clip": 1.25120795, + "balance_loss_mlp": 1.01473665, + "epoch": 0.8220652337291448, + "flos": 14583051285120.0, + "grad_norm": 2.7563548972293423, + "language_loss": 0.75327229, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.77772266, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18994141, + "step": 13673, + "time_per_iteration": 2.8409719467163086 + }, + { + "auxiliary_loss_clip": 0.0141031, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.24593389, + "balance_loss_mlp": 1.01389098, + "epoch": 0.8221253569818128, + "flos": 23816494431360.0, + "grad_norm": 1.6840168862864786, + "language_loss": 0.80428112, + "learning_rate": 3.227189662052254e-07, + "loss": 0.82872128, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19824219, + "step": 13674, + "time_per_iteration": 2.9772467613220215 + }, + { + "auxiliary_loss_clip": 0.01407106, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.24663913, + "balance_loss_mlp": 1.01314282, + "epoch": 0.8221854802344807, + "flos": 21298188061440.0, + "grad_norm": 1.802651664678103, + "language_loss": 0.71545148, + "learning_rate": 3.225068639524484e-07, + "loss": 0.73984456, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19055176, + "step": 13675, + "time_per_iteration": 2.8192877769470215 + }, + { + "auxiliary_loss_clip": 0.01395033, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.23851407, + "balance_loss_mlp": 1.014189, + "epoch": 0.8222456034871487, + "flos": 20965845870720.0, + "grad_norm": 2.660701636763627, + "language_loss": 0.74564385, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76991862, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18261719, + "step": 13676, + "time_per_iteration": 2.835282564163208 + }, + { + "auxiliary_loss_clip": 0.01406398, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.24763227, + "balance_loss_mlp": 1.01501179, + "epoch": 0.8223057267398166, + "flos": 21407445550080.0, + "grad_norm": 1.8142350739896407, + "language_loss": 0.81142485, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.83582526, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1862793, + "step": 13677, + "time_per_iteration": 2.8345768451690674 + }, + { + "auxiliary_loss_clip": 0.01409676, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.24912667, + "balance_loss_mlp": 1.01701176, + "epoch": 0.8223658499924846, + "flos": 15276629111040.0, + "grad_norm": 3.22012263315414, + "language_loss": 0.70468175, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72913909, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19055176, + "step": 13678, + "time_per_iteration": 2.941174268722534 + }, + { + "auxiliary_loss_clip": 0.0139823, + "auxiliary_loss_mlp": 0.01035197, + "balance_loss_clip": 1.23882735, + "balance_loss_mlp": 1.01617157, + "epoch": 0.8224259732451525, + "flos": 31262246542080.0, + "grad_norm": 1.7349581133261578, + "language_loss": 0.7197749, + "learning_rate": 3.216590911288133e-07, + "loss": 0.74410921, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19018555, + "step": 13679, + "time_per_iteration": 2.924630880355835 + }, + { + "auxiliary_loss_clip": 0.01393328, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.23559165, + "balance_loss_mlp": 1.01192057, + "epoch": 0.8224860964978206, + "flos": 21583222174080.0, + "grad_norm": 2.030442085679249, + "language_loss": 0.70538437, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72961891, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18200684, + "step": 13680, + "time_per_iteration": 2.9089460372924805 + }, + { + "auxiliary_loss_clip": 0.01405769, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.24729967, + "balance_loss_mlp": 1.01154637, + "epoch": 0.8225462197504885, + "flos": 25494023082240.0, + "grad_norm": 2.3329270094302, + "language_loss": 0.60459745, + "learning_rate": 3.21235586541986e-07, + "loss": 0.62896037, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18981934, + "step": 13681, + "time_per_iteration": 2.9251773357391357 + }, + { + "auxiliary_loss_clip": 0.01419393, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.2548039, + "balance_loss_mlp": 1.01849747, + "epoch": 0.8226063430031565, + "flos": 39400173889920.0, + "grad_norm": 2.497758816655288, + "language_loss": 0.69925141, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.72381788, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1875, + "step": 13682, + "time_per_iteration": 3.005311965942383 + }, + { + "auxiliary_loss_clip": 0.01411411, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.25019026, + "balance_loss_mlp": 1.01433063, + "epoch": 0.8226664662558244, + "flos": 22824218563200.0, + "grad_norm": 2.618459019011458, + "language_loss": 0.80074084, + "learning_rate": 3.20812336590816e-07, + "loss": 0.82519579, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19750977, + "step": 13683, + "time_per_iteration": 2.8630001544952393 + }, + { + "auxiliary_loss_clip": 0.01392193, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.23580647, + "balance_loss_mlp": 1.01682103, + "epoch": 0.8227265895084924, + "flos": 25676450691840.0, + "grad_norm": 4.048485467748755, + "language_loss": 0.86977589, + "learning_rate": 3.206008071236661e-07, + "loss": 0.89404625, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18017578, + "step": 13684, + "time_per_iteration": 2.877929925918579 + }, + { + "auxiliary_loss_clip": 0.01382083, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.22768927, + "balance_loss_mlp": 1.01716256, + "epoch": 0.8227867127611603, + "flos": 26190994268160.0, + "grad_norm": 3.303770459842274, + "language_loss": 0.79905272, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82322645, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18151855, + "step": 13685, + "time_per_iteration": 2.908799171447754 + }, + { + "auxiliary_loss_clip": 0.01407954, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.24854147, + "balance_loss_mlp": 1.01346123, + "epoch": 0.8228468360138284, + "flos": 22028215213440.0, + "grad_norm": 1.5224651959201947, + "language_loss": 0.68844295, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.71284819, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19116211, + "step": 13686, + "time_per_iteration": 4.27991795539856 + }, + { + "auxiliary_loss_clip": 0.01402572, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.24099517, + "balance_loss_mlp": 1.01536214, + "epoch": 0.8229069592664963, + "flos": 14911683402240.0, + "grad_norm": 2.7821443233419707, + "language_loss": 0.79162288, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.81599569, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19348145, + "step": 13687, + "time_per_iteration": 2.7884604930877686 + }, + { + "auxiliary_loss_clip": 0.01396562, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.23727822, + "balance_loss_mlp": 1.00990915, + "epoch": 0.8229670825191643, + "flos": 15677978901120.0, + "grad_norm": 2.4046791614399488, + "language_loss": 0.72995955, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.75421834, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1940918, + "step": 13688, + "time_per_iteration": 2.8004329204559326 + }, + { + "auxiliary_loss_clip": 0.01406315, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.24626565, + "balance_loss_mlp": 1.01852095, + "epoch": 0.8230272057718323, + "flos": 23193417283200.0, + "grad_norm": 1.6211196503663652, + "language_loss": 0.73996115, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.76438904, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.17944336, + "step": 13689, + "time_per_iteration": 2.8848063945770264 + }, + { + "auxiliary_loss_clip": 0.01405685, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.24406278, + "balance_loss_mlp": 1.01854944, + "epoch": 0.8230873290245002, + "flos": 21042183127680.0, + "grad_norm": 1.925028251133866, + "language_loss": 0.70363772, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.7280612, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18139648, + "step": 13690, + "time_per_iteration": 2.8648109436035156 + }, + { + "auxiliary_loss_clip": 0.01419054, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.25673807, + "balance_loss_mlp": 1.01467919, + "epoch": 0.8231474522771682, + "flos": 21259657474560.0, + "grad_norm": 1.8609847938000594, + "language_loss": 0.86105227, + "learning_rate": 3.191218844260988e-07, + "loss": 0.88557291, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18334961, + "step": 13691, + "time_per_iteration": 4.24226713180542 + }, + { + "auxiliary_loss_clip": 0.01414804, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.25297797, + "balance_loss_mlp": 1.01446187, + "epoch": 0.8232075755298361, + "flos": 23852581799040.0, + "grad_norm": 1.6825036275418859, + "language_loss": 0.77793479, + "learning_rate": 3.189108646472252e-07, + "loss": 0.80242044, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19299316, + "step": 13692, + "time_per_iteration": 2.8536882400512695 + }, + { + "auxiliary_loss_clip": 0.01404193, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.24601579, + "balance_loss_mlp": 1.01470542, + "epoch": 0.8232676987825042, + "flos": 21664219645440.0, + "grad_norm": 1.9760270965883517, + "language_loss": 0.72481072, + "learning_rate": 3.186999086154205e-07, + "loss": 0.74918413, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18444824, + "step": 13693, + "time_per_iteration": 2.844651222229004 + }, + { + "auxiliary_loss_clip": 0.01394428, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.23865128, + "balance_loss_mlp": 1.01501119, + "epoch": 0.8233278220351721, + "flos": 26333805415680.0, + "grad_norm": 1.912882273387837, + "language_loss": 0.84446073, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.8687309, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.17602539, + "step": 13694, + "time_per_iteration": 2.892763376235962 + }, + { + "auxiliary_loss_clip": 0.0142096, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.25699389, + "balance_loss_mlp": 1.01656556, + "epoch": 0.8233879452878401, + "flos": 21735806198400.0, + "grad_norm": 2.0077266461899637, + "language_loss": 0.77817816, + "learning_rate": 3.182781878250118e-07, + "loss": 0.80273515, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1817627, + "step": 13695, + "time_per_iteration": 2.9473884105682373 + }, + { + "auxiliary_loss_clip": 0.0140359, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.24374652, + "balance_loss_mlp": 1.01244473, + "epoch": 0.823448068540508, + "flos": 20567301258240.0, + "grad_norm": 4.891700751175468, + "language_loss": 0.82089019, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.84523511, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18457031, + "step": 13696, + "time_per_iteration": 2.898198366165161 + }, + { + "auxiliary_loss_clip": 0.01179682, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_clip": 1.09335947, + "balance_loss_mlp": 1.00190818, + "epoch": 0.823508191793176, + "flos": 67308512232960.0, + "grad_norm": 0.7302280209889646, + "language_loss": 0.63897443, + "learning_rate": 3.178567221188393e-07, + "loss": 0.66102874, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.23828125, + "step": 13697, + "time_per_iteration": 6.162862777709961 + }, + { + "auxiliary_loss_clip": 0.01393302, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.23816562, + "balance_loss_mlp": 1.01176369, + "epoch": 0.8235683150458439, + "flos": 17936660753280.0, + "grad_norm": 1.506967598769438, + "language_loss": 0.73304987, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75727755, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.17700195, + "step": 13698, + "time_per_iteration": 2.8528225421905518 + }, + { + "auxiliary_loss_clip": 0.01400788, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.23958838, + "balance_loss_mlp": 1.01199734, + "epoch": 0.823628438298512, + "flos": 18925271792640.0, + "grad_norm": 2.273147455085372, + "language_loss": 0.72456121, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74887681, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18762207, + "step": 13699, + "time_per_iteration": 2.8470635414123535 + }, + { + "auxiliary_loss_clip": 0.0139836, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.24066019, + "balance_loss_mlp": 1.01605356, + "epoch": 0.8236885615511799, + "flos": 18705399471360.0, + "grad_norm": 1.908514163666442, + "language_loss": 0.82600856, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.85033554, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18273926, + "step": 13700, + "time_per_iteration": 2.822153329849243 + }, + { + "auxiliary_loss_clip": 0.01411716, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.24961019, + "balance_loss_mlp": 1.01596141, + "epoch": 0.8237486848038479, + "flos": 23705246171520.0, + "grad_norm": 2.026187317590143, + "language_loss": 0.73557115, + "learning_rate": 3.170145562148763e-07, + "loss": 0.76002967, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18188477, + "step": 13701, + "time_per_iteration": 2.8757972717285156 + }, + { + "auxiliary_loss_clip": 0.01408122, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.24568987, + "balance_loss_mlp": 1.01664042, + "epoch": 0.8238088080565159, + "flos": 23451910680960.0, + "grad_norm": 1.6268885944412874, + "language_loss": 0.6972304, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.72166497, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18676758, + "step": 13702, + "time_per_iteration": 2.8842036724090576 + }, + { + "auxiliary_loss_clip": 0.01401525, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.24229956, + "balance_loss_mlp": 1.01328778, + "epoch": 0.8238689313091838, + "flos": 22756568307840.0, + "grad_norm": 1.7441570479672652, + "language_loss": 0.7555939, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77993107, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18908691, + "step": 13703, + "time_per_iteration": 2.9083786010742188 + }, + { + "auxiliary_loss_clip": 0.01435508, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.26770413, + "balance_loss_mlp": 1.01753426, + "epoch": 0.8239290545618518, + "flos": 25641177730560.0, + "grad_norm": 2.005217590859395, + "language_loss": 0.71516812, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.73988634, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.18798828, + "step": 13704, + "time_per_iteration": 2.8713295459747314 + }, + { + "auxiliary_loss_clip": 0.01406379, + "auxiliary_loss_mlp": 0.01033452, + "balance_loss_clip": 1.24645352, + "balance_loss_mlp": 1.01523674, + "epoch": 0.8239891778145197, + "flos": 26036374227840.0, + "grad_norm": 1.6738311814079343, + "language_loss": 0.6477505, + "learning_rate": 3.161734114144916e-07, + "loss": 0.67214882, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18212891, + "step": 13705, + "time_per_iteration": 2.881086826324463 + }, + { + "auxiliary_loss_clip": 0.01411423, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.24900341, + "balance_loss_mlp": 1.01580906, + "epoch": 0.8240493010671878, + "flos": 21842756202240.0, + "grad_norm": 2.306487423343465, + "language_loss": 0.70771974, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.73217851, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18652344, + "step": 13706, + "time_per_iteration": 2.872955322265625 + }, + { + "auxiliary_loss_clip": 0.01416295, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.25522041, + "balance_loss_mlp": 1.01521468, + "epoch": 0.8241094243198557, + "flos": 18561185735040.0, + "grad_norm": 1.7095139559548214, + "language_loss": 0.70243692, + "learning_rate": 3.157532220876475e-07, + "loss": 0.72694033, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18835449, + "step": 13707, + "time_per_iteration": 2.8490333557128906 + }, + { + "auxiliary_loss_clip": 0.014074, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.24694884, + "balance_loss_mlp": 1.01576138, + "epoch": 0.8241695475725237, + "flos": 25458071448960.0, + "grad_norm": 1.977391220442281, + "language_loss": 0.79873747, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.82316351, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19433594, + "step": 13708, + "time_per_iteration": 2.852030038833618 + }, + { + "auxiliary_loss_clip": 0.01409765, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.24812174, + "balance_loss_mlp": 1.01587915, + "epoch": 0.8242296708251916, + "flos": 18999030096000.0, + "grad_norm": 2.205801656874968, + "language_loss": 0.6917944, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.716241, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19018555, + "step": 13709, + "time_per_iteration": 2.8380861282348633 + }, + { + "auxiliary_loss_clip": 0.0141284, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.25091231, + "balance_loss_mlp": 1.01333523, + "epoch": 0.8242897940778596, + "flos": 22610951982720.0, + "grad_norm": 1.9614632224739392, + "language_loss": 0.8326726, + "learning_rate": 3.151234171183319e-07, + "loss": 0.85711771, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18347168, + "step": 13710, + "time_per_iteration": 2.957716703414917 + }, + { + "auxiliary_loss_clip": 0.01394667, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.23600721, + "balance_loss_mlp": 1.01109254, + "epoch": 0.8243499173305275, + "flos": 21477493779840.0, + "grad_norm": 2.0642391910129256, + "language_loss": 0.79153091, + "learning_rate": 3.149136098993257e-07, + "loss": 0.81578183, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19335938, + "step": 13711, + "time_per_iteration": 2.830284833908081 + }, + { + "auxiliary_loss_clip": 0.01405929, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.24577034, + "balance_loss_mlp": 1.01544619, + "epoch": 0.8244100405831956, + "flos": 20019746960640.0, + "grad_norm": 2.199617135723057, + "language_loss": 0.66096169, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.68536079, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1854248, + "step": 13712, + "time_per_iteration": 2.814237117767334 + }, + { + "auxiliary_loss_clip": 0.01411346, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.25011635, + "balance_loss_mlp": 1.01473343, + "epoch": 0.8244701638358635, + "flos": 26441434091520.0, + "grad_norm": 1.7541744669652763, + "language_loss": 0.75490308, + "learning_rate": 3.14494187165202e-07, + "loss": 0.77935433, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19055176, + "step": 13713, + "time_per_iteration": 2.8558037281036377 + }, + { + "auxiliary_loss_clip": 0.01418171, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.25479245, + "balance_loss_mlp": 1.01435435, + "epoch": 0.8245302870885315, + "flos": 17649092931840.0, + "grad_norm": 2.0685678440554325, + "language_loss": 0.81550086, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.84000486, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.17858887, + "step": 13714, + "time_per_iteration": 2.8215112686157227 + }, + { + "auxiliary_loss_clip": 0.01402569, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.24466348, + "balance_loss_mlp": 1.01755655, + "epoch": 0.8245904103411995, + "flos": 26219616243840.0, + "grad_norm": 1.762720717520167, + "language_loss": 0.66916823, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.69356269, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1932373, + "step": 13715, + "time_per_iteration": 2.9206020832061768 + }, + { + "auxiliary_loss_clip": 0.01418591, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.25570917, + "balance_loss_mlp": 1.0155139, + "epoch": 0.8246505335938674, + "flos": 24215265267840.0, + "grad_norm": 1.6756661431840223, + "language_loss": 0.7576068, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.78213179, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18395996, + "step": 13716, + "time_per_iteration": 2.868501663208008 + }, + { + "auxiliary_loss_clip": 0.01184444, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.09770584, + "balance_loss_mlp": 1.00528073, + "epoch": 0.8247106568465354, + "flos": 67127215743360.0, + "grad_norm": 0.7165589340588783, + "language_loss": 0.59094143, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61308855, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.25, + "step": 13717, + "time_per_iteration": 3.4550724029541016 + }, + { + "auxiliary_loss_clip": 0.01410229, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.25120437, + "balance_loss_mlp": 1.01468456, + "epoch": 0.8247707800992033, + "flos": 12575940376320.0, + "grad_norm": 8.853193984922592, + "language_loss": 0.80690682, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.83133405, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.17810059, + "step": 13718, + "time_per_iteration": 2.8253233432769775 + }, + { + "auxiliary_loss_clip": 0.0139575, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.23833227, + "balance_loss_mlp": 1.01957536, + "epoch": 0.8248309033518714, + "flos": 15931857329280.0, + "grad_norm": 1.6259874705982873, + "language_loss": 0.69496262, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71929765, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.1817627, + "step": 13719, + "time_per_iteration": 2.822535276412964 + }, + { + "auxiliary_loss_clip": 0.01406593, + "auxiliary_loss_mlp": 0.01039271, + "balance_loss_clip": 1.24706852, + "balance_loss_mlp": 1.02012634, + "epoch": 0.8248910266045393, + "flos": 17573208122880.0, + "grad_norm": 2.8811594534899254, + "language_loss": 0.70929086, + "learning_rate": 3.13028221321197e-07, + "loss": 0.73374951, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19140625, + "step": 13720, + "time_per_iteration": 2.8427984714508057 + }, + { + "auxiliary_loss_clip": 0.01422769, + "auxiliary_loss_mlp": 0.01037122, + "balance_loss_clip": 1.26087046, + "balance_loss_mlp": 1.01801312, + "epoch": 0.8249511498572073, + "flos": 28630927365120.0, + "grad_norm": 1.5901112056385813, + "language_loss": 0.76075691, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.78535581, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19104004, + "step": 13721, + "time_per_iteration": 4.453307867050171 + }, + { + "auxiliary_loss_clip": 0.01407087, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.24813998, + "balance_loss_mlp": 1.01254022, + "epoch": 0.8250112731098752, + "flos": 25567509916800.0, + "grad_norm": 10.6541046040222, + "language_loss": 0.78538036, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80976605, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18945312, + "step": 13722, + "time_per_iteration": 2.8728177547454834 + }, + { + "auxiliary_loss_clip": 0.01404226, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.24560559, + "balance_loss_mlp": 1.01790166, + "epoch": 0.8250713963625432, + "flos": 27757772352000.0, + "grad_norm": 1.607029304056864, + "language_loss": 0.63471437, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.6591208, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18530273, + "step": 13723, + "time_per_iteration": 2.8797552585601807 + }, + { + "auxiliary_loss_clip": 0.01407451, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.24724233, + "balance_loss_mlp": 1.01518714, + "epoch": 0.8251315196152111, + "flos": 21619083317760.0, + "grad_norm": 1.8058379640118993, + "language_loss": 0.7544533, + "learning_rate": 3.121919337215666e-07, + "loss": 0.77886862, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18908691, + "step": 13724, + "time_per_iteration": 2.8725147247314453 + }, + { + "auxiliary_loss_clip": 0.0140938, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.2487911, + "balance_loss_mlp": 1.01510406, + "epoch": 0.8251916428678792, + "flos": 28589636845440.0, + "grad_norm": 1.7843508208277452, + "language_loss": 0.64586318, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.6702981, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18994141, + "step": 13725, + "time_per_iteration": 2.8608803749084473 + }, + { + "auxiliary_loss_clip": 0.01394327, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.2355535, + "balance_loss_mlp": 1.01196718, + "epoch": 0.8252517661205471, + "flos": 23085381404160.0, + "grad_norm": 1.6374268147076583, + "language_loss": 0.82614505, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.85038877, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18078613, + "step": 13726, + "time_per_iteration": 2.832667112350464 + }, + { + "auxiliary_loss_clip": 0.01389294, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.23469532, + "balance_loss_mlp": 1.01511455, + "epoch": 0.8253118893732151, + "flos": 31772220393600.0, + "grad_norm": 2.527850209555812, + "language_loss": 0.7128911, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.73711377, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.17858887, + "step": 13727, + "time_per_iteration": 4.378351926803589 + }, + { + "auxiliary_loss_clip": 0.01420961, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.25768042, + "balance_loss_mlp": 1.01698506, + "epoch": 0.8253720126258831, + "flos": 18305588004480.0, + "grad_norm": 2.6586294009771736, + "language_loss": 0.64198291, + "learning_rate": 3.113566701515036e-07, + "loss": 0.66654539, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18310547, + "step": 13728, + "time_per_iteration": 2.8443081378936768 + }, + { + "auxiliary_loss_clip": 0.01432568, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.26677144, + "balance_loss_mlp": 1.01881886, + "epoch": 0.825432135878551, + "flos": 26808642040320.0, + "grad_norm": 1.6376045926463716, + "language_loss": 0.72119468, + "learning_rate": 3.111480143230092e-07, + "loss": 0.74589384, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.18518066, + "step": 13729, + "time_per_iteration": 2.8461825847625732 + }, + { + "auxiliary_loss_clip": 0.01183886, + "auxiliary_loss_mlp": 0.0102706, + "balance_loss_clip": 1.09555674, + "balance_loss_mlp": 1.00522089, + "epoch": 0.825492259131219, + "flos": 54242985427200.0, + "grad_norm": 0.848553240234136, + "language_loss": 0.6271975, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64930701, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21875, + "step": 13730, + "time_per_iteration": 3.1700479984283447 + }, + { + "auxiliary_loss_clip": 0.01405783, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.24627304, + "balance_loss_mlp": 1.01709485, + "epoch": 0.825552382383887, + "flos": 43770020987520.0, + "grad_norm": 2.004350647662021, + "language_loss": 0.64086956, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.66528392, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18554688, + "step": 13731, + "time_per_iteration": 3.0116758346557617 + }, + { + "auxiliary_loss_clip": 0.01429, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.26259851, + "balance_loss_mlp": 1.01554847, + "epoch": 0.825612505636555, + "flos": 12610036972800.0, + "grad_norm": 2.1265347450304963, + "language_loss": 0.69903213, + "learning_rate": 3.105224311177812e-07, + "loss": 0.72366595, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.18835449, + "step": 13732, + "time_per_iteration": 5.640879154205322 + }, + { + "auxiliary_loss_clip": 0.01424737, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.26018846, + "balance_loss_mlp": 1.01487708, + "epoch": 0.8256726288892229, + "flos": 17602644504960.0, + "grad_norm": 5.171241984067599, + "language_loss": 0.72795928, + "learning_rate": 3.103140315024817e-07, + "loss": 0.75254595, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19055176, + "step": 13733, + "time_per_iteration": 2.8274881839752197 + }, + { + "auxiliary_loss_clip": 0.0139783, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.23984754, + "balance_loss_mlp": 1.0134778, + "epoch": 0.8257327521418909, + "flos": 23816403941760.0, + "grad_norm": 1.487371191873629, + "language_loss": 0.83078879, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.85509032, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18859863, + "step": 13734, + "time_per_iteration": 2.8727948665618896 + }, + { + "auxiliary_loss_clip": 0.01400769, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.2424202, + "balance_loss_mlp": 1.01267505, + "epoch": 0.8257928753945588, + "flos": 19290443725440.0, + "grad_norm": 2.1979895668679825, + "language_loss": 0.83594298, + "learning_rate": 3.098974244989676e-07, + "loss": 0.8602702, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19262695, + "step": 13735, + "time_per_iteration": 2.815056800842285 + }, + { + "auxiliary_loss_clip": 0.01401586, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.24135518, + "balance_loss_mlp": 1.01661766, + "epoch": 0.8258529986472268, + "flos": 18488060858880.0, + "grad_norm": 2.1174813257934297, + "language_loss": 0.71546495, + "learning_rate": 3.096892171265497e-07, + "loss": 0.73983085, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18371582, + "step": 13736, + "time_per_iteration": 2.8014867305755615 + }, + { + "auxiliary_loss_clip": 0.01184382, + "auxiliary_loss_mlp": 0.01023942, + "balance_loss_clip": 1.09723377, + "balance_loss_mlp": 1.003438, + "epoch": 0.8259131218998947, + "flos": 62164252834560.0, + "grad_norm": 1.5117590762938031, + "language_loss": 0.68055987, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70264304, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.20507812, + "step": 13737, + "time_per_iteration": 3.361929416656494 + }, + { + "auxiliary_loss_clip": 0.01414392, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.2537328, + "balance_loss_mlp": 1.01609325, + "epoch": 0.8259732451525628, + "flos": 22167949714560.0, + "grad_norm": 1.808236915294887, + "language_loss": 0.70490253, + "learning_rate": 3.0927299467987e-07, + "loss": 0.72938824, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18078613, + "step": 13738, + "time_per_iteration": 2.9188954830169678 + }, + { + "auxiliary_loss_clip": 0.0141481, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.25250483, + "balance_loss_mlp": 1.01312065, + "epoch": 0.8260333684052307, + "flos": 38375520727680.0, + "grad_norm": 1.9203237173667616, + "language_loss": 0.64009142, + "learning_rate": 3.090649796213911e-07, + "loss": 0.66458035, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20959473, + "step": 13739, + "time_per_iteration": 3.0387895107269287 + }, + { + "auxiliary_loss_clip": 0.01184607, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.0962646, + "balance_loss_mlp": 1.0036757, + "epoch": 0.8260934916578987, + "flos": 62214818538240.0, + "grad_norm": 0.8421983917953507, + "language_loss": 0.59414899, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61625016, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21875, + "step": 13740, + "time_per_iteration": 3.336470365524292 + }, + { + "auxiliary_loss_clip": 0.01421263, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.25591254, + "balance_loss_mlp": 1.01296139, + "epoch": 0.8261536149105667, + "flos": 22575814755840.0, + "grad_norm": 1.9579697471801056, + "language_loss": 0.76098162, + "learning_rate": 3.086491418735959e-07, + "loss": 0.785519, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19506836, + "step": 13741, + "time_per_iteration": 2.8691625595092773 + }, + { + "auxiliary_loss_clip": 0.01397803, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.23838413, + "balance_loss_mlp": 1.01293242, + "epoch": 0.8262137381632346, + "flos": 32538244423680.0, + "grad_norm": 1.9757907431425563, + "language_loss": 0.63234794, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.65664101, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18591309, + "step": 13742, + "time_per_iteration": 2.899017095565796 + }, + { + "auxiliary_loss_clip": 0.01432013, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.26349616, + "balance_loss_mlp": 1.01511717, + "epoch": 0.8262738614159026, + "flos": 14144392517760.0, + "grad_norm": 2.38240794557015, + "language_loss": 0.66505694, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.68973362, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.20544434, + "step": 13743, + "time_per_iteration": 2.8157413005828857 + }, + { + "auxiliary_loss_clip": 0.01416534, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.25632882, + "balance_loss_mlp": 1.01814961, + "epoch": 0.8263339846685706, + "flos": 19834016480640.0, + "grad_norm": 1.6686668017188593, + "language_loss": 0.66942811, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.69396907, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1940918, + "step": 13744, + "time_per_iteration": 2.8694705963134766 + }, + { + "auxiliary_loss_clip": 0.01405575, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.24598157, + "balance_loss_mlp": 1.01461864, + "epoch": 0.8263941079212386, + "flos": 22755844391040.0, + "grad_norm": 1.9069398187531013, + "language_loss": 0.7604087, + "learning_rate": 3.078182360753612e-07, + "loss": 0.78478813, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.17773438, + "step": 13745, + "time_per_iteration": 2.843214988708496 + }, + { + "auxiliary_loss_clip": 0.01380555, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.22637522, + "balance_loss_mlp": 1.01275074, + "epoch": 0.8264542311739065, + "flos": 20130271303680.0, + "grad_norm": 1.784709928131952, + "language_loss": 0.80036473, + "learning_rate": 3.076106700253709e-07, + "loss": 0.8244803, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18249512, + "step": 13746, + "time_per_iteration": 2.8357508182525635 + }, + { + "auxiliary_loss_clip": 0.01429658, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.26442313, + "balance_loss_mlp": 1.01436925, + "epoch": 0.8265143544265745, + "flos": 16845986148480.0, + "grad_norm": 1.9569171501791687, + "language_loss": 0.69092274, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.71555543, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19238281, + "step": 13747, + "time_per_iteration": 2.8320279121398926 + }, + { + "auxiliary_loss_clip": 0.01400327, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.23976874, + "balance_loss_mlp": 1.01347184, + "epoch": 0.8265744776792424, + "flos": 22028893885440.0, + "grad_norm": 2.5548706761618774, + "language_loss": 0.76131427, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.78563738, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18518066, + "step": 13748, + "time_per_iteration": 2.884262800216675 + }, + { + "auxiliary_loss_clip": 0.0139296, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.23738074, + "balance_loss_mlp": 1.01925552, + "epoch": 0.8266346009319104, + "flos": 19254220623360.0, + "grad_norm": 1.6629917769944962, + "language_loss": 0.63709313, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66138834, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.17333984, + "step": 13749, + "time_per_iteration": 2.8175060749053955 + }, + { + "auxiliary_loss_clip": 0.01399633, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.23947668, + "balance_loss_mlp": 1.01251972, + "epoch": 0.8266947241845783, + "flos": 24176463212160.0, + "grad_norm": 1.652893679768725, + "language_loss": 0.74252528, + "learning_rate": 3.067810476598132e-07, + "loss": 0.76683331, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18640137, + "step": 13750, + "time_per_iteration": 2.9163479804992676 + }, + { + "auxiliary_loss_clip": 0.01422627, + "auxiliary_loss_mlp": 0.01037175, + "balance_loss_clip": 1.26032436, + "balance_loss_mlp": 1.01760137, + "epoch": 0.8267548474372464, + "flos": 21115715207040.0, + "grad_norm": 1.8012734655977065, + "language_loss": 0.66227949, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68687749, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19567871, + "step": 13751, + "time_per_iteration": 2.8824455738067627 + }, + { + "auxiliary_loss_clip": 0.01397305, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.23848987, + "balance_loss_mlp": 1.01509237, + "epoch": 0.8268149706899143, + "flos": 39982910659200.0, + "grad_norm": 1.4780643242392275, + "language_loss": 0.61412036, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.63842094, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.17651367, + "step": 13752, + "time_per_iteration": 3.0032575130462646 + }, + { + "auxiliary_loss_clip": 0.01183814, + "auxiliary_loss_mlp": 0.01019234, + "balance_loss_clip": 1.0966686, + "balance_loss_mlp": 1.00073326, + "epoch": 0.8268750939425823, + "flos": 65808597260160.0, + "grad_norm": 1.0084175287825399, + "language_loss": 0.57485861, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59688914, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.18457031, + "step": 13753, + "time_per_iteration": 3.4065821170806885 + }, + { + "auxiliary_loss_clip": 0.01184998, + "auxiliary_loss_mlp": 0.01019948, + "balance_loss_clip": 1.09876657, + "balance_loss_mlp": 0.99801332, + "epoch": 0.8269352171952503, + "flos": 53005744356480.0, + "grad_norm": 0.7030518731018391, + "language_loss": 0.54940444, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.57145393, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.21972656, + "step": 13754, + "time_per_iteration": 3.374541997909546 + }, + { + "auxiliary_loss_clip": 0.01401427, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.24297345, + "balance_loss_mlp": 1.01296091, + "epoch": 0.8269953404479182, + "flos": 23086557768960.0, + "grad_norm": 2.313020444720935, + "language_loss": 0.70086014, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.7251789, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17480469, + "step": 13755, + "time_per_iteration": 2.8698792457580566 + }, + { + "auxiliary_loss_clip": 0.01400072, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.24168301, + "balance_loss_mlp": 1.01910353, + "epoch": 0.8270554637005862, + "flos": 14218105576320.0, + "grad_norm": 2.0617768681647894, + "language_loss": 0.70856071, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.73292637, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17382812, + "step": 13756, + "time_per_iteration": 2.8703560829162598 + }, + { + "auxiliary_loss_clip": 0.01404792, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.24608791, + "balance_loss_mlp": 1.0137651, + "epoch": 0.8271155869532542, + "flos": 21771984055680.0, + "grad_norm": 2.028972456462824, + "language_loss": 0.73579741, + "learning_rate": 3.053316807931623e-07, + "loss": 0.76016378, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18078613, + "step": 13757, + "time_per_iteration": 4.338456630706787 + }, + { + "auxiliary_loss_clip": 0.01424635, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.26043785, + "balance_loss_mlp": 1.01479912, + "epoch": 0.8271757102059222, + "flos": 15128388587520.0, + "grad_norm": 2.6137350218753745, + "language_loss": 0.69496179, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71956527, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20922852, + "step": 13758, + "time_per_iteration": 2.8267078399658203 + }, + { + "auxiliary_loss_clip": 0.01393105, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.23657393, + "balance_loss_mlp": 1.01167202, + "epoch": 0.8272358334585901, + "flos": 24144312142080.0, + "grad_norm": 1.53940253404656, + "language_loss": 0.69925618, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.7234866, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18273926, + "step": 13759, + "time_per_iteration": 2.898010015487671 + }, + { + "auxiliary_loss_clip": 0.01395213, + "auxiliary_loss_mlp": 0.01032926, + "balance_loss_clip": 1.23701644, + "balance_loss_mlp": 1.01426995, + "epoch": 0.8272959567112581, + "flos": 19000342195200.0, + "grad_norm": 1.8999404795689314, + "language_loss": 0.71319187, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73747325, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18664551, + "step": 13760, + "time_per_iteration": 2.887220621109009 + }, + { + "auxiliary_loss_clip": 0.01398198, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.24182701, + "balance_loss_mlp": 1.01115918, + "epoch": 0.827356079963926, + "flos": 20641421520000.0, + "grad_norm": 1.9251799038220658, + "language_loss": 0.77895284, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.80322313, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17687988, + "step": 13761, + "time_per_iteration": 4.296947717666626 + }, + { + "auxiliary_loss_clip": 0.01399116, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.24273634, + "balance_loss_mlp": 1.01568246, + "epoch": 0.827416203216594, + "flos": 22426171643520.0, + "grad_norm": 1.646053291900062, + "language_loss": 0.71341634, + "learning_rate": 3.042983464482387e-07, + "loss": 0.73774076, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17651367, + "step": 13762, + "time_per_iteration": 2.8422939777374268 + }, + { + "auxiliary_loss_clip": 0.01393132, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.2357074, + "balance_loss_mlp": 1.01120651, + "epoch": 0.827476326469262, + "flos": 19035569911680.0, + "grad_norm": 1.9245112795267285, + "language_loss": 0.71041214, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.734631, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17541504, + "step": 13763, + "time_per_iteration": 2.8434808254241943 + }, + { + "auxiliary_loss_clip": 0.01186579, + "auxiliary_loss_mlp": 0.01020491, + "balance_loss_clip": 1.09872591, + "balance_loss_mlp": 0.99769861, + "epoch": 0.82753644972193, + "flos": 68532179863680.0, + "grad_norm": 0.8411010192207616, + "language_loss": 0.65161258, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67368329, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.22753906, + "step": 13764, + "time_per_iteration": 3.435088634490967 + }, + { + "auxiliary_loss_clip": 0.01407477, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.24837232, + "balance_loss_mlp": 1.01280892, + "epoch": 0.8275965729745979, + "flos": 18414483534720.0, + "grad_norm": 1.9768870839153099, + "language_loss": 0.78711909, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.81151038, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18847656, + "step": 13765, + "time_per_iteration": 2.8451600074768066 + }, + { + "auxiliary_loss_clip": 0.01414489, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.25092626, + "balance_loss_mlp": 1.01636314, + "epoch": 0.8276566962272659, + "flos": 28523796382080.0, + "grad_norm": 1.5531220176619522, + "language_loss": 0.62977827, + "learning_rate": 3.034728363464214e-07, + "loss": 0.65427274, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18603516, + "step": 13766, + "time_per_iteration": 2.9132919311523438 + }, + { + "auxiliary_loss_clip": 0.01406527, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.24676228, + "balance_loss_mlp": 1.01491249, + "epoch": 0.8277168194799339, + "flos": 20239935995520.0, + "grad_norm": 2.690903767771868, + "language_loss": 0.83442956, + "learning_rate": 3.03266619632609e-07, + "loss": 0.8588348, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1907959, + "step": 13767, + "time_per_iteration": 5.788644075393677 + }, + { + "auxiliary_loss_clip": 0.01418465, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.2567004, + "balance_loss_mlp": 1.01695538, + "epoch": 0.8277769427326018, + "flos": 28488613910400.0, + "grad_norm": 1.6231991649956503, + "language_loss": 0.69332874, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71786845, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1854248, + "step": 13768, + "time_per_iteration": 2.932056188583374 + }, + { + "auxiliary_loss_clip": 0.0138929, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.23221564, + "balance_loss_mlp": 1.01119816, + "epoch": 0.8278370659852698, + "flos": 27208770220800.0, + "grad_norm": 3.0577814055711485, + "language_loss": 0.75034124, + "learning_rate": 3.028543792337006e-07, + "loss": 0.77453005, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18395996, + "step": 13769, + "time_per_iteration": 2.926689386367798 + }, + { + "auxiliary_loss_clip": 0.01403393, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.24299479, + "balance_loss_mlp": 1.01335931, + "epoch": 0.8278971892379378, + "flos": 37830319159680.0, + "grad_norm": 1.7583898899526764, + "language_loss": 0.74998093, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.77433598, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.1875, + "step": 13770, + "time_per_iteration": 3.0069572925567627 + }, + { + "auxiliary_loss_clip": 0.01406937, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.2455107, + "balance_loss_mlp": 1.01177561, + "epoch": 0.8279573124906058, + "flos": 22569118525440.0, + "grad_norm": 1.6382520926663422, + "language_loss": 0.76394677, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.78832066, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18688965, + "step": 13771, + "time_per_iteration": 2.866610527038574 + }, + { + "auxiliary_loss_clip": 0.01404195, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.2438271, + "balance_loss_mlp": 1.01375866, + "epoch": 0.8280174357432737, + "flos": 36078308288640.0, + "grad_norm": 2.6325633123140726, + "language_loss": 0.73431253, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.75867218, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18029785, + "step": 13772, + "time_per_iteration": 2.960813283920288 + }, + { + "auxiliary_loss_clip": 0.01392083, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.23453987, + "balance_loss_mlp": 1.02060461, + "epoch": 0.8280775589959417, + "flos": 22970015867520.0, + "grad_norm": 2.257871008968038, + "language_loss": 0.75212884, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.77644396, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18835449, + "step": 13773, + "time_per_iteration": 2.851172685623169 + }, + { + "auxiliary_loss_clip": 0.01393666, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.23665917, + "balance_loss_mlp": 1.01315498, + "epoch": 0.8281376822486096, + "flos": 26073276001920.0, + "grad_norm": 2.031947889309481, + "language_loss": 0.76354223, + "learning_rate": 3.01824904601915e-07, + "loss": 0.78779531, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18481445, + "step": 13774, + "time_per_iteration": 2.8817636966705322 + }, + { + "auxiliary_loss_clip": 0.01425315, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.25986528, + "balance_loss_mlp": 1.02037084, + "epoch": 0.8281978055012776, + "flos": 20677508887680.0, + "grad_norm": 1.726487148782234, + "language_loss": 0.75166571, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77632034, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19763184, + "step": 13775, + "time_per_iteration": 2.852985382080078 + }, + { + "auxiliary_loss_clip": 0.0142076, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.25567436, + "balance_loss_mlp": 1.01245975, + "epoch": 0.8282579287539455, + "flos": 29327355613440.0, + "grad_norm": 2.041481646389865, + "language_loss": 0.73855948, + "learning_rate": 3.01413565459353e-07, + "loss": 0.76308012, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18823242, + "step": 13776, + "time_per_iteration": 2.9063913822174072 + }, + { + "auxiliary_loss_clip": 0.01407323, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.24539042, + "balance_loss_mlp": 1.01420093, + "epoch": 0.8283180520066136, + "flos": 15714744940800.0, + "grad_norm": 2.023957322666098, + "language_loss": 0.77834904, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.80276078, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19641113, + "step": 13777, + "time_per_iteration": 2.799372673034668 + }, + { + "auxiliary_loss_clip": 0.01407927, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.24979091, + "balance_loss_mlp": 1.0169456, + "epoch": 0.8283781752592815, + "flos": 24802481272320.0, + "grad_norm": 1.4531827329469804, + "language_loss": 0.82534993, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84977531, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.17663574, + "step": 13778, + "time_per_iteration": 2.8737404346466064 + }, + { + "auxiliary_loss_clip": 0.01385053, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.2297852, + "balance_loss_mlp": 1.01123571, + "epoch": 0.8284382985119495, + "flos": 18990388339200.0, + "grad_norm": 3.358588190796583, + "language_loss": 0.748963, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.77310896, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18310547, + "step": 13779, + "time_per_iteration": 2.8725521564483643 + }, + { + "auxiliary_loss_clip": 0.01185545, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.09712458, + "balance_loss_mlp": 1.00422049, + "epoch": 0.8284984217646175, + "flos": 61067623898880.0, + "grad_norm": 0.7986145217143393, + "language_loss": 0.56741965, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58954608, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22851562, + "step": 13780, + "time_per_iteration": 3.4343576431274414 + }, + { + "auxiliary_loss_clip": 0.01394534, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.23563695, + "balance_loss_mlp": 1.01003027, + "epoch": 0.8285585450172854, + "flos": 19722949200000.0, + "grad_norm": 1.7458301962813136, + "language_loss": 0.80901825, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.83325398, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18994141, + "step": 13781, + "time_per_iteration": 2.842149496078491 + }, + { + "auxiliary_loss_clip": 0.01396853, + "auxiliary_loss_mlp": 0.01036566, + "balance_loss_clip": 1.2374444, + "balance_loss_mlp": 1.01814842, + "epoch": 0.8286186682699535, + "flos": 21698768689920.0, + "grad_norm": 5.99504751780044, + "language_loss": 0.76583409, + "learning_rate": 3.001810941346543e-07, + "loss": 0.79016829, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18408203, + "step": 13782, + "time_per_iteration": 2.8272643089294434 + }, + { + "auxiliary_loss_clip": 0.01411777, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.24981093, + "balance_loss_mlp": 1.01518762, + "epoch": 0.8286787915226214, + "flos": 25786567831680.0, + "grad_norm": 1.5634398522893869, + "language_loss": 0.76641321, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.79087698, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1940918, + "step": 13783, + "time_per_iteration": 2.911400079727173 + }, + { + "auxiliary_loss_clip": 0.01416907, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.25528622, + "balance_loss_mlp": 1.01416206, + "epoch": 0.8287389147752894, + "flos": 21298323795840.0, + "grad_norm": 1.9597940307982569, + "language_loss": 0.74358737, + "learning_rate": 2.997707859351304e-07, + "loss": 0.7680887, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19055176, + "step": 13784, + "time_per_iteration": 2.8289148807525635 + }, + { + "auxiliary_loss_clip": 0.01414669, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.25117874, + "balance_loss_mlp": 1.01630747, + "epoch": 0.8287990380279573, + "flos": 33557332475520.0, + "grad_norm": 1.4692543817935058, + "language_loss": 0.70422202, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72872519, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19335938, + "step": 13785, + "time_per_iteration": 2.9389724731445312 + }, + { + "auxiliary_loss_clip": 0.01405161, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.24431038, + "balance_loss_mlp": 1.01434493, + "epoch": 0.8288591612806253, + "flos": 22976666853120.0, + "grad_norm": 1.4488878342717473, + "language_loss": 0.69297773, + "learning_rate": 2.993607356270516e-07, + "loss": 0.71736509, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19238281, + "step": 13786, + "time_per_iteration": 2.867943286895752 + }, + { + "auxiliary_loss_clip": 0.01430088, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.26393116, + "balance_loss_mlp": 1.01873147, + "epoch": 0.8289192845332932, + "flos": 18598404222720.0, + "grad_norm": 1.9811070332490766, + "language_loss": 0.7779994, + "learning_rate": 2.991558072017426e-07, + "loss": 0.80267662, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18908691, + "step": 13787, + "time_per_iteration": 2.8462703227996826 + }, + { + "auxiliary_loss_clip": 0.01400159, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.24199784, + "balance_loss_mlp": 1.0148958, + "epoch": 0.8289794077859612, + "flos": 15458830496640.0, + "grad_norm": 2.689517287414441, + "language_loss": 0.80811387, + "learning_rate": 2.989509432726163e-07, + "loss": 0.83245379, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18933105, + "step": 13788, + "time_per_iteration": 2.8292412757873535 + }, + { + "auxiliary_loss_clip": 0.01398528, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.239923, + "balance_loss_mlp": 1.01593149, + "epoch": 0.8290395310386292, + "flos": 28889918455680.0, + "grad_norm": 1.4929914655813683, + "language_loss": 0.71942478, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.74375319, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18383789, + "step": 13789, + "time_per_iteration": 2.8737614154815674 + }, + { + "auxiliary_loss_clip": 0.01403328, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.24151278, + "balance_loss_mlp": 1.01267004, + "epoch": 0.8290996542912972, + "flos": 36590318156160.0, + "grad_norm": 1.8192005061594356, + "language_loss": 0.68488634, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70923996, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19372559, + "step": 13790, + "time_per_iteration": 2.9968698024749756 + }, + { + "auxiliary_loss_clip": 0.01413354, + "auxiliary_loss_mlp": 0.01037914, + "balance_loss_clip": 1.25065303, + "balance_loss_mlp": 1.01750541, + "epoch": 0.8291597775439651, + "flos": 23633478639360.0, + "grad_norm": 1.5845937965421082, + "language_loss": 0.77940857, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.80392122, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20397949, + "step": 13791, + "time_per_iteration": 2.896021842956543 + }, + { + "auxiliary_loss_clip": 0.01403966, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.24692225, + "balance_loss_mlp": 1.0136801, + "epoch": 0.8292199007966331, + "flos": 21407400305280.0, + "grad_norm": 4.68108229797551, + "language_loss": 0.7064569, + "learning_rate": 2.981321326732651e-07, + "loss": 0.73080927, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.17602539, + "step": 13792, + "time_per_iteration": 4.34114408493042 + }, + { + "auxiliary_loss_clip": 0.01403476, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.24260378, + "balance_loss_mlp": 1.01141191, + "epoch": 0.829280024049301, + "flos": 28779439357440.0, + "grad_norm": 1.7909958016959286, + "language_loss": 0.65817082, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.68251002, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19042969, + "step": 13793, + "time_per_iteration": 2.9298954010009766 + }, + { + "auxiliary_loss_clip": 0.01404548, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.24254477, + "balance_loss_mlp": 1.01257157, + "epoch": 0.829340147301969, + "flos": 19947617470080.0, + "grad_norm": 2.2841251902486563, + "language_loss": 0.66851056, + "learning_rate": 2.977231145525461e-07, + "loss": 0.69287205, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19030762, + "step": 13794, + "time_per_iteration": 2.8325254917144775 + }, + { + "auxiliary_loss_clip": 0.01409985, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.24799776, + "balance_loss_mlp": 1.0160408, + "epoch": 0.829400270554637, + "flos": 25239511226880.0, + "grad_norm": 1.8604143880332655, + "language_loss": 0.66532934, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68977886, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18933105, + "step": 13795, + "time_per_iteration": 2.885525703430176 + }, + { + "auxiliary_loss_clip": 0.01387172, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.23237062, + "balance_loss_mlp": 1.01362419, + "epoch": 0.829460393807305, + "flos": 24474618316800.0, + "grad_norm": 1.837352494805835, + "language_loss": 0.67568767, + "learning_rate": 2.973143546338661e-07, + "loss": 0.69987386, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.17822266, + "step": 13796, + "time_per_iteration": 4.278230428695679 + }, + { + "auxiliary_loss_clip": 0.0139088, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.23388004, + "balance_loss_mlp": 1.01352751, + "epoch": 0.829520517059973, + "flos": 15130922296320.0, + "grad_norm": 1.5742848315900135, + "language_loss": 0.71950781, + "learning_rate": 2.971100715196666e-07, + "loss": 0.74373847, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18664551, + "step": 13797, + "time_per_iteration": 2.7943508625030518 + }, + { + "auxiliary_loss_clip": 0.01414937, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.25459433, + "balance_loss_mlp": 1.0121069, + "epoch": 0.8295806403126409, + "flos": 21589963649280.0, + "grad_norm": 2.144577036198014, + "language_loss": 0.73782277, + "learning_rate": 2.969058529792243e-07, + "loss": 0.7622745, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18127441, + "step": 13798, + "time_per_iteration": 2.90126371383667 + }, + { + "auxiliary_loss_clip": 0.01390248, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.23448038, + "balance_loss_mlp": 1.01290131, + "epoch": 0.8296407635653089, + "flos": 21736756339200.0, + "grad_norm": 1.6750068483197647, + "language_loss": 0.77115399, + "learning_rate": 2.967016990202822e-07, + "loss": 0.79536778, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18237305, + "step": 13799, + "time_per_iteration": 2.871572256088257 + }, + { + "auxiliary_loss_clip": 0.01404864, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.2465831, + "balance_loss_mlp": 1.01352596, + "epoch": 0.8297008868179768, + "flos": 11188332276480.0, + "grad_norm": 1.9647387688228162, + "language_loss": 0.67741919, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.70179206, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18908691, + "step": 13800, + "time_per_iteration": 2.9485418796539307 + }, + { + "auxiliary_loss_clip": 0.0141793, + "auxiliary_loss_mlp": 0.01037387, + "balance_loss_clip": 1.25439525, + "balance_loss_mlp": 1.01735997, + "epoch": 0.8297610100706448, + "flos": 20672803428480.0, + "grad_norm": 1.7255768260841884, + "language_loss": 0.74872392, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.7732771, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20043945, + "step": 13801, + "time_per_iteration": 2.8644356727600098 + }, + { + "auxiliary_loss_clip": 0.01411458, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.24915183, + "balance_loss_mlp": 1.01486802, + "epoch": 0.8298211333233128, + "flos": 20386095258240.0, + "grad_norm": 2.081547259914401, + "language_loss": 0.73949367, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.76393861, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1817627, + "step": 13802, + "time_per_iteration": 5.637048721313477 + }, + { + "auxiliary_loss_clip": 0.01405094, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.24443662, + "balance_loss_mlp": 1.01638603, + "epoch": 0.8298812565759808, + "flos": 21519101013120.0, + "grad_norm": 2.5048370222987604, + "language_loss": 0.75396919, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77837163, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18762207, + "step": 13803, + "time_per_iteration": 2.857107162475586 + }, + { + "auxiliary_loss_clip": 0.01402334, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.24280059, + "balance_loss_mlp": 1.01563549, + "epoch": 0.8299413798286487, + "flos": 22828516819200.0, + "grad_norm": 1.7502463301415667, + "language_loss": 0.79656792, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.82093596, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18823242, + "step": 13804, + "time_per_iteration": 2.820383310317993 + }, + { + "auxiliary_loss_clip": 0.01401974, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.24303126, + "balance_loss_mlp": 1.01113129, + "epoch": 0.8300015030813167, + "flos": 29691124957440.0, + "grad_norm": 1.4957993418318138, + "language_loss": 0.73753601, + "learning_rate": 2.954781319115016e-07, + "loss": 0.76184869, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18164062, + "step": 13805, + "time_per_iteration": 2.88493013381958 + }, + { + "auxiliary_loss_clip": 0.01404881, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.24175, + "balance_loss_mlp": 1.01228237, + "epoch": 0.8300616263339846, + "flos": 19729057248000.0, + "grad_norm": 2.4477701978023902, + "language_loss": 0.77657557, + "learning_rate": 2.952744302396906e-07, + "loss": 0.80093026, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18322754, + "step": 13806, + "time_per_iteration": 2.9343559741973877 + }, + { + "auxiliary_loss_clip": 0.01414687, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.25068212, + "balance_loss_mlp": 1.01465893, + "epoch": 0.8301217495866526, + "flos": 19911575347200.0, + "grad_norm": 1.6372776010493362, + "language_loss": 0.64422947, + "learning_rate": 2.950707932112444e-07, + "loss": 0.66871607, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19287109, + "step": 13807, + "time_per_iteration": 2.8344202041625977 + }, + { + "auxiliary_loss_clip": 0.01406424, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.24675608, + "balance_loss_mlp": 1.01516271, + "epoch": 0.8301818728393207, + "flos": 19724397033600.0, + "grad_norm": 1.9124147141527659, + "language_loss": 0.74296045, + "learning_rate": 2.948672208338847e-07, + "loss": 0.76736224, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18591309, + "step": 13808, + "time_per_iteration": 2.8325653076171875 + }, + { + "auxiliary_loss_clip": 0.014114, + "auxiliary_loss_mlp": 0.01040943, + "balance_loss_clip": 1.2478472, + "balance_loss_mlp": 1.02105856, + "epoch": 0.8302419960919886, + "flos": 28305462384000.0, + "grad_norm": 1.8950540626210772, + "language_loss": 0.6786449, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.70316833, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19885254, + "step": 13809, + "time_per_iteration": 3.017317771911621 + }, + { + "auxiliary_loss_clip": 0.01407156, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.24646986, + "balance_loss_mlp": 1.01459098, + "epoch": 0.8303021193446566, + "flos": 18232598862720.0, + "grad_norm": 1.9176571859516467, + "language_loss": 0.74504006, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76946127, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20373535, + "step": 13810, + "time_per_iteration": 2.8718161582946777 + }, + { + "auxiliary_loss_clip": 0.0139346, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.23827004, + "balance_loss_mlp": 1.01515007, + "epoch": 0.8303622425973245, + "flos": 23121559261440.0, + "grad_norm": 1.627521370914086, + "language_loss": 0.81933415, + "learning_rate": 2.94256891685505e-07, + "loss": 0.84359717, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.17675781, + "step": 13811, + "time_per_iteration": 2.856236219406128 + }, + { + "auxiliary_loss_clip": 0.01414896, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.25242352, + "balance_loss_mlp": 1.0160954, + "epoch": 0.8304223658499925, + "flos": 19582219313280.0, + "grad_norm": 2.82902023080237, + "language_loss": 0.73637748, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.76087409, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18664551, + "step": 13812, + "time_per_iteration": 2.827937602996826 + }, + { + "auxiliary_loss_clip": 0.01387719, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.23249006, + "balance_loss_mlp": 1.01384091, + "epoch": 0.8304824891026604, + "flos": 24436902136320.0, + "grad_norm": 1.6528263411759951, + "language_loss": 0.78852808, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.81272727, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18371582, + "step": 13813, + "time_per_iteration": 2.938697338104248 + }, + { + "auxiliary_loss_clip": 0.01406632, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.24387729, + "balance_loss_mlp": 1.0124439, + "epoch": 0.8305426123553284, + "flos": 22391758333440.0, + "grad_norm": 2.116929789843223, + "language_loss": 0.71769214, + "learning_rate": 2.93647144674658e-07, + "loss": 0.7420733, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19030762, + "step": 13814, + "time_per_iteration": 2.9068620204925537 + }, + { + "auxiliary_loss_clip": 0.01430822, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.26092482, + "balance_loss_mlp": 1.01615644, + "epoch": 0.8306027356079964, + "flos": 14911954871040.0, + "grad_norm": 2.0604127175040228, + "language_loss": 0.6897819, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.71444631, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.19458008, + "step": 13815, + "time_per_iteration": 2.8263626098632812 + }, + { + "auxiliary_loss_clip": 0.01399868, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.2405417, + "balance_loss_mlp": 1.01205194, + "epoch": 0.8306628588606644, + "flos": 19653624887040.0, + "grad_norm": 1.9877277782258944, + "language_loss": 0.77261865, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.79693401, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19604492, + "step": 13816, + "time_per_iteration": 2.861412763595581 + }, + { + "auxiliary_loss_clip": 0.01400284, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.24258304, + "balance_loss_mlp": 1.01340902, + "epoch": 0.8307229821133323, + "flos": 24400543299840.0, + "grad_norm": 1.808198606047099, + "language_loss": 0.82005745, + "learning_rate": 2.930379800094371e-07, + "loss": 0.8443774, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18286133, + "step": 13817, + "time_per_iteration": 2.8481698036193848 + }, + { + "auxiliary_loss_clip": 0.01410262, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.24862266, + "balance_loss_mlp": 1.01115096, + "epoch": 0.8307831053660003, + "flos": 21006955411200.0, + "grad_norm": 1.501406288564367, + "language_loss": 0.7887916, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.81320578, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19995117, + "step": 13818, + "time_per_iteration": 2.841456651687622 + }, + { + "auxiliary_loss_clip": 0.01413703, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.25309372, + "balance_loss_mlp": 1.01779938, + "epoch": 0.8308432286186682, + "flos": 21407174081280.0, + "grad_norm": 1.9512801902869126, + "language_loss": 0.82265401, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84716034, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19128418, + "step": 13819, + "time_per_iteration": 2.8304800987243652 + }, + { + "auxiliary_loss_clip": 0.01182486, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.09132421, + "balance_loss_mlp": 1.00159132, + "epoch": 0.8309033518713362, + "flos": 62558589680640.0, + "grad_norm": 1.1820763034884851, + "language_loss": 0.5624426, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58454466, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.26171875, + "step": 13820, + "time_per_iteration": 3.3950560092926025 + }, + { + "auxiliary_loss_clip": 0.01396942, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.23900807, + "balance_loss_mlp": 1.01132274, + "epoch": 0.8309634751240043, + "flos": 16986942259200.0, + "grad_norm": 1.8473455458226038, + "language_loss": 0.69060791, + "learning_rate": 2.922266666860831e-07, + "loss": 0.71487957, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18920898, + "step": 13821, + "time_per_iteration": 2.8396365642547607 + }, + { + "auxiliary_loss_clip": 0.01414288, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.24927878, + "balance_loss_mlp": 1.01652122, + "epoch": 0.8310235983766722, + "flos": 22684710286080.0, + "grad_norm": 2.3926540006880757, + "language_loss": 0.69749916, + "learning_rate": 2.920240002333625e-07, + "loss": 0.72199899, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19152832, + "step": 13822, + "time_per_iteration": 2.8595006465911865 + }, + { + "auxiliary_loss_clip": 0.01396995, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.23948371, + "balance_loss_mlp": 1.01536727, + "epoch": 0.8310837216293402, + "flos": 30823994977920.0, + "grad_norm": 1.8626167078370575, + "language_loss": 0.62709379, + "learning_rate": 2.918213985472631e-07, + "loss": 0.65139717, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17993164, + "step": 13823, + "time_per_iteration": 3.0041024684906006 + }, + { + "auxiliary_loss_clip": 0.01180135, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.09215927, + "balance_loss_mlp": 1.00322461, + "epoch": 0.8311438448820081, + "flos": 71309070120960.0, + "grad_norm": 0.993166684443333, + "language_loss": 0.62023139, + "learning_rate": 2.916188616354669e-07, + "loss": 0.64229584, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.23046875, + "step": 13824, + "time_per_iteration": 3.3900563716888428 + }, + { + "auxiliary_loss_clip": 0.01409252, + "auxiliary_loss_mlp": 0.010335, + "balance_loss_clip": 1.24897337, + "balance_loss_mlp": 1.01510561, + "epoch": 0.8312039681346761, + "flos": 20896974005760.0, + "grad_norm": 1.6203451717894926, + "language_loss": 0.7467314, + "learning_rate": 2.914163895056552e-07, + "loss": 0.77115893, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18395996, + "step": 13825, + "time_per_iteration": 2.855875253677368 + }, + { + "auxiliary_loss_clip": 0.01414659, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.25130475, + "balance_loss_mlp": 1.01547718, + "epoch": 0.831264091387344, + "flos": 17025880049280.0, + "grad_norm": 1.8346408364245799, + "language_loss": 0.8073535, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.83184648, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19152832, + "step": 13826, + "time_per_iteration": 2.8469276428222656 + }, + { + "auxiliary_loss_clip": 0.0141465, + "auxiliary_loss_mlp": 0.01034175, + "balance_loss_clip": 1.25293374, + "balance_loss_mlp": 1.01494694, + "epoch": 0.831324214640012, + "flos": 24428667582720.0, + "grad_norm": 1.640102811947179, + "language_loss": 0.68756795, + "learning_rate": 2.910116396226914e-07, + "loss": 0.71205622, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19226074, + "step": 13827, + "time_per_iteration": 4.3899242877960205 + }, + { + "auxiliary_loss_clip": 0.0140934, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.24890327, + "balance_loss_mlp": 1.01253724, + "epoch": 0.83138433789268, + "flos": 13551520913280.0, + "grad_norm": 5.323310824007471, + "language_loss": 0.74745518, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.77185047, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.17651367, + "step": 13828, + "time_per_iteration": 2.795210123062134 + }, + { + "auxiliary_loss_clip": 0.01403796, + "auxiliary_loss_mlp": 0.01035862, + "balance_loss_clip": 1.24223542, + "balance_loss_mlp": 1.01716995, + "epoch": 0.831444461145348, + "flos": 44508146958720.0, + "grad_norm": 1.8575086513734091, + "language_loss": 0.68263209, + "learning_rate": 2.906071489597657e-07, + "loss": 0.70702869, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18688965, + "step": 13829, + "time_per_iteration": 3.040065050125122 + }, + { + "auxiliary_loss_clip": 0.01418224, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.25390506, + "balance_loss_mlp": 1.01621652, + "epoch": 0.8315045843980159, + "flos": 22713739464960.0, + "grad_norm": 1.5897188303259708, + "language_loss": 0.83306175, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.85758996, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18383789, + "step": 13830, + "time_per_iteration": 2.8714101314544678 + }, + { + "auxiliary_loss_clip": 0.01407068, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.24756122, + "balance_loss_mlp": 1.01555705, + "epoch": 0.8315647076506839, + "flos": 16882299740160.0, + "grad_norm": 2.0155672007913883, + "language_loss": 0.75130963, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.77572262, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18676758, + "step": 13831, + "time_per_iteration": 4.271459579467773 + }, + { + "auxiliary_loss_clip": 0.01412166, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.25067103, + "balance_loss_mlp": 1.01791167, + "epoch": 0.8316248309033518, + "flos": 13816755786240.0, + "grad_norm": 2.2134743491473974, + "language_loss": 0.72110856, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.74560696, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19750977, + "step": 13832, + "time_per_iteration": 2.842864513397217 + }, + { + "auxiliary_loss_clip": 0.01410522, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.25029778, + "balance_loss_mlp": 1.01617241, + "epoch": 0.8316849541560198, + "flos": 23522682827520.0, + "grad_norm": 3.29177582688134, + "language_loss": 0.85176039, + "learning_rate": 2.897989455393979e-07, + "loss": 0.87621248, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18493652, + "step": 13833, + "time_per_iteration": 2.8523359298706055 + }, + { + "auxiliary_loss_clip": 0.0142119, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.25745964, + "balance_loss_mlp": 1.01523435, + "epoch": 0.8317450774086879, + "flos": 23782488324480.0, + "grad_norm": 1.6525794275635919, + "language_loss": 0.77042842, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.79497921, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18664551, + "step": 13834, + "time_per_iteration": 2.8781626224517822 + }, + { + "auxiliary_loss_clip": 0.01396276, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.23860919, + "balance_loss_mlp": 1.01398587, + "epoch": 0.8318052006613558, + "flos": 16223859141120.0, + "grad_norm": 2.375229026882135, + "language_loss": 0.80512953, + "learning_rate": 2.893952329045459e-07, + "loss": 0.82940996, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.17797852, + "step": 13835, + "time_per_iteration": 2.805330753326416 + }, + { + "auxiliary_loss_clip": 0.01421107, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.25842953, + "balance_loss_mlp": 1.01760066, + "epoch": 0.8318653239140238, + "flos": 19984112040960.0, + "grad_norm": 1.9400805507704162, + "language_loss": 0.81673443, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.84131539, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19396973, + "step": 13836, + "time_per_iteration": 4.276665449142456 + }, + { + "auxiliary_loss_clip": 0.01401442, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.24356163, + "balance_loss_mlp": 1.01668453, + "epoch": 0.8319254471666917, + "flos": 17711675769600.0, + "grad_norm": 1.9503868915883795, + "language_loss": 0.77871293, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.80308104, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18676758, + "step": 13837, + "time_per_iteration": 4.270092010498047 + }, + { + "auxiliary_loss_clip": 0.01421125, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.25518894, + "balance_loss_mlp": 1.01369619, + "epoch": 0.8319855704193597, + "flos": 19545950966400.0, + "grad_norm": 1.6261930101606357, + "language_loss": 0.84394324, + "learning_rate": 2.887901504686685e-07, + "loss": 0.86848587, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19433594, + "step": 13838, + "time_per_iteration": 2.895066261291504 + }, + { + "auxiliary_loss_clip": 0.01411802, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.25292051, + "balance_loss_mlp": 1.01844656, + "epoch": 0.8320456936720276, + "flos": 21187799452800.0, + "grad_norm": 1.9240680628079518, + "language_loss": 0.75392807, + "learning_rate": 2.885885860916795e-07, + "loss": 0.77842426, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19384766, + "step": 13839, + "time_per_iteration": 2.8934481143951416 + }, + { + "auxiliary_loss_clip": 0.0141155, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.25145805, + "balance_loss_mlp": 1.01493168, + "epoch": 0.8321058169246957, + "flos": 33262706465280.0, + "grad_norm": 1.5663870059101805, + "language_loss": 0.68276143, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.70721006, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18395996, + "step": 13840, + "time_per_iteration": 2.919400691986084 + }, + { + "auxiliary_loss_clip": 0.01405822, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.24404609, + "balance_loss_mlp": 1.01609886, + "epoch": 0.8321659401773636, + "flos": 14215752846720.0, + "grad_norm": 2.373222359171838, + "language_loss": 0.79920954, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.82361561, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18676758, + "step": 13841, + "time_per_iteration": 2.7953550815582275 + }, + { + "auxiliary_loss_clip": 0.01406656, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.24699688, + "balance_loss_mlp": 1.01588345, + "epoch": 0.8322260634300316, + "flos": 15166511971200.0, + "grad_norm": 1.867156740371621, + "language_loss": 0.69497192, + "learning_rate": 2.879842823726262e-07, + "loss": 0.71938312, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18591309, + "step": 13842, + "time_per_iteration": 2.806941032409668 + }, + { + "auxiliary_loss_clip": 0.0139779, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.23988509, + "balance_loss_mlp": 1.01521826, + "epoch": 0.8322861866826995, + "flos": 25311595472640.0, + "grad_norm": 1.5229698001585692, + "language_loss": 0.73444366, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75876647, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19287109, + "step": 13843, + "time_per_iteration": 2.869706869125366 + }, + { + "auxiliary_loss_clip": 0.01405347, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.24696445, + "balance_loss_mlp": 1.0134871, + "epoch": 0.8323463099353675, + "flos": 17028187534080.0, + "grad_norm": 2.5784479513297853, + "language_loss": 0.78823864, + "learning_rate": 2.875817378128975e-07, + "loss": 0.81262028, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.1932373, + "step": 13844, + "time_per_iteration": 2.8084371089935303 + }, + { + "auxiliary_loss_clip": 0.01183684, + "auxiliary_loss_mlp": 0.01018315, + "balance_loss_clip": 1.09411907, + "balance_loss_mlp": 0.99914646, + "epoch": 0.8324064331880354, + "flos": 55632720032640.0, + "grad_norm": 0.7832195667968963, + "language_loss": 0.55308032, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.5751003, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.19140625, + "step": 13845, + "time_per_iteration": 3.24932861328125 + }, + { + "auxiliary_loss_clip": 0.01416313, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.25295496, + "balance_loss_mlp": 1.01818442, + "epoch": 0.8324665564407034, + "flos": 26149251300480.0, + "grad_norm": 2.5912098060551925, + "language_loss": 0.76222706, + "learning_rate": 2.871794529934555e-07, + "loss": 0.78676748, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19543457, + "step": 13846, + "time_per_iteration": 2.862269401550293 + }, + { + "auxiliary_loss_clip": 0.01422389, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.25567651, + "balance_loss_mlp": 1.01574314, + "epoch": 0.8325266796933715, + "flos": 22057968309120.0, + "grad_norm": 1.7820662776425409, + "language_loss": 0.79836935, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.82295096, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20031738, + "step": 13847, + "time_per_iteration": 2.8809046745300293 + }, + { + "auxiliary_loss_clip": 0.01404049, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.24629164, + "balance_loss_mlp": 1.01317585, + "epoch": 0.8325868029460394, + "flos": 22825937865600.0, + "grad_norm": 1.7267223839842822, + "language_loss": 0.74535555, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76970017, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.17224121, + "step": 13848, + "time_per_iteration": 2.8612771034240723 + }, + { + "auxiliary_loss_clip": 0.01405491, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.24647772, + "balance_loss_mlp": 1.0127027, + "epoch": 0.8326469261987074, + "flos": 14765750363520.0, + "grad_norm": 2.0460058860945267, + "language_loss": 0.65023941, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.6746214, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19995117, + "step": 13849, + "time_per_iteration": 2.9180591106414795 + }, + { + "auxiliary_loss_clip": 0.01411809, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.24982762, + "balance_loss_mlp": 1.02105308, + "epoch": 0.8327070494513753, + "flos": 22935466823040.0, + "grad_norm": 2.23010299603168, + "language_loss": 0.80088645, + "learning_rate": 2.863756628194638e-07, + "loss": 0.82540584, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19067383, + "step": 13850, + "time_per_iteration": 2.867211103439331 + }, + { + "auxiliary_loss_clip": 0.01389462, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.23393154, + "balance_loss_mlp": 1.02092266, + "epoch": 0.8327671727040433, + "flos": 20674432241280.0, + "grad_norm": 1.515443416147512, + "language_loss": 0.79073322, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.81502104, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18395996, + "step": 13851, + "time_per_iteration": 2.847804069519043 + }, + { + "auxiliary_loss_clip": 0.01182526, + "auxiliary_loss_mlp": 0.01023815, + "balance_loss_clip": 1.09322929, + "balance_loss_mlp": 1.00149882, + "epoch": 0.8328272959567112, + "flos": 56089612454400.0, + "grad_norm": 0.7699257045296197, + "language_loss": 0.55915427, + "learning_rate": 2.859741575868344e-07, + "loss": 0.58121771, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.22363281, + "step": 13852, + "time_per_iteration": 3.3332736492156982 + }, + { + "auxiliary_loss_clip": 0.01398173, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.24042654, + "balance_loss_mlp": 1.01225173, + "epoch": 0.8328874192093793, + "flos": 32314888252800.0, + "grad_norm": 1.6827397607539019, + "language_loss": 0.68080699, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.70509756, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18640137, + "step": 13853, + "time_per_iteration": 2.966142177581787 + }, + { + "auxiliary_loss_clip": 0.01402035, + "auxiliary_loss_mlp": 0.01034505, + "balance_loss_clip": 1.24217057, + "balance_loss_mlp": 1.01620638, + "epoch": 0.8329475424620472, + "flos": 23522909051520.0, + "grad_norm": 1.674863727058401, + "language_loss": 0.79106653, + "learning_rate": 2.855729123383286e-07, + "loss": 0.81543189, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18286133, + "step": 13854, + "time_per_iteration": 2.945713520050049 + }, + { + "auxiliary_loss_clip": 0.01182348, + "auxiliary_loss_mlp": 0.01022158, + "balance_loss_clip": 1.0912801, + "balance_loss_mlp": 1.00270331, + "epoch": 0.8330076657147152, + "flos": 67871296045440.0, + "grad_norm": 0.780173540828259, + "language_loss": 0.58697218, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60901725, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.19433594, + "step": 13855, + "time_per_iteration": 3.1872167587280273 + }, + { + "auxiliary_loss_clip": 0.01410541, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.2496922, + "balance_loss_mlp": 1.01296473, + "epoch": 0.8330677889673831, + "flos": 22902953794560.0, + "grad_norm": 1.8743124848458799, + "language_loss": 0.72788632, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.7523129, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19152832, + "step": 13856, + "time_per_iteration": 2.8364670276641846 + }, + { + "auxiliary_loss_clip": 0.01406986, + "auxiliary_loss_mlp": 0.01032781, + "balance_loss_clip": 1.2457366, + "balance_loss_mlp": 1.01345706, + "epoch": 0.8331279122200511, + "flos": 27356603541120.0, + "grad_norm": 1.7386905376675375, + "language_loss": 0.76379251, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.78819013, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1932373, + "step": 13857, + "time_per_iteration": 2.898599624633789 + }, + { + "auxiliary_loss_clip": 0.01396479, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.24187636, + "balance_loss_mlp": 1.01776838, + "epoch": 0.833188035472719, + "flos": 19947753204480.0, + "grad_norm": 1.511043509409384, + "language_loss": 0.74041522, + "learning_rate": 2.847712020370958e-07, + "loss": 0.76473212, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.17443848, + "step": 13858, + "time_per_iteration": 2.8795907497406006 + }, + { + "auxiliary_loss_clip": 0.01426098, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.26010764, + "balance_loss_mlp": 1.01478398, + "epoch": 0.833248158725387, + "flos": 15241265660160.0, + "grad_norm": 1.7936178209373406, + "language_loss": 0.74157113, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.76616973, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18969727, + "step": 13859, + "time_per_iteration": 2.8969931602478027 + }, + { + "auxiliary_loss_clip": 0.01404101, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.24699986, + "balance_loss_mlp": 1.01563799, + "epoch": 0.8333082819780551, + "flos": 24546340604160.0, + "grad_norm": 1.7744853638417033, + "language_loss": 0.80002677, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.82440448, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18029785, + "step": 13860, + "time_per_iteration": 2.9113199710845947 + }, + { + "auxiliary_loss_clip": 0.0139481, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.23757982, + "balance_loss_mlp": 1.01338887, + "epoch": 0.833368405230723, + "flos": 31479901868160.0, + "grad_norm": 1.456239034158224, + "language_loss": 0.82910335, + "learning_rate": 2.841706022218644e-07, + "loss": 0.85336977, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18444824, + "step": 13861, + "time_per_iteration": 2.925389289855957 + }, + { + "auxiliary_loss_clip": 0.01408991, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.24823523, + "balance_loss_mlp": 1.01522493, + "epoch": 0.833428528483391, + "flos": 14910868995840.0, + "grad_norm": 1.922780933051717, + "language_loss": 0.79869366, + "learning_rate": 2.839705324021806e-07, + "loss": 0.82312453, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18859863, + "step": 13862, + "time_per_iteration": 4.251455068588257 + }, + { + "auxiliary_loss_clip": 0.01413804, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.25046587, + "balance_loss_mlp": 1.01465154, + "epoch": 0.8334886517360589, + "flos": 22209873661440.0, + "grad_norm": 4.156508603609199, + "language_loss": 0.7596935, + "learning_rate": 2.83770527654505e-07, + "loss": 0.78416914, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19116211, + "step": 13863, + "time_per_iteration": 2.832001209259033 + }, + { + "auxiliary_loss_clip": 0.01399361, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.2438972, + "balance_loss_mlp": 1.01610756, + "epoch": 0.8335487749887269, + "flos": 30384702783360.0, + "grad_norm": 3.043690085029642, + "language_loss": 0.76050162, + "learning_rate": 2.835705879864232e-07, + "loss": 0.78483546, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.17919922, + "step": 13864, + "time_per_iteration": 2.9099371433258057 + }, + { + "auxiliary_loss_clip": 0.01403872, + "auxiliary_loss_mlp": 0.01038387, + "balance_loss_clip": 1.24377835, + "balance_loss_mlp": 1.01826429, + "epoch": 0.8336088982413948, + "flos": 24691911684480.0, + "grad_norm": 1.8396182014522282, + "language_loss": 0.70099396, + "learning_rate": 2.833707134055168e-07, + "loss": 0.72541654, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.20129395, + "step": 13865, + "time_per_iteration": 2.8608877658843994 + }, + { + "auxiliary_loss_clip": 0.01410709, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.25093472, + "balance_loss_mlp": 1.01538157, + "epoch": 0.8336690214940629, + "flos": 38190514164480.0, + "grad_norm": 1.8325559590988134, + "language_loss": 0.75967425, + "learning_rate": 2.831709039193653e-07, + "loss": 0.78411973, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18457031, + "step": 13866, + "time_per_iteration": 4.4581992626190186 + }, + { + "auxiliary_loss_clip": 0.01183511, + "auxiliary_loss_mlp": 0.01018019, + "balance_loss_clip": 1.09323156, + "balance_loss_mlp": 0.99865896, + "epoch": 0.8337291447467308, + "flos": 55588307621760.0, + "grad_norm": 0.8792470021246632, + "language_loss": 0.6320833, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65409851, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.19335938, + "step": 13867, + "time_per_iteration": 3.2464215755462646 + }, + { + "auxiliary_loss_clip": 0.01406209, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.24758303, + "balance_loss_mlp": 1.01109576, + "epoch": 0.8337892679993988, + "flos": 24144085918080.0, + "grad_norm": 1.9120264471280546, + "language_loss": 0.72810918, + "learning_rate": 2.827714802616301e-07, + "loss": 0.75246, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1776123, + "step": 13868, + "time_per_iteration": 2.8798344135284424 + }, + { + "auxiliary_loss_clip": 0.01411995, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.25134325, + "balance_loss_mlp": 1.01413703, + "epoch": 0.8338493912520667, + "flos": 28195616712960.0, + "grad_norm": 1.4009839750666562, + "language_loss": 0.81275666, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.83721346, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1953125, + "step": 13869, + "time_per_iteration": 2.957946538925171 + }, + { + "auxiliary_loss_clip": 0.01406974, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.24724853, + "balance_loss_mlp": 1.0181694, + "epoch": 0.8339095145047347, + "flos": 22167090063360.0, + "grad_norm": 1.5754887996260096, + "language_loss": 0.82944882, + "learning_rate": 2.823723170738028e-07, + "loss": 0.85388833, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18798828, + "step": 13870, + "time_per_iteration": 2.9224157333374023 + }, + { + "auxiliary_loss_clip": 0.01404178, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.24225235, + "balance_loss_mlp": 1.01382089, + "epoch": 0.8339696377574026, + "flos": 17314940949120.0, + "grad_norm": 2.7426964735687553, + "language_loss": 0.71123314, + "learning_rate": 2.821728331750264e-07, + "loss": 0.73560148, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18835449, + "step": 13871, + "time_per_iteration": 4.29019832611084 + }, + { + "auxiliary_loss_clip": 0.01385377, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.22964585, + "balance_loss_mlp": 1.01772094, + "epoch": 0.8340297610100706, + "flos": 20678232804480.0, + "grad_norm": 1.7478522332125257, + "language_loss": 0.69703054, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.72124416, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18273926, + "step": 13872, + "time_per_iteration": 4.409483909606934 + }, + { + "auxiliary_loss_clip": 0.01406006, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.2455343, + "balance_loss_mlp": 1.01136613, + "epoch": 0.8340898842627387, + "flos": 20523477029760.0, + "grad_norm": 2.0064878526365875, + "language_loss": 0.74492508, + "learning_rate": 2.817740608055712e-07, + "loss": 0.7692821, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18334961, + "step": 13873, + "time_per_iteration": 2.9339370727539062 + }, + { + "auxiliary_loss_clip": 0.01420818, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.25542164, + "balance_loss_mlp": 1.01221073, + "epoch": 0.8341500075154066, + "flos": 21433669551360.0, + "grad_norm": 2.2333903213977306, + "language_loss": 0.76456505, + "learning_rate": 2.81574772350013e-07, + "loss": 0.7890954, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.1998291, + "step": 13874, + "time_per_iteration": 2.825222969055176 + }, + { + "auxiliary_loss_clip": 0.0140753, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.24841034, + "balance_loss_mlp": 1.01482654, + "epoch": 0.8342101307680746, + "flos": 22101159110400.0, + "grad_norm": 2.356400384328307, + "language_loss": 0.67237532, + "learning_rate": 2.813755490573118e-07, + "loss": 0.69677567, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.17687988, + "step": 13875, + "time_per_iteration": 2.8709003925323486 + }, + { + "auxiliary_loss_clip": 0.01398479, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.23875451, + "balance_loss_mlp": 1.01436794, + "epoch": 0.8342702540207425, + "flos": 21881422523520.0, + "grad_norm": 2.5329766377699197, + "language_loss": 0.80543971, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.82975221, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18395996, + "step": 13876, + "time_per_iteration": 2.849001169204712 + }, + { + "auxiliary_loss_clip": 0.01408988, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.25067973, + "balance_loss_mlp": 1.01497388, + "epoch": 0.8343303772734105, + "flos": 22538686757760.0, + "grad_norm": 1.8498019839442936, + "language_loss": 0.88504064, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.90946889, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18859863, + "step": 13877, + "time_per_iteration": 2.869783878326416 + }, + { + "auxiliary_loss_clip": 0.0140066, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.24160123, + "balance_loss_mlp": 1.01425469, + "epoch": 0.8343905005260784, + "flos": 14948087483520.0, + "grad_norm": 1.7762660143664695, + "language_loss": 0.70258051, + "learning_rate": 2.807782702318828e-07, + "loss": 0.72691691, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1875, + "step": 13878, + "time_per_iteration": 2.825199604034424 + }, + { + "auxiliary_loss_clip": 0.01403395, + "auxiliary_loss_mlp": 0.01029395, + "balance_loss_clip": 1.24283004, + "balance_loss_mlp": 1.01175213, + "epoch": 0.8344506237787465, + "flos": 15020714666880.0, + "grad_norm": 1.812823793934417, + "language_loss": 0.80112398, + "learning_rate": 2.805793076661309e-07, + "loss": 0.82545185, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.17626953, + "step": 13879, + "time_per_iteration": 2.8953380584716797 + }, + { + "auxiliary_loss_clip": 0.01391049, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.23366296, + "balance_loss_mlp": 1.01131296, + "epoch": 0.8345107470314144, + "flos": 17567733502080.0, + "grad_norm": 2.2159556496720993, + "language_loss": 0.83977717, + "learning_rate": 2.803804103009828e-07, + "loss": 0.86398405, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18322754, + "step": 13880, + "time_per_iteration": 2.795341968536377 + }, + { + "auxiliary_loss_clip": 0.01417016, + "auxiliary_loss_mlp": 0.0102645, + "balance_loss_clip": 1.25530303, + "balance_loss_mlp": 1.00874734, + "epoch": 0.8345708702840824, + "flos": 25196953852800.0, + "grad_norm": 1.5396875610739729, + "language_loss": 0.78907806, + "learning_rate": 2.80181578143982e-07, + "loss": 0.81351274, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.17712402, + "step": 13881, + "time_per_iteration": 2.9095630645751953 + }, + { + "auxiliary_loss_clip": 0.01389299, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.23502111, + "balance_loss_mlp": 1.01473522, + "epoch": 0.8346309935367503, + "flos": 15091351079040.0, + "grad_norm": 2.5731712831638625, + "language_loss": 0.78991967, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.8141349, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.17504883, + "step": 13882, + "time_per_iteration": 2.8088419437408447 + }, + { + "auxiliary_loss_clip": 0.01399286, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.24027646, + "balance_loss_mlp": 1.01630855, + "epoch": 0.8346911167894183, + "flos": 22941122423040.0, + "grad_norm": 2.5224582422761883, + "language_loss": 0.80948901, + "learning_rate": 2.79784109484579e-07, + "loss": 0.83383989, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19482422, + "step": 13883, + "time_per_iteration": 2.8681557178497314 + }, + { + "auxiliary_loss_clip": 0.01404332, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.2428863, + "balance_loss_mlp": 1.01194263, + "epoch": 0.8347512400420862, + "flos": 20202536528640.0, + "grad_norm": 1.937858705413825, + "language_loss": 0.74800622, + "learning_rate": 2.795854729972482e-07, + "loss": 0.77236032, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19128418, + "step": 13884, + "time_per_iteration": 2.872476100921631 + }, + { + "auxiliary_loss_clip": 0.01439107, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.26878726, + "balance_loss_mlp": 1.01753736, + "epoch": 0.8348113632947542, + "flos": 25965059143680.0, + "grad_norm": 1.8229467995668094, + "language_loss": 0.71083599, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.73559511, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 1.703125, + "router_z_loss_mlp": 0.19287109, + "step": 13885, + "time_per_iteration": 2.876168727874756 + }, + { + "auxiliary_loss_clip": 0.01410315, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.24966145, + "balance_loss_mlp": 1.01319313, + "epoch": 0.8348714865474223, + "flos": 34217944824960.0, + "grad_norm": 1.723062030652736, + "language_loss": 0.7094149, + "learning_rate": 2.791883957449912e-07, + "loss": 0.73383999, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19006348, + "step": 13886, + "time_per_iteration": 3.0203053951263428 + }, + { + "auxiliary_loss_clip": 0.01402096, + "auxiliary_loss_mlp": 0.01030209, + "balance_loss_clip": 1.24332082, + "balance_loss_mlp": 1.01070642, + "epoch": 0.8349316098000902, + "flos": 24401040992640.0, + "grad_norm": 1.685783299382979, + "language_loss": 0.79473382, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81905687, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19506836, + "step": 13887, + "time_per_iteration": 2.9629976749420166 + }, + { + "auxiliary_loss_clip": 0.0142936, + "auxiliary_loss_mlp": 0.01030389, + "balance_loss_clip": 1.26398802, + "balance_loss_mlp": 1.01070738, + "epoch": 0.8349917330527582, + "flos": 23041376196480.0, + "grad_norm": 2.320539404251607, + "language_loss": 0.64402163, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.6686191, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19677734, + "step": 13888, + "time_per_iteration": 2.8860671520233154 + }, + { + "auxiliary_loss_clip": 0.01416429, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.25200987, + "balance_loss_mlp": 1.01013279, + "epoch": 0.8350518563054261, + "flos": 13633739994240.0, + "grad_norm": 2.046323794697516, + "language_loss": 0.68476474, + "learning_rate": 2.785932692855244e-07, + "loss": 0.70921874, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18811035, + "step": 13889, + "time_per_iteration": 2.866093635559082 + }, + { + "auxiliary_loss_clip": 0.01400907, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.24149394, + "balance_loss_mlp": 1.0108211, + "epoch": 0.8351119795580941, + "flos": 21589737425280.0, + "grad_norm": 1.96736348238121, + "language_loss": 0.69393754, + "learning_rate": 2.783950243408399e-07, + "loss": 0.71823084, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.17602539, + "step": 13890, + "time_per_iteration": 2.8298912048339844 + }, + { + "auxiliary_loss_clip": 0.01418993, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.25724316, + "balance_loss_mlp": 1.0178411, + "epoch": 0.835172102810762, + "flos": 20045473269120.0, + "grad_norm": 2.0535049419437734, + "language_loss": 0.59969527, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.62424028, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.17663574, + "step": 13891, + "time_per_iteration": 2.809206485748291 + }, + { + "auxiliary_loss_clip": 0.01403179, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.24313045, + "balance_loss_mlp": 1.0122056, + "epoch": 0.8352322260634301, + "flos": 25120526106240.0, + "grad_norm": 1.6719112050326395, + "language_loss": 0.72193933, + "learning_rate": 2.779987303092846e-07, + "loss": 0.74628401, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19091797, + "step": 13892, + "time_per_iteration": 2.8929800987243652 + }, + { + "auxiliary_loss_clip": 0.01389302, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.23267126, + "balance_loss_mlp": 1.01292539, + "epoch": 0.835292349316098, + "flos": 24874610762880.0, + "grad_norm": 1.4996873557244974, + "language_loss": 0.66177332, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68598223, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18676758, + "step": 13893, + "time_per_iteration": 2.8998019695281982 + }, + { + "auxiliary_loss_clip": 0.01406109, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.24430752, + "balance_loss_mlp": 1.01234674, + "epoch": 0.835352472568766, + "flos": 19875216510720.0, + "grad_norm": 1.924401388351334, + "language_loss": 0.78811485, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.81248289, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18347168, + "step": 13894, + "time_per_iteration": 2.853407859802246 + }, + { + "auxiliary_loss_clip": 0.0138606, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.23235607, + "balance_loss_mlp": 1.01507807, + "epoch": 0.8354125958214339, + "flos": 22064981253120.0, + "grad_norm": 1.8328373977370997, + "language_loss": 0.73549378, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.75968993, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.18481445, + "step": 13895, + "time_per_iteration": 2.9471235275268555 + }, + { + "auxiliary_loss_clip": 0.0142176, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.25667024, + "balance_loss_mlp": 1.01650977, + "epoch": 0.8354727190741019, + "flos": 21407807508480.0, + "grad_norm": 2.2336434018089806, + "language_loss": 0.72949511, + "learning_rate": 2.772069258877667e-07, + "loss": 0.7540763, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19848633, + "step": 13896, + "time_per_iteration": 2.891248941421509 + }, + { + "auxiliary_loss_clip": 0.01393676, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.23731411, + "balance_loss_mlp": 1.01524425, + "epoch": 0.8355328423267698, + "flos": 50858428515840.0, + "grad_norm": 2.358987912744662, + "language_loss": 0.59107828, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61535239, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18481445, + "step": 13897, + "time_per_iteration": 4.5042359828948975 + }, + { + "auxiliary_loss_clip": 0.01181191, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.09054875, + "balance_loss_mlp": 1.00944543, + "epoch": 0.8355929655794379, + "flos": 65583404035200.0, + "grad_norm": 0.7202605376597025, + "language_loss": 0.57729656, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.5994156, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.21289062, + "step": 13898, + "time_per_iteration": 3.381085157394409 + }, + { + "auxiliary_loss_clip": 0.01414738, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.2513994, + "balance_loss_mlp": 1.01505864, + "epoch": 0.8356530888321058, + "flos": 19179512179200.0, + "grad_norm": 1.7703441690426864, + "language_loss": 0.80844539, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.83294082, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1973877, + "step": 13899, + "time_per_iteration": 2.8491294384002686 + }, + { + "auxiliary_loss_clip": 0.01422181, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.2603476, + "balance_loss_mlp": 1.0137881, + "epoch": 0.8357132120847738, + "flos": 44142115374720.0, + "grad_norm": 1.845950851198546, + "language_loss": 0.69623697, + "learning_rate": 2.764161667219749e-07, + "loss": 0.72077835, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18139648, + "step": 13900, + "time_per_iteration": 3.036602258682251 + }, + { + "auxiliary_loss_clip": 0.0139744, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.23973227, + "balance_loss_mlp": 1.01383209, + "epoch": 0.8357733353374418, + "flos": 24400498055040.0, + "grad_norm": 1.41890178141524, + "language_loss": 0.71882915, + "learning_rate": 2.762186403079716e-07, + "loss": 0.7431286, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18664551, + "step": 13901, + "time_per_iteration": 4.311877250671387 + }, + { + "auxiliary_loss_clip": 0.01421756, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.2566458, + "balance_loss_mlp": 1.01429462, + "epoch": 0.8358334585901097, + "flos": 20924329127040.0, + "grad_norm": 2.2262175937414463, + "language_loss": 0.80769479, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.83225131, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19628906, + "step": 13902, + "time_per_iteration": 2.820218086242676 + }, + { + "auxiliary_loss_clip": 0.01398711, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.24114561, + "balance_loss_mlp": 1.01842999, + "epoch": 0.8358935818427777, + "flos": 19253360972160.0, + "grad_norm": 1.6098319652112272, + "language_loss": 0.63061959, + "learning_rate": 2.758237835853379e-07, + "loss": 0.65498471, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19372559, + "step": 13903, + "time_per_iteration": 2.8082101345062256 + }, + { + "auxiliary_loss_clip": 0.01403705, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.24524307, + "balance_loss_mlp": 1.01445699, + "epoch": 0.8359537050954456, + "flos": 24144719345280.0, + "grad_norm": 1.6548987736716696, + "language_loss": 0.75131696, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.77568287, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1842041, + "step": 13904, + "time_per_iteration": 2.862440824508667 + }, + { + "auxiliary_loss_clip": 0.01387522, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.23108494, + "balance_loss_mlp": 1.01414943, + "epoch": 0.8360138283481137, + "flos": 16189083872640.0, + "grad_norm": 5.564296144797552, + "language_loss": 0.73314369, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.75734925, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18884277, + "step": 13905, + "time_per_iteration": 2.861351728439331 + }, + { + "auxiliary_loss_clip": 0.01382405, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.2277894, + "balance_loss_mlp": 1.01740885, + "epoch": 0.8360739516007816, + "flos": 22208697296640.0, + "grad_norm": 2.5498966064721778, + "language_loss": 0.67182642, + "learning_rate": 2.752319888771e-07, + "loss": 0.69600099, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.17651367, + "step": 13906, + "time_per_iteration": 4.272448539733887 + }, + { + "auxiliary_loss_clip": 0.01398935, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.24042249, + "balance_loss_mlp": 1.0141139, + "epoch": 0.8361340748534496, + "flos": 20932925639040.0, + "grad_norm": 1.435446814284113, + "language_loss": 0.74621993, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.77054578, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1953125, + "step": 13907, + "time_per_iteration": 4.302859544754028 + }, + { + "auxiliary_loss_clip": 0.01410768, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.24638724, + "balance_loss_mlp": 1.01424766, + "epoch": 0.8361941981061175, + "flos": 26183845589760.0, + "grad_norm": 1.8116171619853463, + "language_loss": 0.7621184, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.78656268, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1940918, + "step": 13908, + "time_per_iteration": 2.940707206726074 + }, + { + "auxiliary_loss_clip": 0.01407671, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.24823308, + "balance_loss_mlp": 1.01406324, + "epoch": 0.8362543213587855, + "flos": 24428396113920.0, + "grad_norm": 4.132367260813974, + "language_loss": 0.72335607, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.74776787, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19445801, + "step": 13909, + "time_per_iteration": 2.9598684310913086 + }, + { + "auxiliary_loss_clip": 0.01417396, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.25337243, + "balance_loss_mlp": 1.01248538, + "epoch": 0.8363144446114534, + "flos": 17211429550080.0, + "grad_norm": 1.9186421147274122, + "language_loss": 0.74084496, + "learning_rate": 2.744438449482338e-07, + "loss": 0.76533312, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18920898, + "step": 13910, + "time_per_iteration": 2.835066556930542 + }, + { + "auxiliary_loss_clip": 0.01409778, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.24920583, + "balance_loss_mlp": 1.01415718, + "epoch": 0.8363745678641215, + "flos": 19288271975040.0, + "grad_norm": 1.8034510714344314, + "language_loss": 0.73758256, + "learning_rate": 2.742469725305001e-07, + "loss": 0.76201248, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19055176, + "step": 13911, + "time_per_iteration": 3.0157644748687744 + }, + { + "auxiliary_loss_clip": 0.01419699, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.25566244, + "balance_loss_mlp": 1.01495147, + "epoch": 0.8364346911167894, + "flos": 11882362550400.0, + "grad_norm": 1.934727731985863, + "language_loss": 0.79838371, + "learning_rate": 2.740501655534946e-07, + "loss": 0.82291663, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18640137, + "step": 13912, + "time_per_iteration": 2.8285460472106934 + }, + { + "auxiliary_loss_clip": 0.01397455, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.23935044, + "balance_loss_mlp": 1.01288307, + "epoch": 0.8364948143694574, + "flos": 20233601723520.0, + "grad_norm": 1.7014679610261707, + "language_loss": 0.79630959, + "learning_rate": 2.738534240246797e-07, + "loss": 0.82059866, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18566895, + "step": 13913, + "time_per_iteration": 2.8319194316864014 + }, + { + "auxiliary_loss_clip": 0.0141164, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.24889946, + "balance_loss_mlp": 1.01460958, + "epoch": 0.8365549376221254, + "flos": 21621978984960.0, + "grad_norm": 1.707706399209918, + "language_loss": 0.73967183, + "learning_rate": 2.736567479515153e-07, + "loss": 0.76413178, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19750977, + "step": 13914, + "time_per_iteration": 2.8211283683776855 + }, + { + "auxiliary_loss_clip": 0.01404172, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.24490404, + "balance_loss_mlp": 1.01703405, + "epoch": 0.8366150608747933, + "flos": 23304484563840.0, + "grad_norm": 1.685362826316315, + "language_loss": 0.72123945, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.74564248, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19091797, + "step": 13915, + "time_per_iteration": 2.894049882888794 + }, + { + "auxiliary_loss_clip": 0.014043, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.24264073, + "balance_loss_mlp": 1.01413751, + "epoch": 0.8366751841274613, + "flos": 15275950439040.0, + "grad_norm": 1.9358469630159034, + "language_loss": 0.73370576, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.75807363, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18347168, + "step": 13916, + "time_per_iteration": 2.8390722274780273 + }, + { + "auxiliary_loss_clip": 0.01398663, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.23779154, + "balance_loss_mlp": 1.01154482, + "epoch": 0.8367353073801292, + "flos": 13232797407360.0, + "grad_norm": 2.6208900988817985, + "language_loss": 0.75303942, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.77733827, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19677734, + "step": 13917, + "time_per_iteration": 2.799624443054199 + }, + { + "auxiliary_loss_clip": 0.0138751, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.23336649, + "balance_loss_mlp": 1.01587439, + "epoch": 0.8367954306327973, + "flos": 24215265267840.0, + "grad_norm": 1.5881097109904536, + "language_loss": 0.79972744, + "learning_rate": 2.728706983644933e-07, + "loss": 0.8239429, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1817627, + "step": 13918, + "time_per_iteration": 2.866183280944824 + }, + { + "auxiliary_loss_clip": 0.01407569, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.246737, + "balance_loss_mlp": 1.01916027, + "epoch": 0.8368555538854652, + "flos": 24545345218560.0, + "grad_norm": 1.9121510634785384, + "language_loss": 0.68366396, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70812094, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18969727, + "step": 13919, + "time_per_iteration": 2.892137289047241 + }, + { + "auxiliary_loss_clip": 0.01393937, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.23504996, + "balance_loss_mlp": 1.01147377, + "epoch": 0.8369156771381332, + "flos": 20267155382400.0, + "grad_norm": 1.919930955129626, + "language_loss": 0.74116743, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.76541054, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18908691, + "step": 13920, + "time_per_iteration": 2.8685429096221924 + }, + { + "auxiliary_loss_clip": 0.01396682, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.23681164, + "balance_loss_mlp": 1.01288879, + "epoch": 0.8369758003908011, + "flos": 21845651869440.0, + "grad_norm": 1.6977306123200961, + "language_loss": 0.69989985, + "learning_rate": 2.722818488237566e-07, + "loss": 0.72418785, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1920166, + "step": 13921, + "time_per_iteration": 2.842175245285034 + }, + { + "auxiliary_loss_clip": 0.01423824, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.25927579, + "balance_loss_mlp": 1.01770008, + "epoch": 0.8370359236434691, + "flos": 21727616889600.0, + "grad_norm": 2.3822067493979704, + "language_loss": 0.85812199, + "learning_rate": 2.720856966640801e-07, + "loss": 0.88272405, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18688965, + "step": 13922, + "time_per_iteration": 2.8453729152679443 + }, + { + "auxiliary_loss_clip": 0.01398367, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.24185669, + "balance_loss_mlp": 1.01444674, + "epoch": 0.837096046896137, + "flos": 23159275441920.0, + "grad_norm": 1.597410488278112, + "language_loss": 0.72758615, + "learning_rate": 2.71889610027088e-07, + "loss": 0.75190175, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.1875, + "step": 13923, + "time_per_iteration": 2.834040641784668 + }, + { + "auxiliary_loss_clip": 0.01389593, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.23248267, + "balance_loss_mlp": 1.01287663, + "epoch": 0.8371561701488051, + "flos": 24502063927680.0, + "grad_norm": 2.0362684770968493, + "language_loss": 0.76862746, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.79285371, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.20166016, + "step": 13924, + "time_per_iteration": 2.8537347316741943 + }, + { + "auxiliary_loss_clip": 0.01396021, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.23732257, + "balance_loss_mlp": 1.01033926, + "epoch": 0.837216293401473, + "flos": 29219003020800.0, + "grad_norm": 1.6091623228886887, + "language_loss": 0.65069127, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.67494416, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18945312, + "step": 13925, + "time_per_iteration": 2.902864694595337 + }, + { + "auxiliary_loss_clip": 0.0142105, + "auxiliary_loss_mlp": 0.01037125, + "balance_loss_clip": 1.25720608, + "balance_loss_mlp": 1.01720476, + "epoch": 0.837276416654141, + "flos": 25276277266560.0, + "grad_norm": 1.6443519821394001, + "language_loss": 0.74940026, + "learning_rate": 2.713017433265543e-07, + "loss": 0.77398205, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19921875, + "step": 13926, + "time_per_iteration": 2.866297960281372 + }, + { + "auxiliary_loss_clip": 0.01407973, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.24649465, + "balance_loss_mlp": 1.01916456, + "epoch": 0.837336539906809, + "flos": 13890152131200.0, + "grad_norm": 2.06925591618755, + "language_loss": 0.71878636, + "learning_rate": 2.711059188546274e-07, + "loss": 0.74323827, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18066406, + "step": 13927, + "time_per_iteration": 2.8178000450134277 + }, + { + "auxiliary_loss_clip": 0.011821, + "auxiliary_loss_mlp": 0.01026685, + "balance_loss_clip": 1.09118104, + "balance_loss_mlp": 1.00503659, + "epoch": 0.8373966631594769, + "flos": 68903640823680.0, + "grad_norm": 0.7140512127585137, + "language_loss": 0.58863282, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.61072063, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.21679688, + "step": 13928, + "time_per_iteration": 3.6446895599365234 + }, + { + "auxiliary_loss_clip": 0.01411269, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.25055277, + "balance_loss_mlp": 1.01328397, + "epoch": 0.8374567864121449, + "flos": 20458677196800.0, + "grad_norm": 1.6795796961336529, + "language_loss": 0.70324284, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72768337, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19506836, + "step": 13929, + "time_per_iteration": 2.8510799407958984 + }, + { + "auxiliary_loss_clip": 0.01426436, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.26130795, + "balance_loss_mlp": 1.01126373, + "epoch": 0.8375169096648128, + "flos": 41919882848640.0, + "grad_norm": 1.6135138752538007, + "language_loss": 0.67665017, + "learning_rate": 2.705188388275574e-07, + "loss": 0.70122451, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19726562, + "step": 13930, + "time_per_iteration": 3.0626227855682373 + }, + { + "auxiliary_loss_clip": 0.01397686, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.23971331, + "balance_loss_mlp": 1.01240301, + "epoch": 0.8375770329174809, + "flos": 20018480106240.0, + "grad_norm": 1.700432023570513, + "language_loss": 0.72284532, + "learning_rate": 2.703232766395067e-07, + "loss": 0.74712634, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18017578, + "step": 13931, + "time_per_iteration": 2.8467986583709717 + }, + { + "auxiliary_loss_clip": 0.01397653, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.23996973, + "balance_loss_mlp": 1.01430631, + "epoch": 0.8376371561701488, + "flos": 22793786795520.0, + "grad_norm": 1.6809303652101704, + "language_loss": 0.72276425, + "learning_rate": 2.701277800409705e-07, + "loss": 0.74706978, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18591309, + "step": 13932, + "time_per_iteration": 4.403635501861572 + }, + { + "auxiliary_loss_clip": 0.01403149, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.24481201, + "balance_loss_mlp": 1.01458025, + "epoch": 0.8376972794228168, + "flos": 23924892268800.0, + "grad_norm": 2.1940090842668263, + "language_loss": 0.66682041, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69117868, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18115234, + "step": 13933, + "time_per_iteration": 2.9162707328796387 + }, + { + "auxiliary_loss_clip": 0.01397593, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.24016929, + "balance_loss_mlp": 1.01660061, + "epoch": 0.8377574026754847, + "flos": 13742454545280.0, + "grad_norm": 2.1538959315831194, + "language_loss": 0.77074635, + "learning_rate": 2.697369836420933e-07, + "loss": 0.79507768, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18933105, + "step": 13934, + "time_per_iteration": 2.9695489406585693 + }, + { + "auxiliary_loss_clip": 0.01401692, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.24361217, + "balance_loss_mlp": 1.01212239, + "epoch": 0.8378175259281527, + "flos": 21660916775040.0, + "grad_norm": 1.5030183818590426, + "language_loss": 0.77686167, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.80118668, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18688965, + "step": 13935, + "time_per_iteration": 4.333244562149048 + }, + { + "auxiliary_loss_clip": 0.01401566, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.24135554, + "balance_loss_mlp": 1.01211405, + "epoch": 0.8378776491808206, + "flos": 15456794480640.0, + "grad_norm": 2.3947132158919477, + "language_loss": 0.57104003, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.59536606, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18896484, + "step": 13936, + "time_per_iteration": 2.8110313415527344 + }, + { + "auxiliary_loss_clip": 0.01402835, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.24228811, + "balance_loss_mlp": 1.01435089, + "epoch": 0.8379377724334887, + "flos": 14728531875840.0, + "grad_norm": 1.7597549099069183, + "language_loss": 0.90355003, + "learning_rate": 2.691512811503882e-07, + "loss": 0.92791235, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19042969, + "step": 13937, + "time_per_iteration": 2.830812931060791 + }, + { + "auxiliary_loss_clip": 0.01400456, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.24126828, + "balance_loss_mlp": 1.0119009, + "epoch": 0.8379978956861566, + "flos": 24545978645760.0, + "grad_norm": 1.9717649476116534, + "language_loss": 0.82537419, + "learning_rate": 2.689561782445313e-07, + "loss": 0.84969074, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19299316, + "step": 13938, + "time_per_iteration": 2.870368480682373 + }, + { + "auxiliary_loss_clip": 0.01409974, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.24779606, + "balance_loss_mlp": 1.01199675, + "epoch": 0.8380580189388246, + "flos": 18961811608320.0, + "grad_norm": 1.622112413648304, + "language_loss": 0.7161777, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.74058461, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18701172, + "step": 13939, + "time_per_iteration": 2.8313231468200684 + }, + { + "auxiliary_loss_clip": 0.01407593, + "auxiliary_loss_mlp": 0.01039718, + "balance_loss_clip": 1.24468732, + "balance_loss_mlp": 1.01995265, + "epoch": 0.8381181421914926, + "flos": 26551098783360.0, + "grad_norm": 1.6406526021446022, + "language_loss": 0.76902896, + "learning_rate": 2.6856616936428e-07, + "loss": 0.79350209, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19750977, + "step": 13940, + "time_per_iteration": 2.914124011993408 + }, + { + "auxiliary_loss_clip": 0.01398742, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.23980784, + "balance_loss_mlp": 1.01366794, + "epoch": 0.8381782654441605, + "flos": 23301045959040.0, + "grad_norm": 3.331534505850874, + "language_loss": 0.77226108, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.79657531, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19006348, + "step": 13941, + "time_per_iteration": 5.6159162521362305 + }, + { + "auxiliary_loss_clip": 0.01414839, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.24938965, + "balance_loss_mlp": 1.01336169, + "epoch": 0.8382383886968285, + "flos": 26769568515840.0, + "grad_norm": 2.186201781476688, + "language_loss": 0.73839438, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.76287711, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20068359, + "step": 13942, + "time_per_iteration": 2.87819766998291 + }, + { + "auxiliary_loss_clip": 0.01414789, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.24739051, + "balance_loss_mlp": 1.01626241, + "epoch": 0.8382985119494964, + "flos": 26115788131200.0, + "grad_norm": 1.4377789064850224, + "language_loss": 0.79705822, + "learning_rate": 2.679816484834554e-07, + "loss": 0.82156432, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 1.67382812, + "router_z_loss_mlp": 0.19555664, + "step": 13943, + "time_per_iteration": 2.883617877960205 + }, + { + "auxiliary_loss_clip": 0.0140912, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.24918377, + "balance_loss_mlp": 1.0157541, + "epoch": 0.8383586352021645, + "flos": 16443686217600.0, + "grad_norm": 2.033514331247748, + "language_loss": 0.85496539, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87939501, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18078613, + "step": 13944, + "time_per_iteration": 2.8480868339538574 + }, + { + "auxiliary_loss_clip": 0.01177323, + "auxiliary_loss_mlp": 0.01015789, + "balance_loss_clip": 1.08766484, + "balance_loss_mlp": 0.9990043, + "epoch": 0.8384187584548324, + "flos": 64226091968640.0, + "grad_norm": 0.6664612577384531, + "language_loss": 0.50230801, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52423906, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16796875, + "step": 13945, + "time_per_iteration": 3.4434239864349365 + }, + { + "auxiliary_loss_clip": 0.01389066, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.23236561, + "balance_loss_mlp": 1.01280499, + "epoch": 0.8384788817075004, + "flos": 22393160922240.0, + "grad_norm": 1.6895483874491655, + "language_loss": 0.65392959, + "learning_rate": 2.673977187074017e-07, + "loss": 0.6781438, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.19543457, + "step": 13946, + "time_per_iteration": 2.8558883666992188 + }, + { + "auxiliary_loss_clip": 0.01404297, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.2424835, + "balance_loss_mlp": 1.01260591, + "epoch": 0.8385390049601683, + "flos": 29508742592640.0, + "grad_norm": 1.6714679024137806, + "language_loss": 0.67977899, + "learning_rate": 2.672032068397829e-07, + "loss": 0.70414686, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19897461, + "step": 13947, + "time_per_iteration": 2.9351742267608643 + }, + { + "auxiliary_loss_clip": 0.01399894, + "auxiliary_loss_mlp": 0.01034817, + "balance_loss_clip": 1.23885536, + "balance_loss_mlp": 1.01557636, + "epoch": 0.8385991282128363, + "flos": 32720581543680.0, + "grad_norm": 1.381900975320931, + "language_loss": 0.70254302, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72689015, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19238281, + "step": 13948, + "time_per_iteration": 2.9222140312194824 + }, + { + "auxiliary_loss_clip": 0.01386266, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.23184764, + "balance_loss_mlp": 1.01402211, + "epoch": 0.8386592514655042, + "flos": 25450334588160.0, + "grad_norm": 1.931124358195655, + "language_loss": 0.85809433, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.88227606, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.17883301, + "step": 13949, + "time_per_iteration": 2.905339241027832 + }, + { + "auxiliary_loss_clip": 0.01382171, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.22687054, + "balance_loss_mlp": 1.01477289, + "epoch": 0.8387193747181723, + "flos": 22025500525440.0, + "grad_norm": 2.2664890783047134, + "language_loss": 0.70996475, + "learning_rate": 2.66620065513385e-07, + "loss": 0.73412913, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.19482422, + "step": 13950, + "time_per_iteration": 2.8980000019073486 + }, + { + "auxiliary_loss_clip": 0.01391087, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.23431444, + "balance_loss_mlp": 1.01342475, + "epoch": 0.8387794979708402, + "flos": 18159293007360.0, + "grad_norm": 1.5937641291998477, + "language_loss": 0.65783012, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.68206215, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18688965, + "step": 13951, + "time_per_iteration": 2.842414617538452 + }, + { + "auxiliary_loss_clip": 0.01413551, + "auxiliary_loss_mlp": 0.01034513, + "balance_loss_clip": 1.25234103, + "balance_loss_mlp": 1.01553524, + "epoch": 0.8388396212235082, + "flos": 25422255550080.0, + "grad_norm": 1.4551412076159658, + "language_loss": 0.71319884, + "learning_rate": 2.662316332665393e-07, + "loss": 0.73767954, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18981934, + "step": 13952, + "time_per_iteration": 2.8691084384918213 + }, + { + "auxiliary_loss_clip": 0.01396625, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.23789978, + "balance_loss_mlp": 1.0154978, + "epoch": 0.8388997444761762, + "flos": 22283179516800.0, + "grad_norm": 1.8493916855473893, + "language_loss": 0.73882568, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.76313037, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18334961, + "step": 13953, + "time_per_iteration": 2.826960325241089 + }, + { + "auxiliary_loss_clip": 0.01395356, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.23661709, + "balance_loss_mlp": 1.01416731, + "epoch": 0.8389598677288441, + "flos": 19582671761280.0, + "grad_norm": 3.509980169931398, + "language_loss": 0.68843085, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.71270871, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18249512, + "step": 13954, + "time_per_iteration": 2.8485045433044434 + }, + { + "auxiliary_loss_clip": 0.01397298, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.23684478, + "balance_loss_mlp": 1.01883984, + "epoch": 0.8390199909815121, + "flos": 17393766670080.0, + "grad_norm": 1.8539922198264231, + "language_loss": 0.73991084, + "learning_rate": 2.656494779996932e-07, + "loss": 0.76425844, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18615723, + "step": 13955, + "time_per_iteration": 2.840822458267212 + }, + { + "auxiliary_loss_clip": 0.01400164, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.24026179, + "balance_loss_mlp": 1.01361585, + "epoch": 0.83908011423418, + "flos": 24649082841600.0, + "grad_norm": 3.3916334914917434, + "language_loss": 0.67094153, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.69527507, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19567871, + "step": 13956, + "time_per_iteration": 2.8404014110565186 + }, + { + "auxiliary_loss_clip": 0.01411533, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.24856496, + "balance_loss_mlp": 1.01495969, + "epoch": 0.8391402374868481, + "flos": 24729311151360.0, + "grad_norm": 1.7482755051271377, + "language_loss": 0.81351781, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.83797735, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19470215, + "step": 13957, + "time_per_iteration": 2.8553578853607178 + }, + { + "auxiliary_loss_clip": 0.01183998, + "auxiliary_loss_mlp": 0.01019921, + "balance_loss_clip": 1.09144902, + "balance_loss_mlp": 0.99588889, + "epoch": 0.839200360739516, + "flos": 56902899317760.0, + "grad_norm": 0.7493366274113707, + "language_loss": 0.53286922, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55490845, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.24023438, + "step": 13958, + "time_per_iteration": 3.4444868564605713 + }, + { + "auxiliary_loss_clip": 0.01394268, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.23611057, + "balance_loss_mlp": 1.01584458, + "epoch": 0.839260483992184, + "flos": 18341946840960.0, + "grad_norm": 2.116569525507219, + "language_loss": 0.74750829, + "learning_rate": 2.648741917459574e-07, + "loss": 0.77180552, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19616699, + "step": 13959, + "time_per_iteration": 2.8812437057495117 + }, + { + "auxiliary_loss_clip": 0.01398504, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.24117589, + "balance_loss_mlp": 1.01278293, + "epoch": 0.8393206072448519, + "flos": 27098969794560.0, + "grad_norm": 1.9993627601207402, + "language_loss": 0.56261194, + "learning_rate": 2.646805346545169e-07, + "loss": 0.58690709, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18237305, + "step": 13960, + "time_per_iteration": 2.8889992237091064 + }, + { + "auxiliary_loss_clip": 0.01183309, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.09079707, + "balance_loss_mlp": 1.00203979, + "epoch": 0.8393807304975199, + "flos": 61548233875200.0, + "grad_norm": 0.7820621385232547, + "language_loss": 0.60725647, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62936932, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.25976562, + "step": 13961, + "time_per_iteration": 3.3391737937927246 + }, + { + "auxiliary_loss_clip": 0.01400971, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.23952496, + "balance_loss_mlp": 1.01348519, + "epoch": 0.8394408537501878, + "flos": 14900960384640.0, + "grad_norm": 3.0016163955632167, + "language_loss": 0.69013262, + "learning_rate": 2.642934178894405e-07, + "loss": 0.71446961, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19238281, + "step": 13962, + "time_per_iteration": 2.8231360912323 + }, + { + "auxiliary_loss_clip": 0.01404651, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.24234581, + "balance_loss_mlp": 1.01483285, + "epoch": 0.8395009770028559, + "flos": 17418904796160.0, + "grad_norm": 1.8932022499011594, + "language_loss": 0.74412638, + "learning_rate": 2.640999582304841e-07, + "loss": 0.76852453, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20349121, + "step": 13963, + "time_per_iteration": 2.7899603843688965 + }, + { + "auxiliary_loss_clip": 0.01400741, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.24133825, + "balance_loss_mlp": 1.01746094, + "epoch": 0.8395611002555238, + "flos": 27935404012800.0, + "grad_norm": 1.5303858358361293, + "language_loss": 0.76473188, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78909594, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18200684, + "step": 13964, + "time_per_iteration": 2.905437707901001 + }, + { + "auxiliary_loss_clip": 0.01420285, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.25443769, + "balance_loss_mlp": 1.02009773, + "epoch": 0.8396212235081918, + "flos": 11106203685120.0, + "grad_norm": 1.899126811316887, + "language_loss": 0.78586125, + "learning_rate": 2.637132363964161e-07, + "loss": 0.81046355, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 1.66113281, + "router_z_loss_mlp": 0.19848633, + "step": 13965, + "time_per_iteration": 2.81062388420105 + }, + { + "auxiliary_loss_clip": 0.01390577, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.23255324, + "balance_loss_mlp": 1.01383674, + "epoch": 0.8396813467608598, + "flos": 35749087989120.0, + "grad_norm": 1.4859677547340322, + "language_loss": 0.66136396, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68559694, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18908691, + "step": 13966, + "time_per_iteration": 2.9869091510772705 + }, + { + "auxiliary_loss_clip": 0.01392111, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.23448873, + "balance_loss_mlp": 1.01376534, + "epoch": 0.8397414700135277, + "flos": 26187781887360.0, + "grad_norm": 1.9362657072499365, + "language_loss": 0.75405693, + "learning_rate": 2.633267779230177e-07, + "loss": 0.77830988, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1940918, + "step": 13967, + "time_per_iteration": 4.403401136398315 + }, + { + "auxiliary_loss_clip": 0.01399362, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.24148583, + "balance_loss_mlp": 1.01134026, + "epoch": 0.8398015932661957, + "flos": 18342354044160.0, + "grad_norm": 2.0272156144757765, + "language_loss": 0.83148253, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85576808, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1784668, + "step": 13968, + "time_per_iteration": 2.8812739849090576 + }, + { + "auxiliary_loss_clip": 0.0140976, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.24734378, + "balance_loss_mlp": 1.01710916, + "epoch": 0.8398617165188637, + "flos": 17387206174080.0, + "grad_norm": 1.9894369146000601, + "language_loss": 0.77910006, + "learning_rate": 2.629405828689075e-07, + "loss": 0.80355227, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18371582, + "step": 13969, + "time_per_iteration": 2.807709217071533 + }, + { + "auxiliary_loss_clip": 0.01415368, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.25070477, + "balance_loss_mlp": 1.01267242, + "epoch": 0.8399218397715317, + "flos": 22939946058240.0, + "grad_norm": 2.0052688715291915, + "language_loss": 0.77811712, + "learning_rate": 2.627475841423923e-07, + "loss": 0.80259806, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.20056152, + "step": 13970, + "time_per_iteration": 4.2758636474609375 + }, + { + "auxiliary_loss_clip": 0.01416709, + "auxiliary_loss_mlp": 0.01039422, + "balance_loss_clip": 1.25458407, + "balance_loss_mlp": 1.02034855, + "epoch": 0.8399819630241996, + "flos": 23160135093120.0, + "grad_norm": 1.7957056013792068, + "language_loss": 0.72356093, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74812222, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19055176, + "step": 13971, + "time_per_iteration": 2.8453540802001953 + }, + { + "auxiliary_loss_clip": 0.01402431, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.24173141, + "balance_loss_mlp": 1.01418948, + "epoch": 0.8400420862768676, + "flos": 16405653323520.0, + "grad_norm": 1.6414766583973475, + "language_loss": 0.78135359, + "learning_rate": 2.623617843270358e-07, + "loss": 0.80571151, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19165039, + "step": 13972, + "time_per_iteration": 2.835982322692871 + }, + { + "auxiliary_loss_clip": 0.01395907, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.23892415, + "balance_loss_mlp": 1.01245379, + "epoch": 0.8401022095295355, + "flos": 21297237920640.0, + "grad_norm": 1.2748980920187014, + "language_loss": 0.68940878, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.71368235, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19006348, + "step": 13973, + "time_per_iteration": 2.8782694339752197 + }, + { + "auxiliary_loss_clip": 0.01399249, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.23939323, + "balance_loss_mlp": 1.01621294, + "epoch": 0.8401623327822035, + "flos": 17320234590720.0, + "grad_norm": 2.3767853057992423, + "language_loss": 0.78276551, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80711639, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19616699, + "step": 13974, + "time_per_iteration": 2.8108198642730713 + }, + { + "auxiliary_loss_clip": 0.01398378, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.23662531, + "balance_loss_mlp": 1.01322937, + "epoch": 0.8402224560348714, + "flos": 22246820680320.0, + "grad_norm": 1.4365673778628576, + "language_loss": 0.73441088, + "learning_rate": 2.617835788078868e-07, + "loss": 0.75871253, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1854248, + "step": 13975, + "time_per_iteration": 2.914034128189087 + }, + { + "auxiliary_loss_clip": 0.01396527, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.23693585, + "balance_loss_mlp": 1.01348531, + "epoch": 0.8402825792875395, + "flos": 20239574037120.0, + "grad_norm": 1.7109512656736308, + "language_loss": 0.72944504, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.75373447, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18933105, + "step": 13976, + "time_per_iteration": 5.592138051986694 + }, + { + "auxiliary_loss_clip": 0.0141134, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.2504189, + "balance_loss_mlp": 1.01392603, + "epoch": 0.8403427025402074, + "flos": 23299145677440.0, + "grad_norm": 2.8546557005760547, + "language_loss": 0.72964489, + "learning_rate": 2.61398438016311e-07, + "loss": 0.75408316, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18554688, + "step": 13977, + "time_per_iteration": 2.9244132041931152 + }, + { + "auxiliary_loss_clip": 0.01405731, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.24366415, + "balance_loss_mlp": 1.0128386, + "epoch": 0.8404028257928754, + "flos": 32689471104000.0, + "grad_norm": 7.473532177222819, + "language_loss": 0.69331431, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.71768975, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18969727, + "step": 13978, + "time_per_iteration": 2.945669651031494 + }, + { + "auxiliary_loss_clip": 0.01379781, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.22434354, + "balance_loss_mlp": 1.0145669, + "epoch": 0.8404629490455434, + "flos": 16189219607040.0, + "grad_norm": 2.2243155366717526, + "language_loss": 0.78293043, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80706, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18603516, + "step": 13979, + "time_per_iteration": 2.8013103008270264 + }, + { + "auxiliary_loss_clip": 0.01406392, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.24607885, + "balance_loss_mlp": 1.01642382, + "epoch": 0.8405230722982113, + "flos": 15202644583680.0, + "grad_norm": 1.820207708901894, + "language_loss": 0.78726298, + "learning_rate": 2.60821221306778e-07, + "loss": 0.8116802, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18908691, + "step": 13980, + "time_per_iteration": 2.895350933074951 + }, + { + "auxiliary_loss_clip": 0.01394193, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.23811471, + "balance_loss_mlp": 1.01327896, + "epoch": 0.8405831955508793, + "flos": 27822300716160.0, + "grad_norm": 1.9496483916650684, + "language_loss": 0.86814475, + "learning_rate": 2.606289476268757e-07, + "loss": 0.8924095, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.19006348, + "step": 13981, + "time_per_iteration": 2.8705804347991943 + }, + { + "auxiliary_loss_clip": 0.01397973, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.23919606, + "balance_loss_mlp": 1.01484728, + "epoch": 0.8406433188035473, + "flos": 23780090350080.0, + "grad_norm": 10.83214169378516, + "language_loss": 0.68579948, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.71012026, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19250488, + "step": 13982, + "time_per_iteration": 2.8297414779663086 + }, + { + "auxiliary_loss_clip": 0.01418621, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.25765657, + "balance_loss_mlp": 1.01721478, + "epoch": 0.8407034420562153, + "flos": 29217690921600.0, + "grad_norm": 1.6043755784400509, + "language_loss": 0.68812549, + "learning_rate": 2.602445981457324e-07, + "loss": 0.71267569, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19189453, + "step": 13983, + "time_per_iteration": 2.9001975059509277 + }, + { + "auxiliary_loss_clip": 0.01408723, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.24647093, + "balance_loss_mlp": 1.01487958, + "epoch": 0.8407635653088832, + "flos": 26371431106560.0, + "grad_norm": 4.0773522768000126, + "language_loss": 0.79893064, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.82335865, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.1920166, + "step": 13984, + "time_per_iteration": 2.863291025161743 + }, + { + "auxiliary_loss_clip": 0.01397152, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.238464, + "balance_loss_mlp": 1.01634109, + "epoch": 0.8408236885615512, + "flos": 21478308186240.0, + "grad_norm": 1.9310216823021238, + "language_loss": 0.61477697, + "learning_rate": 2.598605125513842e-07, + "loss": 0.63910234, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19042969, + "step": 13985, + "time_per_iteration": 2.8415231704711914 + }, + { + "auxiliary_loss_clip": 0.01395694, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.23365879, + "balance_loss_mlp": 1.0151484, + "epoch": 0.8408838118142191, + "flos": 22973363982720.0, + "grad_norm": 1.6570744939726703, + "language_loss": 0.82325935, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84756339, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19567871, + "step": 13986, + "time_per_iteration": 2.8335986137390137 + }, + { + "auxiliary_loss_clip": 0.01402578, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.2440877, + "balance_loss_mlp": 1.01797652, + "epoch": 0.8409439350668871, + "flos": 26812035400320.0, + "grad_norm": 1.4798586919145345, + "language_loss": 0.66696197, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.6913619, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19433594, + "step": 13987, + "time_per_iteration": 2.942720413208008 + }, + { + "auxiliary_loss_clip": 0.01407417, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.24867523, + "balance_loss_mlp": 1.01625896, + "epoch": 0.841004058319555, + "flos": 26589991328640.0, + "grad_norm": 1.9381844313545067, + "language_loss": 0.67870718, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.70313823, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19433594, + "step": 13988, + "time_per_iteration": 2.9030158519744873 + }, + { + "auxiliary_loss_clip": 0.0142058, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.25630975, + "balance_loss_mlp": 1.01620984, + "epoch": 0.8410641815722231, + "flos": 14510062143360.0, + "grad_norm": 3.161528759633698, + "language_loss": 0.82151449, + "learning_rate": 2.590931332560622e-07, + "loss": 0.84608519, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20263672, + "step": 13989, + "time_per_iteration": 2.8576128482818604 + }, + { + "auxiliary_loss_clip": 0.01417172, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.25343299, + "balance_loss_mlp": 1.01565242, + "epoch": 0.841124304824891, + "flos": 29178029214720.0, + "grad_norm": 1.9749868923677618, + "language_loss": 0.76186585, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.78637826, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18395996, + "step": 13990, + "time_per_iteration": 2.9193875789642334 + }, + { + "auxiliary_loss_clip": 0.0138799, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.23160708, + "balance_loss_mlp": 1.0180006, + "epoch": 0.841184428077559, + "flos": 22420380309120.0, + "grad_norm": 1.8169122782514824, + "language_loss": 0.81004679, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.83430207, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.19543457, + "step": 13991, + "time_per_iteration": 2.857771635055542 + }, + { + "auxiliary_loss_clip": 0.01394114, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.23481679, + "balance_loss_mlp": 1.01007605, + "epoch": 0.841244551330227, + "flos": 22972187617920.0, + "grad_norm": 2.1073268630272635, + "language_loss": 0.713925, + "learning_rate": 2.585182919204105e-07, + "loss": 0.7381506, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18371582, + "step": 13992, + "time_per_iteration": 2.9334957599639893 + }, + { + "auxiliary_loss_clip": 0.01404595, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.24346304, + "balance_loss_mlp": 1.01078129, + "epoch": 0.8413046745828949, + "flos": 21042590330880.0, + "grad_norm": 1.5894082270112864, + "language_loss": 0.77407259, + "learning_rate": 2.583268102064959e-07, + "loss": 0.79840684, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18066406, + "step": 13993, + "time_per_iteration": 2.8471293449401855 + }, + { + "auxiliary_loss_clip": 0.01426009, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.25944042, + "balance_loss_mlp": 1.01536655, + "epoch": 0.841364797835563, + "flos": 27063289630080.0, + "grad_norm": 2.032309038095702, + "language_loss": 0.74780703, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.77242124, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.20043945, + "step": 13994, + "time_per_iteration": 3.020331859588623 + }, + { + "auxiliary_loss_clip": 0.01389336, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.23436773, + "balance_loss_mlp": 1.01412725, + "epoch": 0.8414249210882309, + "flos": 17904916886400.0, + "grad_norm": 1.795483429548079, + "language_loss": 0.60257363, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.62679315, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.18505859, + "step": 13995, + "time_per_iteration": 2.839932680130005 + }, + { + "auxiliary_loss_clip": 0.01403419, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.24374342, + "balance_loss_mlp": 1.01245248, + "epoch": 0.8414850443408989, + "flos": 25450560812160.0, + "grad_norm": 1.585408860258319, + "language_loss": 0.7236557, + "learning_rate": 2.577527613603163e-07, + "loss": 0.7480033, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18908691, + "step": 13996, + "time_per_iteration": 2.8335020542144775 + }, + { + "auxiliary_loss_clip": 0.01391194, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.23235703, + "balance_loss_mlp": 1.01525521, + "epoch": 0.8415451675935668, + "flos": 23230002343680.0, + "grad_norm": 1.839808101686761, + "language_loss": 0.65414762, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.67841017, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19812012, + "step": 13997, + "time_per_iteration": 2.8677492141723633 + }, + { + "auxiliary_loss_clip": 0.01419395, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.25338411, + "balance_loss_mlp": 1.01454806, + "epoch": 0.8416052908462348, + "flos": 18554353770240.0, + "grad_norm": 1.9356833364701034, + "language_loss": 0.82644135, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.85098028, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19958496, + "step": 13998, + "time_per_iteration": 2.8125858306884766 + }, + { + "auxiliary_loss_clip": 0.0139991, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.23861814, + "balance_loss_mlp": 1.01228273, + "epoch": 0.8416654140989027, + "flos": 26116602537600.0, + "grad_norm": 1.4281029134850054, + "language_loss": 0.80963486, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.83395493, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19812012, + "step": 13999, + "time_per_iteration": 2.904327869415283 + }, + { + "auxiliary_loss_clip": 0.01417267, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.25314426, + "balance_loss_mlp": 1.01709592, + "epoch": 0.8417255373515707, + "flos": 26444691717120.0, + "grad_norm": 2.614780787618831, + "language_loss": 0.67592084, + "learning_rate": 2.569882878592096e-07, + "loss": 0.70046699, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.20251465, + "step": 14000, + "time_per_iteration": 2.9527430534362793 + }, + { + "auxiliary_loss_clip": 0.01413144, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.25063562, + "balance_loss_mlp": 1.01199055, + "epoch": 0.8417856606042387, + "flos": 24728541989760.0, + "grad_norm": 2.0688942998555673, + "language_loss": 0.80078554, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.82522935, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19238281, + "step": 14001, + "time_per_iteration": 2.8715648651123047 + }, + { + "auxiliary_loss_clip": 0.01388933, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.22924399, + "balance_loss_mlp": 1.01111901, + "epoch": 0.8418457838569067, + "flos": 20860796148480.0, + "grad_norm": 1.633157585357178, + "language_loss": 0.79269814, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.81687671, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17810059, + "step": 14002, + "time_per_iteration": 4.279980659484863 + }, + { + "auxiliary_loss_clip": 0.01387431, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.22944629, + "balance_loss_mlp": 1.01344419, + "epoch": 0.8419059071095746, + "flos": 28673077536000.0, + "grad_norm": 1.464944761651403, + "language_loss": 0.78592384, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.81012857, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19604492, + "step": 14003, + "time_per_iteration": 2.9127037525177 + }, + { + "auxiliary_loss_clip": 0.01399814, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.23929751, + "balance_loss_mlp": 1.01732373, + "epoch": 0.8419660303622426, + "flos": 21663767197440.0, + "grad_norm": 1.611493621218885, + "language_loss": 0.66163713, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.68601, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20153809, + "step": 14004, + "time_per_iteration": 2.8231866359710693 + }, + { + "auxiliary_loss_clip": 0.01414874, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.25103974, + "balance_loss_mlp": 1.01439667, + "epoch": 0.8420261536149106, + "flos": 25311957431040.0, + "grad_norm": 2.248147721712523, + "language_loss": 0.76975018, + "learning_rate": 2.560341831785724e-07, + "loss": 0.79424137, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19848633, + "step": 14005, + "time_per_iteration": 4.3720128536224365 + }, + { + "auxiliary_loss_clip": 0.01410089, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.24692369, + "balance_loss_mlp": 1.01382709, + "epoch": 0.8420862768675785, + "flos": 18770787486720.0, + "grad_norm": 1.6737763087526984, + "language_loss": 0.78879356, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.81322759, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19494629, + "step": 14006, + "time_per_iteration": 2.8771321773529053 + }, + { + "auxiliary_loss_clip": 0.01394934, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.23615634, + "balance_loss_mlp": 1.01414371, + "epoch": 0.8421464001202466, + "flos": 18335974527360.0, + "grad_norm": 1.7427925446765211, + "language_loss": 0.77876103, + "learning_rate": 2.556530041751932e-07, + "loss": 0.80303812, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18640137, + "step": 14007, + "time_per_iteration": 2.8132781982421875 + }, + { + "auxiliary_loss_clip": 0.01410241, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.24856305, + "balance_loss_mlp": 1.01416469, + "epoch": 0.8422065233729145, + "flos": 31548864222720.0, + "grad_norm": 1.7885499402004554, + "language_loss": 0.66601896, + "learning_rate": 2.554625138886102e-07, + "loss": 0.69045711, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.1940918, + "step": 14008, + "time_per_iteration": 2.9286139011383057 + }, + { + "auxiliary_loss_clip": 0.01186723, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.09479237, + "balance_loss_mlp": 1.00602078, + "epoch": 0.8422666466255825, + "flos": 64326933924480.0, + "grad_norm": 0.7185765739731961, + "language_loss": 0.57020092, + "learning_rate": 2.552720897550631e-07, + "loss": 0.59234965, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.22167969, + "step": 14009, + "time_per_iteration": 3.4123823642730713 + }, + { + "auxiliary_loss_clip": 0.01394996, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.23763061, + "balance_loss_mlp": 1.01550937, + "epoch": 0.8423267698782504, + "flos": 24327508913280.0, + "grad_norm": 1.193180299352751, + "language_loss": 0.78302956, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80731201, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.17724609, + "step": 14010, + "time_per_iteration": 4.339692115783691 + }, + { + "auxiliary_loss_clip": 0.01412843, + "auxiliary_loss_mlp": 0.0104226, + "balance_loss_clip": 1.25071514, + "balance_loss_mlp": 1.02169657, + "epoch": 0.8423868931309184, + "flos": 18305180801280.0, + "grad_norm": 1.8978786799752194, + "language_loss": 0.7359935, + "learning_rate": 2.548914399759592e-07, + "loss": 0.76054454, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20556641, + "step": 14011, + "time_per_iteration": 4.234750986099243 + }, + { + "auxiliary_loss_clip": 0.01407691, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.24724996, + "balance_loss_mlp": 1.01488614, + "epoch": 0.8424470163835863, + "flos": 23560715721600.0, + "grad_norm": 5.899233809465174, + "language_loss": 0.85457611, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.87899023, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18835449, + "step": 14012, + "time_per_iteration": 2.871147871017456 + }, + { + "auxiliary_loss_clip": 0.0137625, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.22407401, + "balance_loss_mlp": 1.0122602, + "epoch": 0.8425071396362543, + "flos": 23780135594880.0, + "grad_norm": 1.7313952169243632, + "language_loss": 0.68472558, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70880663, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.19604492, + "step": 14013, + "time_per_iteration": 2.9019460678100586 + }, + { + "auxiliary_loss_clip": 0.01402437, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.23883629, + "balance_loss_mlp": 1.01500654, + "epoch": 0.8425672628889223, + "flos": 16187183591040.0, + "grad_norm": 2.184885725926289, + "language_loss": 0.79905903, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.82342827, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19494629, + "step": 14014, + "time_per_iteration": 2.8369927406311035 + }, + { + "auxiliary_loss_clip": 0.01398523, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.2377454, + "balance_loss_mlp": 1.01653612, + "epoch": 0.8426273861415903, + "flos": 23159592155520.0, + "grad_norm": 2.6997474836999675, + "language_loss": 0.68389481, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.70824122, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19580078, + "step": 14015, + "time_per_iteration": 2.821166753768921 + }, + { + "auxiliary_loss_clip": 0.01405811, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.24511862, + "balance_loss_mlp": 1.01359177, + "epoch": 0.8426875093942582, + "flos": 17466846301440.0, + "grad_norm": 3.8025805908371404, + "language_loss": 0.76868343, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.79306972, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19226074, + "step": 14016, + "time_per_iteration": 2.8158862590789795 + }, + { + "auxiliary_loss_clip": 0.01401445, + "auxiliary_loss_mlp": 0.01037529, + "balance_loss_clip": 1.24006104, + "balance_loss_mlp": 1.01813316, + "epoch": 0.8427476326469262, + "flos": 19648557469440.0, + "grad_norm": 1.9541873987553064, + "language_loss": 0.80024534, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.82463515, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19384766, + "step": 14017, + "time_per_iteration": 2.81432843208313 + }, + { + "auxiliary_loss_clip": 0.01397291, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.23745728, + "balance_loss_mlp": 1.01515448, + "epoch": 0.8428077558995941, + "flos": 11946936159360.0, + "grad_norm": 2.6194199379643544, + "language_loss": 0.63767236, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.66198289, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18603516, + "step": 14018, + "time_per_iteration": 2.7852909564971924 + }, + { + "auxiliary_loss_clip": 0.0139755, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.23768008, + "balance_loss_mlp": 1.01673102, + "epoch": 0.8428678791522621, + "flos": 10458802817280.0, + "grad_norm": 1.843133297419182, + "language_loss": 0.7968778, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.82120681, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18615723, + "step": 14019, + "time_per_iteration": 2.961827516555786 + }, + { + "auxiliary_loss_clip": 0.01408013, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.24648499, + "balance_loss_mlp": 1.01389933, + "epoch": 0.8429280024049302, + "flos": 28778986909440.0, + "grad_norm": 2.0026458643645126, + "language_loss": 0.79005104, + "learning_rate": 2.531817924498265e-07, + "loss": 0.81445789, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18774414, + "step": 14020, + "time_per_iteration": 2.9138615131378174 + }, + { + "auxiliary_loss_clip": 0.01402679, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.24252677, + "balance_loss_mlp": 1.01335919, + "epoch": 0.8429881256575981, + "flos": 19546629638400.0, + "grad_norm": 1.7089695759067411, + "language_loss": 0.72037727, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.74472541, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18762207, + "step": 14021, + "time_per_iteration": 2.829550266265869 + }, + { + "auxiliary_loss_clip": 0.01420801, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.25734258, + "balance_loss_mlp": 1.01669693, + "epoch": 0.8430482489102661, + "flos": 24801802600320.0, + "grad_norm": 2.2519606427381214, + "language_loss": 0.70545435, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.73002911, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19995117, + "step": 14022, + "time_per_iteration": 2.8840301036834717 + }, + { + "auxiliary_loss_clip": 0.01417491, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.25436699, + "balance_loss_mlp": 1.01648736, + "epoch": 0.843108372162934, + "flos": 21554419219200.0, + "grad_norm": 3.157009418094002, + "language_loss": 0.73144579, + "learning_rate": 2.526131019933553e-07, + "loss": 0.75597924, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19360352, + "step": 14023, + "time_per_iteration": 2.893638849258423 + }, + { + "auxiliary_loss_clip": 0.01401371, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.24172664, + "balance_loss_mlp": 1.01648021, + "epoch": 0.843168495415602, + "flos": 24619691704320.0, + "grad_norm": 1.4938412093004372, + "language_loss": 0.67298329, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69736743, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.20568848, + "step": 14024, + "time_per_iteration": 2.92244815826416 + }, + { + "auxiliary_loss_clip": 0.01388652, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.23159051, + "balance_loss_mlp": 1.0160737, + "epoch": 0.8432286186682699, + "flos": 15131239009920.0, + "grad_norm": 1.667957607068702, + "language_loss": 0.81677973, + "learning_rate": 2.522343063158261e-07, + "loss": 0.84101939, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19250488, + "step": 14025, + "time_per_iteration": 2.8040194511413574 + }, + { + "auxiliary_loss_clip": 0.01396167, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.23940563, + "balance_loss_mlp": 1.01594448, + "epoch": 0.843288741920938, + "flos": 20311251079680.0, + "grad_norm": 1.9945489373733447, + "language_loss": 0.77704805, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.8013494, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18029785, + "step": 14026, + "time_per_iteration": 2.876418113708496 + }, + { + "auxiliary_loss_clip": 0.01388701, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.23132992, + "balance_loss_mlp": 1.01251709, + "epoch": 0.8433488651736059, + "flos": 23342743681920.0, + "grad_norm": 1.4648095541988384, + "language_loss": 0.83399206, + "learning_rate": 2.518557757400945e-07, + "loss": 0.85819727, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19299316, + "step": 14027, + "time_per_iteration": 2.8891735076904297 + }, + { + "auxiliary_loss_clip": 0.01393836, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.23467183, + "balance_loss_mlp": 1.01253247, + "epoch": 0.8434089884262739, + "flos": 39472755828480.0, + "grad_norm": 1.378020030534551, + "language_loss": 0.56909472, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.59335202, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19360352, + "step": 14028, + "time_per_iteration": 3.07761812210083 + }, + { + "auxiliary_loss_clip": 0.01400661, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.24057674, + "balance_loss_mlp": 1.01138484, + "epoch": 0.8434691116789418, + "flos": 23779321188480.0, + "grad_norm": 1.69082563606802, + "language_loss": 0.6446172, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66892815, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19042969, + "step": 14029, + "time_per_iteration": 2.9855735301971436 + }, + { + "auxiliary_loss_clip": 0.01392125, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.23664927, + "balance_loss_mlp": 1.00891948, + "epoch": 0.8435292349316098, + "flos": 22680864478080.0, + "grad_norm": 1.6694303451597923, + "language_loss": 0.76098561, + "learning_rate": 2.51288477067956e-07, + "loss": 0.78519237, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.19628906, + "step": 14030, + "time_per_iteration": 2.9074184894561768 + }, + { + "auxiliary_loss_clip": 0.01386752, + "auxiliary_loss_mlp": 0.01035388, + "balance_loss_clip": 1.23079336, + "balance_loss_mlp": 1.0164454, + "epoch": 0.8435893581842777, + "flos": 18852916078080.0, + "grad_norm": 1.801988249929396, + "language_loss": 0.84223819, + "learning_rate": 2.510995101236502e-07, + "loss": 0.86645961, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18933105, + "step": 14031, + "time_per_iteration": 2.899329423904419 + }, + { + "auxiliary_loss_clip": 0.01393593, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.23636222, + "balance_loss_mlp": 1.01484632, + "epoch": 0.8436494814369457, + "flos": 20713958213760.0, + "grad_norm": 3.6896473173667053, + "language_loss": 0.81289172, + "learning_rate": 2.509106094978266e-07, + "loss": 0.83716547, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18920898, + "step": 14032, + "time_per_iteration": 2.9662675857543945 + }, + { + "auxiliary_loss_clip": 0.01400196, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.24037409, + "balance_loss_mlp": 1.0157423, + "epoch": 0.8437096046896138, + "flos": 22684348327680.0, + "grad_norm": 1.525456629786665, + "language_loss": 0.76273894, + "learning_rate": 2.507217751976478e-07, + "loss": 0.78709632, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19812012, + "step": 14033, + "time_per_iteration": 3.0021893978118896 + }, + { + "auxiliary_loss_clip": 0.01400641, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.24146628, + "balance_loss_mlp": 1.01513863, + "epoch": 0.8437697279422817, + "flos": 16188721914240.0, + "grad_norm": 1.7652070908045971, + "language_loss": 0.83980298, + "learning_rate": 2.505330072302743e-07, + "loss": 0.86414444, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18371582, + "step": 14034, + "time_per_iteration": 2.9431095123291016 + }, + { + "auxiliary_loss_clip": 0.01392578, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.23262691, + "balance_loss_mlp": 1.01235414, + "epoch": 0.8438298511949497, + "flos": 28777222362240.0, + "grad_norm": 1.5164761923902352, + "language_loss": 0.78895748, + "learning_rate": 2.503443056028656e-07, + "loss": 0.81320357, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19665527, + "step": 14035, + "time_per_iteration": 2.912457227706909 + }, + { + "auxiliary_loss_clip": 0.01400127, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.24066854, + "balance_loss_mlp": 1.01473308, + "epoch": 0.8438899744476176, + "flos": 33736004766720.0, + "grad_norm": 1.5953412901726294, + "language_loss": 0.72733366, + "learning_rate": 2.501556703225751e-07, + "loss": 0.75167477, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19250488, + "step": 14036, + "time_per_iteration": 2.972230911254883 + }, + { + "auxiliary_loss_clip": 0.01390745, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.23520398, + "balance_loss_mlp": 1.01175594, + "epoch": 0.8439500977002856, + "flos": 25119530720640.0, + "grad_norm": 1.679280939655633, + "language_loss": 0.69888151, + "learning_rate": 2.49967101396557e-07, + "loss": 0.72308743, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18103027, + "step": 14037, + "time_per_iteration": 4.297449827194214 + }, + { + "auxiliary_loss_clip": 0.01385519, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.22826636, + "balance_loss_mlp": 1.01522994, + "epoch": 0.8440102209529535, + "flos": 32862306816000.0, + "grad_norm": 1.5125375961035357, + "language_loss": 0.69863766, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.72283363, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18859863, + "step": 14038, + "time_per_iteration": 2.9468817710876465 + }, + { + "auxiliary_loss_clip": 0.01395971, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.23572791, + "balance_loss_mlp": 1.01277399, + "epoch": 0.8440703442056215, + "flos": 23740111929600.0, + "grad_norm": 1.6153001544148324, + "language_loss": 0.7682991, + "learning_rate": 2.49590162635938e-07, + "loss": 0.79258335, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19689941, + "step": 14039, + "time_per_iteration": 2.8628385066986084 + }, + { + "auxiliary_loss_clip": 0.0141767, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.25212717, + "balance_loss_mlp": 1.01234567, + "epoch": 0.8441304674582895, + "flos": 20203396179840.0, + "grad_norm": 1.9768555654996691, + "language_loss": 0.80054134, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.82502389, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.18237305, + "step": 14040, + "time_per_iteration": 4.302589654922485 + }, + { + "auxiliary_loss_clip": 0.01397663, + "auxiliary_loss_mlp": 0.01034575, + "balance_loss_clip": 1.2381866, + "balance_loss_mlp": 1.01541829, + "epoch": 0.8441905907109575, + "flos": 20226905493120.0, + "grad_norm": 1.9506797322979943, + "language_loss": 0.70340782, + "learning_rate": 2.492134893781821e-07, + "loss": 0.72773015, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19165039, + "step": 14041, + "time_per_iteration": 2.8281705379486084 + }, + { + "auxiliary_loss_clip": 0.01398986, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.236763, + "balance_loss_mlp": 1.01600313, + "epoch": 0.8442507139636254, + "flos": 13524301526400.0, + "grad_norm": 4.700954723765955, + "language_loss": 0.70168012, + "learning_rate": 2.490252523307341e-07, + "loss": 0.72603381, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20397949, + "step": 14042, + "time_per_iteration": 2.8498895168304443 + }, + { + "auxiliary_loss_clip": 0.01389365, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.23159695, + "balance_loss_mlp": 1.01344562, + "epoch": 0.8443108372162934, + "flos": 18228526830720.0, + "grad_norm": 1.9204816030127811, + "language_loss": 0.75885171, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.78306997, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19018555, + "step": 14043, + "time_per_iteration": 2.810023069381714 + }, + { + "auxiliary_loss_clip": 0.01394173, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.23521185, + "balance_loss_mlp": 1.01288188, + "epoch": 0.8443709604689613, + "flos": 16113153818880.0, + "grad_norm": 2.2241752936274186, + "language_loss": 0.72868967, + "learning_rate": 2.486489774343865e-07, + "loss": 0.75294352, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18334961, + "step": 14044, + "time_per_iteration": 2.8344521522521973 + }, + { + "auxiliary_loss_clip": 0.01382622, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.22641492, + "balance_loss_mlp": 1.01350784, + "epoch": 0.8444310837216293, + "flos": 18520528642560.0, + "grad_norm": 1.60083202108416, + "language_loss": 0.75549293, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77964115, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18688965, + "step": 14045, + "time_per_iteration": 4.23340630531311 + }, + { + "auxiliary_loss_clip": 0.01390044, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.2318424, + "balance_loss_mlp": 1.01423311, + "epoch": 0.8444912069742974, + "flos": 14948177973120.0, + "grad_norm": 7.105024923158671, + "language_loss": 0.79303914, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.81727111, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18908691, + "step": 14046, + "time_per_iteration": 4.300089120864868 + }, + { + "auxiliary_loss_clip": 0.0139512, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.23520589, + "balance_loss_mlp": 1.01203382, + "epoch": 0.8445513302269653, + "flos": 20129864100480.0, + "grad_norm": 2.0641921791428137, + "language_loss": 0.78594232, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.81020957, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19580078, + "step": 14047, + "time_per_iteration": 2.8362908363342285 + }, + { + "auxiliary_loss_clip": 0.01393087, + "auxiliary_loss_mlp": 0.0103231, + "balance_loss_clip": 1.23529971, + "balance_loss_mlp": 1.01289129, + "epoch": 0.8446114534796333, + "flos": 31182153966720.0, + "grad_norm": 1.7229578097202274, + "language_loss": 0.72707033, + "learning_rate": 2.478972246355935e-07, + "loss": 0.7513243, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19421387, + "step": 14048, + "time_per_iteration": 2.9037089347839355 + }, + { + "auxiliary_loss_clip": 0.01397783, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.23913991, + "balance_loss_mlp": 1.01649404, + "epoch": 0.8446715767323012, + "flos": 23958083969280.0, + "grad_norm": 1.5303384440972754, + "language_loss": 0.73866558, + "learning_rate": 2.477094525178667e-07, + "loss": 0.7630024, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19421387, + "step": 14049, + "time_per_iteration": 3.0763416290283203 + }, + { + "auxiliary_loss_clip": 0.01183768, + "auxiliary_loss_mlp": 0.01023691, + "balance_loss_clip": 1.09254694, + "balance_loss_mlp": 1.00023019, + "epoch": 0.8447316999849692, + "flos": 68015102578560.0, + "grad_norm": 0.8098737491884259, + "language_loss": 0.60723925, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62931383, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.234375, + "step": 14050, + "time_per_iteration": 3.2833986282348633 + }, + { + "auxiliary_loss_clip": 0.01390255, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.23118401, + "balance_loss_mlp": 1.01234961, + "epoch": 0.8447918232376371, + "flos": 22429067310720.0, + "grad_norm": 2.2687726556183576, + "language_loss": 0.73104244, + "learning_rate": 2.473341076306303e-07, + "loss": 0.75526011, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19165039, + "step": 14051, + "time_per_iteration": 2.8351612091064453 + }, + { + "auxiliary_loss_clip": 0.01390097, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.23277211, + "balance_loss_mlp": 1.01567125, + "epoch": 0.8448519464903052, + "flos": 23704160296320.0, + "grad_norm": 1.8473062092196968, + "language_loss": 0.75881815, + "learning_rate": 2.471465348753547e-07, + "loss": 0.78306687, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19116211, + "step": 14052, + "time_per_iteration": 2.8575470447540283 + }, + { + "auxiliary_loss_clip": 0.01377605, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.2247299, + "balance_loss_mlp": 1.01596463, + "epoch": 0.8449120697429731, + "flos": 13743947623680.0, + "grad_norm": 1.8851632553705528, + "language_loss": 0.74547708, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76958907, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.17626953, + "step": 14053, + "time_per_iteration": 2.8978829383850098 + }, + { + "auxiliary_loss_clip": 0.01400379, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.2424866, + "balance_loss_mlp": 1.01244485, + "epoch": 0.8449721929956411, + "flos": 20896612047360.0, + "grad_norm": 1.7014553033474542, + "language_loss": 0.75862479, + "learning_rate": 2.467715887770494e-07, + "loss": 0.78294337, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19030762, + "step": 14054, + "time_per_iteration": 2.894753932952881 + }, + { + "auxiliary_loss_clip": 0.01417086, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.25263476, + "balance_loss_mlp": 1.01093078, + "epoch": 0.845032316248309, + "flos": 33229424275200.0, + "grad_norm": 3.671333187467097, + "language_loss": 0.78786051, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.81233156, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19091797, + "step": 14055, + "time_per_iteration": 2.957921266555786 + }, + { + "auxiliary_loss_clip": 0.01391817, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.23378897, + "balance_loss_mlp": 1.01295674, + "epoch": 0.845092439500977, + "flos": 23595355255680.0, + "grad_norm": 1.7689835384333112, + "language_loss": 0.73786122, + "learning_rate": 2.463969086091302e-07, + "loss": 0.76210189, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19287109, + "step": 14056, + "time_per_iteration": 2.851419687271118 + }, + { + "auxiliary_loss_clip": 0.01406166, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.24323201, + "balance_loss_mlp": 1.01754582, + "epoch": 0.8451525627536449, + "flos": 13341692937600.0, + "grad_norm": 2.6701080559504726, + "language_loss": 0.69171166, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.71614468, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19604492, + "step": 14057, + "time_per_iteration": 2.814854621887207 + }, + { + "auxiliary_loss_clip": 0.01397435, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.23817134, + "balance_loss_mlp": 1.01174629, + "epoch": 0.8452126860063129, + "flos": 27829313660160.0, + "grad_norm": 1.7300387686375354, + "language_loss": 0.78243804, + "learning_rate": 2.460224944284284e-07, + "loss": 0.80672628, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19665527, + "step": 14058, + "time_per_iteration": 2.8979766368865967 + }, + { + "auxiliary_loss_clip": 0.01403679, + "auxiliary_loss_mlp": 0.01036125, + "balance_loss_clip": 1.24287021, + "balance_loss_mlp": 1.01696837, + "epoch": 0.845272809258981, + "flos": 27135826323840.0, + "grad_norm": 1.8755313915809648, + "language_loss": 0.70109606, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72549415, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19177246, + "step": 14059, + "time_per_iteration": 2.8769338130950928 + }, + { + "auxiliary_loss_clip": 0.01417166, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.25158858, + "balance_loss_mlp": 1.01561022, + "epoch": 0.8453329325116489, + "flos": 18341675372160.0, + "grad_norm": 2.372029097125862, + "language_loss": 0.59000075, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.61452538, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19689941, + "step": 14060, + "time_per_iteration": 2.881788492202759 + }, + { + "auxiliary_loss_clip": 0.01416742, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.2520566, + "balance_loss_mlp": 1.01490605, + "epoch": 0.8453930557643169, + "flos": 22685705671680.0, + "grad_norm": 1.7081899711250015, + "language_loss": 0.76331317, + "learning_rate": 2.454613720076277e-07, + "loss": 0.78782994, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20019531, + "step": 14061, + "time_per_iteration": 2.9991226196289062 + }, + { + "auxiliary_loss_clip": 0.01403486, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.24060106, + "balance_loss_mlp": 1.01238239, + "epoch": 0.8454531790169848, + "flos": 22496219873280.0, + "grad_norm": 2.2299601130342364, + "language_loss": 0.71365166, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73801816, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.2076416, + "step": 14062, + "time_per_iteration": 3.018763542175293 + }, + { + "auxiliary_loss_clip": 0.01183974, + "auxiliary_loss_mlp": 0.01024561, + "balance_loss_clip": 1.09306479, + "balance_loss_mlp": 1.00310373, + "epoch": 0.8455133022696528, + "flos": 58305058260480.0, + "grad_norm": 0.6321745036081599, + "language_loss": 0.52633214, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54841751, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.21484375, + "step": 14063, + "time_per_iteration": 3.4655587673187256 + }, + { + "auxiliary_loss_clip": 0.01390443, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.23465967, + "balance_loss_mlp": 1.01455307, + "epoch": 0.8455734255223207, + "flos": 21371267692800.0, + "grad_norm": 1.9764796128059385, + "language_loss": 0.82882196, + "learning_rate": 2.449008483773378e-07, + "loss": 0.8530525, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18054199, + "step": 14064, + "time_per_iteration": 2.8386967182159424 + }, + { + "auxiliary_loss_clip": 0.01409268, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.24672747, + "balance_loss_mlp": 1.01328182, + "epoch": 0.8456335487749888, + "flos": 20459039155200.0, + "grad_norm": 2.0067298271418528, + "language_loss": 0.73294067, + "learning_rate": 2.447141402648685e-07, + "loss": 0.75737, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.20385742, + "step": 14065, + "time_per_iteration": 2.8496978282928467 + }, + { + "auxiliary_loss_clip": 0.01379985, + "auxiliary_loss_mlp": 0.01031397, + "balance_loss_clip": 1.22485352, + "balance_loss_mlp": 1.01337278, + "epoch": 0.8456936720276567, + "flos": 28852835702400.0, + "grad_norm": 1.4510701780055577, + "language_loss": 0.78220761, + "learning_rate": 2.445274987130146e-07, + "loss": 0.80632144, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18017578, + "step": 14066, + "time_per_iteration": 2.9248321056365967 + }, + { + "auxiliary_loss_clip": 0.01398685, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.23871207, + "balance_loss_mlp": 1.01718104, + "epoch": 0.8457537952803247, + "flos": 22682719514880.0, + "grad_norm": 1.6260914943367188, + "language_loss": 0.70481652, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72916472, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1895752, + "step": 14067, + "time_per_iteration": 2.880302667617798 + }, + { + "auxiliary_loss_clip": 0.01396593, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.23845983, + "balance_loss_mlp": 1.0132935, + "epoch": 0.8458139185329926, + "flos": 33816142586880.0, + "grad_norm": 1.8111612529228476, + "language_loss": 0.72121322, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.74550676, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19458008, + "step": 14068, + "time_per_iteration": 2.9755825996398926 + }, + { + "auxiliary_loss_clip": 0.01185967, + "auxiliary_loss_mlp": 0.01044685, + "balance_loss_clip": 1.09431696, + "balance_loss_mlp": 1.01702833, + "epoch": 0.8458740417856606, + "flos": 70329960489600.0, + "grad_norm": 0.6937517197851208, + "language_loss": 0.60526264, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.6275692, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.27734375, + "step": 14069, + "time_per_iteration": 3.4512226581573486 + }, + { + "auxiliary_loss_clip": 0.01397515, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.23582983, + "balance_loss_mlp": 1.01248813, + "epoch": 0.8459341650383285, + "flos": 24181621119360.0, + "grad_norm": 1.4284551395211194, + "language_loss": 0.74987113, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.77415031, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.17932129, + "step": 14070, + "time_per_iteration": 2.9037904739379883 + }, + { + "auxiliary_loss_clip": 0.01394924, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.23614168, + "balance_loss_mlp": 1.01279235, + "epoch": 0.8459942882909965, + "flos": 38195491092480.0, + "grad_norm": 1.5570534143166308, + "language_loss": 0.67560756, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69987047, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18579102, + "step": 14071, + "time_per_iteration": 2.975773572921753 + }, + { + "auxiliary_loss_clip": 0.01183041, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.09303582, + "balance_loss_mlp": 1.01227999, + "epoch": 0.8460544115436646, + "flos": 64147673450880.0, + "grad_norm": 0.738063260810199, + "language_loss": 0.61062402, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63278794, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.2109375, + "step": 14072, + "time_per_iteration": 4.529989957809448 + }, + { + "auxiliary_loss_clip": 0.01403547, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.24147129, + "balance_loss_mlp": 1.01363242, + "epoch": 0.8461145347963325, + "flos": 24181575874560.0, + "grad_norm": 1.8996454302059484, + "language_loss": 0.73456872, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.75894523, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20471191, + "step": 14073, + "time_per_iteration": 2.8788063526153564 + }, + { + "auxiliary_loss_clip": 0.01430995, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.26275611, + "balance_loss_mlp": 1.01112151, + "epoch": 0.8461746580490005, + "flos": 34906319498880.0, + "grad_norm": 1.5968170621135314, + "language_loss": 0.78562951, + "learning_rate": 2.430367633291155e-07, + "loss": 0.81025785, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20703125, + "step": 14074, + "time_per_iteration": 2.974012613296509 + }, + { + "auxiliary_loss_clip": 0.01394466, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.23484445, + "balance_loss_mlp": 1.01577556, + "epoch": 0.8462347813016684, + "flos": 25568052854400.0, + "grad_norm": 2.2338889269943665, + "language_loss": 0.75849229, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.78279018, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19555664, + "step": 14075, + "time_per_iteration": 4.294775009155273 + }, + { + "auxiliary_loss_clip": 0.01384875, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.22780919, + "balance_loss_mlp": 1.01534009, + "epoch": 0.8462949045543364, + "flos": 21335451793920.0, + "grad_norm": 2.3466413463148883, + "language_loss": 0.74136019, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.76555169, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18945312, + "step": 14076, + "time_per_iteration": 2.828092336654663 + }, + { + "auxiliary_loss_clip": 0.01408441, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.24548423, + "balance_loss_mlp": 1.01504481, + "epoch": 0.8463550278070043, + "flos": 22647537043200.0, + "grad_norm": 3.0719584013184726, + "language_loss": 0.78589386, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.81032073, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.1920166, + "step": 14077, + "time_per_iteration": 2.826051712036133 + }, + { + "auxiliary_loss_clip": 0.01412899, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.24944556, + "balance_loss_mlp": 1.01662064, + "epoch": 0.8464151510596724, + "flos": 13013287044480.0, + "grad_norm": 1.937292384427296, + "language_loss": 0.76162827, + "learning_rate": 2.422929943924643e-07, + "loss": 0.78612626, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20275879, + "step": 14078, + "time_per_iteration": 2.8548712730407715 + }, + { + "auxiliary_loss_clip": 0.01393524, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.2356565, + "balance_loss_mlp": 1.00968111, + "epoch": 0.8464752743123403, + "flos": 15713161372800.0, + "grad_norm": 2.1216094155452128, + "language_loss": 0.85598409, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.88021088, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19470215, + "step": 14079, + "time_per_iteration": 2.956251621246338 + }, + { + "auxiliary_loss_clip": 0.01442036, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.27260256, + "balance_loss_mlp": 1.01303577, + "epoch": 0.8465353975650083, + "flos": 21664219645440.0, + "grad_norm": 2.317006158762273, + "language_loss": 0.59383851, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61859691, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.20751953, + "step": 14080, + "time_per_iteration": 4.298716068267822 + }, + { + "auxiliary_loss_clip": 0.01419281, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.25228941, + "balance_loss_mlp": 1.01475763, + "epoch": 0.8465955208176762, + "flos": 18524962632960.0, + "grad_norm": 13.785412436295507, + "language_loss": 0.67266643, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.69720149, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.19470215, + "step": 14081, + "time_per_iteration": 4.385536193847656 + }, + { + "auxiliary_loss_clip": 0.01407131, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.24498224, + "balance_loss_mlp": 1.01532483, + "epoch": 0.8466556440703442, + "flos": 24209564423040.0, + "grad_norm": 2.266453586959632, + "language_loss": 0.73319328, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75760347, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18579102, + "step": 14082, + "time_per_iteration": 2.8854384422302246 + }, + { + "auxiliary_loss_clip": 0.01401426, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.23974466, + "balance_loss_mlp": 1.01260829, + "epoch": 0.8467157673230121, + "flos": 20385552320640.0, + "grad_norm": 1.8504139442598664, + "language_loss": 0.76567352, + "learning_rate": 2.413647829539809e-07, + "loss": 0.79000866, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19482422, + "step": 14083, + "time_per_iteration": 2.8527870178222656 + }, + { + "auxiliary_loss_clip": 0.01407858, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.24415302, + "balance_loss_mlp": 1.01070178, + "epoch": 0.8467758905756801, + "flos": 28484858592000.0, + "grad_norm": 2.000356250133995, + "language_loss": 0.66680944, + "learning_rate": 2.411793407010092e-07, + "loss": 0.69120049, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.2052002, + "step": 14084, + "time_per_iteration": 2.9038772583007812 + }, + { + "auxiliary_loss_clip": 0.01398665, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.23997426, + "balance_loss_mlp": 1.01390123, + "epoch": 0.8468360138283482, + "flos": 11700025430400.0, + "grad_norm": 2.182498675441338, + "language_loss": 0.71277976, + "learning_rate": 2.409939651426938e-07, + "loss": 0.73710287, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1973877, + "step": 14085, + "time_per_iteration": 2.844148874282837 + }, + { + "auxiliary_loss_clip": 0.0139605, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.23620784, + "balance_loss_mlp": 1.01287484, + "epoch": 0.8468961370810161, + "flos": 24618515339520.0, + "grad_norm": 1.5123008147622037, + "language_loss": 0.71697128, + "learning_rate": 2.408086562860634e-07, + "loss": 0.74124777, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18725586, + "step": 14086, + "time_per_iteration": 2.9022231101989746 + }, + { + "auxiliary_loss_clip": 0.01402756, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.24327636, + "balance_loss_mlp": 1.01529145, + "epoch": 0.8469562603336841, + "flos": 19619302066560.0, + "grad_norm": 1.765669431282174, + "language_loss": 0.7574113, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.78178596, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19433594, + "step": 14087, + "time_per_iteration": 2.9288957118988037 + }, + { + "auxiliary_loss_clip": 0.01399685, + "auxiliary_loss_mlp": 0.01029546, + "balance_loss_clip": 1.23958039, + "balance_loss_mlp": 1.01140189, + "epoch": 0.847016383586352, + "flos": 22649437324800.0, + "grad_norm": 1.5933509585098458, + "language_loss": 0.74588549, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.77017784, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18139648, + "step": 14088, + "time_per_iteration": 2.8403165340423584 + }, + { + "auxiliary_loss_clip": 0.01404396, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.24326825, + "balance_loss_mlp": 1.0124495, + "epoch": 0.84707650683902, + "flos": 20970098881920.0, + "grad_norm": 2.0095607393170787, + "language_loss": 0.73267198, + "learning_rate": 2.402531299965387e-07, + "loss": 0.75703955, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19909668, + "step": 14089, + "time_per_iteration": 2.934007167816162 + }, + { + "auxiliary_loss_clip": 0.01396761, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.24021411, + "balance_loss_mlp": 1.01497698, + "epoch": 0.8471366300916879, + "flos": 24102478684800.0, + "grad_norm": 1.9950341709236414, + "language_loss": 0.79707968, + "learning_rate": 2.400680880168928e-07, + "loss": 0.82138574, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.1887207, + "step": 14090, + "time_per_iteration": 2.8600449562072754 + }, + { + "auxiliary_loss_clip": 0.01410367, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.24829602, + "balance_loss_mlp": 1.01682734, + "epoch": 0.847196753344356, + "flos": 18342082575360.0, + "grad_norm": 5.1436643837460725, + "language_loss": 0.78105164, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.80551755, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19396973, + "step": 14091, + "time_per_iteration": 2.8164889812469482 + }, + { + "auxiliary_loss_clip": 0.01182556, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.09144866, + "balance_loss_mlp": 1.0090332, + "epoch": 0.8472568765970239, + "flos": 49595026671360.0, + "grad_norm": 0.8188321023788444, + "language_loss": 0.59393477, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61609101, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.24023438, + "step": 14092, + "time_per_iteration": 3.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.01401124, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.24010205, + "balance_loss_mlp": 1.0173347, + "epoch": 0.8473169998496919, + "flos": 19287864771840.0, + "grad_norm": 1.804308359681766, + "language_loss": 0.71115941, + "learning_rate": 2.395133625267756e-07, + "loss": 0.73554552, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20166016, + "step": 14093, + "time_per_iteration": 2.8487629890441895 + }, + { + "auxiliary_loss_clip": 0.01381803, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.22579002, + "balance_loss_mlp": 1.01003218, + "epoch": 0.8473771231023598, + "flos": 17684411137920.0, + "grad_norm": 1.905838569111715, + "language_loss": 0.84005809, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.86416304, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18652344, + "step": 14094, + "time_per_iteration": 2.8359169960021973 + }, + { + "auxiliary_loss_clip": 0.01385691, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.23019695, + "balance_loss_mlp": 1.01758337, + "epoch": 0.8474372463550278, + "flos": 26371023903360.0, + "grad_norm": 1.7589072013503861, + "language_loss": 0.71892744, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.74315357, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.1932373, + "step": 14095, + "time_per_iteration": 2.8978829383850098 + }, + { + "auxiliary_loss_clip": 0.01398435, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.24070072, + "balance_loss_mlp": 1.0158186, + "epoch": 0.8474973696076957, + "flos": 23411932260480.0, + "grad_norm": 3.6227634462899116, + "language_loss": 0.81938475, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.84372473, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1973877, + "step": 14096, + "time_per_iteration": 2.829822063446045 + }, + { + "auxiliary_loss_clip": 0.0141713, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.25204182, + "balance_loss_mlp": 1.01683712, + "epoch": 0.8475574928603637, + "flos": 25084755452160.0, + "grad_norm": 1.9476072205535784, + "language_loss": 0.78154504, + "learning_rate": 2.387746631822374e-07, + "loss": 0.80607903, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19433594, + "step": 14097, + "time_per_iteration": 2.863884687423706 + }, + { + "auxiliary_loss_clip": 0.01395354, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.23678565, + "balance_loss_mlp": 1.01350296, + "epoch": 0.8476176161130318, + "flos": 19974203429760.0, + "grad_norm": 1.909235513062762, + "language_loss": 0.81206727, + "learning_rate": 2.385901552932048e-07, + "loss": 0.83634162, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18579102, + "step": 14098, + "time_per_iteration": 2.8245351314544678 + }, + { + "auxiliary_loss_clip": 0.01391733, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.23385739, + "balance_loss_mlp": 1.01707101, + "epoch": 0.8476777393656997, + "flos": 21295156659840.0, + "grad_norm": 4.528267847629165, + "language_loss": 0.72699815, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.7512877, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.20141602, + "step": 14099, + "time_per_iteration": 2.8637490272521973 + }, + { + "auxiliary_loss_clip": 0.01398262, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.23841059, + "balance_loss_mlp": 1.01667929, + "epoch": 0.8477378626183677, + "flos": 29983579217280.0, + "grad_norm": 3.0645556312214657, + "language_loss": 0.64311749, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.6674757, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20849609, + "step": 14100, + "time_per_iteration": 2.9130966663360596 + }, + { + "auxiliary_loss_clip": 0.01412719, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.24945569, + "balance_loss_mlp": 1.01140749, + "epoch": 0.8477979858710356, + "flos": 24246918645120.0, + "grad_norm": 2.0197228320848426, + "language_loss": 0.74640363, + "learning_rate": 2.380370324111085e-07, + "loss": 0.77083969, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19482422, + "step": 14101, + "time_per_iteration": 2.839693307876587 + }, + { + "auxiliary_loss_clip": 0.01404163, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.24379134, + "balance_loss_mlp": 1.01145363, + "epoch": 0.8478581091237036, + "flos": 25604773649280.0, + "grad_norm": 1.7983492302700896, + "language_loss": 0.72435027, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.74869412, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18786621, + "step": 14102, + "time_per_iteration": 2.920447587966919 + }, + { + "auxiliary_loss_clip": 0.01409953, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.24599469, + "balance_loss_mlp": 1.01369679, + "epoch": 0.8479182323763715, + "flos": 12064609180800.0, + "grad_norm": 1.8631524375377486, + "language_loss": 0.82619357, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.85062695, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19714355, + "step": 14103, + "time_per_iteration": 2.8113646507263184 + }, + { + "auxiliary_loss_clip": 0.01403205, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.24548137, + "balance_loss_mlp": 1.01388347, + "epoch": 0.8479783556290396, + "flos": 21443170959360.0, + "grad_norm": 2.7642406611944854, + "language_loss": 0.78930169, + "learning_rate": 2.374845108533079e-07, + "loss": 0.81366318, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19055176, + "step": 14104, + "time_per_iteration": 2.847508192062378 + }, + { + "auxiliary_loss_clip": 0.01406072, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.2447778, + "balance_loss_mlp": 1.01694083, + "epoch": 0.8480384788817075, + "flos": 19651181667840.0, + "grad_norm": 2.113667057088085, + "language_loss": 0.79626828, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.82069904, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.20056152, + "step": 14105, + "time_per_iteration": 2.8985023498535156 + }, + { + "auxiliary_loss_clip": 0.01424055, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.25854635, + "balance_loss_mlp": 1.01640892, + "epoch": 0.8480986021343755, + "flos": 22498663092480.0, + "grad_norm": 1.7366685444946626, + "language_loss": 0.50716525, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.53177732, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20739746, + "step": 14106, + "time_per_iteration": 2.8652591705322266 + }, + { + "auxiliary_loss_clip": 0.01393129, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.23404026, + "balance_loss_mlp": 1.01419449, + "epoch": 0.8481587253870434, + "flos": 22100616172800.0, + "grad_norm": 4.266370736721215, + "language_loss": 0.75738758, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.7816568, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19604492, + "step": 14107, + "time_per_iteration": 4.2944653034210205 + }, + { + "auxiliary_loss_clip": 0.01395806, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.23668408, + "balance_loss_mlp": 1.01479352, + "epoch": 0.8482188486397114, + "flos": 33595501104000.0, + "grad_norm": 3.0255009279742633, + "language_loss": 0.73872405, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.76302624, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19616699, + "step": 14108, + "time_per_iteration": 2.9484353065490723 + }, + { + "auxiliary_loss_clip": 0.01392536, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.23629057, + "balance_loss_mlp": 1.01080728, + "epoch": 0.8482789718923793, + "flos": 20928220179840.0, + "grad_norm": 1.5748626884519283, + "language_loss": 0.73647749, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.76070714, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19641113, + "step": 14109, + "time_per_iteration": 4.302866458892822 + }, + { + "auxiliary_loss_clip": 0.01398518, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.23959875, + "balance_loss_mlp": 1.01300967, + "epoch": 0.8483390951450474, + "flos": 12903893821440.0, + "grad_norm": 2.028738728930584, + "language_loss": 0.74783224, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.77213705, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18933105, + "step": 14110, + "time_per_iteration": 2.8216323852539062 + }, + { + "auxiliary_loss_clip": 0.01402088, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.2418648, + "balance_loss_mlp": 1.01578045, + "epoch": 0.8483992183977154, + "flos": 25092266088960.0, + "grad_norm": 1.6082793889656823, + "language_loss": 0.76520395, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78957641, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19372559, + "step": 14111, + "time_per_iteration": 2.8686282634735107 + }, + { + "auxiliary_loss_clip": 0.01394171, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.23629832, + "balance_loss_mlp": 1.01447201, + "epoch": 0.8484593416503833, + "flos": 25568595792000.0, + "grad_norm": 2.5235035333807203, + "language_loss": 0.68098783, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.70526451, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19018555, + "step": 14112, + "time_per_iteration": 2.883678436279297 + }, + { + "auxiliary_loss_clip": 0.01392829, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.23190093, + "balance_loss_mlp": 1.00992942, + "epoch": 0.8485194649030513, + "flos": 27209810851200.0, + "grad_norm": 1.726562405186961, + "language_loss": 0.74196613, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.76617616, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18237305, + "step": 14113, + "time_per_iteration": 2.8724875450134277 + }, + { + "auxiliary_loss_clip": 0.0140044, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.24115705, + "balance_loss_mlp": 1.01523316, + "epoch": 0.8485795881557192, + "flos": 24216305898240.0, + "grad_norm": 2.447760845087055, + "language_loss": 0.66718143, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.69153643, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19824219, + "step": 14114, + "time_per_iteration": 2.9304282665252686 + }, + { + "auxiliary_loss_clip": 0.01410288, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.24701476, + "balance_loss_mlp": 1.01451111, + "epoch": 0.8486397114083872, + "flos": 21151395371520.0, + "grad_norm": 1.6742487043588419, + "language_loss": 0.7976765, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.8221215, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19714355, + "step": 14115, + "time_per_iteration": 4.239477872848511 + }, + { + "auxiliary_loss_clip": 0.01411613, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.25007474, + "balance_loss_mlp": 1.01600409, + "epoch": 0.8486998346610551, + "flos": 19984383509760.0, + "grad_norm": 2.030838320883373, + "language_loss": 0.79555321, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.82001191, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18261719, + "step": 14116, + "time_per_iteration": 4.330217361450195 + }, + { + "auxiliary_loss_clip": 0.01407349, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.24410033, + "balance_loss_mlp": 1.01329064, + "epoch": 0.8487599579137232, + "flos": 19801865410560.0, + "grad_norm": 1.976631536553084, + "language_loss": 0.69808924, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.72248852, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19299316, + "step": 14117, + "time_per_iteration": 2.844951629638672 + }, + { + "auxiliary_loss_clip": 0.01404394, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.24158561, + "balance_loss_mlp": 1.01361728, + "epoch": 0.8488200811663911, + "flos": 26407247005440.0, + "grad_norm": 2.473468579795717, + "language_loss": 0.65884209, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.6832158, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19360352, + "step": 14118, + "time_per_iteration": 2.8633108139038086 + }, + { + "auxiliary_loss_clip": 0.01400867, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.24244463, + "balance_loss_mlp": 1.01411712, + "epoch": 0.8488802044190591, + "flos": 16367077491840.0, + "grad_norm": 1.5294007173838253, + "language_loss": 0.73563516, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75996983, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18493652, + "step": 14119, + "time_per_iteration": 2.8399314880371094 + }, + { + "auxiliary_loss_clip": 0.01407574, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.24678636, + "balance_loss_mlp": 1.01521337, + "epoch": 0.848940327671727, + "flos": 19218359479680.0, + "grad_norm": 1.9660120257505227, + "language_loss": 0.79001498, + "learning_rate": 2.345478926864446e-07, + "loss": 0.81444514, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20227051, + "step": 14120, + "time_per_iteration": 2.8365790843963623 + }, + { + "auxiliary_loss_clip": 0.01420202, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.25684118, + "balance_loss_mlp": 1.01548052, + "epoch": 0.849000450924395, + "flos": 21881060565120.0, + "grad_norm": 1.9413138475671718, + "language_loss": 0.76493841, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.78948766, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19250488, + "step": 14121, + "time_per_iteration": 3.0033068656921387 + }, + { + "auxiliary_loss_clip": 0.01184682, + "auxiliary_loss_mlp": 0.01020501, + "balance_loss_clip": 1.09399056, + "balance_loss_mlp": 0.99704039, + "epoch": 0.8490605741770629, + "flos": 71199405429120.0, + "grad_norm": 0.8148754852756624, + "language_loss": 0.60228276, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62433463, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.234375, + "step": 14122, + "time_per_iteration": 3.335278034210205 + }, + { + "auxiliary_loss_clip": 0.01395297, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.23435557, + "balance_loss_mlp": 1.01399076, + "epoch": 0.849120697429731, + "flos": 24984275454720.0, + "grad_norm": 1.8501848679385762, + "language_loss": 0.80730653, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.83158636, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18688965, + "step": 14123, + "time_per_iteration": 2.8891613483428955 + }, + { + "auxiliary_loss_clip": 0.01386924, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.23187041, + "balance_loss_mlp": 1.01241612, + "epoch": 0.8491808206823989, + "flos": 23040788014080.0, + "grad_norm": 2.6008236687248907, + "language_loss": 0.83899271, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.86316782, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18188477, + "step": 14124, + "time_per_iteration": 2.849990129470825 + }, + { + "auxiliary_loss_clip": 0.01397466, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.23889709, + "balance_loss_mlp": 1.01172292, + "epoch": 0.8492409439350669, + "flos": 23889121614720.0, + "grad_norm": 1.8047793083671353, + "language_loss": 0.73435718, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.75864351, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19421387, + "step": 14125, + "time_per_iteration": 2.8531100749969482 + }, + { + "auxiliary_loss_clip": 0.01420295, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.25469029, + "balance_loss_mlp": 1.01560545, + "epoch": 0.8493010671877349, + "flos": 22430288920320.0, + "grad_norm": 1.6789393192214688, + "language_loss": 0.74039727, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.76495832, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.20202637, + "step": 14126, + "time_per_iteration": 2.8507165908813477 + }, + { + "auxiliary_loss_clip": 0.01388725, + "auxiliary_loss_mlp": 0.01034363, + "balance_loss_clip": 1.23217988, + "balance_loss_mlp": 1.01589751, + "epoch": 0.8493611904404028, + "flos": 17538206630400.0, + "grad_norm": 1.4526971388102072, + "language_loss": 0.68510675, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.70933765, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18469238, + "step": 14127, + "time_per_iteration": 2.8638088703155518 + }, + { + "auxiliary_loss_clip": 0.01407233, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.24383402, + "balance_loss_mlp": 1.01255679, + "epoch": 0.8494213136930708, + "flos": 19472147418240.0, + "grad_norm": 2.0612697359388448, + "language_loss": 0.69704425, + "learning_rate": 2.330860086502211e-07, + "loss": 0.72144258, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20068359, + "step": 14128, + "time_per_iteration": 2.834747552871704 + }, + { + "auxiliary_loss_clip": 0.01396352, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.23812091, + "balance_loss_mlp": 1.01695716, + "epoch": 0.8494814369457387, + "flos": 18779474488320.0, + "grad_norm": 1.7379983379315502, + "language_loss": 0.78846264, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.81278956, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19396973, + "step": 14129, + "time_per_iteration": 2.8770384788513184 + }, + { + "auxiliary_loss_clip": 0.01409567, + "auxiliary_loss_mlp": 0.01038257, + "balance_loss_clip": 1.24975395, + "balance_loss_mlp": 1.01957726, + "epoch": 0.8495415601984068, + "flos": 23341974520320.0, + "grad_norm": 1.5940866403287715, + "language_loss": 0.68727148, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.71174973, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18701172, + "step": 14130, + "time_per_iteration": 2.984592914581299 + }, + { + "auxiliary_loss_clip": 0.01399714, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.23860443, + "balance_loss_mlp": 1.01354957, + "epoch": 0.8496016834510747, + "flos": 26623590232320.0, + "grad_norm": 1.7148812664605608, + "language_loss": 0.72175026, + "learning_rate": 2.3253890747186e-07, + "loss": 0.74608147, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1986084, + "step": 14131, + "time_per_iteration": 2.9303297996520996 + }, + { + "auxiliary_loss_clip": 0.0141008, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.24696612, + "balance_loss_mlp": 1.01336551, + "epoch": 0.8496618067037427, + "flos": 25490448743040.0, + "grad_norm": 2.1804339083841615, + "language_loss": 0.69269729, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.71710849, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.17675781, + "step": 14132, + "time_per_iteration": 2.871819496154785 + }, + { + "auxiliary_loss_clip": 0.01398671, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.23976099, + "balance_loss_mlp": 1.01765656, + "epoch": 0.8497219299564106, + "flos": 25385580000000.0, + "grad_norm": 2.157538710565242, + "language_loss": 0.70689487, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.73123479, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17675781, + "step": 14133, + "time_per_iteration": 2.8966238498687744 + }, + { + "auxiliary_loss_clip": 0.01182718, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.09558046, + "balance_loss_mlp": 1.01286054, + "epoch": 0.8497820532090786, + "flos": 67814142583680.0, + "grad_norm": 0.7315035848374647, + "language_loss": 0.57616919, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59838629, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.26171875, + "step": 14134, + "time_per_iteration": 3.4342432022094727 + }, + { + "auxiliary_loss_clip": 0.01419069, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.25469947, + "balance_loss_mlp": 1.01172054, + "epoch": 0.8498421764617465, + "flos": 23450960540160.0, + "grad_norm": 1.891978008068613, + "language_loss": 0.79184234, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81633711, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18688965, + "step": 14135, + "time_per_iteration": 2.8613173961639404 + }, + { + "auxiliary_loss_clip": 0.01412241, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.24871552, + "balance_loss_mlp": 1.01348555, + "epoch": 0.8499022997144146, + "flos": 17721629625600.0, + "grad_norm": 1.9572193858628646, + "language_loss": 0.64947826, + "learning_rate": 2.316284127127044e-07, + "loss": 0.67393422, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19873047, + "step": 14136, + "time_per_iteration": 2.811774253845215 + }, + { + "auxiliary_loss_clip": 0.01420207, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.25582004, + "balance_loss_mlp": 1.01492906, + "epoch": 0.8499624229670825, + "flos": 18597816040320.0, + "grad_norm": 1.9588103129650019, + "language_loss": 0.84644175, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.87098765, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19445801, + "step": 14137, + "time_per_iteration": 2.846207857131958 + }, + { + "auxiliary_loss_clip": 0.01392016, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.23514378, + "balance_loss_mlp": 1.01321888, + "epoch": 0.8500225462197505, + "flos": 24354728300160.0, + "grad_norm": 2.0130237336770174, + "language_loss": 0.79566038, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.8199051, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.19250488, + "step": 14138, + "time_per_iteration": 2.8721206188201904 + }, + { + "auxiliary_loss_clip": 0.01402405, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.24232364, + "balance_loss_mlp": 1.01503301, + "epoch": 0.8500826694724185, + "flos": 16554346295040.0, + "grad_norm": 3.211481603544995, + "language_loss": 0.64732546, + "learning_rate": 2.310829204839073e-07, + "loss": 0.67169189, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19189453, + "step": 14139, + "time_per_iteration": 2.8846235275268555 + }, + { + "auxiliary_loss_clip": 0.01400708, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.24133086, + "balance_loss_mlp": 1.01020706, + "epoch": 0.8501427927250864, + "flos": 16297979402880.0, + "grad_norm": 2.4421866563182033, + "language_loss": 0.71434915, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.73864669, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18847656, + "step": 14140, + "time_per_iteration": 2.8724172115325928 + }, + { + "auxiliary_loss_clip": 0.01421937, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.25683999, + "balance_loss_mlp": 1.019418, + "epoch": 0.8502029159777544, + "flos": 26699203572480.0, + "grad_norm": 1.976687355959325, + "language_loss": 0.64821959, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.67282534, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19213867, + "step": 14141, + "time_per_iteration": 2.8797781467437744 + }, + { + "auxiliary_loss_clip": 0.01400999, + "auxiliary_loss_mlp": 0.01038656, + "balance_loss_clip": 1.24127352, + "balance_loss_mlp": 1.01959419, + "epoch": 0.8502630392304223, + "flos": 35604467049600.0, + "grad_norm": 1.6551513780304723, + "language_loss": 0.71471483, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.73911142, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19055176, + "step": 14142, + "time_per_iteration": 4.364487886428833 + }, + { + "auxiliary_loss_clip": 0.014036, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.24191439, + "balance_loss_mlp": 1.01308298, + "epoch": 0.8503231624830904, + "flos": 21658564045440.0, + "grad_norm": 1.448282235217926, + "language_loss": 0.66237742, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.68672961, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18530273, + "step": 14143, + "time_per_iteration": 2.9855539798736572 + }, + { + "auxiliary_loss_clip": 0.01413523, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.2485013, + "balance_loss_mlp": 1.01500821, + "epoch": 0.8503832857357583, + "flos": 22427212273920.0, + "grad_norm": 1.7499364289532935, + "language_loss": 0.68614095, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.71061713, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.1907959, + "step": 14144, + "time_per_iteration": 4.286494016647339 + }, + { + "auxiliary_loss_clip": 0.01393676, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.23666573, + "balance_loss_mlp": 1.01385283, + "epoch": 0.8504434089884263, + "flos": 18707299752960.0, + "grad_norm": 2.29548064829877, + "language_loss": 0.65724361, + "learning_rate": 2.299937473050777e-07, + "loss": 0.68151402, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.1953125, + "step": 14145, + "time_per_iteration": 2.8191776275634766 + }, + { + "auxiliary_loss_clip": 0.01399718, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.24013782, + "balance_loss_mlp": 1.01724565, + "epoch": 0.8505035322410942, + "flos": 20016760803840.0, + "grad_norm": 1.655892941909803, + "language_loss": 0.85966623, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88402551, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18969727, + "step": 14146, + "time_per_iteration": 2.8484158515930176 + }, + { + "auxiliary_loss_clip": 0.01396308, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.23722708, + "balance_loss_mlp": 1.01244187, + "epoch": 0.8505636554937622, + "flos": 20821541644800.0, + "grad_norm": 2.3178007780700365, + "language_loss": 0.84702408, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.87129605, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18432617, + "step": 14147, + "time_per_iteration": 2.886324167251587 + }, + { + "auxiliary_loss_clip": 0.01410415, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.2460494, + "balance_loss_mlp": 1.01323915, + "epoch": 0.8506237787464301, + "flos": 14182787370240.0, + "grad_norm": 2.2339386913747403, + "language_loss": 0.87367642, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.89810467, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19152832, + "step": 14148, + "time_per_iteration": 2.9008705615997314 + }, + { + "auxiliary_loss_clip": 0.01391977, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.23448992, + "balance_loss_mlp": 1.01710486, + "epoch": 0.8506839019990982, + "flos": 23268804399360.0, + "grad_norm": 1.648611962043415, + "language_loss": 0.72754019, + "learning_rate": 2.292689741370204e-07, + "loss": 0.75182211, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19104004, + "step": 14149, + "time_per_iteration": 2.857525587081909 + }, + { + "auxiliary_loss_clip": 0.01412721, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.25291753, + "balance_loss_mlp": 1.014678, + "epoch": 0.8507440252517661, + "flos": 23669611251840.0, + "grad_norm": 1.591690103621371, + "language_loss": 0.76911056, + "learning_rate": 2.290879486935804e-07, + "loss": 0.79357338, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18884277, + "step": 14150, + "time_per_iteration": 4.298710346221924 + }, + { + "auxiliary_loss_clip": 0.01397345, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.24065685, + "balance_loss_mlp": 1.01707506, + "epoch": 0.8508041485044341, + "flos": 18670624202880.0, + "grad_norm": 1.5005285808726683, + "language_loss": 0.73296428, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.7573002, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.19165039, + "step": 14151, + "time_per_iteration": 4.236381530761719 + }, + { + "auxiliary_loss_clip": 0.01178621, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.09057689, + "balance_loss_mlp": 1.01219392, + "epoch": 0.8508642717571021, + "flos": 52536065639040.0, + "grad_norm": 0.8871600074636261, + "language_loss": 0.59602201, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61815333, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.22363281, + "step": 14152, + "time_per_iteration": 3.104869842529297 + }, + { + "auxiliary_loss_clip": 0.01181994, + "auxiliary_loss_mlp": 0.01017763, + "balance_loss_clip": 1.09126091, + "balance_loss_mlp": 0.9989754, + "epoch": 0.85092439500977, + "flos": 69327368807040.0, + "grad_norm": 0.6910319944795257, + "language_loss": 0.61306989, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63506746, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1875, + "step": 14153, + "time_per_iteration": 3.3026161193847656 + }, + { + "auxiliary_loss_clip": 0.01392445, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.23398352, + "balance_loss_mlp": 1.01388884, + "epoch": 0.850984518262438, + "flos": 24400543299840.0, + "grad_norm": 2.743755963588363, + "language_loss": 0.81340617, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.83766586, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19641113, + "step": 14154, + "time_per_iteration": 2.8746299743652344 + }, + { + "auxiliary_loss_clip": 0.01379446, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.22415996, + "balance_loss_mlp": 1.01515591, + "epoch": 0.851044641515106, + "flos": 23305253725440.0, + "grad_norm": 1.760898297395744, + "language_loss": 0.80215758, + "learning_rate": 2.281838289110165e-07, + "loss": 0.82628089, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17724609, + "step": 14155, + "time_per_iteration": 2.886051893234253 + }, + { + "auxiliary_loss_clip": 0.01413865, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.24990821, + "balance_loss_mlp": 1.01118124, + "epoch": 0.851104764767774, + "flos": 22059099429120.0, + "grad_norm": 1.631006370013621, + "language_loss": 0.71377325, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.73821223, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.18847656, + "step": 14156, + "time_per_iteration": 2.8652024269104004 + }, + { + "auxiliary_loss_clip": 0.01388549, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.23292685, + "balance_loss_mlp": 1.01341712, + "epoch": 0.8511648880204419, + "flos": 20714591640960.0, + "grad_norm": 1.8513618707799206, + "language_loss": 0.74280536, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76701093, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18603516, + "step": 14157, + "time_per_iteration": 2.830643653869629 + }, + { + "auxiliary_loss_clip": 0.01391239, + "auxiliary_loss_mlp": 0.01031957, + "balance_loss_clip": 1.23496199, + "balance_loss_mlp": 1.01473117, + "epoch": 0.8512250112731099, + "flos": 24035326122240.0, + "grad_norm": 1.9539724572250803, + "language_loss": 0.80189592, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.82612795, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17224121, + "step": 14158, + "time_per_iteration": 2.84194016456604 + }, + { + "auxiliary_loss_clip": 0.01402387, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.24187255, + "balance_loss_mlp": 1.01486897, + "epoch": 0.8512851345257778, + "flos": 22024957587840.0, + "grad_norm": 2.1219378123955073, + "language_loss": 0.7993542, + "learning_rate": 2.27461742417828e-07, + "loss": 0.82372546, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19873047, + "step": 14159, + "time_per_iteration": 2.9250481128692627 + }, + { + "auxiliary_loss_clip": 0.01394435, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.23534286, + "balance_loss_mlp": 1.01441061, + "epoch": 0.8513452577784458, + "flos": 14838468036480.0, + "grad_norm": 3.5911648205145785, + "language_loss": 0.71854603, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.74282819, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19372559, + "step": 14160, + "time_per_iteration": 2.84149169921875 + }, + { + "auxiliary_loss_clip": 0.01422353, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.25623453, + "balance_loss_mlp": 1.01413941, + "epoch": 0.8514053810311137, + "flos": 33048399254400.0, + "grad_norm": 2.6315464233661348, + "language_loss": 0.71143401, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.7360096, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.21069336, + "step": 14161, + "time_per_iteration": 2.9425337314605713 + }, + { + "auxiliary_loss_clip": 0.01406697, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.2425822, + "balance_loss_mlp": 1.01446211, + "epoch": 0.8514655042837818, + "flos": 27576204393600.0, + "grad_norm": 2.618115184030919, + "language_loss": 0.78748643, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.81188798, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18994141, + "step": 14162, + "time_per_iteration": 2.928743839263916 + }, + { + "auxiliary_loss_clip": 0.014036, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.2430464, + "balance_loss_mlp": 1.01566148, + "epoch": 0.8515256275364497, + "flos": 35570189473920.0, + "grad_norm": 2.3302240226559414, + "language_loss": 0.78060234, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.80498564, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19067383, + "step": 14163, + "time_per_iteration": 2.9942266941070557 + }, + { + "auxiliary_loss_clip": 0.01182202, + "auxiliary_loss_mlp": 0.01018431, + "balance_loss_clip": 1.09218347, + "balance_loss_mlp": 0.99878544, + "epoch": 0.8515857507891177, + "flos": 70237335104640.0, + "grad_norm": 0.7161782784210835, + "language_loss": 0.55095571, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57296211, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19628906, + "step": 14164, + "time_per_iteration": 3.4478697776794434 + }, + { + "auxiliary_loss_clip": 0.01396057, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.23672438, + "balance_loss_mlp": 1.01847625, + "epoch": 0.8516458740417857, + "flos": 22685750916480.0, + "grad_norm": 2.055495676738981, + "language_loss": 0.73847765, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.76281184, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18884277, + "step": 14165, + "time_per_iteration": 2.8972549438476562 + }, + { + "auxiliary_loss_clip": 0.01387868, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.23081446, + "balance_loss_mlp": 1.01527166, + "epoch": 0.8517059972944536, + "flos": 22757699427840.0, + "grad_norm": 1.771718443121813, + "language_loss": 0.68606102, + "learning_rate": 2.26200679088697e-07, + "loss": 0.71028471, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19238281, + "step": 14166, + "time_per_iteration": 2.8817315101623535 + }, + { + "auxiliary_loss_clip": 0.01404358, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.24373245, + "balance_loss_mlp": 1.01375639, + "epoch": 0.8517661205471216, + "flos": 21699085403520.0, + "grad_norm": 1.8107393039580624, + "language_loss": 0.73966074, + "learning_rate": 2.260207961805125e-07, + "loss": 0.76402777, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18603516, + "step": 14167, + "time_per_iteration": 2.907773494720459 + }, + { + "auxiliary_loss_clip": 0.01404621, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.24517107, + "balance_loss_mlp": 1.01729906, + "epoch": 0.8518262437997896, + "flos": 25385896713600.0, + "grad_norm": 1.6365171973575836, + "language_loss": 0.81274581, + "learning_rate": 2.258409805417969e-07, + "loss": 0.83715504, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18994141, + "step": 14168, + "time_per_iteration": 2.8829052448272705 + }, + { + "auxiliary_loss_clip": 0.01390564, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.23222911, + "balance_loss_mlp": 1.01022351, + "epoch": 0.8518863670524576, + "flos": 27246893604480.0, + "grad_norm": 1.9860201871148038, + "language_loss": 0.76820838, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.79240382, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18737793, + "step": 14169, + "time_per_iteration": 2.9208052158355713 + }, + { + "auxiliary_loss_clip": 0.01411146, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.24834347, + "balance_loss_mlp": 1.01321554, + "epoch": 0.8519464903051255, + "flos": 20969284475520.0, + "grad_norm": 1.83677748534767, + "language_loss": 0.64964944, + "learning_rate": 2.254815511000452e-07, + "loss": 0.67409062, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19750977, + "step": 14170, + "time_per_iteration": 2.858258008956909 + }, + { + "auxiliary_loss_clip": 0.01392848, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.23367047, + "balance_loss_mlp": 1.01234007, + "epoch": 0.8520066135577935, + "flos": 18450797126400.0, + "grad_norm": 2.4016194266621453, + "language_loss": 0.87219036, + "learning_rate": 2.253019373106384e-07, + "loss": 0.89643848, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19628906, + "step": 14171, + "time_per_iteration": 2.8158669471740723 + }, + { + "auxiliary_loss_clip": 0.01420363, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.25976515, + "balance_loss_mlp": 1.01191747, + "epoch": 0.8520667368104614, + "flos": 29141579888640.0, + "grad_norm": 2.0874027608566132, + "language_loss": 0.55684102, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.58135521, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19140625, + "step": 14172, + "time_per_iteration": 2.906567096710205 + }, + { + "auxiliary_loss_clip": 0.01385901, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.23006976, + "balance_loss_mlp": 1.01183677, + "epoch": 0.8521268600631294, + "flos": 16042969854720.0, + "grad_norm": 2.0681218714823904, + "language_loss": 0.69905043, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.72319794, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.17016602, + "step": 14173, + "time_per_iteration": 2.8348536491394043 + }, + { + "auxiliary_loss_clip": 0.01401563, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.24124718, + "balance_loss_mlp": 1.01635885, + "epoch": 0.8521869833157973, + "flos": 22464883209600.0, + "grad_norm": 20.945802773652677, + "language_loss": 0.77497113, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79933876, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18835449, + "step": 14174, + "time_per_iteration": 2.874356985092163 + }, + { + "auxiliary_loss_clip": 0.01419927, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.25699687, + "balance_loss_mlp": 1.01537156, + "epoch": 0.8522471065684654, + "flos": 24982601397120.0, + "grad_norm": 1.705523164391855, + "language_loss": 0.82445121, + "learning_rate": 2.245841551883676e-07, + "loss": 0.84899402, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18969727, + "step": 14175, + "time_per_iteration": 2.8754940032958984 + }, + { + "auxiliary_loss_clip": 0.01419609, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.25528872, + "balance_loss_mlp": 1.01493025, + "epoch": 0.8523072298211333, + "flos": 17719095916800.0, + "grad_norm": 2.1905907678945833, + "language_loss": 0.66042018, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.68495369, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18811035, + "step": 14176, + "time_per_iteration": 2.8800606727600098 + }, + { + "auxiliary_loss_clip": 0.01386417, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.23039746, + "balance_loss_mlp": 1.01271844, + "epoch": 0.8523673530738013, + "flos": 25456849839360.0, + "grad_norm": 2.699769779170139, + "language_loss": 0.79131091, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.81549203, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.1895752, + "step": 14177, + "time_per_iteration": 4.40116810798645 + }, + { + "auxiliary_loss_clip": 0.01409921, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.25025392, + "balance_loss_mlp": 1.01265085, + "epoch": 0.8524274763264693, + "flos": 31441733239680.0, + "grad_norm": 1.616747128994564, + "language_loss": 0.74163902, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.76605809, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19348145, + "step": 14178, + "time_per_iteration": 2.917104482650757 + }, + { + "auxiliary_loss_clip": 0.01405891, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.24370909, + "balance_loss_mlp": 1.01757264, + "epoch": 0.8524875995791372, + "flos": 17721674870400.0, + "grad_norm": 1.780891627058962, + "language_loss": 0.76216567, + "learning_rate": 2.238674502491935e-07, + "loss": 0.78659189, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19152832, + "step": 14179, + "time_per_iteration": 4.3328235149383545 + }, + { + "auxiliary_loss_clip": 0.0139382, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.23694611, + "balance_loss_mlp": 1.01491106, + "epoch": 0.8525477228318052, + "flos": 21696777918720.0, + "grad_norm": 1.9590392443048363, + "language_loss": 0.83242726, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.85670686, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19226074, + "step": 14180, + "time_per_iteration": 2.871492624282837 + }, + { + "auxiliary_loss_clip": 0.01397657, + "auxiliary_loss_mlp": 0.01034801, + "balance_loss_clip": 1.2385323, + "balance_loss_mlp": 1.01566839, + "epoch": 0.8526078460844732, + "flos": 24837844723200.0, + "grad_norm": 4.009947651833669, + "language_loss": 0.61770582, + "learning_rate": 2.235095018591815e-07, + "loss": 0.64203042, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19128418, + "step": 14181, + "time_per_iteration": 2.864065647125244 + }, + { + "auxiliary_loss_clip": 0.0138786, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.23110163, + "balance_loss_mlp": 1.0136447, + "epoch": 0.8526679693371412, + "flos": 13524256281600.0, + "grad_norm": 2.2034171179376734, + "language_loss": 0.72416836, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74837118, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18786621, + "step": 14182, + "time_per_iteration": 2.8313300609588623 + }, + { + "auxiliary_loss_clip": 0.0139637, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.23841739, + "balance_loss_mlp": 1.01276875, + "epoch": 0.8527280925898091, + "flos": 23524673598720.0, + "grad_norm": 1.740743842319948, + "language_loss": 0.70624185, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73052037, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18725586, + "step": 14183, + "time_per_iteration": 2.9570257663726807 + }, + { + "auxiliary_loss_clip": 0.01396254, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.23823142, + "balance_loss_mlp": 1.0161773, + "epoch": 0.8527882158424771, + "flos": 20312698913280.0, + "grad_norm": 1.663037909323201, + "language_loss": 0.73018652, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.75449181, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18103027, + "step": 14184, + "time_per_iteration": 2.8784327507019043 + }, + { + "auxiliary_loss_clip": 0.01395941, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.23759747, + "balance_loss_mlp": 1.01455092, + "epoch": 0.852848339095145, + "flos": 17211565284480.0, + "grad_norm": 1.633183824657198, + "language_loss": 0.77420485, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.79849696, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18725586, + "step": 14185, + "time_per_iteration": 4.240323781967163 + }, + { + "auxiliary_loss_clip": 0.01410148, + "auxiliary_loss_mlp": 0.01031289, + "balance_loss_clip": 1.24776864, + "balance_loss_mlp": 1.01295471, + "epoch": 0.852908462347813, + "flos": 18378079453440.0, + "grad_norm": 1.847150033770184, + "language_loss": 0.803491, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.82790536, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18322754, + "step": 14186, + "time_per_iteration": 2.856426477432251 + }, + { + "auxiliary_loss_clip": 0.0141554, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.25197124, + "balance_loss_mlp": 1.01565254, + "epoch": 0.8529685856004809, + "flos": 18634174876800.0, + "grad_norm": 1.6660427133494256, + "language_loss": 0.63118321, + "learning_rate": 2.224372736588449e-07, + "loss": 0.65568596, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19067383, + "step": 14187, + "time_per_iteration": 4.305727958679199 + }, + { + "auxiliary_loss_clip": 0.01416335, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.25206399, + "balance_loss_mlp": 1.01258564, + "epoch": 0.853028708853149, + "flos": 29619945607680.0, + "grad_norm": 1.6056515081977225, + "language_loss": 0.76696408, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.79145157, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19836426, + "step": 14188, + "time_per_iteration": 2.9282584190368652 + }, + { + "auxiliary_loss_clip": 0.01407798, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.24495983, + "balance_loss_mlp": 1.01332188, + "epoch": 0.8530888321058169, + "flos": 26362698860160.0, + "grad_norm": 1.4966076422687684, + "language_loss": 0.78703231, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.81144762, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20410156, + "step": 14189, + "time_per_iteration": 3.0081801414489746 + }, + { + "auxiliary_loss_clip": 0.0140437, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.24352574, + "balance_loss_mlp": 1.01560044, + "epoch": 0.8531489553584849, + "flos": 20531711583360.0, + "grad_norm": 2.292702199813911, + "language_loss": 0.80397189, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.82836425, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19262695, + "step": 14190, + "time_per_iteration": 2.8562631607055664 + }, + { + "auxiliary_loss_clip": 0.01402336, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.24222374, + "balance_loss_mlp": 1.0120337, + "epoch": 0.8532090786111529, + "flos": 20714048703360.0, + "grad_norm": 2.083166879948958, + "language_loss": 0.76991451, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.79424679, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18859863, + "step": 14191, + "time_per_iteration": 2.854310989379883 + }, + { + "auxiliary_loss_clip": 0.01391033, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.23397851, + "balance_loss_mlp": 1.01256275, + "epoch": 0.8532692018638208, + "flos": 19838495715840.0, + "grad_norm": 1.8641160502799248, + "language_loss": 0.69696367, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.72118735, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18786621, + "step": 14192, + "time_per_iteration": 2.8516666889190674 + }, + { + "auxiliary_loss_clip": 0.01426204, + "auxiliary_loss_mlp": 0.01043213, + "balance_loss_clip": 1.25924766, + "balance_loss_mlp": 1.02255368, + "epoch": 0.8533293251164888, + "flos": 21006774432000.0, + "grad_norm": 2.1725221233310603, + "language_loss": 0.63801205, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.66270626, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.20666504, + "step": 14193, + "time_per_iteration": 2.8607845306396484 + }, + { + "auxiliary_loss_clip": 0.01397327, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.23812222, + "balance_loss_mlp": 1.01052117, + "epoch": 0.8533894483691568, + "flos": 22429926961920.0, + "grad_norm": 1.8082839002133828, + "language_loss": 0.77658343, + "learning_rate": 2.211894078044365e-07, + "loss": 0.80084604, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18408203, + "step": 14194, + "time_per_iteration": 2.8508384227752686 + }, + { + "auxiliary_loss_clip": 0.01396938, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.23746741, + "balance_loss_mlp": 1.01012254, + "epoch": 0.8534495716218248, + "flos": 21626548709760.0, + "grad_norm": 1.7868816415381785, + "language_loss": 0.70495379, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.72921038, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18579102, + "step": 14195, + "time_per_iteration": 2.8987576961517334 + }, + { + "auxiliary_loss_clip": 0.01392576, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.23337245, + "balance_loss_mlp": 1.01473844, + "epoch": 0.8535096948744927, + "flos": 22356485372160.0, + "grad_norm": 1.9905930904684503, + "language_loss": 0.86767435, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.89192981, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18237305, + "step": 14196, + "time_per_iteration": 2.958568811416626 + }, + { + "auxiliary_loss_clip": 0.01182802, + "auxiliary_loss_mlp": 0.01021888, + "balance_loss_clip": 1.09420598, + "balance_loss_mlp": 0.99919063, + "epoch": 0.8535698181271607, + "flos": 52786731686400.0, + "grad_norm": 0.7715488349513239, + "language_loss": 0.55137146, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57341838, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.2265625, + "step": 14197, + "time_per_iteration": 3.296600580215454 + }, + { + "auxiliary_loss_clip": 0.01380319, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.22444487, + "balance_loss_mlp": 1.01475716, + "epoch": 0.8536299413798286, + "flos": 19072335951360.0, + "grad_norm": 1.5948230222836395, + "language_loss": 0.81960535, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.84374636, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.19030762, + "step": 14198, + "time_per_iteration": 2.8763599395751953 + }, + { + "auxiliary_loss_clip": 0.01400749, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.24150538, + "balance_loss_mlp": 1.01812947, + "epoch": 0.8536900646324966, + "flos": 49361698661760.0, + "grad_norm": 1.4708557202917214, + "language_loss": 0.69403934, + "learning_rate": 2.203000984963035e-07, + "loss": 0.71840304, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.17504883, + "step": 14199, + "time_per_iteration": 3.1298577785491943 + }, + { + "auxiliary_loss_clip": 0.01385378, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.23029327, + "balance_loss_mlp": 1.01170301, + "epoch": 0.8537501878851645, + "flos": 21772346014080.0, + "grad_norm": 1.480048957909808, + "language_loss": 0.86894953, + "learning_rate": 2.201224390669072e-07, + "loss": 0.89310777, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18737793, + "step": 14200, + "time_per_iteration": 2.9250011444091797 + }, + { + "auxiliary_loss_clip": 0.01398377, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.23874331, + "balance_loss_mlp": 1.01281118, + "epoch": 0.8538103111378326, + "flos": 22278293078400.0, + "grad_norm": 2.325620477315977, + "language_loss": 0.78648847, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.81078327, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1829834, + "step": 14201, + "time_per_iteration": 2.9130282402038574 + }, + { + "auxiliary_loss_clip": 0.01394331, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.23796988, + "balance_loss_mlp": 1.01450777, + "epoch": 0.8538704343905005, + "flos": 20313830033280.0, + "grad_norm": 1.7161690835959797, + "language_loss": 0.69690162, + "learning_rate": 2.19767322694256e-07, + "loss": 0.72117817, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18811035, + "step": 14202, + "time_per_iteration": 2.8443245887756348 + }, + { + "auxiliary_loss_clip": 0.01399196, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.23914194, + "balance_loss_mlp": 1.01470876, + "epoch": 0.8539305576431685, + "flos": 24766212925440.0, + "grad_norm": 1.717924980013312, + "language_loss": 0.80813205, + "learning_rate": 2.195898657644666e-07, + "loss": 0.83245564, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18457031, + "step": 14203, + "time_per_iteration": 2.9611542224884033 + }, + { + "auxiliary_loss_clip": 0.01403298, + "auxiliary_loss_mlp": 0.01034359, + "balance_loss_clip": 1.24232447, + "balance_loss_mlp": 1.01534486, + "epoch": 0.8539906808958365, + "flos": 26698841614080.0, + "grad_norm": 2.0905254135405005, + "language_loss": 0.67079747, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.6951741, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18994141, + "step": 14204, + "time_per_iteration": 2.9992730617523193 + }, + { + "auxiliary_loss_clip": 0.01407063, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.24451983, + "balance_loss_mlp": 1.01177716, + "epoch": 0.8540508041485044, + "flos": 13372034215680.0, + "grad_norm": 3.8520622741422685, + "language_loss": 0.6138829, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.63825238, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18103027, + "step": 14205, + "time_per_iteration": 2.8239667415618896 + }, + { + "auxiliary_loss_clip": 0.0139638, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.23740757, + "balance_loss_mlp": 1.0124681, + "epoch": 0.8541109274011724, + "flos": 32793706419840.0, + "grad_norm": 2.3752266785771394, + "language_loss": 0.72852588, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.75280678, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19238281, + "step": 14206, + "time_per_iteration": 2.9304983615875244 + }, + { + "auxiliary_loss_clip": 0.01400257, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.23835897, + "balance_loss_mlp": 1.01145887, + "epoch": 0.8541710506538404, + "flos": 17648188035840.0, + "grad_norm": 2.8053383979218385, + "language_loss": 0.7714147, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.79572415, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19226074, + "step": 14207, + "time_per_iteration": 2.9093291759490967 + }, + { + "auxiliary_loss_clip": 0.01403922, + "auxiliary_loss_mlp": 0.01035942, + "balance_loss_clip": 1.2442292, + "balance_loss_mlp": 1.01669002, + "epoch": 0.8542311739065084, + "flos": 20271815596800.0, + "grad_norm": 2.0042638633316385, + "language_loss": 0.85283071, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87722933, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19250488, + "step": 14208, + "time_per_iteration": 2.8831071853637695 + }, + { + "auxiliary_loss_clip": 0.01400451, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.24015319, + "balance_loss_mlp": 1.01730597, + "epoch": 0.8542912971591763, + "flos": 17794709256960.0, + "grad_norm": 3.446050582546343, + "language_loss": 0.67124462, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.69560599, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18395996, + "step": 14209, + "time_per_iteration": 2.843174934387207 + }, + { + "auxiliary_loss_clip": 0.01401903, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.24334431, + "balance_loss_mlp": 1.0133779, + "epoch": 0.8543514204118443, + "flos": 26990798181120.0, + "grad_norm": 2.040388143902975, + "language_loss": 0.70775181, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.73208952, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18481445, + "step": 14210, + "time_per_iteration": 2.9351930618286133 + }, + { + "auxiliary_loss_clip": 0.01401816, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.24130297, + "balance_loss_mlp": 1.01497746, + "epoch": 0.8544115436645122, + "flos": 24035235632640.0, + "grad_norm": 1.3605782122619219, + "language_loss": 0.70757532, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.73192847, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18530273, + "step": 14211, + "time_per_iteration": 2.910698413848877 + }, + { + "auxiliary_loss_clip": 0.01410278, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.24915409, + "balance_loss_mlp": 1.01557612, + "epoch": 0.8544716669171802, + "flos": 16626702009600.0, + "grad_norm": 29.200085560982096, + "language_loss": 0.82451737, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.84896511, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18920898, + "step": 14212, + "time_per_iteration": 4.22799825668335 + }, + { + "auxiliary_loss_clip": 0.01402903, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.24241662, + "balance_loss_mlp": 1.01291645, + "epoch": 0.8545317901698481, + "flos": 40020672084480.0, + "grad_norm": 9.932814825348464, + "language_loss": 0.67067146, + "learning_rate": 2.178190108088105e-07, + "loss": 0.69502383, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19421387, + "step": 14213, + "time_per_iteration": 3.026350498199463 + }, + { + "auxiliary_loss_clip": 0.01399288, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.24120855, + "balance_loss_mlp": 1.01170194, + "epoch": 0.8545919134225162, + "flos": 19912299264000.0, + "grad_norm": 1.6945239643505698, + "language_loss": 0.79177696, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.81606555, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.17871094, + "step": 14214, + "time_per_iteration": 4.356052875518799 + }, + { + "auxiliary_loss_clip": 0.01414766, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.24951959, + "balance_loss_mlp": 1.01411664, + "epoch": 0.8546520366751841, + "flos": 18962264056320.0, + "grad_norm": 2.2367747968214715, + "language_loss": 0.67809623, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.70258391, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19885254, + "step": 14215, + "time_per_iteration": 2.8245315551757812 + }, + { + "auxiliary_loss_clip": 0.01394713, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.23574233, + "balance_loss_mlp": 1.01277936, + "epoch": 0.8547121599278521, + "flos": 35633631962880.0, + "grad_norm": 1.6602010047135498, + "language_loss": 0.63873345, + "learning_rate": 2.172890718362279e-07, + "loss": 0.66299433, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18615723, + "step": 14216, + "time_per_iteration": 2.9630746841430664 + }, + { + "auxiliary_loss_clip": 0.0140902, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.24760127, + "balance_loss_mlp": 1.01578259, + "epoch": 0.8547722831805201, + "flos": 16918884800640.0, + "grad_norm": 1.8934271586189317, + "language_loss": 0.66162258, + "learning_rate": 2.17112560704259e-07, + "loss": 0.6860556, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18505859, + "step": 14217, + "time_per_iteration": 2.878323793411255 + }, + { + "auxiliary_loss_clip": 0.01405289, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.2461164, + "balance_loss_mlp": 1.01543212, + "epoch": 0.854832406433188, + "flos": 23012889955200.0, + "grad_norm": 1.3642839177199304, + "language_loss": 0.65739846, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.68179011, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18432617, + "step": 14218, + "time_per_iteration": 2.8977439403533936 + }, + { + "auxiliary_loss_clip": 0.01415375, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.25110352, + "balance_loss_mlp": 1.01237833, + "epoch": 0.854892529685856, + "flos": 20422318360320.0, + "grad_norm": 1.8259067022781588, + "language_loss": 0.7085588, + "learning_rate": 2.167597412688238e-07, + "loss": 0.7330246, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.18811035, + "step": 14219, + "time_per_iteration": 2.8991353511810303 + }, + { + "auxiliary_loss_clip": 0.01410979, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.24712908, + "balance_loss_mlp": 1.014364, + "epoch": 0.854952652938524, + "flos": 16407282136320.0, + "grad_norm": 2.2622923875129946, + "language_loss": 0.68905157, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.71349311, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18823242, + "step": 14220, + "time_per_iteration": 4.270198345184326 + }, + { + "auxiliary_loss_clip": 0.01389476, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.23267436, + "balance_loss_mlp": 1.01157033, + "epoch": 0.855012776191192, + "flos": 21188568614400.0, + "grad_norm": 1.956647764711893, + "language_loss": 0.72750676, + "learning_rate": 2.164071923159827e-07, + "loss": 0.75171191, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19470215, + "step": 14221, + "time_per_iteration": 2.8656015396118164 + }, + { + "auxiliary_loss_clip": 0.01412316, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.2506696, + "balance_loss_mlp": 1.01495433, + "epoch": 0.8550728994438599, + "flos": 26152237457280.0, + "grad_norm": 2.033574941804207, + "language_loss": 0.60873044, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.63320345, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20031738, + "step": 14222, + "time_per_iteration": 4.238384962081909 + }, + { + "auxiliary_loss_clip": 0.01386735, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.23032391, + "balance_loss_mlp": 1.0120039, + "epoch": 0.8551330226965279, + "flos": 22797813582720.0, + "grad_norm": 1.6014204254420639, + "language_loss": 0.84770155, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.87187052, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18151855, + "step": 14223, + "time_per_iteration": 2.8818936347961426 + }, + { + "auxiliary_loss_clip": 0.01402556, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.24324226, + "balance_loss_mlp": 1.01196694, + "epoch": 0.8551931459491958, + "flos": 22429203045120.0, + "grad_norm": 1.4482261049951097, + "language_loss": 0.74511635, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76945126, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18981934, + "step": 14224, + "time_per_iteration": 2.835235118865967 + }, + { + "auxiliary_loss_clip": 0.01399166, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.2405746, + "balance_loss_mlp": 1.01370645, + "epoch": 0.8552532692018638, + "flos": 19582626516480.0, + "grad_norm": 1.7535323636436766, + "language_loss": 0.76165074, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.78597116, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19165039, + "step": 14225, + "time_per_iteration": 2.8344624042510986 + }, + { + "auxiliary_loss_clip": 0.01397222, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.23900783, + "balance_loss_mlp": 1.01678419, + "epoch": 0.8553133924545318, + "flos": 26444058289920.0, + "grad_norm": 1.9523988930239955, + "language_loss": 0.78024346, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.80456322, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.1796875, + "step": 14226, + "time_per_iteration": 2.864877462387085 + }, + { + "auxiliary_loss_clip": 0.01412488, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.24887812, + "balance_loss_mlp": 1.01164818, + "epoch": 0.8553735157071998, + "flos": 16371059034240.0, + "grad_norm": 1.991450303471412, + "language_loss": 0.55131447, + "learning_rate": 2.153511688875702e-07, + "loss": 0.57574844, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19262695, + "step": 14227, + "time_per_iteration": 2.843902111053467 + }, + { + "auxiliary_loss_clip": 0.01392066, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.23417628, + "balance_loss_mlp": 1.01304722, + "epoch": 0.8554336389598677, + "flos": 20897290719360.0, + "grad_norm": 2.0178742720915506, + "language_loss": 0.66681743, + "learning_rate": 2.151754018031442e-07, + "loss": 0.69105327, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18481445, + "step": 14228, + "time_per_iteration": 2.859248399734497 + }, + { + "auxiliary_loss_clip": 0.01410859, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.24759996, + "balance_loss_mlp": 1.01909983, + "epoch": 0.8554937622125357, + "flos": 21293754071040.0, + "grad_norm": 2.564422376240963, + "language_loss": 0.74624473, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.77074194, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19763184, + "step": 14229, + "time_per_iteration": 2.9162745475769043 + }, + { + "auxiliary_loss_clip": 0.01391067, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.23334908, + "balance_loss_mlp": 1.01442838, + "epoch": 0.8555538854652037, + "flos": 22421828142720.0, + "grad_norm": 1.7513367405548033, + "language_loss": 0.73745441, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.76168954, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18017578, + "step": 14230, + "time_per_iteration": 2.8326966762542725 + }, + { + "auxiliary_loss_clip": 0.01406913, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.24778104, + "balance_loss_mlp": 1.01310635, + "epoch": 0.8556140087178716, + "flos": 20203396179840.0, + "grad_norm": 1.8776566713441984, + "language_loss": 0.83858025, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.86296737, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18688965, + "step": 14231, + "time_per_iteration": 2.8224170207977295 + }, + { + "auxiliary_loss_clip": 0.01411439, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.24923635, + "balance_loss_mlp": 1.01202583, + "epoch": 0.8556741319705397, + "flos": 22648396694400.0, + "grad_norm": 2.5694770406992244, + "language_loss": 0.68577302, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.7101965, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18884277, + "step": 14232, + "time_per_iteration": 2.935227394104004 + }, + { + "auxiliary_loss_clip": 0.01415389, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.25276351, + "balance_loss_mlp": 1.01578128, + "epoch": 0.8557342552232076, + "flos": 23559584601600.0, + "grad_norm": 1.656013210203904, + "language_loss": 0.67168397, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69618559, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18994141, + "step": 14233, + "time_per_iteration": 2.8890607357025146 + }, + { + "auxiliary_loss_clip": 0.01395343, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.23667932, + "balance_loss_mlp": 1.01416135, + "epoch": 0.8557943784758756, + "flos": 19619392556160.0, + "grad_norm": 1.6133727773475648, + "language_loss": 0.77639788, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.80068111, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18823242, + "step": 14234, + "time_per_iteration": 2.8772165775299072 + }, + { + "auxiliary_loss_clip": 0.01179895, + "auxiliary_loss_mlp": 0.01023509, + "balance_loss_clip": 1.08997691, + "balance_loss_mlp": 1.00262392, + "epoch": 0.8558545017285435, + "flos": 70671107433600.0, + "grad_norm": 0.7585330507130328, + "language_loss": 0.58052206, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60255611, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20898438, + "step": 14235, + "time_per_iteration": 3.321000337600708 + }, + { + "auxiliary_loss_clip": 0.01181566, + "auxiliary_loss_mlp": 0.01016967, + "balance_loss_clip": 1.09155083, + "balance_loss_mlp": 0.9977026, + "epoch": 0.8559146249812115, + "flos": 56680402798080.0, + "grad_norm": 0.7796074000725324, + "language_loss": 0.56648135, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58846664, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19238281, + "step": 14236, + "time_per_iteration": 3.1856298446655273 + }, + { + "auxiliary_loss_clip": 0.01400776, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.24037373, + "balance_loss_mlp": 1.01740313, + "epoch": 0.8559747482338794, + "flos": 22897207704960.0, + "grad_norm": 2.729575602379337, + "language_loss": 0.70763588, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.73200715, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18945312, + "step": 14237, + "time_per_iteration": 2.9335579872131348 + }, + { + "auxiliary_loss_clip": 0.01394659, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.23477089, + "balance_loss_mlp": 1.01219118, + "epoch": 0.8560348714865474, + "flos": 22612264081920.0, + "grad_norm": 3.7936461117371305, + "language_loss": 0.64581215, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.67006117, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18054199, + "step": 14238, + "time_per_iteration": 2.8774185180664062 + }, + { + "auxiliary_loss_clip": 0.01390692, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.23502779, + "balance_loss_mlp": 1.01215351, + "epoch": 0.8560949947392154, + "flos": 17940506561280.0, + "grad_norm": 1.557581613063183, + "language_loss": 0.69777369, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.72197014, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.16796875, + "step": 14239, + "time_per_iteration": 2.954288959503174 + }, + { + "auxiliary_loss_clip": 0.01416025, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.25157309, + "balance_loss_mlp": 1.01521373, + "epoch": 0.8561551179918834, + "flos": 31037714006400.0, + "grad_norm": 2.7345147270848225, + "language_loss": 0.67623019, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.70074117, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19848633, + "step": 14240, + "time_per_iteration": 3.0271949768066406 + }, + { + "auxiliary_loss_clip": 0.01405952, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.24352801, + "balance_loss_mlp": 1.01332545, + "epoch": 0.8562152412445513, + "flos": 30677609491200.0, + "grad_norm": 1.762147566644047, + "language_loss": 0.62244076, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64683378, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20019531, + "step": 14241, + "time_per_iteration": 3.0377180576324463 + }, + { + "auxiliary_loss_clip": 0.01423792, + "auxiliary_loss_mlp": 0.01042623, + "balance_loss_clip": 1.25679374, + "balance_loss_mlp": 1.02253604, + "epoch": 0.8562753644972193, + "flos": 31588480684800.0, + "grad_norm": 1.7995489849670472, + "language_loss": 0.75162268, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.77628684, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20080566, + "step": 14242, + "time_per_iteration": 2.977404832839966 + }, + { + "auxiliary_loss_clip": 0.01404081, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.2422415, + "balance_loss_mlp": 1.01439977, + "epoch": 0.8563354877498872, + "flos": 26224819395840.0, + "grad_norm": 1.9949948377397884, + "language_loss": 0.77345669, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.79783773, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19604492, + "step": 14243, + "time_per_iteration": 2.8753256797790527 + }, + { + "auxiliary_loss_clip": 0.01400515, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.24053049, + "balance_loss_mlp": 1.01534319, + "epoch": 0.8563956110025552, + "flos": 24145081303680.0, + "grad_norm": 1.7731391617831433, + "language_loss": 0.68434417, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70868468, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18188477, + "step": 14244, + "time_per_iteration": 2.886712074279785 + }, + { + "auxiliary_loss_clip": 0.01178433, + "auxiliary_loss_mlp": 0.01026405, + "balance_loss_clip": 1.0887301, + "balance_loss_mlp": 1.0059967, + "epoch": 0.8564557342552233, + "flos": 56298897492480.0, + "grad_norm": 0.7544603609226492, + "language_loss": 0.584894, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60694236, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.20410156, + "step": 14245, + "time_per_iteration": 3.2561848163604736 + }, + { + "auxiliary_loss_clip": 0.01420805, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.25593817, + "balance_loss_mlp": 1.01455283, + "epoch": 0.8565158575078912, + "flos": 23451051029760.0, + "grad_norm": 1.7255242016901762, + "language_loss": 0.78265405, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.80719352, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.18579102, + "step": 14246, + "time_per_iteration": 2.910412549972534 + }, + { + "auxiliary_loss_clip": 0.01399481, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.23969698, + "balance_loss_mlp": 1.01250291, + "epoch": 0.8565759807605592, + "flos": 20385597565440.0, + "grad_norm": 1.9272381886628795, + "language_loss": 0.82153398, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.8458367, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18286133, + "step": 14247, + "time_per_iteration": 4.245622634887695 + }, + { + "auxiliary_loss_clip": 0.01403034, + "auxiliary_loss_mlp": 0.01030209, + "balance_loss_clip": 1.24226904, + "balance_loss_mlp": 1.01148164, + "epoch": 0.8566361040132271, + "flos": 18816376262400.0, + "grad_norm": 1.9044480713077034, + "language_loss": 0.7815429, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.8058753, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18725586, + "step": 14248, + "time_per_iteration": 2.873216390609741 + }, + { + "auxiliary_loss_clip": 0.01413481, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.25093675, + "balance_loss_mlp": 1.01553392, + "epoch": 0.8566962272658951, + "flos": 24546114380160.0, + "grad_norm": 2.0913318142225754, + "language_loss": 0.78383422, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.80830842, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18408203, + "step": 14249, + "time_per_iteration": 3.0165634155273438 + }, + { + "auxiliary_loss_clip": 0.01396901, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.23935294, + "balance_loss_mlp": 1.0157578, + "epoch": 0.856756350518563, + "flos": 23187128256000.0, + "grad_norm": 1.8536531800362324, + "language_loss": 0.79246897, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.816782, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1862793, + "step": 14250, + "time_per_iteration": 4.30298638343811 + }, + { + "auxiliary_loss_clip": 0.01396442, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.24013603, + "balance_loss_mlp": 1.01078176, + "epoch": 0.856816473771231, + "flos": 20817017164800.0, + "grad_norm": 3.429242215545121, + "language_loss": 0.80701792, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.83127588, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18566895, + "step": 14251, + "time_per_iteration": 2.871861457824707 + }, + { + "auxiliary_loss_clip": 0.01390889, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.23257399, + "balance_loss_mlp": 1.01380932, + "epoch": 0.856876597023899, + "flos": 20237311797120.0, + "grad_norm": 2.0893167875767533, + "language_loss": 0.61717236, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.64139915, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.17956543, + "step": 14252, + "time_per_iteration": 2.8249783515930176 + }, + { + "auxiliary_loss_clip": 0.01402853, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.24136674, + "balance_loss_mlp": 1.01954961, + "epoch": 0.856936720276567, + "flos": 18305226046080.0, + "grad_norm": 2.0846849947492405, + "language_loss": 0.71281713, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.73722517, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18408203, + "step": 14253, + "time_per_iteration": 2.8363852500915527 + }, + { + "auxiliary_loss_clip": 0.01182631, + "auxiliary_loss_mlp": 0.01026268, + "balance_loss_clip": 1.09365296, + "balance_loss_mlp": 1.00528705, + "epoch": 0.8569968435292349, + "flos": 69908114805120.0, + "grad_norm": 0.7866517407875013, + "language_loss": 0.59296846, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61505741, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.20996094, + "step": 14254, + "time_per_iteration": 3.4057068824768066 + }, + { + "auxiliary_loss_clip": 0.01394874, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.23749399, + "balance_loss_mlp": 1.01667714, + "epoch": 0.8570569667819029, + "flos": 25859602218240.0, + "grad_norm": 1.8973810647033642, + "language_loss": 0.81481338, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83913136, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.20263672, + "step": 14255, + "time_per_iteration": 4.345525503158569 + }, + { + "auxiliary_loss_clip": 0.01387904, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.23279452, + "balance_loss_mlp": 1.01311016, + "epoch": 0.8571170900345708, + "flos": 23267220831360.0, + "grad_norm": 2.1810010174447205, + "language_loss": 0.68552101, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.70970947, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.1784668, + "step": 14256, + "time_per_iteration": 2.877020835876465 + }, + { + "auxiliary_loss_clip": 0.0141489, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.25313473, + "balance_loss_mlp": 1.01679325, + "epoch": 0.8571772132872388, + "flos": 18927262563840.0, + "grad_norm": 1.5958178218768961, + "language_loss": 0.7107054, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.73521101, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18859863, + "step": 14257, + "time_per_iteration": 4.25572395324707 + }, + { + "auxiliary_loss_clip": 0.01393215, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.23618722, + "balance_loss_mlp": 1.01262426, + "epoch": 0.8572373365399069, + "flos": 33261756324480.0, + "grad_norm": 3.4478289516647243, + "language_loss": 0.77296913, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79721552, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18798828, + "step": 14258, + "time_per_iteration": 3.0008463859558105 + }, + { + "auxiliary_loss_clip": 0.01401293, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.24391568, + "balance_loss_mlp": 1.01676893, + "epoch": 0.8572974597925748, + "flos": 23336997592320.0, + "grad_norm": 1.5536164234470484, + "language_loss": 0.6895681, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.7139402, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19128418, + "step": 14259, + "time_per_iteration": 2.9589686393737793 + }, + { + "auxiliary_loss_clip": 0.01396319, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.23609328, + "balance_loss_mlp": 1.01250052, + "epoch": 0.8573575830452428, + "flos": 24546747807360.0, + "grad_norm": 1.6724940992835782, + "language_loss": 0.77909714, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.80337393, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18859863, + "step": 14260, + "time_per_iteration": 2.84981632232666 + }, + { + "auxiliary_loss_clip": 0.01399184, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.23828697, + "balance_loss_mlp": 1.0160569, + "epoch": 0.8574177062979107, + "flos": 24174743909760.0, + "grad_norm": 1.8517400339097498, + "language_loss": 0.75048506, + "learning_rate": 2.09413096654806e-07, + "loss": 0.77483046, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19287109, + "step": 14261, + "time_per_iteration": 2.83603572845459 + }, + { + "auxiliary_loss_clip": 0.01400419, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.23850727, + "balance_loss_mlp": 1.01633036, + "epoch": 0.8574778295505787, + "flos": 17939601665280.0, + "grad_norm": 1.8437899328278136, + "language_loss": 0.79460979, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.8189832, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.20593262, + "step": 14262, + "time_per_iteration": 2.81742262840271 + }, + { + "auxiliary_loss_clip": 0.01392974, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.23685622, + "balance_loss_mlp": 1.01412773, + "epoch": 0.8575379528032466, + "flos": 21590868545280.0, + "grad_norm": 3.7574752809143828, + "language_loss": 0.68785763, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.71211398, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1854248, + "step": 14263, + "time_per_iteration": 2.796776533126831 + }, + { + "auxiliary_loss_clip": 0.01411654, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.24929082, + "balance_loss_mlp": 1.01656199, + "epoch": 0.8575980760559146, + "flos": 21771395873280.0, + "grad_norm": 2.948337468359391, + "language_loss": 0.80088222, + "learning_rate": 2.088929137266986e-07, + "loss": 0.82535535, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19091797, + "step": 14264, + "time_per_iteration": 2.816323757171631 + }, + { + "auxiliary_loss_clip": 0.01411288, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.25164998, + "balance_loss_mlp": 1.01627886, + "epoch": 0.8576581993085826, + "flos": 34399739007360.0, + "grad_norm": 1.2793005532971435, + "language_loss": 0.70050043, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.724967, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19091797, + "step": 14265, + "time_per_iteration": 2.9305152893066406 + }, + { + "auxiliary_loss_clip": 0.01390994, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.23475385, + "balance_loss_mlp": 1.01062512, + "epoch": 0.8577183225612506, + "flos": 23233078990080.0, + "grad_norm": 1.9524002903112594, + "language_loss": 0.66894913, + "learning_rate": 2.085464646918027e-07, + "loss": 0.69314599, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18066406, + "step": 14266, + "time_per_iteration": 2.8515117168426514 + }, + { + "auxiliary_loss_clip": 0.01400802, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.24275041, + "balance_loss_mlp": 1.01541853, + "epoch": 0.8577784458139185, + "flos": 28816024417920.0, + "grad_norm": 2.3047107091933143, + "language_loss": 0.7637881, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.78814375, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19348145, + "step": 14267, + "time_per_iteration": 2.8905978202819824 + }, + { + "auxiliary_loss_clip": 0.01392501, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.23501849, + "balance_loss_mlp": 1.01804006, + "epoch": 0.8578385690665865, + "flos": 19765054126080.0, + "grad_norm": 1.6953041399624071, + "language_loss": 0.87907243, + "learning_rate": 2.082002873852946e-07, + "loss": 0.90336037, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18261719, + "step": 14268, + "time_per_iteration": 2.8315558433532715 + }, + { + "auxiliary_loss_clip": 0.01419055, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.25583029, + "balance_loss_mlp": 1.01568425, + "epoch": 0.8578986923192544, + "flos": 20713777234560.0, + "grad_norm": 1.7741067857405215, + "language_loss": 0.74005139, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.76458859, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18981934, + "step": 14269, + "time_per_iteration": 2.8940114974975586 + }, + { + "auxiliary_loss_clip": 0.01403299, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.24214673, + "balance_loss_mlp": 1.01385999, + "epoch": 0.8579588155719224, + "flos": 36115572021120.0, + "grad_norm": 1.519727222069618, + "language_loss": 0.66777313, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.69213426, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18945312, + "step": 14270, + "time_per_iteration": 2.9452457427978516 + }, + { + "auxiliary_loss_clip": 0.01393927, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.23671508, + "balance_loss_mlp": 1.0108732, + "epoch": 0.8580189388245905, + "flos": 22862884884480.0, + "grad_norm": 1.600574567535804, + "language_loss": 0.74308056, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76731575, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18713379, + "step": 14271, + "time_per_iteration": 2.8707776069641113 + }, + { + "auxiliary_loss_clip": 0.01180227, + "auxiliary_loss_mlp": 0.01021706, + "balance_loss_clip": 1.09032571, + "balance_loss_mlp": 1.00196517, + "epoch": 0.8580790620772584, + "flos": 69676723042560.0, + "grad_norm": 0.8014230951458096, + "language_loss": 0.59676313, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.6187824, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19726562, + "step": 14272, + "time_per_iteration": 3.385039806365967 + }, + { + "auxiliary_loss_clip": 0.01418307, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.25379729, + "balance_loss_mlp": 1.01391387, + "epoch": 0.8581391853299264, + "flos": 13342100140800.0, + "grad_norm": 2.228200143606108, + "language_loss": 0.76161075, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.78612506, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1920166, + "step": 14273, + "time_per_iteration": 2.8465416431427 + }, + { + "auxiliary_loss_clip": 0.01391108, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.23172092, + "balance_loss_mlp": 1.01267815, + "epoch": 0.8581993085825943, + "flos": 19654982231040.0, + "grad_norm": 1.740841862463271, + "language_loss": 0.82619756, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.85042095, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1854248, + "step": 14274, + "time_per_iteration": 2.834275245666504 + }, + { + "auxiliary_loss_clip": 0.01179138, + "auxiliary_loss_mlp": 0.01019761, + "balance_loss_clip": 1.08986735, + "balance_loss_mlp": 1.00097394, + "epoch": 0.8582594318352623, + "flos": 55849515707520.0, + "grad_norm": 0.7932810194096229, + "language_loss": 0.60917389, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.63116288, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1875, + "step": 14275, + "time_per_iteration": 3.3716845512390137 + }, + { + "auxiliary_loss_clip": 0.0140476, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.24260485, + "balance_loss_mlp": 1.0107708, + "epoch": 0.8583195550879302, + "flos": 24290154691200.0, + "grad_norm": 3.7938678535016113, + "language_loss": 0.60165733, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.62599921, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18640137, + "step": 14276, + "time_per_iteration": 2.881760597229004 + }, + { + "auxiliary_loss_clip": 0.01399758, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.23957705, + "balance_loss_mlp": 1.01318073, + "epoch": 0.8583796783405983, + "flos": 13452126791040.0, + "grad_norm": 2.4594527959746406, + "language_loss": 0.77357149, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.79789507, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19421387, + "step": 14277, + "time_per_iteration": 2.8129806518554688 + }, + { + "auxiliary_loss_clip": 0.01404939, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.24497902, + "balance_loss_mlp": 1.01344419, + "epoch": 0.8584398015932662, + "flos": 16188902893440.0, + "grad_norm": 1.6858748316256966, + "language_loss": 0.83873165, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.86310887, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19335938, + "step": 14278, + "time_per_iteration": 2.8463923931121826 + }, + { + "auxiliary_loss_clip": 0.01418818, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.25525999, + "balance_loss_mlp": 1.01376295, + "epoch": 0.8584999248459342, + "flos": 17457616362240.0, + "grad_norm": 2.1239204478919, + "language_loss": 0.75895715, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.78347993, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19702148, + "step": 14279, + "time_per_iteration": 2.875763177871704 + }, + { + "auxiliary_loss_clip": 0.01395815, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.23820233, + "balance_loss_mlp": 1.01395786, + "epoch": 0.8585600480986021, + "flos": 23451548722560.0, + "grad_norm": 2.215698549422693, + "language_loss": 0.67116308, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.69545043, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18969727, + "step": 14280, + "time_per_iteration": 3.0334174633026123 + }, + { + "auxiliary_loss_clip": 0.01394493, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.23659587, + "balance_loss_mlp": 1.01266527, + "epoch": 0.8586201713512701, + "flos": 19947436490880.0, + "grad_norm": 1.8181279265563723, + "language_loss": 0.63502955, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.65928257, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18139648, + "step": 14281, + "time_per_iteration": 2.8748443126678467 + }, + { + "auxiliary_loss_clip": 0.01400817, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.24132097, + "balance_loss_mlp": 1.01208091, + "epoch": 0.858680294603938, + "flos": 15313214171520.0, + "grad_norm": 3.2058264117172603, + "language_loss": 0.74432278, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.76864219, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19042969, + "step": 14282, + "time_per_iteration": 4.23938512802124 + }, + { + "auxiliary_loss_clip": 0.01394283, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.23503411, + "balance_loss_mlp": 1.01332879, + "epoch": 0.858740417856606, + "flos": 22721566815360.0, + "grad_norm": 1.7562597010016865, + "language_loss": 0.76572728, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.78998584, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18237305, + "step": 14283, + "time_per_iteration": 2.868083953857422 + }, + { + "auxiliary_loss_clip": 0.01402825, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.24332237, + "balance_loss_mlp": 1.01575816, + "epoch": 0.8588005411092741, + "flos": 34066537165440.0, + "grad_norm": 1.779748131504728, + "language_loss": 0.60635078, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.63072574, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18884277, + "step": 14284, + "time_per_iteration": 2.9781763553619385 + }, + { + "auxiliary_loss_clip": 0.01383493, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.22809386, + "balance_loss_mlp": 1.01115477, + "epoch": 0.858860664361942, + "flos": 28925462885760.0, + "grad_norm": 1.6394972357545472, + "language_loss": 0.76454687, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.7886709, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.17749023, + "step": 14285, + "time_per_iteration": 4.360584020614624 + }, + { + "auxiliary_loss_clip": 0.01406454, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.24523652, + "balance_loss_mlp": 1.01698959, + "epoch": 0.85892078761461, + "flos": 19802136879360.0, + "grad_norm": 2.0727942827938883, + "language_loss": 0.75029778, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.77472758, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19543457, + "step": 14286, + "time_per_iteration": 2.8486523628234863 + }, + { + "auxiliary_loss_clip": 0.01178792, + "auxiliary_loss_mlp": 0.0101559, + "balance_loss_clip": 1.08978379, + "balance_loss_mlp": 0.99756527, + "epoch": 0.8589809108672779, + "flos": 67135930007040.0, + "grad_norm": 0.7611850822519846, + "language_loss": 0.49459809, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51654196, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18066406, + "step": 14287, + "time_per_iteration": 3.316173791885376 + }, + { + "auxiliary_loss_clip": 0.01402757, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.24327934, + "balance_loss_mlp": 1.01405978, + "epoch": 0.8590410341199459, + "flos": 29728343445120.0, + "grad_norm": 1.7418932216879512, + "language_loss": 0.7948184, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81917644, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18994141, + "step": 14288, + "time_per_iteration": 2.924221992492676 + }, + { + "auxiliary_loss_clip": 0.01412035, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.24963856, + "balance_loss_mlp": 1.01364625, + "epoch": 0.8591011573726138, + "flos": 23997881410560.0, + "grad_norm": 2.2172759602227097, + "language_loss": 0.81090349, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83536369, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20336914, + "step": 14289, + "time_per_iteration": 3.026127815246582 + }, + { + "auxiliary_loss_clip": 0.01409287, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.24926555, + "balance_loss_mlp": 1.01196361, + "epoch": 0.8591612806252819, + "flos": 14436756288000.0, + "grad_norm": 1.7828910826089455, + "language_loss": 0.66073084, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.68512422, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1809082, + "step": 14290, + "time_per_iteration": 4.217029094696045 + }, + { + "auxiliary_loss_clip": 0.01417874, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.25375485, + "balance_loss_mlp": 1.01528633, + "epoch": 0.8592214038779498, + "flos": 31589747539200.0, + "grad_norm": 2.0183768821284156, + "language_loss": 0.56388158, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.58840156, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18847656, + "step": 14291, + "time_per_iteration": 3.0566208362579346 + }, + { + "auxiliary_loss_clip": 0.01400958, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.23976088, + "balance_loss_mlp": 1.01197553, + "epoch": 0.8592815271306178, + "flos": 17466258119040.0, + "grad_norm": 4.364510761322985, + "language_loss": 0.72562099, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.74993545, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18505859, + "step": 14292, + "time_per_iteration": 4.281145811080933 + }, + { + "auxiliary_loss_clip": 0.01402922, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.24216497, + "balance_loss_mlp": 1.01191282, + "epoch": 0.8593416503832857, + "flos": 25422436529280.0, + "grad_norm": 1.7915578039424898, + "language_loss": 0.71511233, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73945069, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19018555, + "step": 14293, + "time_per_iteration": 2.8909337520599365 + }, + { + "auxiliary_loss_clip": 0.01392125, + "auxiliary_loss_mlp": 0.01030476, + "balance_loss_clip": 1.23481822, + "balance_loss_mlp": 1.01187944, + "epoch": 0.8594017736359537, + "flos": 21006593452800.0, + "grad_norm": 2.837678306701247, + "language_loss": 0.69896412, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.72319007, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18603516, + "step": 14294, + "time_per_iteration": 2.843010425567627 + }, + { + "auxiliary_loss_clip": 0.01384835, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.22878718, + "balance_loss_mlp": 1.01138282, + "epoch": 0.8594618968886216, + "flos": 22101249600000.0, + "grad_norm": 2.2177795598676826, + "language_loss": 0.78144336, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80558705, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18151855, + "step": 14295, + "time_per_iteration": 2.8818867206573486 + }, + { + "auxiliary_loss_clip": 0.014188, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.2545625, + "balance_loss_mlp": 1.01624107, + "epoch": 0.8595220201412896, + "flos": 11663757083520.0, + "grad_norm": 2.902115513057223, + "language_loss": 0.7040236, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.72857356, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19958496, + "step": 14296, + "time_per_iteration": 2.826753616333008 + }, + { + "auxiliary_loss_clip": 0.01400291, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.24019694, + "balance_loss_mlp": 1.01197636, + "epoch": 0.8595821433939577, + "flos": 25049753959680.0, + "grad_norm": 2.3653577850120517, + "language_loss": 0.79906833, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.82338804, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19689941, + "step": 14297, + "time_per_iteration": 2.881380796432495 + }, + { + "auxiliary_loss_clip": 0.0138889, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.23186505, + "balance_loss_mlp": 1.00907135, + "epoch": 0.8596422666466256, + "flos": 28523117710080.0, + "grad_norm": 2.112773337721862, + "language_loss": 0.68921822, + "learning_rate": 2.030402708016954e-07, + "loss": 0.7133832, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18530273, + "step": 14298, + "time_per_iteration": 2.941148281097412 + }, + { + "auxiliary_loss_clip": 0.0139802, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.24102485, + "balance_loss_mlp": 1.01662767, + "epoch": 0.8597023898992936, + "flos": 13596928709760.0, + "grad_norm": 2.133678940546827, + "language_loss": 0.69229031, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71661913, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18225098, + "step": 14299, + "time_per_iteration": 2.833583354949951 + }, + { + "auxiliary_loss_clip": 0.01398442, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.23746264, + "balance_loss_mlp": 1.01587558, + "epoch": 0.8597625131519615, + "flos": 32312037830400.0, + "grad_norm": 2.420628480155815, + "language_loss": 0.71733624, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.74167371, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19421387, + "step": 14300, + "time_per_iteration": 2.90362286567688 + }, + { + "auxiliary_loss_clip": 0.01392077, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.23451662, + "balance_loss_mlp": 1.01533794, + "epoch": 0.8598226364046295, + "flos": 28741994645760.0, + "grad_norm": 1.80383236597738, + "language_loss": 0.70341808, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.72768301, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19067383, + "step": 14301, + "time_per_iteration": 2.895787477493286 + }, + { + "auxiliary_loss_clip": 0.01402297, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.24225235, + "balance_loss_mlp": 1.01338816, + "epoch": 0.8598827596572974, + "flos": 21881874971520.0, + "grad_norm": 1.7100516132598813, + "language_loss": 0.75470454, + "learning_rate": 2.023568983386641e-07, + "loss": 0.77905101, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1895752, + "step": 14302, + "time_per_iteration": 2.8710737228393555 + }, + { + "auxiliary_loss_clip": 0.01386607, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.22995377, + "balance_loss_mlp": 1.01060343, + "epoch": 0.8599428829099655, + "flos": 23777375662080.0, + "grad_norm": 1.6880420334577724, + "language_loss": 0.8446005, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86875176, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.17919922, + "step": 14303, + "time_per_iteration": 2.87199068069458 + }, + { + "auxiliary_loss_clip": 0.01413239, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.25070405, + "balance_loss_mlp": 1.01529348, + "epoch": 0.8600030061626334, + "flos": 16220375291520.0, + "grad_norm": 2.1852933972665722, + "language_loss": 0.77856988, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.80304068, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18530273, + "step": 14304, + "time_per_iteration": 2.802100896835327 + }, + { + "auxiliary_loss_clip": 0.01400741, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.23985493, + "balance_loss_mlp": 1.01121449, + "epoch": 0.8600631294153014, + "flos": 15677843166720.0, + "grad_norm": 2.0471899827067648, + "language_loss": 0.54987496, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.57419109, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.1965332, + "step": 14305, + "time_per_iteration": 2.820127248764038 + }, + { + "auxiliary_loss_clip": 0.01394792, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.23589492, + "balance_loss_mlp": 1.0115205, + "epoch": 0.8601232526679693, + "flos": 17501893038720.0, + "grad_norm": 1.814234228639476, + "language_loss": 0.84157789, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86582536, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18432617, + "step": 14306, + "time_per_iteration": 2.8018598556518555 + }, + { + "auxiliary_loss_clip": 0.01387214, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.23098254, + "balance_loss_mlp": 1.01070237, + "epoch": 0.8601833759206373, + "flos": 26998897000320.0, + "grad_norm": 1.3140799746086815, + "language_loss": 0.72150755, + "learning_rate": 2.01504216561474e-07, + "loss": 0.74566865, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18200684, + "step": 14307, + "time_per_iteration": 2.9462833404541016 + }, + { + "auxiliary_loss_clip": 0.01408242, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.24514282, + "balance_loss_mlp": 1.01369083, + "epoch": 0.8602434991733052, + "flos": 25241004305280.0, + "grad_norm": 1.5543447258844592, + "language_loss": 0.64511454, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.66953349, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19970703, + "step": 14308, + "time_per_iteration": 2.8774564266204834 + }, + { + "auxiliary_loss_clip": 0.01179868, + "auxiliary_loss_mlp": 0.01022533, + "balance_loss_clip": 1.08921337, + "balance_loss_mlp": 1.00107503, + "epoch": 0.8603036224259732, + "flos": 71046911894400.0, + "grad_norm": 0.6268461084852628, + "language_loss": 0.48541331, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50743735, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.21484375, + "step": 14309, + "time_per_iteration": 3.424689292907715 + }, + { + "auxiliary_loss_clip": 0.01410791, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.24784875, + "balance_loss_mlp": 1.01385856, + "epoch": 0.8603637456786413, + "flos": 20309667511680.0, + "grad_norm": 2.2869293043614047, + "language_loss": 0.6732862, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69771552, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18286133, + "step": 14310, + "time_per_iteration": 2.898127794265747 + }, + { + "auxiliary_loss_clip": 0.01399207, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.23920631, + "balance_loss_mlp": 1.01210332, + "epoch": 0.8604238689313092, + "flos": 21846013827840.0, + "grad_norm": 1.6342848116136621, + "language_loss": 0.78629059, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.81058347, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.17980957, + "step": 14311, + "time_per_iteration": 2.8879222869873047 + }, + { + "auxiliary_loss_clip": 0.0139767, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.2394557, + "balance_loss_mlp": 1.01184487, + "epoch": 0.8604839921839772, + "flos": 18013224234240.0, + "grad_norm": 1.8397629791121286, + "language_loss": 0.72065175, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74493062, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18371582, + "step": 14312, + "time_per_iteration": 2.87563157081604 + }, + { + "auxiliary_loss_clip": 0.01397798, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.23809409, + "balance_loss_mlp": 1.01491535, + "epoch": 0.8605441154366451, + "flos": 16260534691200.0, + "grad_norm": 2.8628229202226696, + "language_loss": 0.78489214, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80920392, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18444824, + "step": 14313, + "time_per_iteration": 2.837592363357544 + }, + { + "auxiliary_loss_clip": 0.01390824, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.23442793, + "balance_loss_mlp": 1.01355267, + "epoch": 0.8606042386893131, + "flos": 32278981864320.0, + "grad_norm": 1.5122201915452007, + "language_loss": 0.73433268, + "learning_rate": 2.003133266178474e-07, + "loss": 0.7585746, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19799805, + "step": 14314, + "time_per_iteration": 2.9720890522003174 + }, + { + "auxiliary_loss_clip": 0.01390237, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.23144484, + "balance_loss_mlp": 1.01324284, + "epoch": 0.860664361941981, + "flos": 20239347813120.0, + "grad_norm": 2.2487109886891314, + "language_loss": 0.69745255, + "learning_rate": 2.001434724086657e-07, + "loss": 0.72167301, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18579102, + "step": 14315, + "time_per_iteration": 2.859649658203125 + }, + { + "auxiliary_loss_clip": 0.01406865, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.247172, + "balance_loss_mlp": 1.01356006, + "epoch": 0.8607244851946491, + "flos": 25202428473600.0, + "grad_norm": 1.5815990301923042, + "language_loss": 0.7270028, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.75138867, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18164062, + "step": 14316, + "time_per_iteration": 2.936397075653076 + }, + { + "auxiliary_loss_clip": 0.01410958, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.24909651, + "balance_loss_mlp": 1.01435971, + "epoch": 0.860784608447317, + "flos": 20490918756480.0, + "grad_norm": 1.7010269583597377, + "language_loss": 0.83747292, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.86191559, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18933105, + "step": 14317, + "time_per_iteration": 4.301350831985474 + }, + { + "auxiliary_loss_clip": 0.01388748, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.23236263, + "balance_loss_mlp": 1.01153111, + "epoch": 0.860844731699985, + "flos": 50493347072640.0, + "grad_norm": 2.289593956792784, + "language_loss": 0.67217344, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69636649, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.19030762, + "step": 14318, + "time_per_iteration": 3.188929319381714 + }, + { + "auxiliary_loss_clip": 0.01386647, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.23109794, + "balance_loss_mlp": 1.01446795, + "epoch": 0.8609048549526529, + "flos": 41187819680640.0, + "grad_norm": 1.5262240236159628, + "language_loss": 0.72127414, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.74546558, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18029785, + "step": 14319, + "time_per_iteration": 3.112804651260376 + }, + { + "auxiliary_loss_clip": 0.01407884, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.24632382, + "balance_loss_mlp": 1.01384664, + "epoch": 0.8609649782053209, + "flos": 23961703553280.0, + "grad_norm": 1.8725884325476636, + "language_loss": 0.68218064, + "learning_rate": 1.992952252525839e-07, + "loss": 0.70658684, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18896484, + "step": 14320, + "time_per_iteration": 4.301659822463989 + }, + { + "auxiliary_loss_clip": 0.01406816, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.24425292, + "balance_loss_mlp": 1.01412487, + "epoch": 0.8610251014579888, + "flos": 23123007095040.0, + "grad_norm": 2.1930362057304627, + "language_loss": 0.80476248, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82916635, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19433594, + "step": 14321, + "time_per_iteration": 2.880366802215576 + }, + { + "auxiliary_loss_clip": 0.01377017, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.22339416, + "balance_loss_mlp": 1.01412809, + "epoch": 0.8610852247106568, + "flos": 19436105295360.0, + "grad_norm": 1.8939193009911237, + "language_loss": 0.71709406, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.74120164, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.19616699, + "step": 14322, + "time_per_iteration": 2.8419291973114014 + }, + { + "auxiliary_loss_clip": 0.01423373, + "auxiliary_loss_mlp": 0.01037605, + "balance_loss_clip": 1.25896311, + "balance_loss_mlp": 1.01871085, + "epoch": 0.8611453479633249, + "flos": 19319970597120.0, + "grad_norm": 1.9777990072852365, + "language_loss": 0.57049304, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.59510279, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18908691, + "step": 14323, + "time_per_iteration": 2.8401448726654053 + }, + { + "auxiliary_loss_clip": 0.01397038, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.23940861, + "balance_loss_mlp": 1.01341486, + "epoch": 0.8612054712159928, + "flos": 23262877330560.0, + "grad_norm": 3.0182283050245347, + "language_loss": 0.76412952, + "learning_rate": 1.986178565813801e-07, + "loss": 0.78842366, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1895752, + "step": 14324, + "time_per_iteration": 2.865325689315796 + }, + { + "auxiliary_loss_clip": 0.01395902, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.23620892, + "balance_loss_mlp": 1.01307487, + "epoch": 0.8612655944686608, + "flos": 16035504462720.0, + "grad_norm": 2.122615404936511, + "language_loss": 0.67089617, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.69519162, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.20568848, + "step": 14325, + "time_per_iteration": 4.218362808227539 + }, + { + "auxiliary_loss_clip": 0.01402373, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.24116218, + "balance_loss_mlp": 1.01603794, + "epoch": 0.8613257177213287, + "flos": 22503097082880.0, + "grad_norm": 1.6769150826950707, + "language_loss": 0.65896475, + "learning_rate": 1.982795820716472e-07, + "loss": 0.68334496, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19604492, + "step": 14326, + "time_per_iteration": 2.866687297821045 + }, + { + "auxiliary_loss_clip": 0.01415598, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.2536217, + "balance_loss_mlp": 1.01372612, + "epoch": 0.8613858409739967, + "flos": 17246883490560.0, + "grad_norm": 1.9744949956016458, + "language_loss": 0.8559382, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.88041472, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18322754, + "step": 14327, + "time_per_iteration": 4.171824932098389 + }, + { + "auxiliary_loss_clip": 0.01397072, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.23874259, + "balance_loss_mlp": 1.01412451, + "epoch": 0.8614459642266646, + "flos": 22831593465600.0, + "grad_norm": 2.7403495534756823, + "language_loss": 0.75486982, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77917224, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19055176, + "step": 14328, + "time_per_iteration": 2.946436643600464 + }, + { + "auxiliary_loss_clip": 0.01394667, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.23704207, + "balance_loss_mlp": 1.01240277, + "epoch": 0.8615060874793327, + "flos": 26515237639680.0, + "grad_norm": 1.6536907637625073, + "language_loss": 0.80182695, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82607162, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.17407227, + "step": 14329, + "time_per_iteration": 2.89574933052063 + }, + { + "auxiliary_loss_clip": 0.01404794, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.24494195, + "balance_loss_mlp": 1.01646376, + "epoch": 0.8615662107320006, + "flos": 24071639713920.0, + "grad_norm": 2.196772898633525, + "language_loss": 0.78323293, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.80763453, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18908691, + "step": 14330, + "time_per_iteration": 2.873300313949585 + }, + { + "auxiliary_loss_clip": 0.01383499, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.22617674, + "balance_loss_mlp": 1.01466441, + "epoch": 0.8616263339846686, + "flos": 24174291461760.0, + "grad_norm": 1.883580267607281, + "language_loss": 0.65503335, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67920619, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19116211, + "step": 14331, + "time_per_iteration": 2.893131971359253 + }, + { + "auxiliary_loss_clip": 0.0138774, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.23204482, + "balance_loss_mlp": 1.01442206, + "epoch": 0.8616864572373365, + "flos": 21733951161600.0, + "grad_norm": 1.6160889310737399, + "language_loss": 0.76592016, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.79011381, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.17199707, + "step": 14332, + "time_per_iteration": 2.861217737197876 + }, + { + "auxiliary_loss_clip": 0.01401555, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.24008632, + "balance_loss_mlp": 1.01374435, + "epoch": 0.8617465804900045, + "flos": 23776561255680.0, + "grad_norm": 1.857838282623686, + "language_loss": 0.67591214, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.70025802, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19287109, + "step": 14333, + "time_per_iteration": 2.9158599376678467 + }, + { + "auxiliary_loss_clip": 0.01424598, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.25860572, + "balance_loss_mlp": 1.01569617, + "epoch": 0.8618067037426724, + "flos": 37717396842240.0, + "grad_norm": 2.217732669756166, + "language_loss": 0.63312435, + "learning_rate": 1.969292174019157e-07, + "loss": 0.6577298, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.20263672, + "step": 14334, + "time_per_iteration": 3.045092821121216 + }, + { + "auxiliary_loss_clip": 0.01417372, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.25387144, + "balance_loss_mlp": 1.01474977, + "epoch": 0.8618668269953405, + "flos": 21481113363840.0, + "grad_norm": 1.831423798351321, + "language_loss": 0.69961691, + "learning_rate": 1.967607294278577e-07, + "loss": 0.72412872, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19067383, + "step": 14335, + "time_per_iteration": 2.8941683769226074 + }, + { + "auxiliary_loss_clip": 0.0140698, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.24749529, + "balance_loss_mlp": 1.01595664, + "epoch": 0.8619269502480085, + "flos": 22241798507520.0, + "grad_norm": 1.6141494851472251, + "language_loss": 0.83350885, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85792184, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18359375, + "step": 14336, + "time_per_iteration": 2.9764790534973145 + }, + { + "auxiliary_loss_clip": 0.01417694, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.25277948, + "balance_loss_mlp": 1.01331782, + "epoch": 0.8619870735006764, + "flos": 22720797653760.0, + "grad_norm": 1.8888517797854774, + "language_loss": 0.6839658, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.70846963, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19372559, + "step": 14337, + "time_per_iteration": 2.8869025707244873 + }, + { + "auxiliary_loss_clip": 0.013966, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.23788679, + "balance_loss_mlp": 1.0098871, + "epoch": 0.8620471967533444, + "flos": 37533385664640.0, + "grad_norm": 1.6914932000710234, + "language_loss": 0.6771695, + "learning_rate": 1.962556758053089e-07, + "loss": 0.70142442, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19006348, + "step": 14338, + "time_per_iteration": 3.0290868282318115 + }, + { + "auxiliary_loss_clip": 0.01404187, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.24422264, + "balance_loss_mlp": 1.01682973, + "epoch": 0.8621073200060123, + "flos": 19691884005120.0, + "grad_norm": 4.610401707538982, + "language_loss": 0.62758565, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.65197277, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17687988, + "step": 14339, + "time_per_iteration": 2.9000966548919678 + }, + { + "auxiliary_loss_clip": 0.01388594, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.23136997, + "balance_loss_mlp": 1.01324558, + "epoch": 0.8621674432586803, + "flos": 14544656432640.0, + "grad_norm": 1.9494598019263136, + "language_loss": 0.63468206, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.6588859, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18554688, + "step": 14340, + "time_per_iteration": 2.7810592651367188 + }, + { + "auxiliary_loss_clip": 0.01379404, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.22673643, + "balance_loss_mlp": 1.01032305, + "epoch": 0.8622275665113482, + "flos": 20749683623040.0, + "grad_norm": 1.5873708709124237, + "language_loss": 0.80581897, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82990021, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.18395996, + "step": 14341, + "time_per_iteration": 2.8484578132629395 + }, + { + "auxiliary_loss_clip": 0.01388815, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.23102522, + "balance_loss_mlp": 1.01532054, + "epoch": 0.8622876897640163, + "flos": 24726234504960.0, + "grad_norm": 1.6942298347708242, + "language_loss": 0.75036353, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.77458823, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18347168, + "step": 14342, + "time_per_iteration": 2.90468168258667 + }, + { + "auxiliary_loss_clip": 0.0141023, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.24785018, + "balance_loss_mlp": 1.01445365, + "epoch": 0.8623478130166842, + "flos": 17466348608640.0, + "grad_norm": 1.8533929931688453, + "language_loss": 0.69994974, + "learning_rate": 1.95415287816028e-07, + "loss": 0.72439259, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19616699, + "step": 14343, + "time_per_iteration": 2.829237222671509 + }, + { + "auxiliary_loss_clip": 0.01390521, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.23103118, + "balance_loss_mlp": 1.0192306, + "epoch": 0.8624079362693522, + "flos": 18117052346880.0, + "grad_norm": 1.723900139750099, + "language_loss": 0.68714195, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.71143222, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19274902, + "step": 14344, + "time_per_iteration": 2.816903829574585 + }, + { + "auxiliary_loss_clip": 0.01410619, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.24810743, + "balance_loss_mlp": 1.0142746, + "epoch": 0.8624680595220201, + "flos": 30679193059200.0, + "grad_norm": 1.433944646967897, + "language_loss": 0.81736368, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.841802, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18945312, + "step": 14345, + "time_per_iteration": 2.9425466060638428 + }, + { + "auxiliary_loss_clip": 0.01406344, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.24605656, + "balance_loss_mlp": 1.01326251, + "epoch": 0.8625281827746881, + "flos": 38012837258880.0, + "grad_norm": 2.0726354337847956, + "language_loss": 0.523498, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.54788953, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19555664, + "step": 14346, + "time_per_iteration": 3.0011844635009766 + }, + { + "auxiliary_loss_clip": 0.01385266, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.2264843, + "balance_loss_mlp": 1.01218367, + "epoch": 0.862588306027356, + "flos": 26260816273920.0, + "grad_norm": 1.4890589733900672, + "language_loss": 0.75587356, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.78003496, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18701172, + "step": 14347, + "time_per_iteration": 2.8641397953033447 + }, + { + "auxiliary_loss_clip": 0.01408765, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.24811864, + "balance_loss_mlp": 1.01545858, + "epoch": 0.862648429280024, + "flos": 25888224193920.0, + "grad_norm": 1.8465364940068945, + "language_loss": 0.81481373, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83924568, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18969727, + "step": 14348, + "time_per_iteration": 2.900980234146118 + }, + { + "auxiliary_loss_clip": 0.01382168, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.22804785, + "balance_loss_mlp": 1.0129869, + "epoch": 0.862708552532692, + "flos": 37830681118080.0, + "grad_norm": 1.5781910721507775, + "language_loss": 0.66392905, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68806493, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18408203, + "step": 14349, + "time_per_iteration": 3.0167076587677 + }, + { + "auxiliary_loss_clip": 0.0138368, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.22586977, + "balance_loss_mlp": 1.01359582, + "epoch": 0.86276867578536, + "flos": 19099283869440.0, + "grad_norm": 3.8162422637771556, + "language_loss": 0.71093345, + "learning_rate": 1.942416188703573e-07, + "loss": 0.73509586, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18981934, + "step": 14350, + "time_per_iteration": 2.887859582901001 + }, + { + "auxiliary_loss_clip": 0.01399649, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.23842621, + "balance_loss_mlp": 1.01214278, + "epoch": 0.862828799038028, + "flos": 22174555455360.0, + "grad_norm": 1.776895640338648, + "language_loss": 0.78173733, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.80604863, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19348145, + "step": 14351, + "time_per_iteration": 2.8710227012634277 + }, + { + "auxiliary_loss_clip": 0.01403082, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.24430192, + "balance_loss_mlp": 1.01484406, + "epoch": 0.8628889222906959, + "flos": 23155112920320.0, + "grad_norm": 1.8814113944041224, + "language_loss": 0.85484183, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87920773, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18676758, + "step": 14352, + "time_per_iteration": 4.2580389976501465 + }, + { + "auxiliary_loss_clip": 0.01181988, + "auxiliary_loss_mlp": 0.01023731, + "balance_loss_clip": 1.09131169, + "balance_loss_mlp": 1.00360858, + "epoch": 0.8629490455433639, + "flos": 57848545779840.0, + "grad_norm": 0.7974411927820524, + "language_loss": 0.62002707, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.64208424, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.20117188, + "step": 14353, + "time_per_iteration": 3.3525543212890625 + }, + { + "auxiliary_loss_clip": 0.01390304, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.23319912, + "balance_loss_mlp": 1.01450586, + "epoch": 0.8630091687960318, + "flos": 15926880401280.0, + "grad_norm": 5.904721604398567, + "language_loss": 0.8244732, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84869587, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17431641, + "step": 14354, + "time_per_iteration": 2.8387889862060547 + }, + { + "auxiliary_loss_clip": 0.01399892, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.24082971, + "balance_loss_mlp": 1.01410484, + "epoch": 0.8630692920486999, + "flos": 17969128536960.0, + "grad_norm": 1.801015706354795, + "language_loss": 0.86698776, + "learning_rate": 1.934053380181031e-07, + "loss": 0.89132285, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19494629, + "step": 14355, + "time_per_iteration": 4.28113865852356 + }, + { + "auxiliary_loss_clip": 0.01406151, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.24563837, + "balance_loss_mlp": 1.01484561, + "epoch": 0.8631294153013678, + "flos": 22465245168000.0, + "grad_norm": 1.9478915540883277, + "language_loss": 0.59255135, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.61695784, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1965332, + "step": 14356, + "time_per_iteration": 2.840611696243286 + }, + { + "auxiliary_loss_clip": 0.01417951, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.25342536, + "balance_loss_mlp": 1.01718259, + "epoch": 0.8631895385540358, + "flos": 16845805169280.0, + "grad_norm": 1.5661686944474218, + "language_loss": 0.77797973, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.80252254, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19140625, + "step": 14357, + "time_per_iteration": 2.8416881561279297 + }, + { + "auxiliary_loss_clip": 0.01407243, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.24659455, + "balance_loss_mlp": 1.01714706, + "epoch": 0.8632496618067037, + "flos": 18706349612160.0, + "grad_norm": 2.612709166259756, + "language_loss": 0.78403986, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.80847412, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19042969, + "step": 14358, + "time_per_iteration": 2.850579261779785 + }, + { + "auxiliary_loss_clip": 0.01403254, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.24207616, + "balance_loss_mlp": 1.0141654, + "epoch": 0.8633097850593717, + "flos": 24290607139200.0, + "grad_norm": 1.280606414428361, + "language_loss": 0.75257456, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.7769329, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18432617, + "step": 14359, + "time_per_iteration": 2.9248881340026855 + }, + { + "auxiliary_loss_clip": 0.01396679, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.24051929, + "balance_loss_mlp": 1.0121665, + "epoch": 0.8633699083120396, + "flos": 21188432880000.0, + "grad_norm": 2.0214851925395694, + "language_loss": 0.71400326, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.73828781, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19592285, + "step": 14360, + "time_per_iteration": 4.295778512954712 + }, + { + "auxiliary_loss_clip": 0.01415162, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.25220621, + "balance_loss_mlp": 1.01456308, + "epoch": 0.8634300315647077, + "flos": 19254265868160.0, + "grad_norm": 2.3498925005756894, + "language_loss": 0.76511878, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78960782, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19189453, + "step": 14361, + "time_per_iteration": 2.8387696743011475 + }, + { + "auxiliary_loss_clip": 0.01181804, + "auxiliary_loss_mlp": 0.01020199, + "balance_loss_clip": 1.09248543, + "balance_loss_mlp": 0.99969542, + "epoch": 0.8634901548173756, + "flos": 66225013568640.0, + "grad_norm": 0.9525451811913462, + "language_loss": 0.5887835, + "learning_rate": 1.922374222645329e-07, + "loss": 0.61080354, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.20507812, + "step": 14362, + "time_per_iteration": 4.858181476593018 + }, + { + "auxiliary_loss_clip": 0.01413422, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.24947751, + "balance_loss_mlp": 1.01293027, + "epoch": 0.8635502780700436, + "flos": 24800038053120.0, + "grad_norm": 1.6546876119293759, + "language_loss": 0.8133502, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.83780903, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.1953125, + "step": 14363, + "time_per_iteration": 2.8830459117889404 + }, + { + "auxiliary_loss_clip": 0.01397963, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.23694873, + "balance_loss_mlp": 1.01398468, + "epoch": 0.8636104013227116, + "flos": 25200347212800.0, + "grad_norm": 2.3580334518710684, + "language_loss": 0.73702121, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.76133442, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19360352, + "step": 14364, + "time_per_iteration": 2.8994593620300293 + }, + { + "auxiliary_loss_clip": 0.01398697, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.23731744, + "balance_loss_mlp": 1.01225019, + "epoch": 0.8636705245753795, + "flos": 23888985880320.0, + "grad_norm": 1.6612991342669692, + "language_loss": 0.72517765, + "learning_rate": 1.917379150731755e-07, + "loss": 0.74946696, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18005371, + "step": 14365, + "time_per_iteration": 2.906771659851074 + }, + { + "auxiliary_loss_clip": 0.01408044, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.24490643, + "balance_loss_mlp": 1.01407528, + "epoch": 0.8637306478280475, + "flos": 23120337651840.0, + "grad_norm": 2.000355342987979, + "language_loss": 0.71363658, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73804992, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19213867, + "step": 14366, + "time_per_iteration": 2.867645025253296 + }, + { + "auxiliary_loss_clip": 0.01391412, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.23472631, + "balance_loss_mlp": 1.01320481, + "epoch": 0.8637907710807154, + "flos": 21916469260800.0, + "grad_norm": 1.638527109448551, + "language_loss": 0.82330406, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.84753191, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.1817627, + "step": 14367, + "time_per_iteration": 2.8661606311798096 + }, + { + "auxiliary_loss_clip": 0.01403686, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.24268639, + "balance_loss_mlp": 1.0111897, + "epoch": 0.8638508943333835, + "flos": 23589428186880.0, + "grad_norm": 2.013782625565588, + "language_loss": 0.62249374, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.64683956, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19714355, + "step": 14368, + "time_per_iteration": 2.913501739501953 + }, + { + "auxiliary_loss_clip": 0.01410032, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.25097609, + "balance_loss_mlp": 1.01238489, + "epoch": 0.8639110175860514, + "flos": 25786929790080.0, + "grad_norm": 1.9450057806077643, + "language_loss": 0.76938939, + "learning_rate": 1.91072865486821e-07, + "loss": 0.79379445, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18078613, + "step": 14369, + "time_per_iteration": 2.900883436203003 + }, + { + "auxiliary_loss_clip": 0.01403544, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.24090004, + "balance_loss_mlp": 1.01515722, + "epoch": 0.8639711408387194, + "flos": 23380324128000.0, + "grad_norm": 2.21470983329804, + "language_loss": 0.64885759, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.67323101, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18640137, + "step": 14370, + "time_per_iteration": 2.9256601333618164 + }, + { + "auxiliary_loss_clip": 0.01401723, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.24236, + "balance_loss_mlp": 1.01355696, + "epoch": 0.8640312640913873, + "flos": 22137336967680.0, + "grad_norm": 1.5645991047199757, + "language_loss": 0.66317391, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68752229, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19567871, + "step": 14371, + "time_per_iteration": 2.9041545391082764 + }, + { + "auxiliary_loss_clip": 0.01181821, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.09142971, + "balance_loss_mlp": 1.00948656, + "epoch": 0.8640913873440553, + "flos": 57595418530560.0, + "grad_norm": 0.8640116497098699, + "language_loss": 0.57004941, + "learning_rate": 1.905747985193107e-07, + "loss": 0.59220278, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.24023438, + "step": 14372, + "time_per_iteration": 3.235107898712158 + }, + { + "auxiliary_loss_clip": 0.01381683, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.22593367, + "balance_loss_mlp": 1.01458597, + "epoch": 0.8641515105967232, + "flos": 23997881410560.0, + "grad_norm": 1.707327895498149, + "language_loss": 0.8025471, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.82669497, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18518066, + "step": 14373, + "time_per_iteration": 2.9178075790405273 + }, + { + "auxiliary_loss_clip": 0.01400496, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.24038315, + "balance_loss_mlp": 1.01131344, + "epoch": 0.8642116338493913, + "flos": 19072154972160.0, + "grad_norm": 1.7912564845921626, + "language_loss": 0.64736068, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.67167056, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19177246, + "step": 14374, + "time_per_iteration": 2.8689777851104736 + }, + { + "auxiliary_loss_clip": 0.01393563, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.23794413, + "balance_loss_mlp": 1.0165379, + "epoch": 0.8642717571020592, + "flos": 18261809020800.0, + "grad_norm": 7.959977227411173, + "language_loss": 0.7802937, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.80457228, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.17749023, + "step": 14375, + "time_per_iteration": 2.8444693088531494 + }, + { + "auxiliary_loss_clip": 0.01398342, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.23834991, + "balance_loss_mlp": 1.01619172, + "epoch": 0.8643318803547272, + "flos": 57682325088000.0, + "grad_norm": 1.7322662486247813, + "language_loss": 0.61515141, + "learning_rate": 1.899116698488117e-07, + "loss": 0.63948768, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19091797, + "step": 14376, + "time_per_iteration": 3.2649261951446533 + }, + { + "auxiliary_loss_clip": 0.01387658, + "auxiliary_loss_mlp": 0.01026306, + "balance_loss_clip": 1.22875834, + "balance_loss_mlp": 1.00823414, + "epoch": 0.8643920036073952, + "flos": 19618940108160.0, + "grad_norm": 1.4741071322288308, + "language_loss": 0.67018843, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.69432807, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1809082, + "step": 14377, + "time_per_iteration": 2.915039539337158 + }, + { + "auxiliary_loss_clip": 0.01402713, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.24157476, + "balance_loss_mlp": 1.01528442, + "epoch": 0.8644521268600631, + "flos": 20860162721280.0, + "grad_norm": 1.5526342341251569, + "language_loss": 0.71328473, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.73765898, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19433594, + "step": 14378, + "time_per_iteration": 2.8625831604003906 + }, + { + "auxiliary_loss_clip": 0.01178444, + "auxiliary_loss_mlp": 0.01025505, + "balance_loss_clip": 1.08971655, + "balance_loss_mlp": 1.00452447, + "epoch": 0.8645122501127311, + "flos": 66752451912960.0, + "grad_norm": 0.8019795247649073, + "language_loss": 0.60355073, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62559021, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.20996094, + "step": 14379, + "time_per_iteration": 3.3770532608032227 + }, + { + "auxiliary_loss_clip": 0.01392652, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.23537827, + "balance_loss_mlp": 1.01515269, + "epoch": 0.864572373365399, + "flos": 21700171278720.0, + "grad_norm": 1.6923207498825894, + "language_loss": 0.74888408, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.77314901, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18688965, + "step": 14380, + "time_per_iteration": 2.914415121078491 + }, + { + "auxiliary_loss_clip": 0.01431563, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.26603889, + "balance_loss_mlp": 1.01366758, + "epoch": 0.8646324966180671, + "flos": 20276023363200.0, + "grad_norm": 2.7929577454848706, + "language_loss": 0.76381123, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.78844976, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.1862793, + "step": 14381, + "time_per_iteration": 2.8508009910583496 + }, + { + "auxiliary_loss_clip": 0.01388292, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.2307725, + "balance_loss_mlp": 1.01300335, + "epoch": 0.864692619870735, + "flos": 11953270431360.0, + "grad_norm": 2.35542324904602, + "language_loss": 0.85640895, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.88060319, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18139648, + "step": 14382, + "time_per_iteration": 2.7891809940338135 + }, + { + "auxiliary_loss_clip": 0.01412146, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.25161242, + "balance_loss_mlp": 1.01552439, + "epoch": 0.864752743123403, + "flos": 21480706160640.0, + "grad_norm": 1.7703924167265435, + "language_loss": 0.76567537, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.79015315, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.2010498, + "step": 14383, + "time_per_iteration": 2.855681896209717 + }, + { + "auxiliary_loss_clip": 0.01387903, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.23252463, + "balance_loss_mlp": 1.0149914, + "epoch": 0.8648128663760709, + "flos": 19537716412800.0, + "grad_norm": 1.768550400359185, + "language_loss": 0.85747874, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.88170159, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.19384766, + "step": 14384, + "time_per_iteration": 2.8080103397369385 + }, + { + "auxiliary_loss_clip": 0.01390193, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.23206687, + "balance_loss_mlp": 1.01488304, + "epoch": 0.8648729896287389, + "flos": 21297826103040.0, + "grad_norm": 1.8145274736956454, + "language_loss": 0.81217933, + "learning_rate": 1.884236463176072e-07, + "loss": 0.83642012, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19006348, + "step": 14385, + "time_per_iteration": 2.851769208908081 + }, + { + "auxiliary_loss_clip": 0.01415657, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.25393796, + "balance_loss_mlp": 1.0126133, + "epoch": 0.8649331128814068, + "flos": 24614262328320.0, + "grad_norm": 2.1828517755027184, + "language_loss": 0.7359556, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.76043016, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19165039, + "step": 14386, + "time_per_iteration": 2.860720634460449 + }, + { + "auxiliary_loss_clip": 0.01393689, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.23607087, + "balance_loss_mlp": 1.0164938, + "epoch": 0.8649932361340749, + "flos": 15386293802880.0, + "grad_norm": 2.324843588159943, + "language_loss": 0.82365382, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84794366, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18811035, + "step": 14387, + "time_per_iteration": 4.263425588607788 + }, + { + "auxiliary_loss_clip": 0.01390181, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.23327112, + "balance_loss_mlp": 1.01239312, + "epoch": 0.8650533593867428, + "flos": 19910670451200.0, + "grad_norm": 1.986227268850098, + "language_loss": 0.70099217, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.72520536, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1875, + "step": 14388, + "time_per_iteration": 2.8305718898773193 + }, + { + "auxiliary_loss_clip": 0.01387526, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.23365402, + "balance_loss_mlp": 1.01578307, + "epoch": 0.8651134826394108, + "flos": 25637196188160.0, + "grad_norm": 1.49137727930038, + "language_loss": 0.90914035, + "learning_rate": 1.877640883285283e-07, + "loss": 0.93335426, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.18078613, + "step": 14389, + "time_per_iteration": 2.894045829772949 + }, + { + "auxiliary_loss_clip": 0.01399767, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.24284232, + "balance_loss_mlp": 1.01258731, + "epoch": 0.8651736058920788, + "flos": 18743884813440.0, + "grad_norm": 1.7517939855764342, + "language_loss": 0.71514195, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.73944956, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18395996, + "step": 14390, + "time_per_iteration": 4.370844841003418 + }, + { + "auxiliary_loss_clip": 0.01400786, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.24058676, + "balance_loss_mlp": 1.01544487, + "epoch": 0.8652337291447467, + "flos": 20786404417920.0, + "grad_norm": 1.5228622622994485, + "language_loss": 0.82837296, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.85272819, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19274902, + "step": 14391, + "time_per_iteration": 2.863205671310425 + }, + { + "auxiliary_loss_clip": 0.01178884, + "auxiliary_loss_mlp": 0.01020258, + "balance_loss_clip": 1.08936572, + "balance_loss_mlp": 0.99841911, + "epoch": 0.8652938523974147, + "flos": 64257383387520.0, + "grad_norm": 0.8085762278377054, + "language_loss": 0.68123573, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70322716, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.21875, + "step": 14392, + "time_per_iteration": 3.2456107139587402 + }, + { + "auxiliary_loss_clip": 0.01414545, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.24968219, + "balance_loss_mlp": 1.01435256, + "epoch": 0.8653539756500827, + "flos": 18049583070720.0, + "grad_norm": 1.9893432824221735, + "language_loss": 0.76697272, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.79145223, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19042969, + "step": 14393, + "time_per_iteration": 2.8293118476867676 + }, + { + "auxiliary_loss_clip": 0.01396065, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.23489523, + "balance_loss_mlp": 1.01504874, + "epoch": 0.8654140989027507, + "flos": 17390871002880.0, + "grad_norm": 1.9748748430717693, + "language_loss": 0.74580634, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.7701062, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1887207, + "step": 14394, + "time_per_iteration": 2.8415701389312744 + }, + { + "auxiliary_loss_clip": 0.01407461, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.2466433, + "balance_loss_mlp": 1.01461315, + "epoch": 0.8654742221554186, + "flos": 53302750358400.0, + "grad_norm": 2.1401434188140502, + "language_loss": 0.66349268, + "learning_rate": 1.867768130747036e-07, + "loss": 0.68790942, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19592285, + "step": 14395, + "time_per_iteration": 4.64944314956665 + }, + { + "auxiliary_loss_clip": 0.01406016, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.24752069, + "balance_loss_mlp": 1.01210976, + "epoch": 0.8655343454080866, + "flos": 23925208982400.0, + "grad_norm": 1.7503631542903044, + "language_loss": 0.69221914, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.71658659, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18615723, + "step": 14396, + "time_per_iteration": 2.862457513809204 + }, + { + "auxiliary_loss_clip": 0.0140608, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.24392056, + "balance_loss_mlp": 1.01467705, + "epoch": 0.8655944686607545, + "flos": 24107772326400.0, + "grad_norm": 2.150152491382438, + "language_loss": 0.70560205, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.73000538, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19580078, + "step": 14397, + "time_per_iteration": 4.218133449554443 + }, + { + "auxiliary_loss_clip": 0.0139384, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.23419929, + "balance_loss_mlp": 1.01430798, + "epoch": 0.8656545919134225, + "flos": 23123188074240.0, + "grad_norm": 1.968344585016124, + "language_loss": 0.6456027, + "learning_rate": 1.86284103591253e-07, + "loss": 0.66987675, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19250488, + "step": 14398, + "time_per_iteration": 2.9097113609313965 + }, + { + "auxiliary_loss_clip": 0.01394995, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.23529828, + "balance_loss_mlp": 1.01305711, + "epoch": 0.8657147151660904, + "flos": 21151531105920.0, + "grad_norm": 2.0095503066737437, + "language_loss": 0.76900971, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.79327792, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18762207, + "step": 14399, + "time_per_iteration": 2.8482415676116943 + }, + { + "auxiliary_loss_clip": 0.01387682, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.23133922, + "balance_loss_mlp": 1.01146483, + "epoch": 0.8657748384187585, + "flos": 16298024647680.0, + "grad_norm": 1.8954014935638521, + "language_loss": 0.93681526, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.96099555, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18884277, + "step": 14400, + "time_per_iteration": 2.838705539703369 + }, + { + "auxiliary_loss_clip": 0.01399872, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.24082279, + "balance_loss_mlp": 1.01416326, + "epoch": 0.8658349616714264, + "flos": 30855376886400.0, + "grad_norm": 1.797135061581166, + "language_loss": 0.6786207, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.70295155, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19042969, + "step": 14401, + "time_per_iteration": 2.931502342224121 + }, + { + "auxiliary_loss_clip": 0.01416414, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.25315464, + "balance_loss_mlp": 1.01643395, + "epoch": 0.8658950849240944, + "flos": 18962128321920.0, + "grad_norm": 2.314460958632759, + "language_loss": 0.74977833, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.77429968, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19287109, + "step": 14402, + "time_per_iteration": 2.8819520473480225 + }, + { + "auxiliary_loss_clip": 0.01386096, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.2304213, + "balance_loss_mlp": 1.01185226, + "epoch": 0.8659552081767624, + "flos": 23373899366400.0, + "grad_norm": 1.6544455849696553, + "language_loss": 0.75542474, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.77958852, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18432617, + "step": 14403, + "time_per_iteration": 2.841637134552002 + }, + { + "auxiliary_loss_clip": 0.01415572, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.25438535, + "balance_loss_mlp": 1.01760483, + "epoch": 0.8660153314294303, + "flos": 23852536554240.0, + "grad_norm": 2.0954078133765415, + "language_loss": 0.73843741, + "learning_rate": 1.853005417520368e-07, + "loss": 0.76296711, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19787598, + "step": 14404, + "time_per_iteration": 2.8844516277313232 + }, + { + "auxiliary_loss_clip": 0.01386505, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.23017418, + "balance_loss_mlp": 1.01337135, + "epoch": 0.8660754546820983, + "flos": 23122871360640.0, + "grad_norm": 1.87285827112881, + "language_loss": 0.7186361, + "learning_rate": 1.851368555901447e-07, + "loss": 0.74283713, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.20239258, + "step": 14405, + "time_per_iteration": 2.8448288440704346 + }, + { + "auxiliary_loss_clip": 0.01408242, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.24521661, + "balance_loss_mlp": 1.01377296, + "epoch": 0.8661355779347663, + "flos": 14400035493120.0, + "grad_norm": 1.9237517988577033, + "language_loss": 0.67279804, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.69721049, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19226074, + "step": 14406, + "time_per_iteration": 2.8092682361602783 + }, + { + "auxiliary_loss_clip": 0.01401148, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.24189794, + "balance_loss_mlp": 1.01319456, + "epoch": 0.8661957011874343, + "flos": 21879748465920.0, + "grad_norm": 1.6718396764205201, + "language_loss": 0.83796453, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.86229014, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18237305, + "step": 14407, + "time_per_iteration": 2.8849070072174072 + }, + { + "auxiliary_loss_clip": 0.01396653, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.2384727, + "balance_loss_mlp": 1.01505733, + "epoch": 0.8662558244401022, + "flos": 21845154176640.0, + "grad_norm": 1.7367859356854936, + "language_loss": 0.70409256, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.7284047, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19506836, + "step": 14408, + "time_per_iteration": 2.8573081493377686 + }, + { + "auxiliary_loss_clip": 0.01382281, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.22862482, + "balance_loss_mlp": 1.01206613, + "epoch": 0.8663159476927702, + "flos": 17392997508480.0, + "grad_norm": 1.9810934208554565, + "language_loss": 0.78277981, + "learning_rate": 1.844827992025304e-07, + "loss": 0.80689573, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.17272949, + "step": 14409, + "time_per_iteration": 2.912222385406494 + }, + { + "auxiliary_loss_clip": 0.01415008, + "auxiliary_loss_mlp": 0.01033261, + "balance_loss_clip": 1.25249982, + "balance_loss_mlp": 1.01384163, + "epoch": 0.8663760709454381, + "flos": 22758061386240.0, + "grad_norm": 1.9015993737721228, + "language_loss": 0.77620447, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.80068719, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19433594, + "step": 14410, + "time_per_iteration": 2.843724012374878 + }, + { + "auxiliary_loss_clip": 0.01400142, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.24024129, + "balance_loss_mlp": 1.0145936, + "epoch": 0.8664361941981061, + "flos": 17383858058880.0, + "grad_norm": 2.6900145735173715, + "language_loss": 0.78060615, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.80494237, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18884277, + "step": 14411, + "time_per_iteration": 2.8351473808288574 + }, + { + "auxiliary_loss_clip": 0.01388642, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.23035812, + "balance_loss_mlp": 1.01500201, + "epoch": 0.866496317450774, + "flos": 16043060344320.0, + "grad_norm": 1.95070639956755, + "language_loss": 0.74536204, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76958025, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18188477, + "step": 14412, + "time_per_iteration": 2.8567521572113037 + }, + { + "auxiliary_loss_clip": 0.01397502, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.24059081, + "balance_loss_mlp": 1.01219416, + "epoch": 0.8665564407034421, + "flos": 20824889760000.0, + "grad_norm": 1.7054089966208763, + "language_loss": 0.70375097, + "learning_rate": 1.83829844328371e-07, + "loss": 0.72802615, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17834473, + "step": 14413, + "time_per_iteration": 2.859772205352783 + }, + { + "auxiliary_loss_clip": 0.01404978, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.24474514, + "balance_loss_mlp": 1.01259005, + "epoch": 0.86661656395611, + "flos": 15823640471040.0, + "grad_norm": 2.390497911409765, + "language_loss": 0.63685083, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.6612193, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19274902, + "step": 14414, + "time_per_iteration": 2.8281304836273193 + }, + { + "auxiliary_loss_clip": 0.01395806, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.23698163, + "balance_loss_mlp": 1.0127188, + "epoch": 0.866676687208778, + "flos": 23046624593280.0, + "grad_norm": 1.7212569201313803, + "language_loss": 0.64219999, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66647243, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18713379, + "step": 14415, + "time_per_iteration": 2.856370687484741 + }, + { + "auxiliary_loss_clip": 0.01181908, + "auxiliary_loss_mlp": 0.01022455, + "balance_loss_clip": 1.09084606, + "balance_loss_mlp": 0.99909037, + "epoch": 0.866736810461446, + "flos": 63834831768960.0, + "grad_norm": 0.7976802636238252, + "language_loss": 0.60462558, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62666929, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.23339844, + "step": 14416, + "time_per_iteration": 3.3933756351470947 + }, + { + "auxiliary_loss_clip": 0.01407553, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.24423993, + "balance_loss_mlp": 1.01453507, + "epoch": 0.8667969337141139, + "flos": 20459174889600.0, + "grad_norm": 1.9128185503044737, + "language_loss": 0.75799274, + "learning_rate": 1.831779913638285e-07, + "loss": 0.78241414, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20068359, + "step": 14417, + "time_per_iteration": 2.8508450984954834 + }, + { + "auxiliary_loss_clip": 0.01395083, + "auxiliary_loss_mlp": 0.01034836, + "balance_loss_clip": 1.23581266, + "balance_loss_mlp": 1.0159297, + "epoch": 0.866857056966782, + "flos": 21663902931840.0, + "grad_norm": 1.4743111951565562, + "language_loss": 0.75161713, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77591634, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18896484, + "step": 14418, + "time_per_iteration": 2.8395800590515137 + }, + { + "auxiliary_loss_clip": 0.01385657, + "auxiliary_loss_mlp": 0.01034736, + "balance_loss_clip": 1.22782433, + "balance_loss_mlp": 1.0163542, + "epoch": 0.8669171802194499, + "flos": 22862070478080.0, + "grad_norm": 1.4886076272645223, + "language_loss": 0.68316185, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70736575, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18395996, + "step": 14419, + "time_per_iteration": 2.9261937141418457 + }, + { + "auxiliary_loss_clip": 0.01406893, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.24580073, + "balance_loss_mlp": 1.01050234, + "epoch": 0.8669773034721179, + "flos": 18743432365440.0, + "grad_norm": 2.1948712456173234, + "language_loss": 0.79663378, + "learning_rate": 1.826898250065465e-07, + "loss": 0.82099307, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1854248, + "step": 14420, + "time_per_iteration": 2.8792357444763184 + }, + { + "auxiliary_loss_clip": 0.0140836, + "auxiliary_loss_mlp": 0.01034035, + "balance_loss_clip": 1.24904871, + "balance_loss_mlp": 1.01512814, + "epoch": 0.8670374267247858, + "flos": 18925181303040.0, + "grad_norm": 1.4585353328520247, + "language_loss": 0.83790982, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.86233383, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18908691, + "step": 14421, + "time_per_iteration": 2.850329637527466 + }, + { + "auxiliary_loss_clip": 0.01180499, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.09021389, + "balance_loss_mlp": 1.01147175, + "epoch": 0.8670975499774538, + "flos": 48845245524480.0, + "grad_norm": 0.7064659136912858, + "language_loss": 0.4923172, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51445246, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.21582031, + "step": 14422, + "time_per_iteration": 4.720146179199219 + }, + { + "auxiliary_loss_clip": 0.01390493, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.23179603, + "balance_loss_mlp": 1.01505446, + "epoch": 0.8671576732301217, + "flos": 26146129409280.0, + "grad_norm": 2.1534619669700534, + "language_loss": 0.74442464, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.76866907, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18908691, + "step": 14423, + "time_per_iteration": 2.89670991897583 + }, + { + "auxiliary_loss_clip": 0.01377287, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.22352839, + "balance_loss_mlp": 1.01128483, + "epoch": 0.8672177964827897, + "flos": 18376133927040.0, + "grad_norm": 2.1172135901875637, + "language_loss": 0.76909268, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.79316705, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18847656, + "step": 14424, + "time_per_iteration": 2.8538525104522705 + }, + { + "auxiliary_loss_clip": 0.01384577, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.23085487, + "balance_loss_mlp": 1.01090574, + "epoch": 0.8672779197354576, + "flos": 28556264165760.0, + "grad_norm": 1.7940730205544564, + "language_loss": 0.72162575, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.74576223, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.1817627, + "step": 14425, + "time_per_iteration": 4.299913644790649 + }, + { + "auxiliary_loss_clip": 0.01411077, + "auxiliary_loss_mlp": 0.01032042, + "balance_loss_clip": 1.24760675, + "balance_loss_mlp": 1.01239681, + "epoch": 0.8673380429881257, + "flos": 22392617984640.0, + "grad_norm": 1.7042950753624233, + "language_loss": 0.69048333, + "learning_rate": 1.817153530980926e-07, + "loss": 0.71491444, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19641113, + "step": 14426, + "time_per_iteration": 2.8697080612182617 + }, + { + "auxiliary_loss_clip": 0.01400717, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.23855793, + "balance_loss_mlp": 1.01115012, + "epoch": 0.8673981662407936, + "flos": 21006321984000.0, + "grad_norm": 1.850816031490293, + "language_loss": 0.7160123, + "learning_rate": 1.815531824008234e-07, + "loss": 0.74031556, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18469238, + "step": 14427, + "time_per_iteration": 2.918419361114502 + }, + { + "auxiliary_loss_clip": 0.01398066, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.24016857, + "balance_loss_mlp": 1.01226544, + "epoch": 0.8674582894934616, + "flos": 24437761787520.0, + "grad_norm": 3.590567722558237, + "language_loss": 0.68896997, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.71326804, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19470215, + "step": 14428, + "time_per_iteration": 2.9238839149475098 + }, + { + "auxiliary_loss_clip": 0.01392009, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.23377228, + "balance_loss_mlp": 1.01354504, + "epoch": 0.8675184127461296, + "flos": 20746425997440.0, + "grad_norm": 1.8993200281884253, + "language_loss": 0.71833885, + "learning_rate": 1.812290478794889e-07, + "loss": 0.74257886, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18444824, + "step": 14429, + "time_per_iteration": 2.82319712638855 + }, + { + "auxiliary_loss_clip": 0.01388565, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.23071492, + "balance_loss_mlp": 1.01360464, + "epoch": 0.8675785359987975, + "flos": 19145053624320.0, + "grad_norm": 2.1073944452532314, + "language_loss": 0.67976874, + "learning_rate": 1.810670840677151e-07, + "loss": 0.70398188, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19165039, + "step": 14430, + "time_per_iteration": 2.84468674659729 + }, + { + "auxiliary_loss_clip": 0.01394984, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.23428392, + "balance_loss_mlp": 1.01700497, + "epoch": 0.8676386592514655, + "flos": 22720842898560.0, + "grad_norm": 1.8365852097187898, + "language_loss": 0.69308484, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71740091, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19628906, + "step": 14431, + "time_per_iteration": 5.680430173873901 + }, + { + "auxiliary_loss_clip": 0.01405866, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.24579763, + "balance_loss_mlp": 1.01777375, + "epoch": 0.8676987825041335, + "flos": 14217924597120.0, + "grad_norm": 2.031742437316324, + "language_loss": 0.64877111, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.67319542, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18798828, + "step": 14432, + "time_per_iteration": 2.796380043029785 + }, + { + "auxiliary_loss_clip": 0.0140398, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.24498439, + "balance_loss_mlp": 1.01491034, + "epoch": 0.8677589057568015, + "flos": 13597924095360.0, + "grad_norm": 2.1360658810045576, + "language_loss": 0.79724222, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.82160974, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.17871094, + "step": 14433, + "time_per_iteration": 2.807097911834717 + }, + { + "auxiliary_loss_clip": 0.01184901, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.09439826, + "balance_loss_mlp": 1.00481915, + "epoch": 0.8678190290094694, + "flos": 68964142400640.0, + "grad_norm": 0.7073135467266112, + "language_loss": 0.58531809, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60746324, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.24707031, + "step": 14434, + "time_per_iteration": 3.444748640060425 + }, + { + "auxiliary_loss_clip": 0.0138382, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.22765136, + "balance_loss_mlp": 1.01708174, + "epoch": 0.8678791522621374, + "flos": 32569400108160.0, + "grad_norm": 1.6699221356110487, + "language_loss": 0.80723429, + "learning_rate": 1.802582997433628e-07, + "loss": 0.83143246, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18896484, + "step": 14435, + "time_per_iteration": 2.964468240737915 + }, + { + "auxiliary_loss_clip": 0.01405935, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.24306798, + "balance_loss_mlp": 1.01520538, + "epoch": 0.8679392755148053, + "flos": 35055962611200.0, + "grad_norm": 3.2097445115886054, + "language_loss": 0.63326466, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.65766466, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18847656, + "step": 14436, + "time_per_iteration": 3.036362648010254 + }, + { + "auxiliary_loss_clip": 0.01398623, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.23854136, + "balance_loss_mlp": 1.01283145, + "epoch": 0.8679993987674733, + "flos": 18561909651840.0, + "grad_norm": 2.2877030935637626, + "language_loss": 0.70669997, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.73100632, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19165039, + "step": 14437, + "time_per_iteration": 2.8597140312194824 + }, + { + "auxiliary_loss_clip": 0.01398194, + "auxiliary_loss_mlp": 0.01029286, + "balance_loss_clip": 1.23941255, + "balance_loss_mlp": 1.01055872, + "epoch": 0.8680595220201412, + "flos": 27465951519360.0, + "grad_norm": 1.9714316225315272, + "language_loss": 0.80594003, + "learning_rate": 1.797738571571381e-07, + "loss": 0.83021486, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18737793, + "step": 14438, + "time_per_iteration": 2.940873146057129 + }, + { + "auxiliary_loss_clip": 0.01389381, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.23266065, + "balance_loss_mlp": 1.01385188, + "epoch": 0.8681196452728093, + "flos": 19218268990080.0, + "grad_norm": 1.8888794704080136, + "language_loss": 0.68116248, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.70539039, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19580078, + "step": 14439, + "time_per_iteration": 2.9713308811187744 + }, + { + "auxiliary_loss_clip": 0.01394719, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.2358067, + "balance_loss_mlp": 1.01774812, + "epoch": 0.8681797685254772, + "flos": 37574495205120.0, + "grad_norm": 1.9303618816215, + "language_loss": 0.64751339, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.67182505, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18701172, + "step": 14440, + "time_per_iteration": 2.987187147140503 + }, + { + "auxiliary_loss_clip": 0.01387421, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.23181403, + "balance_loss_mlp": 1.01377869, + "epoch": 0.8682398917781452, + "flos": 23299100432640.0, + "grad_norm": 1.854477245241771, + "language_loss": 0.65904927, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68324834, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18713379, + "step": 14441, + "time_per_iteration": 2.8658487796783447 + }, + { + "auxiliary_loss_clip": 0.01386658, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.2310667, + "balance_loss_mlp": 1.01290774, + "epoch": 0.8683000150308132, + "flos": 21883096581120.0, + "grad_norm": 1.7033733713127164, + "language_loss": 0.66539323, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68957794, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18908691, + "step": 14442, + "time_per_iteration": 2.8447670936584473 + }, + { + "auxiliary_loss_clip": 0.014125, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.24891376, + "balance_loss_mlp": 1.01196599, + "epoch": 0.8683601382834811, + "flos": 14655271265280.0, + "grad_norm": 1.7510821687258555, + "language_loss": 0.72968501, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.75412667, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19714355, + "step": 14443, + "time_per_iteration": 2.8528592586517334 + }, + { + "auxiliary_loss_clip": 0.01401799, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.24087954, + "balance_loss_mlp": 1.01251376, + "epoch": 0.8684202615361492, + "flos": 26371204882560.0, + "grad_norm": 1.8396493179734938, + "language_loss": 0.8379482, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.86228907, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19775391, + "step": 14444, + "time_per_iteration": 2.8710856437683105 + }, + { + "auxiliary_loss_clip": 0.0138848, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.2295481, + "balance_loss_mlp": 1.0143348, + "epoch": 0.8684803847888171, + "flos": 20713686744960.0, + "grad_norm": 4.080260037468577, + "language_loss": 0.78015745, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.80438006, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19458008, + "step": 14445, + "time_per_iteration": 2.883321523666382 + }, + { + "auxiliary_loss_clip": 0.01401437, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.24250054, + "balance_loss_mlp": 1.01583409, + "epoch": 0.8685405080414851, + "flos": 22648396694400.0, + "grad_norm": 2.266008523187761, + "language_loss": 0.68250954, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70688349, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20117188, + "step": 14446, + "time_per_iteration": 2.842822313308716 + }, + { + "auxiliary_loss_clip": 0.01393668, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.23594213, + "balance_loss_mlp": 1.01181865, + "epoch": 0.868600631294153, + "flos": 24831555696000.0, + "grad_norm": 1.6489880375243862, + "language_loss": 0.83118415, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85543603, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19702148, + "step": 14447, + "time_per_iteration": 2.8304293155670166 + }, + { + "auxiliary_loss_clip": 0.01393203, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.23480952, + "balance_loss_mlp": 1.01201999, + "epoch": 0.868660754546821, + "flos": 25123557507840.0, + "grad_norm": 1.72116615292025, + "language_loss": 0.74474394, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76898265, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18640137, + "step": 14448, + "time_per_iteration": 2.8736226558685303 + }, + { + "auxiliary_loss_clip": 0.01398781, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.23919547, + "balance_loss_mlp": 1.01410818, + "epoch": 0.8687208777994889, + "flos": 12685605068160.0, + "grad_norm": 4.725529767618878, + "language_loss": 0.80926627, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.8335889, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19360352, + "step": 14449, + "time_per_iteration": 2.7929675579071045 + }, + { + "auxiliary_loss_clip": 0.0118306, + "auxiliary_loss_mlp": 0.01021667, + "balance_loss_clip": 1.09317994, + "balance_loss_mlp": 0.99896908, + "epoch": 0.8687810010521569, + "flos": 65647841909760.0, + "grad_norm": 0.8054968302385487, + "language_loss": 0.60713381, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62918103, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.2265625, + "step": 14450, + "time_per_iteration": 3.25038743019104 + }, + { + "auxiliary_loss_clip": 0.01405653, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.24496269, + "balance_loss_mlp": 1.01049256, + "epoch": 0.8688411243048249, + "flos": 24254836485120.0, + "grad_norm": 1.6861252503758863, + "language_loss": 0.77151489, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.79585695, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18054199, + "step": 14451, + "time_per_iteration": 2.8633556365966797 + }, + { + "auxiliary_loss_clip": 0.01399083, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.24057388, + "balance_loss_mlp": 1.01368713, + "epoch": 0.8689012475574929, + "flos": 18230562846720.0, + "grad_norm": 2.6694233118463098, + "language_loss": 0.72438502, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74869585, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18310547, + "step": 14452, + "time_per_iteration": 2.8013010025024414 + }, + { + "auxiliary_loss_clip": 0.01406068, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.24467766, + "balance_loss_mlp": 1.01204169, + "epoch": 0.8689613708101608, + "flos": 19656339575040.0, + "grad_norm": 1.4343282241206077, + "language_loss": 0.72962618, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.75400281, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19543457, + "step": 14453, + "time_per_iteration": 2.873689651489258 + }, + { + "auxiliary_loss_clip": 0.01395066, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.2389946, + "balance_loss_mlp": 1.0174222, + "epoch": 0.8690214940628288, + "flos": 11736158042880.0, + "grad_norm": 1.986112319122435, + "language_loss": 0.74327773, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.76759231, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.1895752, + "step": 14454, + "time_per_iteration": 2.853628396987915 + }, + { + "auxiliary_loss_clip": 0.01406376, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.24797738, + "balance_loss_mlp": 1.013515, + "epoch": 0.8690816173154968, + "flos": 34950596175360.0, + "grad_norm": 2.503322793672512, + "language_loss": 0.60001934, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.62440741, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18920898, + "step": 14455, + "time_per_iteration": 2.9873037338256836 + }, + { + "auxiliary_loss_clip": 0.01396965, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.2367444, + "balance_loss_mlp": 1.01254439, + "epoch": 0.8691417405681647, + "flos": 11621018730240.0, + "grad_norm": 2.1090960629262394, + "language_loss": 0.81179339, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.83609056, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20202637, + "step": 14456, + "time_per_iteration": 2.8134841918945312 + }, + { + "auxiliary_loss_clip": 0.01407718, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.24503672, + "balance_loss_mlp": 1.01619649, + "epoch": 0.8692018638208328, + "flos": 24618741563520.0, + "grad_norm": 2.3690728456987875, + "language_loss": 0.75566232, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.78010154, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.20007324, + "step": 14457, + "time_per_iteration": 2.8665478229522705 + }, + { + "auxiliary_loss_clip": 0.01395114, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.23811436, + "balance_loss_mlp": 1.01236439, + "epoch": 0.8692619870735007, + "flos": 26006349663360.0, + "grad_norm": 1.5228700779669977, + "language_loss": 0.79031479, + "learning_rate": 1.765601232001328e-07, + "loss": 0.81457633, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18676758, + "step": 14458, + "time_per_iteration": 4.248074531555176 + }, + { + "auxiliary_loss_clip": 0.0139958, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.24099803, + "balance_loss_mlp": 1.0158937, + "epoch": 0.8693221103261687, + "flos": 18051121393920.0, + "grad_norm": 1.5622180623771789, + "language_loss": 0.71875501, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.74311221, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.20251465, + "step": 14459, + "time_per_iteration": 2.8200881481170654 + }, + { + "auxiliary_loss_clip": 0.01374506, + "auxiliary_loss_mlp": 0.010291, + "balance_loss_clip": 1.2212311, + "balance_loss_mlp": 1.01114655, + "epoch": 0.8693822335788366, + "flos": 27504301127040.0, + "grad_norm": 1.3105475782301086, + "language_loss": 0.74421406, + "learning_rate": 1.762402701923398e-07, + "loss": 0.76825017, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.17956543, + "step": 14460, + "time_per_iteration": 2.8865227699279785 + }, + { + "auxiliary_loss_clip": 0.0140285, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.23936594, + "balance_loss_mlp": 1.01373529, + "epoch": 0.8694423568315046, + "flos": 24108405753600.0, + "grad_norm": 2.2275915394936994, + "language_loss": 0.65699315, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.68134469, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18579102, + "step": 14461, + "time_per_iteration": 4.285569190979004 + }, + { + "auxiliary_loss_clip": 0.01395934, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.23623323, + "balance_loss_mlp": 1.01726353, + "epoch": 0.8695024800841725, + "flos": 18369437696640.0, + "grad_norm": 2.2718477663883103, + "language_loss": 0.8335799, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.85791314, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.20129395, + "step": 14462, + "time_per_iteration": 2.8672564029693604 + }, + { + "auxiliary_loss_clip": 0.014054, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.24488997, + "balance_loss_mlp": 1.01417089, + "epoch": 0.8695626033368405, + "flos": 14035858945920.0, + "grad_norm": 2.0055405937059323, + "language_loss": 0.6574741, + "learning_rate": 1.757610093744335e-07, + "loss": 0.68186986, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.20007324, + "step": 14463, + "time_per_iteration": 2.8214285373687744 + }, + { + "auxiliary_loss_clip": 0.01410964, + "auxiliary_loss_mlp": 0.01037462, + "balance_loss_clip": 1.2474122, + "balance_loss_mlp": 1.01646948, + "epoch": 0.8696227265895085, + "flos": 16845624190080.0, + "grad_norm": 1.7681321598393898, + "language_loss": 0.67191786, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.69640213, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20983887, + "step": 14464, + "time_per_iteration": 2.8059613704681396 + }, + { + "auxiliary_loss_clip": 0.01410058, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.2459867, + "balance_loss_mlp": 1.01356912, + "epoch": 0.8696828498421765, + "flos": 21809157298560.0, + "grad_norm": 2.513591311232102, + "language_loss": 0.63837385, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.66280597, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19592285, + "step": 14465, + "time_per_iteration": 2.873141288757324 + }, + { + "auxiliary_loss_clip": 0.01379699, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.22531772, + "balance_loss_mlp": 1.0156424, + "epoch": 0.8697429730948444, + "flos": 22905487503360.0, + "grad_norm": 1.7539122087683716, + "language_loss": 0.84882736, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.87296581, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18518066, + "step": 14466, + "time_per_iteration": 4.3376171588897705 + }, + { + "auxiliary_loss_clip": 0.01412079, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.24822211, + "balance_loss_mlp": 1.01589406, + "epoch": 0.8698030963475124, + "flos": 24728180031360.0, + "grad_norm": 2.6200720319862016, + "language_loss": 0.62446034, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.6489377, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19763184, + "step": 14467, + "time_per_iteration": 2.8598620891571045 + }, + { + "auxiliary_loss_clip": 0.01374683, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.22033608, + "balance_loss_mlp": 1.01568592, + "epoch": 0.8698632196001803, + "flos": 28454743537920.0, + "grad_norm": 1.4039560967162013, + "language_loss": 0.69472814, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.7188099, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.17810059, + "step": 14468, + "time_per_iteration": 2.9781503677368164 + }, + { + "auxiliary_loss_clip": 0.01389726, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.23240709, + "balance_loss_mlp": 1.0168258, + "epoch": 0.8699233428528483, + "flos": 27647971925760.0, + "grad_norm": 1.7153526335948723, + "language_loss": 0.71701348, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.74128199, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.20275879, + "step": 14469, + "time_per_iteration": 2.9022605419158936 + }, + { + "auxiliary_loss_clip": 0.01378816, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.22534037, + "balance_loss_mlp": 1.01247764, + "epoch": 0.8699834661055164, + "flos": 20054748453120.0, + "grad_norm": 2.090341990675311, + "language_loss": 0.84679079, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.87088418, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.18054199, + "step": 14470, + "time_per_iteration": 2.792091131210327 + }, + { + "auxiliary_loss_clip": 0.01388534, + "auxiliary_loss_mlp": 0.0103499, + "balance_loss_clip": 1.23079896, + "balance_loss_mlp": 1.01573753, + "epoch": 0.8700435893581843, + "flos": 23743369555200.0, + "grad_norm": 1.693350516268456, + "language_loss": 0.73578066, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.76001585, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19250488, + "step": 14471, + "time_per_iteration": 2.919616460800171 + }, + { + "auxiliary_loss_clip": 0.01393918, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.23686886, + "balance_loss_mlp": 1.01166582, + "epoch": 0.8701037126108523, + "flos": 23558453481600.0, + "grad_norm": 1.3290236387304863, + "language_loss": 0.79605037, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.82028908, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.1829834, + "step": 14472, + "time_per_iteration": 2.8884940147399902 + }, + { + "auxiliary_loss_clip": 0.01388129, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.22993183, + "balance_loss_mlp": 1.01174414, + "epoch": 0.8701638358635202, + "flos": 18852689854080.0, + "grad_norm": 1.929711190664412, + "language_loss": 0.73701346, + "learning_rate": 1.741679706279644e-07, + "loss": 0.76120818, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19592285, + "step": 14473, + "time_per_iteration": 2.849877119064331 + }, + { + "auxiliary_loss_clip": 0.0139945, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.2385006, + "balance_loss_mlp": 1.01391745, + "epoch": 0.8702239591161882, + "flos": 27939928492800.0, + "grad_norm": 1.61462971675426, + "language_loss": 0.7284565, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.75278127, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19104004, + "step": 14474, + "time_per_iteration": 2.8686022758483887 + }, + { + "auxiliary_loss_clip": 0.01394067, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.23431873, + "balance_loss_mlp": 1.01186943, + "epoch": 0.8702840823688561, + "flos": 17241770828160.0, + "grad_norm": 1.7815105904447601, + "language_loss": 0.68201566, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.70626587, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1907959, + "step": 14475, + "time_per_iteration": 2.7979276180267334 + }, + { + "auxiliary_loss_clip": 0.01394235, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.23341429, + "balance_loss_mlp": 1.01312232, + "epoch": 0.8703442056215241, + "flos": 19437055436160.0, + "grad_norm": 1.5042676646798343, + "language_loss": 0.77594185, + "learning_rate": 1.736914088262349e-07, + "loss": 0.80020344, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18811035, + "step": 14476, + "time_per_iteration": 2.808225393295288 + }, + { + "auxiliary_loss_clip": 0.01395767, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.23862243, + "balance_loss_mlp": 1.01035881, + "epoch": 0.8704043288741921, + "flos": 22284174902400.0, + "grad_norm": 1.7894323974053035, + "language_loss": 0.72878724, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.75303626, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18786621, + "step": 14477, + "time_per_iteration": 2.870584011077881 + }, + { + "auxiliary_loss_clip": 0.0138757, + "auxiliary_loss_mlp": 0.01031971, + "balance_loss_clip": 1.22870731, + "balance_loss_mlp": 1.01311159, + "epoch": 0.8704644521268601, + "flos": 16655957412480.0, + "grad_norm": 2.2258998188408827, + "language_loss": 0.60105139, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.62524676, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18847656, + "step": 14478, + "time_per_iteration": 2.8058855533599854 + }, + { + "auxiliary_loss_clip": 0.0139429, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.23786926, + "balance_loss_mlp": 1.0109005, + "epoch": 0.870524575379528, + "flos": 24290969097600.0, + "grad_norm": 1.4895008568085413, + "language_loss": 0.72434747, + "learning_rate": 1.732154703087323e-07, + "loss": 0.74857795, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17871094, + "step": 14479, + "time_per_iteration": 2.8648648262023926 + }, + { + "auxiliary_loss_clip": 0.01392962, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.23393726, + "balance_loss_mlp": 1.01372492, + "epoch": 0.870584698632196, + "flos": 28780796701440.0, + "grad_norm": 1.4481402979370852, + "language_loss": 0.71887904, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.743146, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20007324, + "step": 14480, + "time_per_iteration": 2.9199917316436768 + }, + { + "auxiliary_loss_clip": 0.01409102, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.24659133, + "balance_loss_mlp": 1.01392734, + "epoch": 0.8706448218848639, + "flos": 32461635697920.0, + "grad_norm": 1.675170825715028, + "language_loss": 0.70224488, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72666568, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19055176, + "step": 14481, + "time_per_iteration": 2.9263155460357666 + }, + { + "auxiliary_loss_clip": 0.01388111, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.23043883, + "balance_loss_mlp": 1.01227593, + "epoch": 0.8707049451375319, + "flos": 22758423344640.0, + "grad_norm": 1.6613590737162072, + "language_loss": 0.77758181, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.8017683, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18261719, + "step": 14482, + "time_per_iteration": 2.8454833030700684 + }, + { + "auxiliary_loss_clip": 0.01393786, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.23572993, + "balance_loss_mlp": 1.01577163, + "epoch": 0.8707650683902, + "flos": 15860542245120.0, + "grad_norm": 1.6826209426664924, + "language_loss": 0.77559191, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.79987884, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19128418, + "step": 14483, + "time_per_iteration": 2.847640037536621 + }, + { + "auxiliary_loss_clip": 0.01418703, + "auxiliary_loss_mlp": 0.01038962, + "balance_loss_clip": 1.25488508, + "balance_loss_mlp": 1.01895905, + "epoch": 0.8708251916428679, + "flos": 16475792042880.0, + "grad_norm": 2.249064849524526, + "language_loss": 0.62889218, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.65346885, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20007324, + "step": 14484, + "time_per_iteration": 2.829704523086548 + }, + { + "auxiliary_loss_clip": 0.01396821, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.23850346, + "balance_loss_mlp": 1.01221478, + "epoch": 0.8708853148955359, + "flos": 15386339047680.0, + "grad_norm": 1.861346753143034, + "language_loss": 0.68876284, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.71304572, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19262695, + "step": 14485, + "time_per_iteration": 2.8546547889709473 + }, + { + "auxiliary_loss_clip": 0.01393558, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.2347157, + "balance_loss_mlp": 1.0132494, + "epoch": 0.8709454381482038, + "flos": 30562560668160.0, + "grad_norm": 1.9679381951053145, + "language_loss": 0.63983279, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.66410154, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.20068359, + "step": 14486, + "time_per_iteration": 2.8983259201049805 + }, + { + "auxiliary_loss_clip": 0.0140618, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.24365604, + "balance_loss_mlp": 1.01729488, + "epoch": 0.8710055614008718, + "flos": 22611630654720.0, + "grad_norm": 1.889378265993119, + "language_loss": 0.62403345, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.64846277, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19445801, + "step": 14487, + "time_per_iteration": 2.8537940979003906 + }, + { + "auxiliary_loss_clip": 0.01392477, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.23432207, + "balance_loss_mlp": 1.01372457, + "epoch": 0.8710656846535397, + "flos": 18452561673600.0, + "grad_norm": 1.9138746580842707, + "language_loss": 0.6834873, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.7077409, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19177246, + "step": 14488, + "time_per_iteration": 2.832899570465088 + }, + { + "auxiliary_loss_clip": 0.01400447, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.24024916, + "balance_loss_mlp": 1.01149762, + "epoch": 0.8711258079062077, + "flos": 16511336472960.0, + "grad_norm": 1.7589201605360285, + "language_loss": 0.85979527, + "learning_rate": 1.716335121648338e-07, + "loss": 0.88410544, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1907959, + "step": 14489, + "time_per_iteration": 2.877612352371216 + }, + { + "auxiliary_loss_clip": 0.01415212, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.24970102, + "balance_loss_mlp": 1.01557755, + "epoch": 0.8711859311588757, + "flos": 15670242040320.0, + "grad_norm": 2.121258253828598, + "language_loss": 0.76955205, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.79405129, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.19128418, + "step": 14490, + "time_per_iteration": 2.812361240386963 + }, + { + "auxiliary_loss_clip": 0.01395348, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.23321521, + "balance_loss_mlp": 1.01357007, + "epoch": 0.8712460544115437, + "flos": 15565373297280.0, + "grad_norm": 2.257525942588463, + "language_loss": 0.77423251, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.79852796, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.20593262, + "step": 14491, + "time_per_iteration": 2.819559097290039 + }, + { + "auxiliary_loss_clip": 0.01395673, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.23789608, + "balance_loss_mlp": 1.01081896, + "epoch": 0.8713061776642116, + "flos": 16772227845120.0, + "grad_norm": 2.027780221765637, + "language_loss": 0.67562294, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69987959, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19177246, + "step": 14492, + "time_per_iteration": 4.270235538482666 + }, + { + "auxiliary_loss_clip": 0.01382203, + "auxiliary_loss_mlp": 0.01029934, + "balance_loss_clip": 1.22587609, + "balance_loss_mlp": 1.01143241, + "epoch": 0.8713663009168796, + "flos": 24290561894400.0, + "grad_norm": 1.855502056993065, + "language_loss": 0.70015925, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.7242806, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18481445, + "step": 14493, + "time_per_iteration": 2.8644697666168213 + }, + { + "auxiliary_loss_clip": 0.01399911, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.24001122, + "balance_loss_mlp": 1.02093148, + "epoch": 0.8714264241695475, + "flos": 23803554418560.0, + "grad_norm": 2.258292149789749, + "language_loss": 0.9006319, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.92503566, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1953125, + "step": 14494, + "time_per_iteration": 2.8377974033355713 + }, + { + "auxiliary_loss_clip": 0.01391579, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.23531556, + "balance_loss_mlp": 1.01611423, + "epoch": 0.8714865474222155, + "flos": 38012022852480.0, + "grad_norm": 1.7521242887682553, + "language_loss": 0.59987915, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.62413859, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18237305, + "step": 14495, + "time_per_iteration": 2.9833669662475586 + }, + { + "auxiliary_loss_clip": 0.01399854, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.23957515, + "balance_loss_mlp": 1.01402879, + "epoch": 0.8715466706748836, + "flos": 22465923840000.0, + "grad_norm": 1.8832710080933037, + "language_loss": 0.81558323, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.83991694, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19482422, + "step": 14496, + "time_per_iteration": 4.241694211959839 + }, + { + "auxiliary_loss_clip": 0.01394137, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.23487568, + "balance_loss_mlp": 1.01427269, + "epoch": 0.8716067939275515, + "flos": 21225017940480.0, + "grad_norm": 2.1897571942012592, + "language_loss": 0.79523218, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81951511, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19885254, + "step": 14497, + "time_per_iteration": 2.8140978813171387 + }, + { + "auxiliary_loss_clip": 0.01399963, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.23762989, + "balance_loss_mlp": 1.01144147, + "epoch": 0.8716669171802195, + "flos": 23006736662400.0, + "grad_norm": 2.11917070523064, + "language_loss": 0.67965698, + "learning_rate": 1.70215677535406e-07, + "loss": 0.70397419, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.203125, + "step": 14498, + "time_per_iteration": 2.856161117553711 + }, + { + "auxiliary_loss_clip": 0.01398027, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.23799264, + "balance_loss_mlp": 1.01036334, + "epoch": 0.8717270404328874, + "flos": 29794817335680.0, + "grad_norm": 1.5228502102404373, + "language_loss": 0.57347214, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59774482, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18884277, + "step": 14499, + "time_per_iteration": 2.9195942878723145 + }, + { + "auxiliary_loss_clip": 0.01399016, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.23794472, + "balance_loss_mlp": 1.01456332, + "epoch": 0.8717871636855554, + "flos": 22028305703040.0, + "grad_norm": 1.8921162921051295, + "language_loss": 0.806647, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.83098173, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19885254, + "step": 14500, + "time_per_iteration": 2.8478164672851562 + }, + { + "auxiliary_loss_clip": 0.01393393, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.2342627, + "balance_loss_mlp": 1.01263189, + "epoch": 0.8718472869382233, + "flos": 16662925111680.0, + "grad_norm": 1.9239426401124038, + "language_loss": 0.73399538, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.75824475, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18908691, + "step": 14501, + "time_per_iteration": 4.219645977020264 + }, + { + "auxiliary_loss_clip": 0.0139837, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.2355845, + "balance_loss_mlp": 1.01497531, + "epoch": 0.8719074101908914, + "flos": 19503484081920.0, + "grad_norm": 1.6566606128263373, + "language_loss": 0.65280735, + "learning_rate": 1.695873325782482e-07, + "loss": 0.67713678, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19604492, + "step": 14502, + "time_per_iteration": 4.269784688949585 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.23562539, + "balance_loss_mlp": 1.01630759, + "epoch": 0.8719675334435593, + "flos": 33083310257280.0, + "grad_norm": 1.5840109104823612, + "language_loss": 0.69407964, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71839315, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.20178223, + "step": 14503, + "time_per_iteration": 2.9754064083099365 + }, + { + "auxiliary_loss_clip": 0.01398741, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.23938107, + "balance_loss_mlp": 1.01330388, + "epoch": 0.8720276566962273, + "flos": 13634237687040.0, + "grad_norm": 2.1469288547328853, + "language_loss": 0.70863909, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.7329455, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18591309, + "step": 14504, + "time_per_iteration": 2.850471019744873 + }, + { + "auxiliary_loss_clip": 0.01393721, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.2336576, + "balance_loss_mlp": 1.01430118, + "epoch": 0.8720877799488952, + "flos": 23524764088320.0, + "grad_norm": 1.7617838773121666, + "language_loss": 0.70760155, + "learning_rate": 1.691168026385552e-07, + "loss": 0.73187709, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1953125, + "step": 14505, + "time_per_iteration": 2.954744577407837 + }, + { + "auxiliary_loss_clip": 0.01396049, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.23810768, + "balance_loss_mlp": 1.00915825, + "epoch": 0.8721479032015632, + "flos": 20824030108800.0, + "grad_norm": 1.7287702405324836, + "language_loss": 0.7912941, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.8155297, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18347168, + "step": 14506, + "time_per_iteration": 2.835397720336914 + }, + { + "auxiliary_loss_clip": 0.01414365, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.25013852, + "balance_loss_mlp": 1.01061034, + "epoch": 0.8722080264542311, + "flos": 19473052314240.0, + "grad_norm": 2.8651532762943335, + "language_loss": 0.7467159, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.77115643, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1907959, + "step": 14507, + "time_per_iteration": 2.850752592086792 + }, + { + "auxiliary_loss_clip": 0.01410346, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.24646139, + "balance_loss_mlp": 1.01530111, + "epoch": 0.8722681497068991, + "flos": 21771757831680.0, + "grad_norm": 2.409411190495622, + "language_loss": 0.73404622, + "learning_rate": 1.686468975443156e-07, + "loss": 0.75850093, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19812012, + "step": 14508, + "time_per_iteration": 2.8691678047180176 + }, + { + "auxiliary_loss_clip": 0.01405364, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.24332738, + "balance_loss_mlp": 1.01247096, + "epoch": 0.8723282729595672, + "flos": 28889013559680.0, + "grad_norm": 1.6802051288749196, + "language_loss": 0.69389772, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.71827304, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19714355, + "step": 14509, + "time_per_iteration": 2.9333951473236084 + }, + { + "auxiliary_loss_clip": 0.01396986, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.23791218, + "balance_loss_mlp": 1.01464057, + "epoch": 0.8723883962122351, + "flos": 26480417126400.0, + "grad_norm": 1.650314309297747, + "language_loss": 0.59249383, + "learning_rate": 1.683339746970558e-07, + "loss": 0.61680746, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.1973877, + "step": 14510, + "time_per_iteration": 2.84717059135437 + }, + { + "auxiliary_loss_clip": 0.01430416, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.26207161, + "balance_loss_mlp": 1.0156157, + "epoch": 0.8724485194649031, + "flos": 20531078156160.0, + "grad_norm": 2.0612499261695114, + "language_loss": 0.68606877, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.71072704, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.19799805, + "step": 14511, + "time_per_iteration": 2.851417064666748 + }, + { + "auxiliary_loss_clip": 0.01395702, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.23470294, + "balance_loss_mlp": 1.0141865, + "epoch": 0.872508642717571, + "flos": 24364184463360.0, + "grad_norm": 1.6713380153841249, + "language_loss": 0.82528794, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.84958196, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19506836, + "step": 14512, + "time_per_iteration": 2.861938238143921 + }, + { + "auxiliary_loss_clip": 0.01175715, + "auxiliary_loss_mlp": 0.01016015, + "balance_loss_clip": 1.09061146, + "balance_loss_mlp": 0.9992308, + "epoch": 0.872568765970239, + "flos": 61436578412160.0, + "grad_norm": 0.7838666426177744, + "language_loss": 0.58684838, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60876572, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.16796875, + "step": 14513, + "time_per_iteration": 3.2189137935638428 + }, + { + "auxiliary_loss_clip": 0.01399984, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.24096608, + "balance_loss_mlp": 1.01283872, + "epoch": 0.8726288892229069, + "flos": 22607377643520.0, + "grad_norm": 1.7994884710298653, + "language_loss": 0.77474624, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.79906869, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19433594, + "step": 14514, + "time_per_iteration": 2.861544609069824 + }, + { + "auxiliary_loss_clip": 0.01416959, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.25260353, + "balance_loss_mlp": 1.0161339, + "epoch": 0.872689012475575, + "flos": 25896277768320.0, + "grad_norm": 2.113841468721925, + "language_loss": 0.6572271, + "learning_rate": 1.675528831794055e-07, + "loss": 0.68175209, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1940918, + "step": 14515, + "time_per_iteration": 2.9077186584472656 + }, + { + "auxiliary_loss_clip": 0.01404025, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.24275291, + "balance_loss_mlp": 1.01505089, + "epoch": 0.8727491357282429, + "flos": 21516612549120.0, + "grad_norm": 2.047438846377193, + "language_loss": 0.7902782, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81466556, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19677734, + "step": 14516, + "time_per_iteration": 2.839146852493286 + }, + { + "auxiliary_loss_clip": 0.01404782, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.24231148, + "balance_loss_mlp": 1.01440811, + "epoch": 0.8728092589809109, + "flos": 19216640177280.0, + "grad_norm": 2.868521648496266, + "language_loss": 0.72927523, + "learning_rate": 1.672409329369453e-07, + "loss": 0.75366801, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20092773, + "step": 14517, + "time_per_iteration": 2.8146183490753174 + }, + { + "auxiliary_loss_clip": 0.01384079, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.22673607, + "balance_loss_mlp": 1.01346159, + "epoch": 0.8728693822335788, + "flos": 20605153173120.0, + "grad_norm": 2.7998354443366757, + "language_loss": 0.7340976, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.75825691, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18395996, + "step": 14518, + "time_per_iteration": 2.8527588844299316 + }, + { + "auxiliary_loss_clip": 0.01386099, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.22984862, + "balance_loss_mlp": 1.01531696, + "epoch": 0.8729295054862468, + "flos": 21739335292800.0, + "grad_norm": 3.03799241586724, + "language_loss": 0.74364018, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76784396, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18969727, + "step": 14519, + "time_per_iteration": 2.8608503341674805 + }, + { + "auxiliary_loss_clip": 0.01405556, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.24326122, + "balance_loss_mlp": 1.01285088, + "epoch": 0.8729896287389147, + "flos": 17681741694720.0, + "grad_norm": 6.477863747667627, + "language_loss": 0.77695763, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.80134439, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20263672, + "step": 14520, + "time_per_iteration": 2.808223247528076 + }, + { + "auxiliary_loss_clip": 0.01415503, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.25166368, + "balance_loss_mlp": 1.0193311, + "epoch": 0.8730497519915827, + "flos": 24582563706240.0, + "grad_norm": 2.0124447130800553, + "language_loss": 0.83009541, + "learning_rate": 1.666178664801816e-07, + "loss": 0.85465628, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.21240234, + "step": 14521, + "time_per_iteration": 2.884317636489868 + }, + { + "auxiliary_loss_clip": 0.01406393, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.24460363, + "balance_loss_mlp": 1.01428533, + "epoch": 0.8731098752442508, + "flos": 13451629098240.0, + "grad_norm": 2.0153919648532455, + "language_loss": 0.77352178, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.79792267, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19396973, + "step": 14522, + "time_per_iteration": 2.8259174823760986 + }, + { + "auxiliary_loss_clip": 0.01390466, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.23269963, + "balance_loss_mlp": 1.01056755, + "epoch": 0.8731699984969187, + "flos": 23484197485440.0, + "grad_norm": 1.753240857490492, + "language_loss": 0.75801015, + "learning_rate": 1.66306750360385e-07, + "loss": 0.78219879, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17834473, + "step": 14523, + "time_per_iteration": 2.865238666534424 + }, + { + "auxiliary_loss_clip": 0.0138578, + "auxiliary_loss_mlp": 0.01034133, + "balance_loss_clip": 1.22820711, + "balance_loss_mlp": 1.01559556, + "epoch": 0.8732301217495867, + "flos": 17721674870400.0, + "grad_norm": 2.510104436979704, + "language_loss": 0.80166829, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.82586735, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1854248, + "step": 14524, + "time_per_iteration": 2.796372890472412 + }, + { + "auxiliary_loss_clip": 0.01380259, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.22547042, + "balance_loss_mlp": 1.01247287, + "epoch": 0.8732902450022546, + "flos": 22064483560320.0, + "grad_norm": 1.998008075763111, + "language_loss": 0.78809273, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.81220198, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18200684, + "step": 14525, + "time_per_iteration": 2.9252774715423584 + }, + { + "auxiliary_loss_clip": 0.01408923, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.24617076, + "balance_loss_mlp": 1.01521409, + "epoch": 0.8733503682549226, + "flos": 22283089027200.0, + "grad_norm": 1.9518667072202396, + "language_loss": 0.69945467, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.72389185, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19580078, + "step": 14526, + "time_per_iteration": 2.9935317039489746 + }, + { + "auxiliary_loss_clip": 0.0141776, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.25160527, + "balance_loss_mlp": 1.0169785, + "epoch": 0.8734104915075905, + "flos": 23373899366400.0, + "grad_norm": 2.7999471478207174, + "language_loss": 0.61747861, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.64203578, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.20959473, + "step": 14527, + "time_per_iteration": 4.316193342208862 + }, + { + "auxiliary_loss_clip": 0.01413173, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.24397826, + "balance_loss_mlp": 1.01525891, + "epoch": 0.8734706147602586, + "flos": 17721222422400.0, + "grad_norm": 2.0380285058876524, + "language_loss": 0.67031258, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.69481051, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.21374512, + "step": 14528, + "time_per_iteration": 2.8054394721984863 + }, + { + "auxiliary_loss_clip": 0.01387037, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.23108196, + "balance_loss_mlp": 1.01318645, + "epoch": 0.8735307380129265, + "flos": 22058194533120.0, + "grad_norm": 2.827082939554064, + "language_loss": 0.91239512, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.93658423, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18676758, + "step": 14529, + "time_per_iteration": 2.8768117427825928 + }, + { + "auxiliary_loss_clip": 0.01383533, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.22843122, + "balance_loss_mlp": 1.01122439, + "epoch": 0.8735908612655945, + "flos": 25349492632320.0, + "grad_norm": 1.7121965263513501, + "language_loss": 0.85498393, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87911773, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18615723, + "step": 14530, + "time_per_iteration": 4.35499906539917 + }, + { + "auxiliary_loss_clip": 0.01395043, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.23383904, + "balance_loss_mlp": 1.01574516, + "epoch": 0.8736509845182624, + "flos": 21550211452800.0, + "grad_norm": 2.7827777118734196, + "language_loss": 0.75430954, + "learning_rate": 1.650650677057128e-07, + "loss": 0.77859831, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18103027, + "step": 14531, + "time_per_iteration": 2.9153575897216797 + }, + { + "auxiliary_loss_clip": 0.01388199, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.23183131, + "balance_loss_mlp": 1.01386523, + "epoch": 0.8737111077709304, + "flos": 22027174583040.0, + "grad_norm": 1.9844539594981296, + "language_loss": 0.62101412, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.64521772, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18310547, + "step": 14532, + "time_per_iteration": 2.87565016746521 + }, + { + "auxiliary_loss_clip": 0.01175407, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.08805537, + "balance_loss_mlp": 1.0066222, + "epoch": 0.8737712310235983, + "flos": 70097510113920.0, + "grad_norm": 0.8218434768953291, + "language_loss": 0.58738869, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60940832, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.19921875, + "step": 14533, + "time_per_iteration": 3.4332005977630615 + }, + { + "auxiliary_loss_clip": 0.0138631, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.23056281, + "balance_loss_mlp": 1.01161182, + "epoch": 0.8738313542762663, + "flos": 28670181868800.0, + "grad_norm": 1.552895539278509, + "language_loss": 0.77490318, + "learning_rate": 1.646005846335954e-07, + "loss": 0.79906023, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.17785645, + "step": 14534, + "time_per_iteration": 2.903783082962036 + }, + { + "auxiliary_loss_clip": 0.0139697, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.23647594, + "balance_loss_mlp": 1.01279545, + "epoch": 0.8738914775289344, + "flos": 22356621106560.0, + "grad_norm": 6.729236427207113, + "language_loss": 0.7545082, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77878344, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.17773438, + "step": 14535, + "time_per_iteration": 2.861818552017212 + }, + { + "auxiliary_loss_clip": 0.01394883, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.23517179, + "balance_loss_mlp": 1.01478982, + "epoch": 0.8739516007816023, + "flos": 31772310883200.0, + "grad_norm": 1.8213681825937396, + "language_loss": 0.75026655, + "learning_rate": 1.64291277235048e-07, + "loss": 0.77455491, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19165039, + "step": 14536, + "time_per_iteration": 5.877175569534302 + }, + { + "auxiliary_loss_clip": 0.01402876, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.24078453, + "balance_loss_mlp": 1.01655412, + "epoch": 0.8740117240342703, + "flos": 21220719684480.0, + "grad_norm": 2.6658033817075, + "language_loss": 0.64779741, + "learning_rate": 1.641367279482304e-07, + "loss": 0.67217743, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18566895, + "step": 14537, + "time_per_iteration": 2.8556387424468994 + }, + { + "auxiliary_loss_clip": 0.01391422, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.23202801, + "balance_loss_mlp": 1.01248074, + "epoch": 0.8740718472869382, + "flos": 25196184691200.0, + "grad_norm": 1.8986344905485446, + "language_loss": 0.58654159, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.6107713, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.1907959, + "step": 14538, + "time_per_iteration": 2.8689358234405518 + }, + { + "auxiliary_loss_clip": 0.01379434, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.22621775, + "balance_loss_mlp": 1.01220679, + "epoch": 0.8741319705396062, + "flos": 19510542270720.0, + "grad_norm": 1.8130766628113275, + "language_loss": 0.69230205, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.71640515, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18652344, + "step": 14539, + "time_per_iteration": 2.8235397338867188 + }, + { + "auxiliary_loss_clip": 0.01415761, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.25080454, + "balance_loss_mlp": 1.01735711, + "epoch": 0.8741920937922741, + "flos": 14109074311680.0, + "grad_norm": 1.723264057671699, + "language_loss": 0.7458154, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.77033579, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18908691, + "step": 14540, + "time_per_iteration": 2.834864377975464 + }, + { + "auxiliary_loss_clip": 0.01392742, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.23385859, + "balance_loss_mlp": 1.01513267, + "epoch": 0.8742522170449422, + "flos": 27721504005120.0, + "grad_norm": 1.6711001694633634, + "language_loss": 0.79411769, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81838167, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18530273, + "step": 14541, + "time_per_iteration": 2.90102481842041 + }, + { + "auxiliary_loss_clip": 0.0140872, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.24356198, + "balance_loss_mlp": 1.01305151, + "epoch": 0.8743123402976101, + "flos": 21152571736320.0, + "grad_norm": 2.5411470394636213, + "language_loss": 0.67308843, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.69750875, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.20263672, + "step": 14542, + "time_per_iteration": 2.826150417327881 + }, + { + "auxiliary_loss_clip": 0.0117945, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.09071875, + "balance_loss_mlp": 1.00995588, + "epoch": 0.8743724635502781, + "flos": 60898344543360.0, + "grad_norm": 0.7832091864413139, + "language_loss": 0.54505205, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56716162, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.21582031, + "step": 14543, + "time_per_iteration": 3.1768436431884766 + }, + { + "auxiliary_loss_clip": 0.01402706, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.24083781, + "balance_loss_mlp": 1.01430202, + "epoch": 0.874432586802946, + "flos": 28120546310400.0, + "grad_norm": 2.246527702620527, + "language_loss": 0.71007776, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.73443413, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1862793, + "step": 14544, + "time_per_iteration": 2.899484395980835 + }, + { + "auxiliary_loss_clip": 0.0138124, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.22722459, + "balance_loss_mlp": 1.01345754, + "epoch": 0.874492710055614, + "flos": 23560806211200.0, + "grad_norm": 1.4405795940735293, + "language_loss": 0.76682305, + "learning_rate": 1.62902840325714e-07, + "loss": 0.79094684, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.17675781, + "step": 14545, + "time_per_iteration": 2.933885335922241 + }, + { + "auxiliary_loss_clip": 0.01395377, + "auxiliary_loss_mlp": 0.0103762, + "balance_loss_clip": 1.23621106, + "balance_loss_mlp": 1.01777148, + "epoch": 0.8745528333082819, + "flos": 40930185934080.0, + "grad_norm": 1.5568163303388918, + "language_loss": 0.66541523, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68974519, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1986084, + "step": 14546, + "time_per_iteration": 3.012847423553467 + }, + { + "auxiliary_loss_clip": 0.01400234, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.23953891, + "balance_loss_mlp": 1.01318264, + "epoch": 0.87461295656095, + "flos": 23633342904960.0, + "grad_norm": 1.5785269501033563, + "language_loss": 0.73047793, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75479567, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18359375, + "step": 14547, + "time_per_iteration": 2.8574347496032715 + }, + { + "auxiliary_loss_clip": 0.01425768, + "auxiliary_loss_mlp": 0.01036381, + "balance_loss_clip": 1.25858486, + "balance_loss_mlp": 1.01699781, + "epoch": 0.874673079813618, + "flos": 38806714103040.0, + "grad_norm": 3.191415671815469, + "language_loss": 0.70813894, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.73276043, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19384766, + "step": 14548, + "time_per_iteration": 3.0851807594299316 + }, + { + "auxiliary_loss_clip": 0.01405881, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.24234152, + "balance_loss_mlp": 1.01565433, + "epoch": 0.8747332030662859, + "flos": 23706694005120.0, + "grad_norm": 1.8748242458868416, + "language_loss": 0.71372175, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.73813522, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19812012, + "step": 14549, + "time_per_iteration": 2.9585132598876953 + }, + { + "auxiliary_loss_clip": 0.01412341, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.24648058, + "balance_loss_mlp": 1.01187992, + "epoch": 0.8747933263189539, + "flos": 24473215728000.0, + "grad_norm": 2.486587281565092, + "language_loss": 0.83685005, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.86127973, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.18725586, + "step": 14550, + "time_per_iteration": 2.985731840133667 + }, + { + "auxiliary_loss_clip": 0.0140023, + "auxiliary_loss_mlp": 0.01036971, + "balance_loss_clip": 1.23707414, + "balance_loss_mlp": 1.01738465, + "epoch": 0.8748534495716218, + "flos": 13817298723840.0, + "grad_norm": 1.5725863367140747, + "language_loss": 0.7315315, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.75590354, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19616699, + "step": 14551, + "time_per_iteration": 2.814605474472046 + }, + { + "auxiliary_loss_clip": 0.01392083, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.23508894, + "balance_loss_mlp": 1.01588321, + "epoch": 0.8749135728242898, + "flos": 29874185994240.0, + "grad_norm": 3.4662093391090045, + "language_loss": 0.6547662, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.67904568, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19970703, + "step": 14552, + "time_per_iteration": 2.916895627975464 + }, + { + "auxiliary_loss_clip": 0.01402615, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.24040937, + "balance_loss_mlp": 1.01452589, + "epoch": 0.8749736960769577, + "flos": 24143995428480.0, + "grad_norm": 1.7458883283222435, + "language_loss": 0.8027088, + "learning_rate": 1.616734111284479e-07, + "loss": 0.82708359, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20336914, + "step": 14553, + "time_per_iteration": 2.8357186317443848 + }, + { + "auxiliary_loss_clip": 0.01408618, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.24515009, + "balance_loss_mlp": 1.01367712, + "epoch": 0.8750338193296258, + "flos": 17211610529280.0, + "grad_norm": 1.9389192577948122, + "language_loss": 0.71086687, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.73527491, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18530273, + "step": 14554, + "time_per_iteration": 2.7916648387908936 + }, + { + "auxiliary_loss_clip": 0.01389832, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.23172784, + "balance_loss_mlp": 1.01130557, + "epoch": 0.8750939425822937, + "flos": 23743912492800.0, + "grad_norm": 3.5453999436056907, + "language_loss": 0.84358829, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.86778241, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18273926, + "step": 14555, + "time_per_iteration": 2.8653101921081543 + }, + { + "auxiliary_loss_clip": 0.01400613, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.24214292, + "balance_loss_mlp": 1.01202679, + "epoch": 0.8751540658349617, + "flos": 26552863330560.0, + "grad_norm": 2.4195983332969693, + "language_loss": 0.71671999, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.74103969, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19348145, + "step": 14556, + "time_per_iteration": 2.8892199993133545 + }, + { + "auxiliary_loss_clip": 0.01406864, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.24334764, + "balance_loss_mlp": 1.01240087, + "epoch": 0.8752141890876296, + "flos": 19395176734080.0, + "grad_norm": 1.8417421632178812, + "language_loss": 0.77412069, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.79850668, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19335938, + "step": 14557, + "time_per_iteration": 2.81040358543396 + }, + { + "auxiliary_loss_clip": 0.01403418, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.24356651, + "balance_loss_mlp": 1.01767278, + "epoch": 0.8752743123402976, + "flos": 25385353776000.0, + "grad_norm": 2.1886773985659587, + "language_loss": 0.83521748, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85962033, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1920166, + "step": 14558, + "time_per_iteration": 2.8760411739349365 + }, + { + "auxiliary_loss_clip": 0.01183461, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.09518564, + "balance_loss_mlp": 1.00884044, + "epoch": 0.8753344355929655, + "flos": 59979057816960.0, + "grad_norm": 0.8086867445110079, + "language_loss": 0.56088138, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58299601, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.19140625, + "step": 14559, + "time_per_iteration": 3.3502023220062256 + }, + { + "auxiliary_loss_clip": 0.01389778, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.23103571, + "balance_loss_mlp": 1.01747167, + "epoch": 0.8753945588456336, + "flos": 17903604787200.0, + "grad_norm": 1.6879200994850063, + "language_loss": 0.66894281, + "learning_rate": 1.606013202286407e-07, + "loss": 0.69319725, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18188477, + "step": 14560, + "time_per_iteration": 2.832836866378784 + }, + { + "auxiliary_loss_clip": 0.01389965, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.23142171, + "balance_loss_mlp": 1.01106799, + "epoch": 0.8754546820983016, + "flos": 30925877564160.0, + "grad_norm": 1.7609542568341663, + "language_loss": 0.79805505, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.82225358, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18823242, + "step": 14561, + "time_per_iteration": 2.9340782165527344 + }, + { + "auxiliary_loss_clip": 0.01407691, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.24428749, + "balance_loss_mlp": 1.01478648, + "epoch": 0.8755148053509695, + "flos": 20640742848000.0, + "grad_norm": 1.9696508094098846, + "language_loss": 0.78450501, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.8089236, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19384766, + "step": 14562, + "time_per_iteration": 4.361043930053711 + }, + { + "auxiliary_loss_clip": 0.01366914, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.21368408, + "balance_loss_mlp": 1.01122844, + "epoch": 0.8755749286036375, + "flos": 34983063959040.0, + "grad_norm": 1.354483903502839, + "language_loss": 0.72335875, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74733818, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.19787598, + "step": 14563, + "time_per_iteration": 3.018146514892578 + }, + { + "auxiliary_loss_clip": 0.01410474, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.24683428, + "balance_loss_mlp": 1.01440406, + "epoch": 0.8756350518563054, + "flos": 18195470864640.0, + "grad_norm": 4.622220576002593, + "language_loss": 0.66727519, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.69171834, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19421387, + "step": 14564, + "time_per_iteration": 2.838650941848755 + }, + { + "auxiliary_loss_clip": 0.01391181, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.22988224, + "balance_loss_mlp": 1.01819253, + "epoch": 0.8756951751089734, + "flos": 20093595753600.0, + "grad_norm": 1.6263175282692692, + "language_loss": 0.71499807, + "learning_rate": 1.598376334037408e-07, + "loss": 0.7392804, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18859863, + "step": 14565, + "time_per_iteration": 2.835254669189453 + }, + { + "auxiliary_loss_clip": 0.01420417, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.25334954, + "balance_loss_mlp": 1.01593542, + "epoch": 0.8757552983616413, + "flos": 27536271217920.0, + "grad_norm": 1.4425460454413448, + "language_loss": 0.78255939, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.80712438, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 1.66992188, + "router_z_loss_mlp": 0.20141602, + "step": 14566, + "time_per_iteration": 4.29935884475708 + }, + { + "auxiliary_loss_clip": 0.0139438, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.23565209, + "balance_loss_mlp": 1.01244068, + "epoch": 0.8758154216143094, + "flos": 18080964979200.0, + "grad_norm": 1.6815790117725005, + "language_loss": 0.71809208, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.74234891, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18859863, + "step": 14567, + "time_per_iteration": 2.838391065597534 + }, + { + "auxiliary_loss_clip": 0.01387547, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.22979569, + "balance_loss_mlp": 1.0140475, + "epoch": 0.8758755448669773, + "flos": 25056450190080.0, + "grad_norm": 1.6454278888543241, + "language_loss": 0.74706745, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.77127481, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19116211, + "step": 14568, + "time_per_iteration": 2.868021249771118 + }, + { + "auxiliary_loss_clip": 0.01398832, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.24015117, + "balance_loss_mlp": 1.01560974, + "epoch": 0.8759356681196453, + "flos": 22867092650880.0, + "grad_norm": 2.0044067074477043, + "language_loss": 0.87692624, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.90125889, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18835449, + "step": 14569, + "time_per_iteration": 2.8624300956726074 + }, + { + "auxiliary_loss_clip": 0.0139688, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.23741734, + "balance_loss_mlp": 1.01334774, + "epoch": 0.8759957913723132, + "flos": 21042318862080.0, + "grad_norm": 1.6172541537692908, + "language_loss": 0.7441141, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76841044, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1940918, + "step": 14570, + "time_per_iteration": 2.8491714000701904 + }, + { + "auxiliary_loss_clip": 0.01406136, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.24298334, + "balance_loss_mlp": 1.0144136, + "epoch": 0.8760559146249812, + "flos": 20019882695040.0, + "grad_norm": 1.5786310937617332, + "language_loss": 0.68108803, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.70548469, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19116211, + "step": 14571, + "time_per_iteration": 4.36399245262146 + }, + { + "auxiliary_loss_clip": 0.01389846, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.23225379, + "balance_loss_mlp": 1.01394784, + "epoch": 0.8761160378776491, + "flos": 19983614348160.0, + "grad_norm": 1.7881219684113248, + "language_loss": 0.63723749, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.66146815, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19287109, + "step": 14572, + "time_per_iteration": 2.842728853225708 + }, + { + "auxiliary_loss_clip": 0.01389908, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.23241735, + "balance_loss_mlp": 1.01269722, + "epoch": 0.8761761611303172, + "flos": 28816341131520.0, + "grad_norm": 1.6659248974297587, + "language_loss": 0.74431419, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.76852322, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.1829834, + "step": 14573, + "time_per_iteration": 2.8893232345581055 + }, + { + "auxiliary_loss_clip": 0.01379174, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.22454512, + "balance_loss_mlp": 1.01234591, + "epoch": 0.8762362843829851, + "flos": 18341584882560.0, + "grad_norm": 2.179745025027988, + "language_loss": 0.74026251, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.76435846, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.18054199, + "step": 14574, + "time_per_iteration": 2.9263434410095215 + }, + { + "auxiliary_loss_clip": 0.01397753, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.23825634, + "balance_loss_mlp": 1.01347494, + "epoch": 0.8762964076356531, + "flos": 15787734082560.0, + "grad_norm": 2.015437702081427, + "language_loss": 0.7637105, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.78801978, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19677734, + "step": 14575, + "time_per_iteration": 2.8207294940948486 + }, + { + "auxiliary_loss_clip": 0.013777, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.22214651, + "balance_loss_mlp": 1.01482248, + "epoch": 0.8763565308883211, + "flos": 33188269489920.0, + "grad_norm": 1.7746192610879317, + "language_loss": 0.67600274, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.70011526, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18737793, + "step": 14576, + "time_per_iteration": 2.954775094985962 + }, + { + "auxiliary_loss_clip": 0.01390121, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.23260164, + "balance_loss_mlp": 1.0113194, + "epoch": 0.876416654140989, + "flos": 15896584368000.0, + "grad_norm": 1.9809475219832893, + "language_loss": 0.67364711, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.69784236, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18078613, + "step": 14577, + "time_per_iteration": 2.841046094894409 + }, + { + "auxiliary_loss_clip": 0.01409063, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.24731505, + "balance_loss_mlp": 1.0138545, + "epoch": 0.876476777393657, + "flos": 25896323013120.0, + "grad_norm": 2.7868600487990354, + "language_loss": 0.71836185, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.74278986, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19897461, + "step": 14578, + "time_per_iteration": 2.867885112762451 + }, + { + "auxiliary_loss_clip": 0.01409704, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.24710107, + "balance_loss_mlp": 1.01706553, + "epoch": 0.876536900646325, + "flos": 13597607381760.0, + "grad_norm": 2.5019893296024174, + "language_loss": 0.71905124, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.7435075, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18847656, + "step": 14579, + "time_per_iteration": 2.86909556388855 + }, + { + "auxiliary_loss_clip": 0.01383515, + "auxiliary_loss_mlp": 0.01034091, + "balance_loss_clip": 1.2280854, + "balance_loss_mlp": 1.01516104, + "epoch": 0.876597023898993, + "flos": 12210949422720.0, + "grad_norm": 1.7978800231177932, + "language_loss": 0.71449172, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.73866779, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18933105, + "step": 14580, + "time_per_iteration": 2.8579955101013184 + }, + { + "auxiliary_loss_clip": 0.01390377, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.23419499, + "balance_loss_mlp": 1.01202297, + "epoch": 0.8766571471516609, + "flos": 25347411371520.0, + "grad_norm": 2.7907756847945406, + "language_loss": 0.66408157, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.68828917, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18359375, + "step": 14581, + "time_per_iteration": 2.8741085529327393 + }, + { + "auxiliary_loss_clip": 0.01385503, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.22862625, + "balance_loss_mlp": 1.0117451, + "epoch": 0.8767172704043289, + "flos": 30124354348800.0, + "grad_norm": 1.4335121941418636, + "language_loss": 0.73789227, + "learning_rate": 1.572541512164416e-07, + "loss": 0.76205099, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18615723, + "step": 14582, + "time_per_iteration": 2.9132890701293945 + }, + { + "auxiliary_loss_clip": 0.01385493, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.22710896, + "balance_loss_mlp": 1.01638246, + "epoch": 0.8767773936569968, + "flos": 19290488970240.0, + "grad_norm": 2.21964689269826, + "language_loss": 0.68084991, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.70506388, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19519043, + "step": 14583, + "time_per_iteration": 2.8329358100891113 + }, + { + "auxiliary_loss_clip": 0.014168, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.25446367, + "balance_loss_mlp": 1.013098, + "epoch": 0.8768375169096648, + "flos": 21255992645760.0, + "grad_norm": 1.9964576809012997, + "language_loss": 0.79557854, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.82006526, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18786621, + "step": 14584, + "time_per_iteration": 2.8180973529815674 + }, + { + "auxiliary_loss_clip": 0.01410485, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.2480123, + "balance_loss_mlp": 1.01074874, + "epoch": 0.8768976401623327, + "flos": 23306158621440.0, + "grad_norm": 1.5984353444332926, + "language_loss": 0.72999781, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.75440234, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1920166, + "step": 14585, + "time_per_iteration": 2.9110145568847656 + }, + { + "auxiliary_loss_clip": 0.01380807, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.22487926, + "balance_loss_mlp": 1.01309419, + "epoch": 0.8769577634150008, + "flos": 21371222448000.0, + "grad_norm": 1.7515902263188996, + "language_loss": 0.74703622, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.77117181, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.19665527, + "step": 14586, + "time_per_iteration": 2.8584306240081787 + }, + { + "auxiliary_loss_clip": 0.01403366, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.24390531, + "balance_loss_mlp": 1.01604033, + "epoch": 0.8770178866676687, + "flos": 23524628353920.0, + "grad_norm": 1.7428649098725366, + "language_loss": 0.79058915, + "learning_rate": 1.564981454895844e-07, + "loss": 0.81497085, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18786621, + "step": 14587, + "time_per_iteration": 2.868985414505005 + }, + { + "auxiliary_loss_clip": 0.01396109, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.23637271, + "balance_loss_mlp": 1.01357996, + "epoch": 0.8770780099203367, + "flos": 19728514310400.0, + "grad_norm": 1.6268359934164138, + "language_loss": 0.74307525, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76736778, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19543457, + "step": 14588, + "time_per_iteration": 2.8455989360809326 + }, + { + "auxiliary_loss_clip": 0.01381422, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.22421694, + "balance_loss_mlp": 1.01381946, + "epoch": 0.8771381331730047, + "flos": 21405409534080.0, + "grad_norm": 1.9364790967196666, + "language_loss": 0.67029893, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.69443369, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18225098, + "step": 14589, + "time_per_iteration": 2.8480799198150635 + }, + { + "auxiliary_loss_clip": 0.01383104, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.22532821, + "balance_loss_mlp": 1.01448607, + "epoch": 0.8771982564256726, + "flos": 20270186784000.0, + "grad_norm": 2.5196384827780807, + "language_loss": 0.71562874, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.73979521, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19055176, + "step": 14590, + "time_per_iteration": 2.952967882156372 + }, + { + "auxiliary_loss_clip": 0.01408915, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.24258351, + "balance_loss_mlp": 1.01563811, + "epoch": 0.8772583796783406, + "flos": 12495531087360.0, + "grad_norm": 1.9319206192710896, + "language_loss": 0.74816173, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77261162, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 1.6640625, + "router_z_loss_mlp": 0.20446777, + "step": 14591, + "time_per_iteration": 2.776198148727417 + }, + { + "auxiliary_loss_clip": 0.01381547, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.22747695, + "balance_loss_mlp": 1.01458931, + "epoch": 0.8773185029310085, + "flos": 15928690193280.0, + "grad_norm": 1.9151856068137083, + "language_loss": 0.81317222, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.83731937, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18579102, + "step": 14592, + "time_per_iteration": 2.9132721424102783 + }, + { + "auxiliary_loss_clip": 0.01381714, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.22614098, + "balance_loss_mlp": 1.01411915, + "epoch": 0.8773786261836766, + "flos": 21513852616320.0, + "grad_norm": 1.690862932115398, + "language_loss": 0.83161139, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.85574925, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.17956543, + "step": 14593, + "time_per_iteration": 2.8362715244293213 + }, + { + "auxiliary_loss_clip": 0.01388449, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.23128688, + "balance_loss_mlp": 1.01069212, + "epoch": 0.8774387494363445, + "flos": 26772961875840.0, + "grad_norm": 1.4670691109880925, + "language_loss": 0.7688396, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.7930128, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18188477, + "step": 14594, + "time_per_iteration": 2.852783441543579 + }, + { + "auxiliary_loss_clip": 0.01394728, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.23414993, + "balance_loss_mlp": 1.01289654, + "epoch": 0.8774988726890125, + "flos": 18488060858880.0, + "grad_norm": 2.1349559370271036, + "language_loss": 0.78303945, + "learning_rate": 1.552921717241651e-07, + "loss": 0.80730295, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18713379, + "step": 14595, + "time_per_iteration": 2.8018088340759277 + }, + { + "auxiliary_loss_clip": 0.01397398, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.23877704, + "balance_loss_mlp": 1.01507223, + "epoch": 0.8775589959416804, + "flos": 24437218849920.0, + "grad_norm": 1.6583380496231788, + "language_loss": 0.71212029, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.73644727, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.20227051, + "step": 14596, + "time_per_iteration": 2.8449621200561523 + }, + { + "auxiliary_loss_clip": 0.01388405, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.23257017, + "balance_loss_mlp": 1.01227331, + "epoch": 0.8776191191943484, + "flos": 23450462847360.0, + "grad_norm": 2.1011887746210602, + "language_loss": 0.86358106, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88777155, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18383789, + "step": 14597, + "time_per_iteration": 4.327143430709839 + }, + { + "auxiliary_loss_clip": 0.01396384, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.2381351, + "balance_loss_mlp": 1.01315987, + "epoch": 0.8776792424470163, + "flos": 26841426537600.0, + "grad_norm": 1.6166757779020804, + "language_loss": 0.73368144, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75796449, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18786621, + "step": 14598, + "time_per_iteration": 2.8756613731384277 + }, + { + "auxiliary_loss_clip": 0.01396831, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.23661029, + "balance_loss_mlp": 1.01586008, + "epoch": 0.8777393656996844, + "flos": 15632887818240.0, + "grad_norm": 2.6212599022039225, + "language_loss": 0.78098023, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.80530179, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19470215, + "step": 14599, + "time_per_iteration": 2.7501723766326904 + }, + { + "auxiliary_loss_clip": 0.0139872, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.23832238, + "balance_loss_mlp": 1.01174939, + "epoch": 0.8777994889523523, + "flos": 18889274914560.0, + "grad_norm": 6.159801646665903, + "language_loss": 0.68946266, + "learning_rate": 1.545407113589332e-07, + "loss": 0.71375644, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18908691, + "step": 14600, + "time_per_iteration": 4.201138734817505 + }, + { + "auxiliary_loss_clip": 0.01397392, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.23707318, + "balance_loss_mlp": 1.01516521, + "epoch": 0.8778596122050203, + "flos": 48841408995840.0, + "grad_norm": 2.2589993412931935, + "language_loss": 0.70093369, + "learning_rate": 1.543906292031072e-07, + "loss": 0.72525215, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19287109, + "step": 14601, + "time_per_iteration": 3.061053514480591 + }, + { + "auxiliary_loss_clip": 0.01420357, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.25425267, + "balance_loss_mlp": 1.01561701, + "epoch": 0.8779197354576883, + "flos": 25669618727040.0, + "grad_norm": 1.7615481831984088, + "language_loss": 0.73564279, + "learning_rate": 1.542406170329733e-07, + "loss": 0.7601977, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.19506836, + "step": 14602, + "time_per_iteration": 2.858706474304199 + }, + { + "auxiliary_loss_clip": 0.01387751, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.23018765, + "balance_loss_mlp": 1.01332974, + "epoch": 0.8779798587103562, + "flos": 18852418385280.0, + "grad_norm": 1.7066607159214915, + "language_loss": 0.71157008, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73576576, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18481445, + "step": 14603, + "time_per_iteration": 2.8407068252563477 + }, + { + "auxiliary_loss_clip": 0.0117706, + "auxiliary_loss_mlp": 0.01012994, + "balance_loss_clip": 1.08931136, + "balance_loss_mlp": 0.99659103, + "epoch": 0.8780399819630242, + "flos": 68645554629120.0, + "grad_norm": 0.7469514709765466, + "language_loss": 0.54172802, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56362855, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1640625, + "step": 14604, + "time_per_iteration": 3.3702590465545654 + }, + { + "auxiliary_loss_clip": 0.01178228, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.08956671, + "balance_loss_mlp": 1.00668514, + "epoch": 0.8781001052156922, + "flos": 65767550947200.0, + "grad_norm": 0.7239806653138193, + "language_loss": 0.59339118, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61549211, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.25195312, + "step": 14605, + "time_per_iteration": 3.2586140632629395 + }, + { + "auxiliary_loss_clip": 0.0140417, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.24304247, + "balance_loss_mlp": 1.01575947, + "epoch": 0.8781602284683602, + "flos": 22058556491520.0, + "grad_norm": 8.75219506488415, + "language_loss": 0.85549718, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87989187, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19543457, + "step": 14606, + "time_per_iteration": 5.681826591491699 + }, + { + "auxiliary_loss_clip": 0.01416572, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.25366354, + "balance_loss_mlp": 1.01592314, + "epoch": 0.8782203517210281, + "flos": 17570945882880.0, + "grad_norm": 1.978409700222824, + "language_loss": 0.71401924, + "learning_rate": 1.534916061666931e-07, + "loss": 0.73854101, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19689941, + "step": 14607, + "time_per_iteration": 2.8328754901885986 + }, + { + "auxiliary_loss_clip": 0.01392499, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.2358644, + "balance_loss_mlp": 1.01321173, + "epoch": 0.8782804749736961, + "flos": 25531422549120.0, + "grad_norm": 3.6225218746011856, + "language_loss": 0.73087358, + "learning_rate": 1.533420140300785e-07, + "loss": 0.75512034, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18969727, + "step": 14608, + "time_per_iteration": 2.881551742553711 + }, + { + "auxiliary_loss_clip": 0.01415448, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.25058544, + "balance_loss_mlp": 1.01525187, + "epoch": 0.878340598226364, + "flos": 21808750095360.0, + "grad_norm": 2.069137994892772, + "language_loss": 0.88645566, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.9109627, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20031738, + "step": 14609, + "time_per_iteration": 2.8457159996032715 + }, + { + "auxiliary_loss_clip": 0.01395443, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.23567319, + "balance_loss_mlp": 1.01203609, + "epoch": 0.878400721479032, + "flos": 21111416951040.0, + "grad_norm": 1.5599117450577145, + "language_loss": 0.7061249, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.73040676, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.20703125, + "step": 14610, + "time_per_iteration": 2.903596878051758 + }, + { + "auxiliary_loss_clip": 0.01386773, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.2306838, + "balance_loss_mlp": 1.01599479, + "epoch": 0.8784608447316999, + "flos": 20933423331840.0, + "grad_norm": 1.8702494056574528, + "language_loss": 0.81922328, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.84344536, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19433594, + "step": 14611, + "time_per_iteration": 2.8051180839538574 + }, + { + "auxiliary_loss_clip": 0.01394445, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.2344799, + "balance_loss_mlp": 1.012321, + "epoch": 0.878520967984368, + "flos": 23340933889920.0, + "grad_norm": 1.6036091875098124, + "language_loss": 0.77053881, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.79479271, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18615723, + "step": 14612, + "time_per_iteration": 2.832141876220703 + }, + { + "auxiliary_loss_clip": 0.01389753, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.23250699, + "balance_loss_mlp": 1.01617205, + "epoch": 0.8785810912370359, + "flos": 25529612757120.0, + "grad_norm": 1.6176320926207248, + "language_loss": 0.72766954, + "learning_rate": 1.525951038422002e-07, + "loss": 0.75192153, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19274902, + "step": 14613, + "time_per_iteration": 2.848081111907959 + }, + { + "auxiliary_loss_clip": 0.01179751, + "auxiliary_loss_mlp": 0.01028732, + "balance_loss_clip": 1.09296107, + "balance_loss_mlp": 1.00841904, + "epoch": 0.8786412144897039, + "flos": 61865961995520.0, + "grad_norm": 1.0427451514799915, + "language_loss": 0.64621568, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66830051, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.203125, + "step": 14614, + "time_per_iteration": 3.0853798389434814 + }, + { + "auxiliary_loss_clip": 0.01178879, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.09280634, + "balance_loss_mlp": 1.01532364, + "epoch": 0.8787013377423719, + "flos": 71023628805120.0, + "grad_norm": 0.6636522128484623, + "language_loss": 0.58651423, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60867178, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.21582031, + "step": 14615, + "time_per_iteration": 3.3477370738983154 + }, + { + "auxiliary_loss_clip": 0.01389036, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.22949767, + "balance_loss_mlp": 1.0158534, + "epoch": 0.8787614609950398, + "flos": 17356231468800.0, + "grad_norm": 2.617062150205355, + "language_loss": 0.73336446, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.75759882, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18579102, + "step": 14616, + "time_per_iteration": 2.7977709770202637 + }, + { + "auxiliary_loss_clip": 0.01175215, + "auxiliary_loss_mlp": 0.01019611, + "balance_loss_clip": 1.08943594, + "balance_loss_mlp": 1.0046382, + "epoch": 0.8788215842477078, + "flos": 72543370279680.0, + "grad_norm": 0.8560705821834971, + "language_loss": 0.57990479, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60185301, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14941406, + "step": 14617, + "time_per_iteration": 3.402327299118042 + }, + { + "auxiliary_loss_clip": 0.01382969, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.22727919, + "balance_loss_mlp": 1.01370549, + "epoch": 0.8788817075003758, + "flos": 24838342416000.0, + "grad_norm": 1.763252977387195, + "language_loss": 0.84048969, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.86464727, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.19091797, + "step": 14618, + "time_per_iteration": 2.876068353652954 + }, + { + "auxiliary_loss_clip": 0.01376423, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.22369206, + "balance_loss_mlp": 1.00909317, + "epoch": 0.8789418307530438, + "flos": 22649211100800.0, + "grad_norm": 1.5968210255990656, + "language_loss": 0.69723642, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.7212767, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.18505859, + "step": 14619, + "time_per_iteration": 2.859360456466675 + }, + { + "auxiliary_loss_clip": 0.01403226, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.2418592, + "balance_loss_mlp": 1.01772261, + "epoch": 0.8790019540057117, + "flos": 19793585612160.0, + "grad_norm": 1.878970990706346, + "language_loss": 0.776941, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.80133682, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18615723, + "step": 14620, + "time_per_iteration": 2.92250919342041 + }, + { + "auxiliary_loss_clip": 0.01401596, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.24087989, + "balance_loss_mlp": 1.00953364, + "epoch": 0.8790620772583797, + "flos": 20239347813120.0, + "grad_norm": 1.707788577652126, + "language_loss": 0.80085862, + "learning_rate": 1.514036906317542e-07, + "loss": 0.82516849, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19873047, + "step": 14621, + "time_per_iteration": 2.8714306354522705 + }, + { + "auxiliary_loss_clip": 0.01413481, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.24906647, + "balance_loss_mlp": 1.01621151, + "epoch": 0.8791222005110476, + "flos": 24140602068480.0, + "grad_norm": 1.56990054084633, + "language_loss": 0.67528582, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.69977659, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19396973, + "step": 14622, + "time_per_iteration": 2.8703458309173584 + }, + { + "auxiliary_loss_clip": 0.01385225, + "auxiliary_loss_mlp": 0.01033557, + "balance_loss_clip": 1.22909999, + "balance_loss_mlp": 1.01482975, + "epoch": 0.8791823237637156, + "flos": 21623607797760.0, + "grad_norm": 2.116895632549563, + "language_loss": 0.73523831, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75942612, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18725586, + "step": 14623, + "time_per_iteration": 2.802783250808716 + }, + { + "auxiliary_loss_clip": 0.01384386, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.22617042, + "balance_loss_mlp": 1.01446986, + "epoch": 0.8792424470163835, + "flos": 24254067323520.0, + "grad_norm": 3.6767223591398803, + "language_loss": 0.79262757, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.81680715, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19104004, + "step": 14624, + "time_per_iteration": 2.868772268295288 + }, + { + "auxiliary_loss_clip": 0.01394809, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.23492205, + "balance_loss_mlp": 1.01493812, + "epoch": 0.8793025702690516, + "flos": 24902237352960.0, + "grad_norm": 1.9675887066562934, + "language_loss": 0.8028627, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.82717347, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.21325684, + "step": 14625, + "time_per_iteration": 2.8570480346679688 + }, + { + "auxiliary_loss_clip": 0.01382149, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.2262882, + "balance_loss_mlp": 1.01242638, + "epoch": 0.8793626935217195, + "flos": 25383724963200.0, + "grad_norm": 1.4131729968783362, + "language_loss": 0.7427336, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76687109, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19165039, + "step": 14626, + "time_per_iteration": 2.8758046627044678 + }, + { + "auxiliary_loss_clip": 0.01401841, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.23828506, + "balance_loss_mlp": 1.01195121, + "epoch": 0.8794228167743875, + "flos": 34691288371200.0, + "grad_norm": 1.441605071358997, + "language_loss": 0.7142241, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73855281, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19091797, + "step": 14627, + "time_per_iteration": 2.9727444648742676 + }, + { + "auxiliary_loss_clip": 0.01392651, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.23369765, + "balance_loss_mlp": 1.01172817, + "epoch": 0.8794829400270555, + "flos": 19473504762240.0, + "grad_norm": 1.6657758457116463, + "language_loss": 0.72853303, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.75277817, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20141602, + "step": 14628, + "time_per_iteration": 2.8266279697418213 + }, + { + "auxiliary_loss_clip": 0.01398766, + "auxiliary_loss_mlp": 0.01033049, + "balance_loss_clip": 1.23792553, + "balance_loss_mlp": 1.01256931, + "epoch": 0.8795430632797234, + "flos": 15239727336960.0, + "grad_norm": 2.9938388218570546, + "language_loss": 0.70213294, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.72645104, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20483398, + "step": 14629, + "time_per_iteration": 2.801964282989502 + }, + { + "auxiliary_loss_clip": 0.01384133, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.22763729, + "balance_loss_mlp": 1.01100588, + "epoch": 0.8796031865323914, + "flos": 27755962560000.0, + "grad_norm": 1.5641589233544713, + "language_loss": 0.69605541, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.72019416, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18737793, + "step": 14630, + "time_per_iteration": 2.88619327545166 + }, + { + "auxiliary_loss_clip": 0.01376618, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.2226311, + "balance_loss_mlp": 1.01686215, + "epoch": 0.8796633097850594, + "flos": 31297971951360.0, + "grad_norm": 1.8233736347772387, + "language_loss": 0.75051999, + "learning_rate": 1.499207333613999e-07, + "loss": 0.77464867, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1940918, + "step": 14631, + "time_per_iteration": 2.918863534927368 + }, + { + "auxiliary_loss_clip": 0.01374494, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.22157502, + "balance_loss_mlp": 1.01323009, + "epoch": 0.8797234330377274, + "flos": 24253750609920.0, + "grad_norm": 6.619806536576093, + "language_loss": 0.69976431, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.72383773, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.19616699, + "step": 14632, + "time_per_iteration": 2.844898223876953 + }, + { + "auxiliary_loss_clip": 0.01395035, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.23633075, + "balance_loss_mlp": 1.01645672, + "epoch": 0.8797835562903953, + "flos": 24177277618560.0, + "grad_norm": 1.9382144419997884, + "language_loss": 0.65660071, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.68089926, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18334961, + "step": 14633, + "time_per_iteration": 4.290616273880005 + }, + { + "auxiliary_loss_clip": 0.0138691, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.22890973, + "balance_loss_mlp": 1.01557112, + "epoch": 0.8798436795430633, + "flos": 19294922960640.0, + "grad_norm": 1.4185437058910058, + "language_loss": 0.8464939, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.87070704, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18859863, + "step": 14634, + "time_per_iteration": 2.9137935638427734 + }, + { + "auxiliary_loss_clip": 0.01393343, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.23377872, + "balance_loss_mlp": 1.01364899, + "epoch": 0.8799038027957312, + "flos": 28189915868160.0, + "grad_norm": 1.7244594685217904, + "language_loss": 0.80747795, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.83173692, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18920898, + "step": 14635, + "time_per_iteration": 4.290014266967773 + }, + { + "auxiliary_loss_clip": 0.01392361, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.23282146, + "balance_loss_mlp": 1.01489651, + "epoch": 0.8799639260483992, + "flos": 24655462358400.0, + "grad_norm": 2.0788539888289694, + "language_loss": 0.65596986, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.68023247, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18994141, + "step": 14636, + "time_per_iteration": 2.8439388275146484 + }, + { + "auxiliary_loss_clip": 0.01390131, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.23079228, + "balance_loss_mlp": 1.01470566, + "epoch": 0.8800240493010671, + "flos": 22210688067840.0, + "grad_norm": 1.5280392005664172, + "language_loss": 0.70879126, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.73303986, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.20019531, + "step": 14637, + "time_per_iteration": 2.8422062397003174 + }, + { + "auxiliary_loss_clip": 0.01398891, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.24013162, + "balance_loss_mlp": 1.01210284, + "epoch": 0.8800841725537352, + "flos": 14254102454400.0, + "grad_norm": 2.6093583225675876, + "language_loss": 0.67923868, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.70354676, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19812012, + "step": 14638, + "time_per_iteration": 2.810321807861328 + }, + { + "auxiliary_loss_clip": 0.01406538, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.24612379, + "balance_loss_mlp": 1.01255453, + "epoch": 0.8801442958064031, + "flos": 37430145734400.0, + "grad_norm": 2.2266169242758878, + "language_loss": 0.59313339, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.61751777, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19348145, + "step": 14639, + "time_per_iteration": 3.013446569442749 + }, + { + "auxiliary_loss_clip": 0.01390442, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.23135364, + "balance_loss_mlp": 1.01315844, + "epoch": 0.8802044190590711, + "flos": 25058531450880.0, + "grad_norm": 1.45009205721012, + "language_loss": 0.74961412, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.77384871, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19848633, + "step": 14640, + "time_per_iteration": 2.9164555072784424 + }, + { + "auxiliary_loss_clip": 0.01390264, + "auxiliary_loss_mlp": 0.01040996, + "balance_loss_clip": 1.23073173, + "balance_loss_mlp": 1.01965714, + "epoch": 0.8802645423117391, + "flos": 24144493121280.0, + "grad_norm": 2.0177159684772863, + "language_loss": 0.70870179, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.73301435, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.21337891, + "step": 14641, + "time_per_iteration": 5.67152214050293 + }, + { + "auxiliary_loss_clip": 0.01402593, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.23952246, + "balance_loss_mlp": 1.01070607, + "epoch": 0.880324665564407, + "flos": 17940189847680.0, + "grad_norm": 4.169805865458593, + "language_loss": 0.86125231, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.88558441, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19897461, + "step": 14642, + "time_per_iteration": 2.8256890773773193 + }, + { + "auxiliary_loss_clip": 0.01390349, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.23245859, + "balance_loss_mlp": 1.01826787, + "epoch": 0.880384788817075, + "flos": 21297418899840.0, + "grad_norm": 1.6647989203135247, + "language_loss": 0.79552573, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81980979, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19787598, + "step": 14643, + "time_per_iteration": 2.895221471786499 + }, + { + "auxiliary_loss_clip": 0.01378656, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.22411537, + "balance_loss_mlp": 1.01012516, + "epoch": 0.880444912069743, + "flos": 12466094705280.0, + "grad_norm": 1.6173723647366962, + "language_loss": 0.73938197, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.76345265, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1829834, + "step": 14644, + "time_per_iteration": 2.820234775543213 + }, + { + "auxiliary_loss_clip": 0.01406461, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.24347115, + "balance_loss_mlp": 1.01000643, + "epoch": 0.880505035322411, + "flos": 13633830483840.0, + "grad_norm": 2.0447695726905732, + "language_loss": 0.80270636, + "learning_rate": 1.47856380505911e-07, + "loss": 0.82706308, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19213867, + "step": 14645, + "time_per_iteration": 2.8141956329345703 + }, + { + "auxiliary_loss_clip": 0.01380749, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.22723866, + "balance_loss_mlp": 1.00905371, + "epoch": 0.8805651585750789, + "flos": 23192874345600.0, + "grad_norm": 2.33224410654509, + "language_loss": 0.64830041, + "learning_rate": 1.477094533001364e-07, + "loss": 0.67239463, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.19616699, + "step": 14646, + "time_per_iteration": 2.875166177749634 + }, + { + "auxiliary_loss_clip": 0.01407559, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.24293959, + "balance_loss_mlp": 1.01440287, + "epoch": 0.8806252818277469, + "flos": 14911230954240.0, + "grad_norm": 2.548596802725856, + "language_loss": 0.78838444, + "learning_rate": 1.475625963334055e-07, + "loss": 0.81280029, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19641113, + "step": 14647, + "time_per_iteration": 2.814800500869751 + }, + { + "auxiliary_loss_clip": 0.01395829, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.2377516, + "balance_loss_mlp": 1.01571107, + "epoch": 0.8806854050804148, + "flos": 17648052301440.0, + "grad_norm": 2.09306913546854, + "language_loss": 0.75911963, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.78341854, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18347168, + "step": 14648, + "time_per_iteration": 2.834747314453125 + }, + { + "auxiliary_loss_clip": 0.01394679, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.23420823, + "balance_loss_mlp": 1.01276755, + "epoch": 0.8807455283330828, + "flos": 25341891505920.0, + "grad_norm": 1.8865277113238375, + "language_loss": 0.65804571, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.68230152, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18139648, + "step": 14649, + "time_per_iteration": 2.8540308475494385 + }, + { + "auxiliary_loss_clip": 0.01393243, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.2354275, + "balance_loss_mlp": 1.01209795, + "epoch": 0.8808056515857507, + "flos": 25276141532160.0, + "grad_norm": 1.2582015675526097, + "language_loss": 0.62888777, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.65313256, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19140625, + "step": 14650, + "time_per_iteration": 2.936044216156006 + }, + { + "auxiliary_loss_clip": 0.01389803, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.23302698, + "balance_loss_mlp": 1.016729, + "epoch": 0.8808657748384188, + "flos": 26590217552640.0, + "grad_norm": 1.4040303004930477, + "language_loss": 0.72979677, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75405377, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19177246, + "step": 14651, + "time_per_iteration": 2.8755311965942383 + }, + { + "auxiliary_loss_clip": 0.01403738, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.2420454, + "balance_loss_mlp": 1.01591921, + "epoch": 0.8809258980910867, + "flos": 18670488468480.0, + "grad_norm": 1.8098815385336255, + "language_loss": 0.72145784, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74584538, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19091797, + "step": 14652, + "time_per_iteration": 2.803046226501465 + }, + { + "auxiliary_loss_clip": 0.01385529, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.22723889, + "balance_loss_mlp": 1.01193106, + "epoch": 0.8809860213437547, + "flos": 19801820165760.0, + "grad_norm": 2.966308396807985, + "language_loss": 0.75780642, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.78196913, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18823242, + "step": 14653, + "time_per_iteration": 2.8150997161865234 + }, + { + "auxiliary_loss_clip": 0.01408321, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.2457304, + "balance_loss_mlp": 1.01180089, + "epoch": 0.8810461445964227, + "flos": 17903378563200.0, + "grad_norm": 1.6879760842927332, + "language_loss": 0.7205438, + "learning_rate": 1.465365647269421e-07, + "loss": 0.74493372, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1887207, + "step": 14654, + "time_per_iteration": 2.81980037689209 + }, + { + "auxiliary_loss_clip": 0.01388394, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.23092985, + "balance_loss_mlp": 1.01381946, + "epoch": 0.8811062678490906, + "flos": 29174545365120.0, + "grad_norm": 1.4654721071532706, + "language_loss": 0.72857606, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.75278872, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19067383, + "step": 14655, + "time_per_iteration": 2.8943212032318115 + }, + { + "auxiliary_loss_clip": 0.01395126, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.2370944, + "balance_loss_mlp": 1.0148747, + "epoch": 0.8811663911017587, + "flos": 20348333832960.0, + "grad_norm": 1.6513214945740973, + "language_loss": 0.82102609, + "learning_rate": 1.462440453077449e-07, + "loss": 0.84531981, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19384766, + "step": 14656, + "time_per_iteration": 2.877187490463257 + }, + { + "auxiliary_loss_clip": 0.01406719, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.24514055, + "balance_loss_mlp": 1.01728749, + "epoch": 0.8812265143544266, + "flos": 25896911195520.0, + "grad_norm": 1.654069535204542, + "language_loss": 0.69418991, + "learning_rate": 1.460978910372914e-07, + "loss": 0.71862048, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19055176, + "step": 14657, + "time_per_iteration": 2.87732195854187 + }, + { + "auxiliary_loss_clip": 0.01401016, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.2410965, + "balance_loss_mlp": 1.01268744, + "epoch": 0.8812866376070946, + "flos": 27206100777600.0, + "grad_norm": 3.537876153128997, + "language_loss": 0.851008, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.87533998, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19482422, + "step": 14658, + "time_per_iteration": 2.827751874923706 + }, + { + "auxiliary_loss_clip": 0.01407404, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.24286866, + "balance_loss_mlp": 1.01164556, + "epoch": 0.8813467608597625, + "flos": 23818168488960.0, + "grad_norm": 1.9417624458216536, + "language_loss": 0.78033793, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.8047201, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19189453, + "step": 14659, + "time_per_iteration": 2.881575584411621 + }, + { + "auxiliary_loss_clip": 0.01402218, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.24309742, + "balance_loss_mlp": 1.01531124, + "epoch": 0.8814068841124305, + "flos": 21115262759040.0, + "grad_norm": 2.4163419735577487, + "language_loss": 0.61581939, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.64018011, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1854248, + "step": 14660, + "time_per_iteration": 2.834075450897217 + }, + { + "auxiliary_loss_clip": 0.01386986, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.2282517, + "balance_loss_mlp": 1.01232195, + "epoch": 0.8814670073650984, + "flos": 24727591848960.0, + "grad_norm": 1.8888203124881329, + "language_loss": 0.78693771, + "learning_rate": 1.455139770123972e-07, + "loss": 0.81112319, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19226074, + "step": 14661, + "time_per_iteration": 2.9403293132781982 + }, + { + "auxiliary_loss_clip": 0.0139561, + "auxiliary_loss_mlp": 0.0103896, + "balance_loss_clip": 1.2351526, + "balance_loss_mlp": 1.01960027, + "epoch": 0.8815271306177664, + "flos": 22976576363520.0, + "grad_norm": 1.7448500772317281, + "language_loss": 0.77149117, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79583687, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19360352, + "step": 14662, + "time_per_iteration": 2.8587751388549805 + }, + { + "auxiliary_loss_clip": 0.01373136, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.2190733, + "balance_loss_mlp": 1.01042187, + "epoch": 0.8815872538704344, + "flos": 19468527834240.0, + "grad_norm": 2.082861270564137, + "language_loss": 0.74088538, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.7649042, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.18322754, + "step": 14663, + "time_per_iteration": 2.7927353382110596 + }, + { + "auxiliary_loss_clip": 0.01382514, + "auxiliary_loss_mlp": 0.01034125, + "balance_loss_clip": 1.22565079, + "balance_loss_mlp": 1.01584983, + "epoch": 0.8816473771231024, + "flos": 32168231297280.0, + "grad_norm": 2.1107612138307905, + "language_loss": 0.70891547, + "learning_rate": 1.450767798584489e-07, + "loss": 0.73308182, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18286133, + "step": 14664, + "time_per_iteration": 2.9138131141662598 + }, + { + "auxiliary_loss_clip": 0.01385274, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.22913396, + "balance_loss_mlp": 1.01588047, + "epoch": 0.8817075003757703, + "flos": 19691657781120.0, + "grad_norm": 1.4826056831422372, + "language_loss": 0.82104129, + "learning_rate": 1.449311881441828e-07, + "loss": 0.8452388, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18603516, + "step": 14665, + "time_per_iteration": 2.8089053630828857 + }, + { + "auxiliary_loss_clip": 0.01399327, + "auxiliary_loss_mlp": 0.0103592, + "balance_loss_clip": 1.24033082, + "balance_loss_mlp": 1.01634574, + "epoch": 0.8817676236284383, + "flos": 15676893025920.0, + "grad_norm": 2.4574365174258923, + "language_loss": 0.59452116, + "learning_rate": 1.447856667743117e-07, + "loss": 0.61887372, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19567871, + "step": 14666, + "time_per_iteration": 2.829272508621216 + }, + { + "auxiliary_loss_clip": 0.01398033, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.23801064, + "balance_loss_mlp": 1.01149118, + "epoch": 0.8818277468811063, + "flos": 17904283459200.0, + "grad_norm": 1.8884215161452804, + "language_loss": 0.85036731, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.8746714, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.20861816, + "step": 14667, + "time_per_iteration": 4.244984865188599 + }, + { + "auxiliary_loss_clip": 0.01398987, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.2397809, + "balance_loss_mlp": 1.01668262, + "epoch": 0.8818878701337742, + "flos": 18779745957120.0, + "grad_norm": 1.7942799746604732, + "language_loss": 0.62765533, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.6520083, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19641113, + "step": 14668, + "time_per_iteration": 2.7874624729156494 + }, + { + "auxiliary_loss_clip": 0.01393028, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.23679614, + "balance_loss_mlp": 1.01409233, + "epoch": 0.8819479933864423, + "flos": 17721222422400.0, + "grad_norm": 2.2509816766220903, + "language_loss": 0.5805071, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.60475844, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18017578, + "step": 14669, + "time_per_iteration": 2.788069009780884 + }, + { + "auxiliary_loss_clip": 0.01402622, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.24343491, + "balance_loss_mlp": 1.01191974, + "epoch": 0.8820081166391102, + "flos": 11736158042880.0, + "grad_norm": 1.7452395347996843, + "language_loss": 0.72886097, + "learning_rate": 1.442042848491043e-07, + "loss": 0.75319421, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18774414, + "step": 14670, + "time_per_iteration": 4.259251832962036 + }, + { + "auxiliary_loss_clip": 0.01391087, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.23167396, + "balance_loss_mlp": 1.01406932, + "epoch": 0.8820682398917782, + "flos": 27501812663040.0, + "grad_norm": 1.9147881197651986, + "language_loss": 0.74496883, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76922119, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.20056152, + "step": 14671, + "time_per_iteration": 2.9013023376464844 + }, + { + "auxiliary_loss_clip": 0.01387325, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.22688222, + "balance_loss_mlp": 1.01506484, + "epoch": 0.8821283631444461, + "flos": 16882752188160.0, + "grad_norm": 2.162455596590144, + "language_loss": 0.8577444, + "learning_rate": 1.43914016096218e-07, + "loss": 0.88197011, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.2019043, + "step": 14672, + "time_per_iteration": 2.7974395751953125 + }, + { + "auxiliary_loss_clip": 0.01386569, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.23099816, + "balance_loss_mlp": 1.01214349, + "epoch": 0.8821884863971141, + "flos": 24291919238400.0, + "grad_norm": 1.513010406770979, + "language_loss": 0.7350657, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.75924492, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.19226074, + "step": 14673, + "time_per_iteration": 2.862677812576294 + }, + { + "auxiliary_loss_clip": 0.01181732, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.09359145, + "balance_loss_mlp": 1.01644027, + "epoch": 0.882248609649782, + "flos": 59465934812160.0, + "grad_norm": 0.8069653660355437, + "language_loss": 0.4945522, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51674753, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21386719, + "step": 14674, + "time_per_iteration": 3.4424238204956055 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.23011541, + "balance_loss_mlp": 1.01284337, + "epoch": 0.88230873290245, + "flos": 19947255511680.0, + "grad_norm": 13.531308756841925, + "language_loss": 0.77637005, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.80059922, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1965332, + "step": 14675, + "time_per_iteration": 2.839144468307495 + }, + { + "auxiliary_loss_clip": 0.01382578, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.22632384, + "balance_loss_mlp": 1.0109849, + "epoch": 0.882368856155118, + "flos": 16371466237440.0, + "grad_norm": 1.8854906958969841, + "language_loss": 0.79784405, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.82197702, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1973877, + "step": 14676, + "time_per_iteration": 5.695163249969482 + }, + { + "auxiliary_loss_clip": 0.01178367, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.09150064, + "balance_loss_mlp": 1.00797987, + "epoch": 0.882428979407786, + "flos": 70630423079040.0, + "grad_norm": 0.6984338893827733, + "language_loss": 0.54819661, + "learning_rate": 1.431895760121109e-07, + "loss": 0.57026321, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.203125, + "step": 14677, + "time_per_iteration": 3.3942410945892334 + }, + { + "auxiliary_loss_clip": 0.01390343, + "auxiliary_loss_mlp": 0.01028849, + "balance_loss_clip": 1.23149753, + "balance_loss_mlp": 1.01045465, + "epoch": 0.8824891026604539, + "flos": 18159383496960.0, + "grad_norm": 2.4151939500506665, + "language_loss": 0.65733933, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.68153125, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18408203, + "step": 14678, + "time_per_iteration": 2.855454683303833 + }, + { + "auxiliary_loss_clip": 0.01396317, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.23514509, + "balance_loss_mlp": 1.01201165, + "epoch": 0.8825492259131219, + "flos": 27243500244480.0, + "grad_norm": 2.1290948752392884, + "language_loss": 0.71840727, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.74268353, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19299316, + "step": 14679, + "time_per_iteration": 2.9078238010406494 + }, + { + "auxiliary_loss_clip": 0.0139075, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.23302436, + "balance_loss_mlp": 1.01003945, + "epoch": 0.8826093491657898, + "flos": 22284898819200.0, + "grad_norm": 1.7754005530504624, + "language_loss": 0.64456415, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66875666, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18469238, + "step": 14680, + "time_per_iteration": 2.9168174266815186 + }, + { + "auxiliary_loss_clip": 0.01389194, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.23299527, + "balance_loss_mlp": 1.01508772, + "epoch": 0.8826694724184578, + "flos": 14211499835520.0, + "grad_norm": 2.52099955218005, + "language_loss": 0.78151923, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.80575919, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19726562, + "step": 14681, + "time_per_iteration": 2.8062493801116943 + }, + { + "auxiliary_loss_clip": 0.01405075, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.24423003, + "balance_loss_mlp": 1.01411343, + "epoch": 0.8827295956711259, + "flos": 20641512009600.0, + "grad_norm": 1.711714890215758, + "language_loss": 0.73553276, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75991952, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19494629, + "step": 14682, + "time_per_iteration": 2.85417103767395 + }, + { + "auxiliary_loss_clip": 0.01403662, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.24091315, + "balance_loss_mlp": 1.01560807, + "epoch": 0.8827897189237938, + "flos": 18521705007360.0, + "grad_norm": 1.9611074930448564, + "language_loss": 0.74972653, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.77411616, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19726562, + "step": 14683, + "time_per_iteration": 2.8227174282073975 + }, + { + "auxiliary_loss_clip": 0.01397287, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.23739791, + "balance_loss_mlp": 1.01340175, + "epoch": 0.8828498421764618, + "flos": 22757654183040.0, + "grad_norm": 3.328002884025606, + "language_loss": 0.65913546, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.68343461, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19226074, + "step": 14684, + "time_per_iteration": 2.847733974456787 + }, + { + "auxiliary_loss_clip": 0.01394035, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.2353121, + "balance_loss_mlp": 1.00962543, + "epoch": 0.8829099654291297, + "flos": 15020624177280.0, + "grad_norm": 1.7341622441579412, + "language_loss": 0.69846249, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.72268581, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18676758, + "step": 14685, + "time_per_iteration": 2.822073221206665 + }, + { + "auxiliary_loss_clip": 0.01405412, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.24390268, + "balance_loss_mlp": 1.01458955, + "epoch": 0.8829700886817977, + "flos": 16727498720640.0, + "grad_norm": 2.1176045835643893, + "language_loss": 0.75260484, + "learning_rate": 1.418900201783806e-07, + "loss": 0.77699989, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19506836, + "step": 14686, + "time_per_iteration": 2.7869625091552734 + }, + { + "auxiliary_loss_clip": 0.01390912, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.23420882, + "balance_loss_mlp": 1.01167417, + "epoch": 0.8830302119344656, + "flos": 15270385328640.0, + "grad_norm": 1.8934848280686465, + "language_loss": 0.64098388, + "learning_rate": 1.417459773114007e-07, + "loss": 0.66520846, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19873047, + "step": 14687, + "time_per_iteration": 2.781559467315674 + }, + { + "auxiliary_loss_clip": 0.01400421, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.23841047, + "balance_loss_mlp": 1.01612043, + "epoch": 0.8830903351871336, + "flos": 28628529390720.0, + "grad_norm": 2.334643278213978, + "language_loss": 0.69770777, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.72207087, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19763184, + "step": 14688, + "time_per_iteration": 2.8685600757598877 + }, + { + "auxiliary_loss_clip": 0.01391175, + "auxiliary_loss_mlp": 0.01028756, + "balance_loss_clip": 1.23665822, + "balance_loss_mlp": 1.01068377, + "epoch": 0.8831504584398016, + "flos": 28013324837760.0, + "grad_norm": 1.7447590976521856, + "language_loss": 0.68046916, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.70466852, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18066406, + "step": 14689, + "time_per_iteration": 2.8704299926757812 + }, + { + "auxiliary_loss_clip": 0.01391727, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.23535657, + "balance_loss_mlp": 1.01903665, + "epoch": 0.8832105816924696, + "flos": 26591212938240.0, + "grad_norm": 1.3095291994454632, + "language_loss": 0.74950457, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.77380145, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18920898, + "step": 14690, + "time_per_iteration": 2.9246413707733154 + }, + { + "auxiliary_loss_clip": 0.01399265, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.23945773, + "balance_loss_mlp": 1.01513886, + "epoch": 0.8832707049451375, + "flos": 24909250296960.0, + "grad_norm": 1.4347491246826043, + "language_loss": 0.73835397, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.76269454, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19665527, + "step": 14691, + "time_per_iteration": 2.9272353649139404 + }, + { + "auxiliary_loss_clip": 0.01420713, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.25456905, + "balance_loss_mlp": 1.01457286, + "epoch": 0.8833308281978055, + "flos": 15459735392640.0, + "grad_norm": 1.6236386308368105, + "language_loss": 0.52558005, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.5501312, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19824219, + "step": 14692, + "time_per_iteration": 2.8005425930023193 + }, + { + "auxiliary_loss_clip": 0.01405618, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.2443924, + "balance_loss_mlp": 1.01610625, + "epoch": 0.8833909514504734, + "flos": 20310798631680.0, + "grad_norm": 2.238487788074904, + "language_loss": 0.61443245, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.63883686, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18713379, + "step": 14693, + "time_per_iteration": 2.836475133895874 + }, + { + "auxiliary_loss_clip": 0.01393492, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.23804116, + "balance_loss_mlp": 1.01050413, + "epoch": 0.8834510747031414, + "flos": 20383199591040.0, + "grad_norm": 1.5096283413884424, + "language_loss": 0.7585212, + "learning_rate": 1.407396505730898e-07, + "loss": 0.78274208, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18103027, + "step": 14694, + "time_per_iteration": 2.8115313053131104 + }, + { + "auxiliary_loss_clip": 0.01403612, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.24080968, + "balance_loss_mlp": 1.01198816, + "epoch": 0.8835111979558095, + "flos": 29763933120000.0, + "grad_norm": 1.8851781011123867, + "language_loss": 0.73455971, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.75889552, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.17980957, + "step": 14695, + "time_per_iteration": 2.8939731121063232 + }, + { + "auxiliary_loss_clip": 0.01385956, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.23157632, + "balance_loss_mlp": 1.01193786, + "epoch": 0.8835713212084774, + "flos": 24145352772480.0, + "grad_norm": 1.5898887494571001, + "language_loss": 0.80644858, + "learning_rate": 1.404527630961998e-07, + "loss": 0.83061278, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18530273, + "step": 14696, + "time_per_iteration": 2.8476998805999756 + }, + { + "auxiliary_loss_clip": 0.01402485, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.24264336, + "balance_loss_mlp": 1.0126214, + "epoch": 0.8836314444611454, + "flos": 27683697335040.0, + "grad_norm": 1.6128189820190961, + "language_loss": 0.7562505, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.78058815, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18664551, + "step": 14697, + "time_per_iteration": 2.888587474822998 + }, + { + "auxiliary_loss_clip": 0.01387015, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.22941446, + "balance_loss_mlp": 1.01410532, + "epoch": 0.8836915677138133, + "flos": 16845759924480.0, + "grad_norm": 3.021325482852787, + "language_loss": 0.72681469, + "learning_rate": 1.401661576761779e-07, + "loss": 0.75102055, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19458008, + "step": 14698, + "time_per_iteration": 2.823514938354492 + }, + { + "auxiliary_loss_clip": 0.01174659, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.0892632, + "balance_loss_mlp": 1.02082825, + "epoch": 0.8837516909664813, + "flos": 69344245117440.0, + "grad_norm": 0.8058994451819829, + "language_loss": 0.53791553, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.56004113, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.17089844, + "step": 14699, + "time_per_iteration": 3.3409698009490967 + }, + { + "auxiliary_loss_clip": 0.01400593, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.2381072, + "balance_loss_mlp": 1.01497293, + "epoch": 0.8838118142191492, + "flos": 21334682632320.0, + "grad_norm": 1.6449291572744014, + "language_loss": 0.77377844, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.79813302, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19885254, + "step": 14700, + "time_per_iteration": 2.8452768325805664 + }, + { + "auxiliary_loss_clip": 0.01391106, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.23427463, + "balance_loss_mlp": 1.01424527, + "epoch": 0.8838719374718172, + "flos": 21480525181440.0, + "grad_norm": 1.833565775177554, + "language_loss": 0.73935735, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.76360053, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18969727, + "step": 14701, + "time_per_iteration": 2.850094795227051 + }, + { + "auxiliary_loss_clip": 0.01410386, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.24657977, + "balance_loss_mlp": 1.01311338, + "epoch": 0.8839320607244852, + "flos": 26480145657600.0, + "grad_norm": 1.8152130786431204, + "language_loss": 0.71832752, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.74275148, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.18896484, + "step": 14702, + "time_per_iteration": 4.294740915298462 + }, + { + "auxiliary_loss_clip": 0.01412593, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.24998045, + "balance_loss_mlp": 1.0198921, + "epoch": 0.8839921839771532, + "flos": 45238264600320.0, + "grad_norm": 1.4925866168409894, + "language_loss": 0.72295594, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.74747145, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19055176, + "step": 14703, + "time_per_iteration": 3.074967861175537 + }, + { + "auxiliary_loss_clip": 0.01382735, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.22744465, + "balance_loss_mlp": 1.0161407, + "epoch": 0.8840523072298211, + "flos": 20015358215040.0, + "grad_norm": 2.031003357288446, + "language_loss": 0.6734938, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.69766676, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18432617, + "step": 14704, + "time_per_iteration": 2.878918409347534 + }, + { + "auxiliary_loss_clip": 0.01383474, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.22827375, + "balance_loss_mlp": 1.01416695, + "epoch": 0.8841124304824891, + "flos": 24436766401920.0, + "grad_norm": 1.680686572695812, + "language_loss": 0.71047419, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.73463613, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18554688, + "step": 14705, + "time_per_iteration": 2.913623332977295 + }, + { + "auxiliary_loss_clip": 0.01399053, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.2416544, + "balance_loss_mlp": 1.01392901, + "epoch": 0.884172553735157, + "flos": 31296976565760.0, + "grad_norm": 1.391201441425496, + "language_loss": 0.7112639, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.735578, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18432617, + "step": 14706, + "time_per_iteration": 4.303986072540283 + }, + { + "auxiliary_loss_clip": 0.01388855, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.23080683, + "balance_loss_mlp": 1.01069915, + "epoch": 0.884232676987825, + "flos": 21399437220480.0, + "grad_norm": 1.6425181189454023, + "language_loss": 0.7497623, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.77394593, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18811035, + "step": 14707, + "time_per_iteration": 2.849536180496216 + }, + { + "auxiliary_loss_clip": 0.01175375, + "auxiliary_loss_mlp": 0.01025283, + "balance_loss_clip": 1.08975935, + "balance_loss_mlp": 1.00229979, + "epoch": 0.8842928002404931, + "flos": 57938167025280.0, + "grad_norm": 0.7952078902702773, + "language_loss": 0.60462129, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62662786, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.22949219, + "step": 14708, + "time_per_iteration": 3.182222843170166 + }, + { + "auxiliary_loss_clip": 0.0137189, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.21907914, + "balance_loss_mlp": 1.01226652, + "epoch": 0.884352923493161, + "flos": 41478826106880.0, + "grad_norm": 2.443848941452672, + "language_loss": 0.68073821, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.70476294, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.18322754, + "step": 14709, + "time_per_iteration": 3.027759552001953 + }, + { + "auxiliary_loss_clip": 0.01412462, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.2493279, + "balance_loss_mlp": 1.01751423, + "epoch": 0.884413046745829, + "flos": 46558991606400.0, + "grad_norm": 1.5555773606010121, + "language_loss": 0.63250017, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.65700996, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.21008301, + "step": 14710, + "time_per_iteration": 3.100559949874878 + }, + { + "auxiliary_loss_clip": 0.01382358, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.2277844, + "balance_loss_mlp": 1.01138234, + "epoch": 0.8844731699984969, + "flos": 19144646421120.0, + "grad_norm": 2.589008412065707, + "language_loss": 0.65017414, + "learning_rate": 1.38310100580431e-07, + "loss": 0.6742878, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.1763916, + "step": 14711, + "time_per_iteration": 5.764604806900024 + }, + { + "auxiliary_loss_clip": 0.01407502, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.24325776, + "balance_loss_mlp": 1.01218736, + "epoch": 0.8845332932511649, + "flos": 23271654821760.0, + "grad_norm": 3.190897292869493, + "language_loss": 0.76333845, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78772974, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19421387, + "step": 14712, + "time_per_iteration": 3.002335548400879 + }, + { + "auxiliary_loss_clip": 0.01396775, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.23690283, + "balance_loss_mlp": 1.01371038, + "epoch": 0.8845934165038328, + "flos": 17573027143680.0, + "grad_norm": 2.2936405426944577, + "language_loss": 0.81858867, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.84289575, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.20214844, + "step": 14713, + "time_per_iteration": 2.7990305423736572 + }, + { + "auxiliary_loss_clip": 0.01379601, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.22339904, + "balance_loss_mlp": 1.01109028, + "epoch": 0.8846535397565009, + "flos": 27495976083840.0, + "grad_norm": 1.6001586576420759, + "language_loss": 0.56792188, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.59201443, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.1854248, + "step": 14714, + "time_per_iteration": 2.8852453231811523 + }, + { + "auxiliary_loss_clip": 0.01393842, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.23570848, + "balance_loss_mlp": 1.01318121, + "epoch": 0.8847136630091688, + "flos": 28771657251840.0, + "grad_norm": 1.964297818113568, + "language_loss": 0.744798, + "learning_rate": 1.377414057838755e-07, + "loss": 0.76906377, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1953125, + "step": 14715, + "time_per_iteration": 2.8961360454559326 + }, + { + "auxiliary_loss_clip": 0.01394494, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.23563361, + "balance_loss_mlp": 1.01549816, + "epoch": 0.8847737862618368, + "flos": 23487319376640.0, + "grad_norm": 3.082521939036507, + "language_loss": 0.75456047, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77884293, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18261719, + "step": 14716, + "time_per_iteration": 2.8309836387634277 + }, + { + "auxiliary_loss_clip": 0.01392281, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.23395956, + "balance_loss_mlp": 1.0145247, + "epoch": 0.8848339095145047, + "flos": 18670036020480.0, + "grad_norm": 6.745210672580485, + "language_loss": 0.71922469, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.74348778, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19494629, + "step": 14717, + "time_per_iteration": 2.844132900238037 + }, + { + "auxiliary_loss_clip": 0.01386195, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.23239923, + "balance_loss_mlp": 1.01296449, + "epoch": 0.8848940327671727, + "flos": 32283053896320.0, + "grad_norm": 2.2908160396383277, + "language_loss": 0.74443233, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76860332, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.1796875, + "step": 14718, + "time_per_iteration": 2.9203860759735107 + }, + { + "auxiliary_loss_clip": 0.0140786, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.24453545, + "balance_loss_mlp": 1.01376653, + "epoch": 0.8849541560198406, + "flos": 24031751783040.0, + "grad_norm": 2.1357624346538544, + "language_loss": 0.79098046, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.81538451, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18762207, + "step": 14719, + "time_per_iteration": 2.903578996658325 + }, + { + "auxiliary_loss_clip": 0.01398275, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.23850799, + "balance_loss_mlp": 1.01296031, + "epoch": 0.8850142792725086, + "flos": 16881756802560.0, + "grad_norm": 3.7429451997242262, + "language_loss": 0.72620153, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.75049543, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18151855, + "step": 14720, + "time_per_iteration": 2.9609827995300293 + }, + { + "auxiliary_loss_clip": 0.01403246, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.24060464, + "balance_loss_mlp": 1.01313436, + "epoch": 0.8850744025251767, + "flos": 24034556960640.0, + "grad_norm": 2.5205157507236438, + "language_loss": 0.83135426, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.85571378, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19567871, + "step": 14721, + "time_per_iteration": 2.862091541290283 + }, + { + "auxiliary_loss_clip": 0.01402295, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.24073899, + "balance_loss_mlp": 1.01354623, + "epoch": 0.8851345257778446, + "flos": 47971466363520.0, + "grad_norm": 2.2047839429568064, + "language_loss": 0.63444453, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.65879524, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19213867, + "step": 14722, + "time_per_iteration": 3.0744693279266357 + }, + { + "auxiliary_loss_clip": 0.01401511, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.24097586, + "balance_loss_mlp": 1.0135839, + "epoch": 0.8851946490305126, + "flos": 36624595731840.0, + "grad_norm": 1.9042372505390657, + "language_loss": 0.69264102, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.71697807, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18603516, + "step": 14723, + "time_per_iteration": 3.0083343982696533 + }, + { + "auxiliary_loss_clip": 0.01392066, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.23378015, + "balance_loss_mlp": 1.0138669, + "epoch": 0.8852547722831805, + "flos": 21554419219200.0, + "grad_norm": 1.8217480645293103, + "language_loss": 0.78490889, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80916548, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19726562, + "step": 14724, + "time_per_iteration": 2.8453662395477295 + }, + { + "auxiliary_loss_clip": 0.01175574, + "auxiliary_loss_mlp": 0.01021299, + "balance_loss_clip": 1.09005284, + "balance_loss_mlp": 1.00079525, + "epoch": 0.8853148955358485, + "flos": 63088154530560.0, + "grad_norm": 0.7979720175269285, + "language_loss": 0.58971143, + "learning_rate": 1.363246127376143e-07, + "loss": 0.61168015, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.20507812, + "step": 14725, + "time_per_iteration": 3.2374956607818604 + }, + { + "auxiliary_loss_clip": 0.01413285, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.24602234, + "balance_loss_mlp": 1.01674652, + "epoch": 0.8853750187885164, + "flos": 18158795314560.0, + "grad_norm": 3.6566098158840115, + "language_loss": 0.70079643, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.72529221, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19555664, + "step": 14726, + "time_per_iteration": 2.8083276748657227 + }, + { + "auxiliary_loss_clip": 0.01398678, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.24100804, + "balance_loss_mlp": 1.00965929, + "epoch": 0.8854351420411845, + "flos": 39585994859520.0, + "grad_norm": 1.2500599109928099, + "language_loss": 0.70328671, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.72755158, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18151855, + "step": 14727, + "time_per_iteration": 3.031891345977783 + }, + { + "auxiliary_loss_clip": 0.01403099, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.24419451, + "balance_loss_mlp": 1.01318741, + "epoch": 0.8854952652938524, + "flos": 23779909370880.0, + "grad_norm": 1.7965082292852292, + "language_loss": 0.71211576, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.73647106, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19250488, + "step": 14728, + "time_per_iteration": 2.8584160804748535 + }, + { + "auxiliary_loss_clip": 0.014007, + "auxiliary_loss_mlp": 0.01035327, + "balance_loss_clip": 1.23950934, + "balance_loss_mlp": 1.01632547, + "epoch": 0.8855553885465204, + "flos": 18297896388480.0, + "grad_norm": 2.2549921725414093, + "language_loss": 0.67313385, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.69749415, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19006348, + "step": 14729, + "time_per_iteration": 2.800593852996826 + }, + { + "auxiliary_loss_clip": 0.01391078, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.23403764, + "balance_loss_mlp": 1.01219857, + "epoch": 0.8856155117991883, + "flos": 36881324582400.0, + "grad_norm": 1.5134567813420785, + "language_loss": 0.63181049, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65602016, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.17675781, + "step": 14730, + "time_per_iteration": 2.97753643989563 + }, + { + "auxiliary_loss_clip": 0.01394305, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.23713779, + "balance_loss_mlp": 1.01294565, + "epoch": 0.8856756350518563, + "flos": 22174057762560.0, + "grad_norm": 1.3800644391154413, + "language_loss": 0.80016607, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.82443237, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19372559, + "step": 14731, + "time_per_iteration": 2.8784258365631104 + }, + { + "auxiliary_loss_clip": 0.01398301, + "auxiliary_loss_mlp": 0.01036075, + "balance_loss_clip": 1.23774624, + "balance_loss_mlp": 1.01677477, + "epoch": 0.8857357583045242, + "flos": 20750769498240.0, + "grad_norm": 1.6417657044478902, + "language_loss": 0.840707, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.86505079, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19311523, + "step": 14732, + "time_per_iteration": 2.887178421020508 + }, + { + "auxiliary_loss_clip": 0.0118, + "auxiliary_loss_mlp": 0.01023608, + "balance_loss_clip": 1.09200585, + "balance_loss_mlp": 1.00148296, + "epoch": 0.8857958815571922, + "flos": 69925669787520.0, + "grad_norm": 0.9343370876685249, + "language_loss": 0.60044718, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.62248325, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.22167969, + "step": 14733, + "time_per_iteration": 3.3171210289001465 + }, + { + "auxiliary_loss_clip": 0.01418277, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.25712895, + "balance_loss_mlp": 1.01685405, + "epoch": 0.8858560048098603, + "flos": 15130243624320.0, + "grad_norm": 2.4033379728614728, + "language_loss": 0.67487001, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.6994158, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19433594, + "step": 14734, + "time_per_iteration": 2.86643123626709 + }, + { + "auxiliary_loss_clip": 0.0139518, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.23959517, + "balance_loss_mlp": 1.01292121, + "epoch": 0.8859161280625282, + "flos": 16617924518400.0, + "grad_norm": 1.869077586948502, + "language_loss": 0.76457965, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.78884786, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18713379, + "step": 14735, + "time_per_iteration": 2.8493192195892334 + }, + { + "auxiliary_loss_clip": 0.01405323, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.24402165, + "balance_loss_mlp": 1.01707602, + "epoch": 0.8859762513151962, + "flos": 18702911007360.0, + "grad_norm": 2.009181288660724, + "language_loss": 0.71178502, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.73618758, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.1784668, + "step": 14736, + "time_per_iteration": 2.8475871086120605 + }, + { + "auxiliary_loss_clip": 0.01399308, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.23922539, + "balance_loss_mlp": 1.01080418, + "epoch": 0.8860363745678641, + "flos": 19546358169600.0, + "grad_norm": 2.257005972185825, + "language_loss": 0.85498571, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.87927556, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18884277, + "step": 14737, + "time_per_iteration": 4.309977769851685 + }, + { + "auxiliary_loss_clip": 0.01429677, + "auxiliary_loss_mlp": 0.01038341, + "balance_loss_clip": 1.2638917, + "balance_loss_mlp": 1.01753926, + "epoch": 0.8860964978205321, + "flos": 35968507862400.0, + "grad_norm": 2.394788163522259, + "language_loss": 0.68944013, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.71412027, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.20812988, + "step": 14738, + "time_per_iteration": 2.993105888366699 + }, + { + "auxiliary_loss_clip": 0.01413825, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.24856234, + "balance_loss_mlp": 1.01069927, + "epoch": 0.8861566210732, + "flos": 21221805559680.0, + "grad_norm": 2.011439887344891, + "language_loss": 0.75287449, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77731711, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.1973877, + "step": 14739, + "time_per_iteration": 2.8484365940093994 + }, + { + "auxiliary_loss_clip": 0.01387807, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.23127067, + "balance_loss_mlp": 1.01342154, + "epoch": 0.886216744325868, + "flos": 14616695433600.0, + "grad_norm": 1.8224282368091416, + "language_loss": 0.87457108, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89876777, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18432617, + "step": 14740, + "time_per_iteration": 4.256930828094482 + }, + { + "auxiliary_loss_clip": 0.01389268, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.23192954, + "balance_loss_mlp": 1.01361775, + "epoch": 0.886276867578536, + "flos": 26662980470400.0, + "grad_norm": 2.9989338409695305, + "language_loss": 0.64191556, + "learning_rate": 1.34072445601471e-07, + "loss": 0.6661371, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19262695, + "step": 14741, + "time_per_iteration": 2.8840200901031494 + }, + { + "auxiliary_loss_clip": 0.01397362, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.23864698, + "balance_loss_mlp": 1.01255393, + "epoch": 0.886336990831204, + "flos": 16772861272320.0, + "grad_norm": 2.0326417940863726, + "language_loss": 0.73292148, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.7572059, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18530273, + "step": 14742, + "time_per_iteration": 2.8549904823303223 + }, + { + "auxiliary_loss_clip": 0.01392822, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.23505402, + "balance_loss_mlp": 1.01116407, + "epoch": 0.8863971140838719, + "flos": 25276277266560.0, + "grad_norm": 2.6730731877364504, + "language_loss": 0.60012996, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.62436247, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19274902, + "step": 14743, + "time_per_iteration": 2.8797836303710938 + }, + { + "auxiliary_loss_clip": 0.0139625, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.23596621, + "balance_loss_mlp": 1.0158776, + "epoch": 0.8864572373365399, + "flos": 23414782682880.0, + "grad_norm": 2.0242777445327307, + "language_loss": 0.60597587, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.63029844, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20117188, + "step": 14744, + "time_per_iteration": 2.8451130390167236 + }, + { + "auxiliary_loss_clip": 0.01397011, + "auxiliary_loss_mlp": 0.01035028, + "balance_loss_clip": 1.23735809, + "balance_loss_mlp": 1.01590657, + "epoch": 0.8865173605892078, + "flos": 18557204192640.0, + "grad_norm": 1.5646710540414905, + "language_loss": 0.77315676, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.79747719, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19116211, + "step": 14745, + "time_per_iteration": 2.835024118423462 + }, + { + "auxiliary_loss_clip": 0.01399607, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.24061835, + "balance_loss_mlp": 1.01178467, + "epoch": 0.8865774838418758, + "flos": 19035162708480.0, + "grad_norm": 1.6599917988235127, + "language_loss": 0.78474176, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.80904746, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19177246, + "step": 14746, + "time_per_iteration": 5.56698751449585 + }, + { + "auxiliary_loss_clip": 0.01396469, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.23647368, + "balance_loss_mlp": 1.01354074, + "epoch": 0.8866376070945439, + "flos": 22173152866560.0, + "grad_norm": 3.9770684939385914, + "language_loss": 0.76914901, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.79343551, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18640137, + "step": 14747, + "time_per_iteration": 2.8566722869873047 + }, + { + "auxiliary_loss_clip": 0.01382217, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.22644138, + "balance_loss_mlp": 1.0111835, + "epoch": 0.8866977303472118, + "flos": 20714003458560.0, + "grad_norm": 1.7302410660980432, + "language_loss": 0.83657503, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.8606962, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18713379, + "step": 14748, + "time_per_iteration": 2.831908702850342 + }, + { + "auxiliary_loss_clip": 0.01398855, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.2391504, + "balance_loss_mlp": 1.01476264, + "epoch": 0.8867578535998798, + "flos": 48810479535360.0, + "grad_norm": 1.7501662849102422, + "language_loss": 0.78128815, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.80561185, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1875, + "step": 14749, + "time_per_iteration": 3.09627103805542 + }, + { + "auxiliary_loss_clip": 0.01416376, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.25207222, + "balance_loss_mlp": 1.01570368, + "epoch": 0.8868179768525477, + "flos": 21115081779840.0, + "grad_norm": 2.1556390418573237, + "language_loss": 0.7079975, + "learning_rate": 1.328135602550451e-07, + "loss": 0.73250937, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19116211, + "step": 14750, + "time_per_iteration": 2.8390960693359375 + }, + { + "auxiliary_loss_clip": 0.01388912, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.23131645, + "balance_loss_mlp": 1.01307917, + "epoch": 0.8868781001052157, + "flos": 21839815290240.0, + "grad_norm": 1.8616904726454888, + "language_loss": 0.59805256, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.62225997, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1875, + "step": 14751, + "time_per_iteration": 2.85882568359375 + }, + { + "auxiliary_loss_clip": 0.01396454, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.2380898, + "balance_loss_mlp": 1.01236296, + "epoch": 0.8869382233578836, + "flos": 13524256281600.0, + "grad_norm": 2.0473336505144237, + "language_loss": 0.81663215, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.84090602, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18591309, + "step": 14752, + "time_per_iteration": 2.835252285003662 + }, + { + "auxiliary_loss_clip": 0.01416396, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.25077176, + "balance_loss_mlp": 1.01702714, + "epoch": 0.8869983466105517, + "flos": 22713694220160.0, + "grad_norm": 1.8336089423351132, + "language_loss": 0.81159008, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.83611655, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.19226074, + "step": 14753, + "time_per_iteration": 2.833667516708374 + }, + { + "auxiliary_loss_clip": 0.01391226, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.23404181, + "balance_loss_mlp": 1.01373529, + "epoch": 0.8870584698632196, + "flos": 15349075315200.0, + "grad_norm": 1.6171843117629625, + "language_loss": 0.65899289, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.68322915, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18688965, + "step": 14754, + "time_per_iteration": 2.808683395385742 + }, + { + "auxiliary_loss_clip": 0.01407497, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.24748194, + "balance_loss_mlp": 1.01976335, + "epoch": 0.8871185931158876, + "flos": 26627074081920.0, + "grad_norm": 1.8575969421827914, + "language_loss": 0.75640953, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.78086853, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18652344, + "step": 14755, + "time_per_iteration": 2.982423782348633 + }, + { + "auxiliary_loss_clip": 0.01384922, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.22759891, + "balance_loss_mlp": 1.01572943, + "epoch": 0.8871787163685555, + "flos": 21809112053760.0, + "grad_norm": 1.4480773271636003, + "language_loss": 0.78683627, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.81103063, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18786621, + "step": 14756, + "time_per_iteration": 2.8380849361419678 + }, + { + "auxiliary_loss_clip": 0.01398894, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.23850942, + "balance_loss_mlp": 1.0115943, + "epoch": 0.8872388396212235, + "flos": 14911185709440.0, + "grad_norm": 2.0971922420638154, + "language_loss": 0.77797061, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.80227101, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19543457, + "step": 14757, + "time_per_iteration": 2.82704758644104 + }, + { + "auxiliary_loss_clip": 0.01380222, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.2253449, + "balance_loss_mlp": 1.01478219, + "epoch": 0.8872989628738914, + "flos": 26443289128320.0, + "grad_norm": 1.9124951787432027, + "language_loss": 0.68626922, + "learning_rate": 1.316993656021632e-07, + "loss": 0.71040094, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.1817627, + "step": 14758, + "time_per_iteration": 2.892263650894165 + }, + { + "auxiliary_loss_clip": 0.01408181, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.24843788, + "balance_loss_mlp": 1.01507616, + "epoch": 0.8873590861265594, + "flos": 48159685307520.0, + "grad_norm": 2.1323864599403763, + "language_loss": 0.69545496, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71986872, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18127441, + "step": 14759, + "time_per_iteration": 3.111626625061035 + }, + { + "auxiliary_loss_clip": 0.01390182, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.23118567, + "balance_loss_mlp": 1.01298058, + "epoch": 0.8874192093792275, + "flos": 18342173064960.0, + "grad_norm": 1.7036508040764797, + "language_loss": 0.75124896, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.77547133, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.1907959, + "step": 14760, + "time_per_iteration": 2.8150553703308105 + }, + { + "auxiliary_loss_clip": 0.01404193, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.24202919, + "balance_loss_mlp": 1.01646698, + "epoch": 0.8874793326318954, + "flos": 17903061849600.0, + "grad_norm": 2.2391403563615846, + "language_loss": 0.77485275, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.79924619, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18688965, + "step": 14761, + "time_per_iteration": 2.9017693996429443 + }, + { + "auxiliary_loss_clip": 0.01395181, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.23601437, + "balance_loss_mlp": 1.01806331, + "epoch": 0.8875394558845634, + "flos": 31114548956160.0, + "grad_norm": 4.228236028862086, + "language_loss": 0.62364644, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.6479708, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1920166, + "step": 14762, + "time_per_iteration": 2.9655661582946777 + }, + { + "auxiliary_loss_clip": 0.01388727, + "auxiliary_loss_mlp": 0.01036809, + "balance_loss_clip": 1.23025036, + "balance_loss_mlp": 1.01607895, + "epoch": 0.8875995791372313, + "flos": 21151893064320.0, + "grad_norm": 1.9301268997996266, + "language_loss": 0.65314281, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.6773982, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.20727539, + "step": 14763, + "time_per_iteration": 2.9769885540008545 + }, + { + "auxiliary_loss_clip": 0.014032, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.24286842, + "balance_loss_mlp": 1.01330829, + "epoch": 0.8876597023898993, + "flos": 17463543431040.0, + "grad_norm": 2.067845645802123, + "language_loss": 0.72158712, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.74594009, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18786621, + "step": 14764, + "time_per_iteration": 2.863757610321045 + }, + { + "auxiliary_loss_clip": 0.01406121, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.2417382, + "balance_loss_mlp": 1.01521599, + "epoch": 0.8877198256425672, + "flos": 22717404293760.0, + "grad_norm": 2.1287099665450264, + "language_loss": 0.66678023, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.69117951, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.18579102, + "step": 14765, + "time_per_iteration": 2.882183313369751 + }, + { + "auxiliary_loss_clip": 0.01385477, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.22919953, + "balance_loss_mlp": 1.01527405, + "epoch": 0.8877799488952353, + "flos": 24545797666560.0, + "grad_norm": 1.559400922641486, + "language_loss": 0.77103519, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.795223, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18017578, + "step": 14766, + "time_per_iteration": 2.858431339263916 + }, + { + "auxiliary_loss_clip": 0.01384887, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.22903991, + "balance_loss_mlp": 1.01781368, + "epoch": 0.8878400721479032, + "flos": 20968877272320.0, + "grad_norm": 1.6403579674035713, + "language_loss": 0.74072301, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.76494539, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.19543457, + "step": 14767, + "time_per_iteration": 2.8486826419830322 + }, + { + "auxiliary_loss_clip": 0.01379583, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.22541106, + "balance_loss_mlp": 1.0122478, + "epoch": 0.8879001954005712, + "flos": 25304582528640.0, + "grad_norm": 1.8676047330731924, + "language_loss": 0.71290523, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73700809, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18457031, + "step": 14768, + "time_per_iteration": 2.880800247192383 + }, + { + "auxiliary_loss_clip": 0.01397547, + "auxiliary_loss_mlp": 0.01033438, + "balance_loss_clip": 1.24034798, + "balance_loss_mlp": 1.01388788, + "epoch": 0.8879603186532391, + "flos": 23195724768000.0, + "grad_norm": 3.027365545877491, + "language_loss": 0.71235049, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.73666036, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19555664, + "step": 14769, + "time_per_iteration": 2.8483471870422363 + }, + { + "auxiliary_loss_clip": 0.0139006, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.23176146, + "balance_loss_mlp": 1.01151538, + "epoch": 0.8880204419059071, + "flos": 13661502318720.0, + "grad_norm": 2.4022083809383274, + "language_loss": 0.68323982, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.70744258, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18713379, + "step": 14770, + "time_per_iteration": 2.8085718154907227 + }, + { + "auxiliary_loss_clip": 0.01381794, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.22788846, + "balance_loss_mlp": 1.01441121, + "epoch": 0.888080565158575, + "flos": 20641738233600.0, + "grad_norm": 1.9563058970175538, + "language_loss": 0.66244644, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.68659329, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.18457031, + "step": 14771, + "time_per_iteration": 2.8890223503112793 + }, + { + "auxiliary_loss_clip": 0.01388639, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.23052835, + "balance_loss_mlp": 1.01177585, + "epoch": 0.888140688411243, + "flos": 28631379813120.0, + "grad_norm": 1.7648679946304118, + "language_loss": 0.83096796, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.85516417, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19189453, + "step": 14772, + "time_per_iteration": 4.3276686668396 + }, + { + "auxiliary_loss_clip": 0.01375768, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.22227693, + "balance_loss_mlp": 1.01125789, + "epoch": 0.8882008116639111, + "flos": 25531196325120.0, + "grad_norm": 1.4807914832940732, + "language_loss": 0.7705121, + "learning_rate": 1.296224737033258e-07, + "loss": 0.79456818, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.18579102, + "step": 14773, + "time_per_iteration": 2.9128007888793945 + }, + { + "auxiliary_loss_clip": 0.01401347, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.24520481, + "balance_loss_mlp": 1.01186132, + "epoch": 0.888260934916579, + "flos": 27685416637440.0, + "grad_norm": 1.841575234898447, + "language_loss": 0.75690907, + "learning_rate": 1.294845814469907e-07, + "loss": 0.78122795, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18676758, + "step": 14774, + "time_per_iteration": 2.9575769901275635 + }, + { + "auxiliary_loss_clip": 0.0140129, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.23972726, + "balance_loss_mlp": 1.01431823, + "epoch": 0.888321058169247, + "flos": 21619671500160.0, + "grad_norm": 3.156793921351749, + "language_loss": 0.73423243, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.75858772, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19909668, + "step": 14775, + "time_per_iteration": 4.286986351013184 + }, + { + "auxiliary_loss_clip": 0.01389913, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.23276925, + "balance_loss_mlp": 1.01723695, + "epoch": 0.8883811814219149, + "flos": 18157890418560.0, + "grad_norm": 1.5832308900704664, + "language_loss": 0.80931258, + "learning_rate": 1.292090097299432e-07, + "loss": 0.83357155, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18762207, + "step": 14776, + "time_per_iteration": 2.8827366828918457 + }, + { + "auxiliary_loss_clip": 0.01412406, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.24737287, + "balance_loss_mlp": 1.01101673, + "epoch": 0.8884413046745829, + "flos": 28335034500480.0, + "grad_norm": 2.5697752150512296, + "language_loss": 0.70094281, + "learning_rate": 1.290713302796802e-07, + "loss": 0.72536576, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.18847656, + "step": 14777, + "time_per_iteration": 2.9598469734191895 + }, + { + "auxiliary_loss_clip": 0.01388359, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.22998631, + "balance_loss_mlp": 1.01694465, + "epoch": 0.8885014279272508, + "flos": 15167145398400.0, + "grad_norm": 1.8235069131314823, + "language_loss": 0.71518111, + "learning_rate": 1.2893372177522e-07, + "loss": 0.73943031, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19616699, + "step": 14778, + "time_per_iteration": 2.8434598445892334 + }, + { + "auxiliary_loss_clip": 0.01397858, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.23937774, + "balance_loss_mlp": 1.01519966, + "epoch": 0.8885615511799189, + "flos": 19109101991040.0, + "grad_norm": 1.6033118456894666, + "language_loss": 0.78073394, + "learning_rate": 1.287961842217804e-07, + "loss": 0.80505395, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18933105, + "step": 14779, + "time_per_iteration": 2.882605791091919 + }, + { + "auxiliary_loss_clip": 0.01182831, + "auxiliary_loss_mlp": 0.01021395, + "balance_loss_clip": 1.09220636, + "balance_loss_mlp": 1.00089097, + "epoch": 0.8886216744325868, + "flos": 51208551912960.0, + "grad_norm": 0.8731558610077517, + "language_loss": 0.56878889, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.59083116, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.20507812, + "step": 14780, + "time_per_iteration": 3.141737699508667 + }, + { + "auxiliary_loss_clip": 0.01179823, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.08989549, + "balance_loss_mlp": 1.0088582, + "epoch": 0.8886817976852548, + "flos": 61644931292160.0, + "grad_norm": 0.7921524195841102, + "language_loss": 0.6240533, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64613467, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.19433594, + "step": 14781, + "time_per_iteration": 6.080317735671997 + }, + { + "auxiliary_loss_clip": 0.01177349, + "auxiliary_loss_mlp": 0.01016626, + "balance_loss_clip": 1.09162927, + "balance_loss_mlp": 0.99802935, + "epoch": 0.8887419209379227, + "flos": 60674128721280.0, + "grad_norm": 0.7954218612379419, + "language_loss": 0.58138037, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60332012, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.18554688, + "step": 14782, + "time_per_iteration": 3.1010873317718506 + }, + { + "auxiliary_loss_clip": 0.01389399, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.23336935, + "balance_loss_mlp": 1.01490426, + "epoch": 0.8888020441905907, + "flos": 29217917145600.0, + "grad_norm": 1.7644295952441622, + "language_loss": 0.66631997, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.69054377, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18078613, + "step": 14783, + "time_per_iteration": 2.905606985092163 + }, + { + "auxiliary_loss_clip": 0.01404761, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.24298644, + "balance_loss_mlp": 1.01172864, + "epoch": 0.8888621674432586, + "flos": 22172564684160.0, + "grad_norm": 1.5644940305233204, + "language_loss": 0.78151822, + "learning_rate": 1.281095609023415e-07, + "loss": 0.80587429, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19116211, + "step": 14784, + "time_per_iteration": 2.8618788719177246 + }, + { + "auxiliary_loss_clip": 0.01405267, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.2438705, + "balance_loss_mlp": 1.01339281, + "epoch": 0.8889222906959267, + "flos": 27684330762240.0, + "grad_norm": 2.6230170539145496, + "language_loss": 0.61253166, + "learning_rate": 1.279724491644565e-07, + "loss": 0.63690305, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18493652, + "step": 14785, + "time_per_iteration": 2.8985042572021484 + }, + { + "auxiliary_loss_clip": 0.01395967, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.23704755, + "balance_loss_mlp": 1.01417673, + "epoch": 0.8889824139485947, + "flos": 14175457712640.0, + "grad_norm": 1.6682415632806102, + "language_loss": 0.65927911, + "learning_rate": 1.278354084140445e-07, + "loss": 0.68357575, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19506836, + "step": 14786, + "time_per_iteration": 2.8045055866241455 + }, + { + "auxiliary_loss_clip": 0.01403216, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.23878217, + "balance_loss_mlp": 1.01234627, + "epoch": 0.8890425372012626, + "flos": 12858938472960.0, + "grad_norm": 2.4405306350724008, + "language_loss": 0.86405641, + "learning_rate": 1.276984386563009e-07, + "loss": 0.88841033, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19824219, + "step": 14787, + "time_per_iteration": 2.810202121734619 + }, + { + "auxiliary_loss_clip": 0.01397254, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.23822618, + "balance_loss_mlp": 1.01205468, + "epoch": 0.8891026604539306, + "flos": 21699266382720.0, + "grad_norm": 3.995504524885821, + "language_loss": 0.71071142, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.7349813, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.17687988, + "step": 14788, + "time_per_iteration": 2.916994571685791 + }, + { + "auxiliary_loss_clip": 0.01375531, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.22177505, + "balance_loss_mlp": 1.0150907, + "epoch": 0.8891627837065985, + "flos": 21881377278720.0, + "grad_norm": 1.608644428927032, + "language_loss": 0.70506036, + "learning_rate": 1.274247121395935e-07, + "loss": 0.7291522, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18554688, + "step": 14789, + "time_per_iteration": 2.841963052749634 + }, + { + "auxiliary_loss_clip": 0.01386888, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.23075235, + "balance_loss_mlp": 1.01575065, + "epoch": 0.8892229069592665, + "flos": 21590280362880.0, + "grad_norm": 1.7583989328613445, + "language_loss": 0.71147251, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.73569125, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19238281, + "step": 14790, + "time_per_iteration": 2.915573835372925 + }, + { + "auxiliary_loss_clip": 0.01395647, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.23772943, + "balance_loss_mlp": 1.01177025, + "epoch": 0.8892830302119344, + "flos": 23086376789760.0, + "grad_norm": 1.6208983139777713, + "language_loss": 0.73104858, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.7553075, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18481445, + "step": 14791, + "time_per_iteration": 2.873307704925537 + }, + { + "auxiliary_loss_clip": 0.01384043, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.22915375, + "balance_loss_mlp": 1.01352549, + "epoch": 0.8893431534646025, + "flos": 23081535596160.0, + "grad_norm": 1.6801908262064733, + "language_loss": 0.74423254, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76839495, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18701172, + "step": 14792, + "time_per_iteration": 2.911698818206787 + }, + { + "auxiliary_loss_clip": 0.01405753, + "auxiliary_loss_mlp": 0.0103995, + "balance_loss_clip": 1.24300504, + "balance_loss_mlp": 1.0199703, + "epoch": 0.8894032767172704, + "flos": 22465018944000.0, + "grad_norm": 1.9836397656014977, + "language_loss": 0.6697346, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.69419169, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.1998291, + "step": 14793, + "time_per_iteration": 2.8469762802124023 + }, + { + "auxiliary_loss_clip": 0.01395506, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.23352695, + "balance_loss_mlp": 1.01438856, + "epoch": 0.8894633999699384, + "flos": 25349266408320.0, + "grad_norm": 1.8538369998312316, + "language_loss": 0.72388238, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.74818456, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20324707, + "step": 14794, + "time_per_iteration": 2.8777291774749756 + }, + { + "auxiliary_loss_clip": 0.014224, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.25789189, + "balance_loss_mlp": 1.01696968, + "epoch": 0.8895235232226063, + "flos": 21003381072000.0, + "grad_norm": 1.4627923299260905, + "language_loss": 0.75880158, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.78339505, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19995117, + "step": 14795, + "time_per_iteration": 2.873260259628296 + }, + { + "auxiliary_loss_clip": 0.01176423, + "auxiliary_loss_mlp": 0.01018873, + "balance_loss_clip": 1.09122229, + "balance_loss_mlp": 0.99903661, + "epoch": 0.8895836464752743, + "flos": 69762769948800.0, + "grad_norm": 0.7712750999078115, + "language_loss": 0.56105828, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58301127, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.19824219, + "step": 14796, + "time_per_iteration": 3.26432466506958 + }, + { + "auxiliary_loss_clip": 0.01403311, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.24266398, + "balance_loss_mlp": 1.01557422, + "epoch": 0.8896437697279422, + "flos": 23232219338880.0, + "grad_norm": 2.0145341322049046, + "language_loss": 0.70703828, + "learning_rate": 1.263326468169843e-07, + "loss": 0.73141587, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18884277, + "step": 14797, + "time_per_iteration": 2.896883964538574 + }, + { + "auxiliary_loss_clip": 0.01176924, + "auxiliary_loss_mlp": 0.01012246, + "balance_loss_clip": 1.09045601, + "balance_loss_mlp": 0.99603349, + "epoch": 0.8897038929806103, + "flos": 70782310448640.0, + "grad_norm": 0.7491099033150069, + "language_loss": 0.58029723, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60218894, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.16210938, + "step": 14798, + "time_per_iteration": 3.341984272003174 + }, + { + "auxiliary_loss_clip": 0.0140103, + "auxiliary_loss_mlp": 0.01032403, + "balance_loss_clip": 1.24087, + "balance_loss_mlp": 1.01415253, + "epoch": 0.8897640162332782, + "flos": 19254537336960.0, + "grad_norm": 1.6303523690411268, + "language_loss": 0.79812479, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.8224591, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18249512, + "step": 14799, + "time_per_iteration": 2.850330114364624 + }, + { + "auxiliary_loss_clip": 0.01180152, + "auxiliary_loss_mlp": 0.01023585, + "balance_loss_clip": 1.09204912, + "balance_loss_mlp": 1.00269914, + "epoch": 0.8898241394859462, + "flos": 41381124024960.0, + "grad_norm": 0.9072990851573018, + "language_loss": 0.58165514, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60369253, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20898438, + "step": 14800, + "time_per_iteration": 3.2400875091552734 + }, + { + "auxiliary_loss_clip": 0.0139649, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.23773909, + "balance_loss_mlp": 1.01041532, + "epoch": 0.8898842627386142, + "flos": 18994777084800.0, + "grad_norm": 1.4694763429949533, + "language_loss": 0.66807085, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.69232678, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18676758, + "step": 14801, + "time_per_iteration": 2.8980751037597656 + }, + { + "auxiliary_loss_clip": 0.01408539, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.24557757, + "balance_loss_mlp": 1.01852798, + "epoch": 0.8899443859912821, + "flos": 13223024530560.0, + "grad_norm": 6.185669304730591, + "language_loss": 0.76347947, + "learning_rate": 1.256524149358682e-07, + "loss": 0.78795272, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20239258, + "step": 14802, + "time_per_iteration": 2.960324287414551 + }, + { + "auxiliary_loss_clip": 0.01392012, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.23604739, + "balance_loss_mlp": 1.01343846, + "epoch": 0.8900045092439501, + "flos": 22684981754880.0, + "grad_norm": 1.8360796849122258, + "language_loss": 0.73884594, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.76308882, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18823242, + "step": 14803, + "time_per_iteration": 2.9188499450683594 + }, + { + "auxiliary_loss_clip": 0.01387632, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.22885096, + "balance_loss_mlp": 1.01654041, + "epoch": 0.890064632496618, + "flos": 21151350126720.0, + "grad_norm": 2.0221248151939255, + "language_loss": 0.73132563, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.75555348, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18615723, + "step": 14804, + "time_per_iteration": 2.824906587600708 + }, + { + "auxiliary_loss_clip": 0.013932, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.23411703, + "balance_loss_mlp": 1.0125742, + "epoch": 0.8901247557492861, + "flos": 23405643233280.0, + "grad_norm": 1.6888449310644211, + "language_loss": 0.81887591, + "learning_rate": 1.252451286713123e-07, + "loss": 0.8431201, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18652344, + "step": 14805, + "time_per_iteration": 2.9111576080322266 + }, + { + "auxiliary_loss_clip": 0.01406203, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.2440958, + "balance_loss_mlp": 1.01375425, + "epoch": 0.890184879001954, + "flos": 29181694043520.0, + "grad_norm": 3.172825397120138, + "language_loss": 0.67584288, + "learning_rate": 1.251095087580505e-07, + "loss": 0.7002396, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19726562, + "step": 14806, + "time_per_iteration": 2.9290060997009277 + }, + { + "auxiliary_loss_clip": 0.01389677, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.23000801, + "balance_loss_mlp": 1.01284266, + "epoch": 0.890245002254622, + "flos": 14435715657600.0, + "grad_norm": 1.8676538135398753, + "language_loss": 0.68394488, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.70815462, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18432617, + "step": 14807, + "time_per_iteration": 4.3083178997039795 + }, + { + "auxiliary_loss_clip": 0.01390943, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.23406935, + "balance_loss_mlp": 1.01422048, + "epoch": 0.8903051255072899, + "flos": 22392256026240.0, + "grad_norm": 2.9599850613438687, + "language_loss": 0.76290345, + "learning_rate": 1.248384822247732e-07, + "loss": 0.78714073, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18591309, + "step": 14808, + "time_per_iteration": 2.856003522872925 + }, + { + "auxiliary_loss_clip": 0.01397009, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.23775423, + "balance_loss_mlp": 1.01422095, + "epoch": 0.8903652487599579, + "flos": 20787173579520.0, + "grad_norm": 1.8116827401831268, + "language_loss": 0.82204032, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.84633446, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18188477, + "step": 14809, + "time_per_iteration": 2.834502935409546 + }, + { + "auxiliary_loss_clip": 0.01384261, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.2269119, + "balance_loss_mlp": 1.01543581, + "epoch": 0.8904253720126258, + "flos": 24434866120320.0, + "grad_norm": 1.6932647114692645, + "language_loss": 0.69410086, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.71828985, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.1920166, + "step": 14810, + "time_per_iteration": 2.856969118118286 + }, + { + "auxiliary_loss_clip": 0.01396825, + "auxiliary_loss_mlp": 0.0103429, + "balance_loss_clip": 1.23564255, + "balance_loss_mlp": 1.01527643, + "epoch": 0.8904854952652939, + "flos": 19473323783040.0, + "grad_norm": 1.9177185888195598, + "language_loss": 0.71414155, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.73845267, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19006348, + "step": 14811, + "time_per_iteration": 4.304624319076538 + }, + { + "auxiliary_loss_clip": 0.01408603, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.24660301, + "balance_loss_mlp": 1.01779091, + "epoch": 0.8905456185179618, + "flos": 50817183240960.0, + "grad_norm": 1.7975341973492116, + "language_loss": 0.66308492, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.68754041, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19152832, + "step": 14812, + "time_per_iteration": 3.1073338985443115 + }, + { + "auxiliary_loss_clip": 0.01392149, + "auxiliary_loss_mlp": 0.01033316, + "balance_loss_clip": 1.23488355, + "balance_loss_mlp": 1.01427817, + "epoch": 0.8906057417706298, + "flos": 17793713871360.0, + "grad_norm": 1.8467808716969305, + "language_loss": 0.69177675, + "learning_rate": 1.24162160341861e-07, + "loss": 0.71603143, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19042969, + "step": 14813, + "time_per_iteration": 2.945223093032837 + }, + { + "auxiliary_loss_clip": 0.01425092, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.25787687, + "balance_loss_mlp": 1.01471925, + "epoch": 0.8906658650232978, + "flos": 21954954602880.0, + "grad_norm": 1.8228046334976127, + "language_loss": 0.76899123, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.79359466, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.20532227, + "step": 14814, + "time_per_iteration": 2.9189345836639404 + }, + { + "auxiliary_loss_clip": 0.01407117, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.24440122, + "balance_loss_mlp": 1.01315284, + "epoch": 0.8907259882759657, + "flos": 21297645123840.0, + "grad_norm": 2.113040481547012, + "language_loss": 0.74949306, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.77389234, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19641113, + "step": 14815, + "time_per_iteration": 2.8409223556518555 + }, + { + "auxiliary_loss_clip": 0.01383206, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.22779918, + "balance_loss_mlp": 1.01339614, + "epoch": 0.8907861115286337, + "flos": 20129999834880.0, + "grad_norm": 1.8298547515222485, + "language_loss": 0.75660414, + "learning_rate": 1.237572207545914e-07, + "loss": 0.78075594, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18566895, + "step": 14816, + "time_per_iteration": 4.346631288528442 + }, + { + "auxiliary_loss_clip": 0.01394187, + "auxiliary_loss_mlp": 0.01032736, + "balance_loss_clip": 1.23510504, + "balance_loss_mlp": 1.01403165, + "epoch": 0.8908462347813016, + "flos": 20093776732800.0, + "grad_norm": 1.7796978623017419, + "language_loss": 0.77754253, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.80181175, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18701172, + "step": 14817, + "time_per_iteration": 2.829658269882202 + }, + { + "auxiliary_loss_clip": 0.0117558, + "auxiliary_loss_mlp": 0.01022643, + "balance_loss_clip": 1.09049988, + "balance_loss_mlp": 1.0037607, + "epoch": 0.8909063580339697, + "flos": 65533155045120.0, + "grad_norm": 0.7640084695616988, + "language_loss": 0.5652765, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58725876, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.18847656, + "step": 14818, + "time_per_iteration": 3.3866817951202393 + }, + { + "auxiliary_loss_clip": 0.01395061, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.23560476, + "balance_loss_mlp": 1.01284087, + "epoch": 0.8909664812866376, + "flos": 29875407603840.0, + "grad_norm": 2.0138189572472944, + "language_loss": 0.65218329, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.67644894, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18664551, + "step": 14819, + "time_per_iteration": 2.9420526027679443 + }, + { + "auxiliary_loss_clip": 0.01401755, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.24109101, + "balance_loss_mlp": 1.01360667, + "epoch": 0.8910266045393056, + "flos": 25458026204160.0, + "grad_norm": 2.533814927096091, + "language_loss": 0.79633743, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.8206864, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19543457, + "step": 14820, + "time_per_iteration": 2.8889687061309814 + }, + { + "auxiliary_loss_clip": 0.01393399, + "auxiliary_loss_mlp": 0.01035528, + "balance_loss_clip": 1.23460603, + "balance_loss_mlp": 1.01725364, + "epoch": 0.8910867277919735, + "flos": 24509936522880.0, + "grad_norm": 3.072288970763731, + "language_loss": 0.7705667, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.79485607, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18286133, + "step": 14821, + "time_per_iteration": 2.920757532119751 + }, + { + "auxiliary_loss_clip": 0.01173304, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.08808899, + "balance_loss_mlp": 1.01280713, + "epoch": 0.8911468510446415, + "flos": 60716550360960.0, + "grad_norm": 0.7880436118365125, + "language_loss": 0.59328085, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.6153785, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.23632812, + "step": 14822, + "time_per_iteration": 3.1713337898254395 + }, + { + "auxiliary_loss_clip": 0.01405093, + "auxiliary_loss_mlp": 0.01036725, + "balance_loss_clip": 1.2440989, + "balance_loss_mlp": 1.01774693, + "epoch": 0.8912069742973094, + "flos": 25348316267520.0, + "grad_norm": 1.8805742384550568, + "language_loss": 0.70029575, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.72471392, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18969727, + "step": 14823, + "time_per_iteration": 2.86421799659729 + }, + { + "auxiliary_loss_clip": 0.01390193, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.23375094, + "balance_loss_mlp": 1.01024044, + "epoch": 0.8912670975499775, + "flos": 18232689352320.0, + "grad_norm": 7.08825110661804, + "language_loss": 0.69672608, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.72091949, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18896484, + "step": 14824, + "time_per_iteration": 2.8212883472442627 + }, + { + "auxiliary_loss_clip": 0.01400586, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.23808384, + "balance_loss_mlp": 1.01294398, + "epoch": 0.8913272208026454, + "flos": 26515463863680.0, + "grad_norm": 2.76357497615334, + "language_loss": 0.7189275, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.74325168, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18884277, + "step": 14825, + "time_per_iteration": 2.8829452991485596 + }, + { + "auxiliary_loss_clip": 0.01391751, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.23482478, + "balance_loss_mlp": 1.01027596, + "epoch": 0.8913873440553134, + "flos": 18810358704000.0, + "grad_norm": 1.8414918098706514, + "language_loss": 0.72567934, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.749892, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19226074, + "step": 14826, + "time_per_iteration": 2.8330910205841064 + }, + { + "auxiliary_loss_clip": 0.01403275, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.24501777, + "balance_loss_mlp": 1.01289809, + "epoch": 0.8914474673079814, + "flos": 20894214072960.0, + "grad_norm": 2.6761845018215964, + "language_loss": 0.7619822, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.78632236, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.1784668, + "step": 14827, + "time_per_iteration": 2.8203821182250977 + }, + { + "auxiliary_loss_clip": 0.01392095, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.23277152, + "balance_loss_mlp": 1.01212525, + "epoch": 0.8915075905606493, + "flos": 20960371249920.0, + "grad_norm": 1.7064370199183105, + "language_loss": 0.78750956, + "learning_rate": 1.221438670423336e-07, + "loss": 0.81174654, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19482422, + "step": 14828, + "time_per_iteration": 2.8341658115386963 + }, + { + "auxiliary_loss_clip": 0.01386507, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.22876406, + "balance_loss_mlp": 1.01197016, + "epoch": 0.8915677138133173, + "flos": 23086783992960.0, + "grad_norm": 12.4846334874325, + "language_loss": 0.75954032, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.7837199, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19494629, + "step": 14829, + "time_per_iteration": 2.8687570095062256 + }, + { + "auxiliary_loss_clip": 0.01400695, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.24040544, + "balance_loss_mlp": 1.01181018, + "epoch": 0.8916278370659853, + "flos": 23450598581760.0, + "grad_norm": 1.6385935845973019, + "language_loss": 0.85233957, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.87665105, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18652344, + "step": 14830, + "time_per_iteration": 2.868070125579834 + }, + { + "auxiliary_loss_clip": 0.01388996, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.23259199, + "balance_loss_mlp": 1.01650262, + "epoch": 0.8916879603186533, + "flos": 25172675377920.0, + "grad_norm": 1.6545170191282255, + "language_loss": 0.7571516, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.78139192, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1854248, + "step": 14831, + "time_per_iteration": 3.0424954891204834 + }, + { + "auxiliary_loss_clip": 0.01406614, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.24262369, + "balance_loss_mlp": 1.00976551, + "epoch": 0.8917480835713212, + "flos": 20239438302720.0, + "grad_norm": 1.8811377268405864, + "language_loss": 0.7345736, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75893706, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19970703, + "step": 14832, + "time_per_iteration": 2.856053590774536 + }, + { + "auxiliary_loss_clip": 0.0139391, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.23216641, + "balance_loss_mlp": 1.01591861, + "epoch": 0.8918082068239892, + "flos": 26112485260800.0, + "grad_norm": 1.8934386084725479, + "language_loss": 0.67622912, + "learning_rate": 1.214746621848355e-07, + "loss": 0.70051205, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18469238, + "step": 14833, + "time_per_iteration": 2.9080209732055664 + }, + { + "auxiliary_loss_clip": 0.01398909, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.23787999, + "balance_loss_mlp": 1.01301312, + "epoch": 0.8918683300766571, + "flos": 24843364588800.0, + "grad_norm": 1.6215955693471722, + "language_loss": 0.74810916, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.77242362, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19519043, + "step": 14834, + "time_per_iteration": 2.9243602752685547 + }, + { + "auxiliary_loss_clip": 0.01390673, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.23152995, + "balance_loss_mlp": 1.0138166, + "epoch": 0.8919284533293251, + "flos": 22314154222080.0, + "grad_norm": 2.688130993998457, + "language_loss": 0.79655862, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.82079399, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19042969, + "step": 14835, + "time_per_iteration": 2.8676562309265137 + }, + { + "auxiliary_loss_clip": 0.01389736, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.23362076, + "balance_loss_mlp": 1.01419294, + "epoch": 0.891988576581993, + "flos": 30385924392960.0, + "grad_norm": 1.3822514019868815, + "language_loss": 0.74631977, + "learning_rate": 1.210739940361689e-07, + "loss": 0.77054757, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18835449, + "step": 14836, + "time_per_iteration": 2.93780517578125 + }, + { + "auxiliary_loss_clip": 0.0139326, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.23418975, + "balance_loss_mlp": 1.0137006, + "epoch": 0.8920486998346611, + "flos": 15559581962880.0, + "grad_norm": 2.0820406802794174, + "language_loss": 0.69284129, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.71709341, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18261719, + "step": 14837, + "time_per_iteration": 2.831418037414551 + }, + { + "auxiliary_loss_clip": 0.0140774, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.24408484, + "balance_loss_mlp": 1.01243615, + "epoch": 0.892108823087329, + "flos": 21224836961280.0, + "grad_norm": 1.8039551479214098, + "language_loss": 0.68428504, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.70868087, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19396973, + "step": 14838, + "time_per_iteration": 2.8639755249023438 + }, + { + "auxiliary_loss_clip": 0.01391234, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.23187566, + "balance_loss_mlp": 1.01140571, + "epoch": 0.892168946339997, + "flos": 21988463016960.0, + "grad_norm": 2.569774000061412, + "language_loss": 0.77339095, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.79760647, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18920898, + "step": 14839, + "time_per_iteration": 2.846349000930786 + }, + { + "auxiliary_loss_clip": 0.01178631, + "auxiliary_loss_mlp": 0.01024264, + "balance_loss_clip": 1.08917511, + "balance_loss_mlp": 1.00547695, + "epoch": 0.892229069592665, + "flos": 67505671664640.0, + "grad_norm": 0.6806142078209703, + "language_loss": 0.49524796, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51727688, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.1875, + "step": 14840, + "time_per_iteration": 3.3247530460357666 + }, + { + "auxiliary_loss_clip": 0.01416377, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.25108886, + "balance_loss_mlp": 1.0184536, + "epoch": 0.8922891928453329, + "flos": 19467894407040.0, + "grad_norm": 2.6831511479963863, + "language_loss": 0.65251988, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.67707306, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.20495605, + "step": 14841, + "time_per_iteration": 2.8285574913024902 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.23491764, + "balance_loss_mlp": 1.01327348, + "epoch": 0.8923493160980009, + "flos": 23377745174400.0, + "grad_norm": 1.3952010303814268, + "language_loss": 0.68883896, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.71305138, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18676758, + "step": 14842, + "time_per_iteration": 4.272842645645142 + }, + { + "auxiliary_loss_clip": 0.01385223, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.22847462, + "balance_loss_mlp": 1.01253057, + "epoch": 0.8924094393506689, + "flos": 26188189090560.0, + "grad_norm": 2.802043831752032, + "language_loss": 0.81484532, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.83900684, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18395996, + "step": 14843, + "time_per_iteration": 2.901437997817993 + }, + { + "auxiliary_loss_clip": 0.01418276, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.25330985, + "balance_loss_mlp": 1.012321, + "epoch": 0.8924695626033369, + "flos": 22028803395840.0, + "grad_norm": 2.054040075220133, + "language_loss": 0.6916967, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.71619928, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.1965332, + "step": 14844, + "time_per_iteration": 2.8695871829986572 + }, + { + "auxiliary_loss_clip": 0.01397871, + "auxiliary_loss_mlp": 0.01037002, + "balance_loss_clip": 1.23710918, + "balance_loss_mlp": 1.01664138, + "epoch": 0.8925296858560048, + "flos": 14802244934400.0, + "grad_norm": 2.103000500900352, + "language_loss": 0.92530751, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.94965625, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20373535, + "step": 14845, + "time_per_iteration": 4.2432942390441895 + }, + { + "auxiliary_loss_clip": 0.01381041, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.22497439, + "balance_loss_mlp": 1.01219797, + "epoch": 0.8925898091086728, + "flos": 22356937820160.0, + "grad_norm": 1.678559172231663, + "language_loss": 0.72783518, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.75195777, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19042969, + "step": 14846, + "time_per_iteration": 2.9055559635162354 + }, + { + "auxiliary_loss_clip": 0.0140012, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.23879659, + "balance_loss_mlp": 1.01549709, + "epoch": 0.8926499323613407, + "flos": 45822539692800.0, + "grad_norm": 1.9845605453228268, + "language_loss": 0.57834542, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.60268664, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18518066, + "step": 14847, + "time_per_iteration": 3.0493314266204834 + }, + { + "auxiliary_loss_clip": 0.01395171, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.23536921, + "balance_loss_mlp": 1.01451898, + "epoch": 0.8927100556140087, + "flos": 22137110743680.0, + "grad_norm": 2.514105990303816, + "language_loss": 0.77806652, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.80234629, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1829834, + "step": 14848, + "time_per_iteration": 2.9532394409179688 + }, + { + "auxiliary_loss_clip": 0.01380983, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.22452021, + "balance_loss_mlp": 1.01457727, + "epoch": 0.8927701788666766, + "flos": 28341956954880.0, + "grad_norm": 1.78540595940651, + "language_loss": 0.69638795, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.72053397, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19030762, + "step": 14849, + "time_per_iteration": 2.9224915504455566 + }, + { + "auxiliary_loss_clip": 0.0140742, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.24652529, + "balance_loss_mlp": 1.01316619, + "epoch": 0.8928303021193447, + "flos": 25304220570240.0, + "grad_norm": 2.078379576998031, + "language_loss": 0.81310105, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83749467, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18786621, + "step": 14850, + "time_per_iteration": 2.863088607788086 + }, + { + "auxiliary_loss_clip": 0.01385009, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.22836554, + "balance_loss_mlp": 1.01522088, + "epoch": 0.8928904253720126, + "flos": 22246865925120.0, + "grad_norm": 1.814181351356151, + "language_loss": 0.75947249, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.78367031, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.1953125, + "step": 14851, + "time_per_iteration": 5.6998679637908936 + }, + { + "auxiliary_loss_clip": 0.01396273, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.23888135, + "balance_loss_mlp": 1.01798153, + "epoch": 0.8929505486246806, + "flos": 27104580149760.0, + "grad_norm": 1.7485078838153287, + "language_loss": 0.79107887, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.81540763, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1862793, + "step": 14852, + "time_per_iteration": 2.9078919887542725 + }, + { + "auxiliary_loss_clip": 0.01388538, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.23195601, + "balance_loss_mlp": 1.00979567, + "epoch": 0.8930106718773486, + "flos": 23049701239680.0, + "grad_norm": 1.3571857237596423, + "language_loss": 0.69384098, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71801114, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18676758, + "step": 14853, + "time_per_iteration": 2.8777825832366943 + }, + { + "auxiliary_loss_clip": 0.01395031, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.23683512, + "balance_loss_mlp": 1.01326954, + "epoch": 0.8930707951300165, + "flos": 35640192458880.0, + "grad_norm": 1.7704576110261916, + "language_loss": 0.68032426, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.70459545, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18811035, + "step": 14854, + "time_per_iteration": 2.9884860515594482 + }, + { + "auxiliary_loss_clip": 0.01377086, + "auxiliary_loss_mlp": 0.01028592, + "balance_loss_clip": 1.22222674, + "balance_loss_mlp": 1.00975692, + "epoch": 0.8931309183826845, + "flos": 23050379911680.0, + "grad_norm": 1.4714071736324421, + "language_loss": 0.75428641, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.7783432, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18847656, + "step": 14855, + "time_per_iteration": 2.8647921085357666 + }, + { + "auxiliary_loss_clip": 0.01386218, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.22955704, + "balance_loss_mlp": 1.01263249, + "epoch": 0.8931910416353525, + "flos": 26515644842880.0, + "grad_norm": 1.903024653105993, + "language_loss": 0.65057755, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.67475128, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18530273, + "step": 14856, + "time_per_iteration": 2.878382921218872 + }, + { + "auxiliary_loss_clip": 0.01392068, + "auxiliary_loss_mlp": 0.01033856, + "balance_loss_clip": 1.23322415, + "balance_loss_mlp": 1.01381671, + "epoch": 0.8932511648880205, + "flos": 24984592168320.0, + "grad_norm": 8.747086676662974, + "language_loss": 0.66767627, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.69193554, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.20056152, + "step": 14857, + "time_per_iteration": 2.9058046340942383 + }, + { + "auxiliary_loss_clip": 0.01396988, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.23725879, + "balance_loss_mlp": 1.01539111, + "epoch": 0.8933112881406884, + "flos": 24471043977600.0, + "grad_norm": 2.681321419401168, + "language_loss": 0.76150012, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.78581518, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19128418, + "step": 14858, + "time_per_iteration": 2.8919506072998047 + }, + { + "auxiliary_loss_clip": 0.01399053, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.23913836, + "balance_loss_mlp": 1.014045, + "epoch": 0.8933714113933564, + "flos": 28304738467200.0, + "grad_norm": 1.7223950654973423, + "language_loss": 0.70326257, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.72758752, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1940918, + "step": 14859, + "time_per_iteration": 2.927227258682251 + }, + { + "auxiliary_loss_clip": 0.01384002, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.23086548, + "balance_loss_mlp": 1.01497531, + "epoch": 0.8934315346460243, + "flos": 21445025996160.0, + "grad_norm": 1.6323809530731974, + "language_loss": 0.75563419, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77980971, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18554688, + "step": 14860, + "time_per_iteration": 2.893035650253296 + }, + { + "auxiliary_loss_clip": 0.01401798, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.24118662, + "balance_loss_mlp": 1.01389194, + "epoch": 0.8934916578986923, + "flos": 23780271329280.0, + "grad_norm": 1.7366407249101758, + "language_loss": 0.57906199, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.60340095, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18212891, + "step": 14861, + "time_per_iteration": 2.856999158859253 + }, + { + "auxiliary_loss_clip": 0.01390484, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.23350263, + "balance_loss_mlp": 1.01351213, + "epoch": 0.8935517811513602, + "flos": 18925362282240.0, + "grad_norm": 3.1284777741882093, + "language_loss": 0.64587677, + "learning_rate": 1.176284122190685e-07, + "loss": 0.67011011, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.1932373, + "step": 14862, + "time_per_iteration": 2.8163230419158936 + }, + { + "auxiliary_loss_clip": 0.01391738, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.23396957, + "balance_loss_mlp": 1.0149827, + "epoch": 0.8936119044040283, + "flos": 24072092161920.0, + "grad_norm": 2.1122035363941274, + "language_loss": 0.78733897, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.81159669, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19055176, + "step": 14863, + "time_per_iteration": 2.849299907684326 + }, + { + "auxiliary_loss_clip": 0.01385636, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.23035169, + "balance_loss_mlp": 1.01494598, + "epoch": 0.8936720276566962, + "flos": 21333868225920.0, + "grad_norm": 2.7722192401017165, + "language_loss": 0.71556503, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.7397536, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18286133, + "step": 14864, + "time_per_iteration": 2.85001277923584 + }, + { + "auxiliary_loss_clip": 0.01427558, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.26085496, + "balance_loss_mlp": 1.01680446, + "epoch": 0.8937321509093642, + "flos": 18415388430720.0, + "grad_norm": 2.184772256643288, + "language_loss": 0.76592433, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.79055309, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.18505859, + "step": 14865, + "time_per_iteration": 2.8527615070343018 + }, + { + "auxiliary_loss_clip": 0.01377465, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.2233634, + "balance_loss_mlp": 1.01432931, + "epoch": 0.8937922741620322, + "flos": 22064981253120.0, + "grad_norm": 1.5288440267674976, + "language_loss": 0.72575808, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74985862, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18249512, + "step": 14866, + "time_per_iteration": 2.884913444519043 + }, + { + "auxiliary_loss_clip": 0.01413111, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.2498517, + "balance_loss_mlp": 1.01354551, + "epoch": 0.8938523974147001, + "flos": 25674912368640.0, + "grad_norm": 1.5848300162795328, + "language_loss": 0.84611136, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.87057966, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.20166016, + "step": 14867, + "time_per_iteration": 2.8877944946289062 + }, + { + "auxiliary_loss_clip": 0.0139831, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.23841572, + "balance_loss_mlp": 1.01199758, + "epoch": 0.8939125206673681, + "flos": 25754823964800.0, + "grad_norm": 1.5082726576980523, + "language_loss": 0.81069756, + "learning_rate": 1.168401272009567e-07, + "loss": 0.83497918, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17834473, + "step": 14868, + "time_per_iteration": 2.888139009475708 + }, + { + "auxiliary_loss_clip": 0.01400645, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.24023628, + "balance_loss_mlp": 1.01309443, + "epoch": 0.8939726439200361, + "flos": 27355924869120.0, + "grad_norm": 1.6247200843733187, + "language_loss": 0.77889121, + "learning_rate": 1.167089962692056e-07, + "loss": 0.80322194, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19335938, + "step": 14869, + "time_per_iteration": 2.886737823486328 + }, + { + "auxiliary_loss_clip": 0.01385818, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.22866666, + "balance_loss_mlp": 1.01030231, + "epoch": 0.8940327671727041, + "flos": 20348333832960.0, + "grad_norm": 2.5939794596570893, + "language_loss": 0.66254503, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.68669277, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18664551, + "step": 14870, + "time_per_iteration": 2.8719136714935303 + }, + { + "auxiliary_loss_clip": 0.01177752, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.08925259, + "balance_loss_mlp": 1.01048076, + "epoch": 0.894092890425372, + "flos": 58434386457600.0, + "grad_norm": 0.8020235241826532, + "language_loss": 0.55976874, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.58186471, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21386719, + "step": 14871, + "time_per_iteration": 3.45035457611084 + }, + { + "auxiliary_loss_clip": 0.01389001, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.23283637, + "balance_loss_mlp": 1.01683235, + "epoch": 0.89415301367804, + "flos": 19839445856640.0, + "grad_norm": 1.8690289736410954, + "language_loss": 0.77589691, + "learning_rate": 1.16316031981331e-07, + "loss": 0.80013913, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18408203, + "step": 14872, + "time_per_iteration": 2.867182731628418 + }, + { + "auxiliary_loss_clip": 0.01383923, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.22867846, + "balance_loss_mlp": 1.01528931, + "epoch": 0.8942131369307079, + "flos": 25786839300480.0, + "grad_norm": 1.5844905572421342, + "language_loss": 0.67811817, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.70229155, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18127441, + "step": 14873, + "time_per_iteration": 2.8889942169189453 + }, + { + "auxiliary_loss_clip": 0.01388599, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.23164511, + "balance_loss_mlp": 1.01596439, + "epoch": 0.8942732601833759, + "flos": 23159456421120.0, + "grad_norm": 1.905727877526288, + "language_loss": 0.6068821, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.63112998, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.20214844, + "step": 14874, + "time_per_iteration": 3.0060155391693115 + }, + { + "auxiliary_loss_clip": 0.01399978, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.23917818, + "balance_loss_mlp": 1.01516473, + "epoch": 0.8943333834360438, + "flos": 27867165575040.0, + "grad_norm": 1.94570299844284, + "language_loss": 0.76564515, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.78999126, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19470215, + "step": 14875, + "time_per_iteration": 2.951315402984619 + }, + { + "auxiliary_loss_clip": 0.01422887, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.25708175, + "balance_loss_mlp": 1.01466513, + "epoch": 0.8943935066887119, + "flos": 22174148252160.0, + "grad_norm": 1.9138042995977387, + "language_loss": 0.78199863, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.80657566, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.20141602, + "step": 14876, + "time_per_iteration": 2.853595018386841 + }, + { + "auxiliary_loss_clip": 0.01390564, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.23253131, + "balance_loss_mlp": 1.01074517, + "epoch": 0.8944536299413798, + "flos": 21479891754240.0, + "grad_norm": 1.6721103734528484, + "language_loss": 0.7917977, + "learning_rate": 1.156625201573287e-07, + "loss": 0.81599689, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18615723, + "step": 14877, + "time_per_iteration": 4.2834084033966064 + }, + { + "auxiliary_loss_clip": 0.01378151, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.22041988, + "balance_loss_mlp": 1.01089132, + "epoch": 0.8945137531940478, + "flos": 17757716993280.0, + "grad_norm": 2.0179480973376336, + "language_loss": 0.76138616, + "learning_rate": 1.155320321355151e-07, + "loss": 0.78546607, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18920898, + "step": 14878, + "time_per_iteration": 2.8930141925811768 + }, + { + "auxiliary_loss_clip": 0.01405575, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.24342918, + "balance_loss_mlp": 1.00988841, + "epoch": 0.8945738764467158, + "flos": 21152164533120.0, + "grad_norm": 1.5493044936778437, + "language_loss": 0.768888, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.79323989, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19750977, + "step": 14879, + "time_per_iteration": 2.841632604598999 + }, + { + "auxiliary_loss_clip": 0.0139597, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.23679388, + "balance_loss_mlp": 1.01564968, + "epoch": 0.8946339996993837, + "flos": 14911230954240.0, + "grad_norm": 1.7249583936240358, + "language_loss": 0.75621605, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.78052223, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19006348, + "step": 14880, + "time_per_iteration": 4.230382442474365 + }, + { + "auxiliary_loss_clip": 0.01393199, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.23558426, + "balance_loss_mlp": 1.01756334, + "epoch": 0.8946941229520518, + "flos": 27393912518400.0, + "grad_norm": 1.7978872943104527, + "language_loss": 0.83764017, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.861947, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19921875, + "step": 14881, + "time_per_iteration": 2.9411323070526123 + }, + { + "auxiliary_loss_clip": 0.01383421, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.22706556, + "balance_loss_mlp": 1.01150858, + "epoch": 0.8947542462047197, + "flos": 31808986433280.0, + "grad_norm": 1.5609375632349982, + "language_loss": 0.67684889, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.70099843, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.20019531, + "step": 14882, + "time_per_iteration": 2.9044177532196045 + }, + { + "auxiliary_loss_clip": 0.01420941, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.25680602, + "balance_loss_mlp": 1.01251304, + "epoch": 0.8948143694573877, + "flos": 20892585260160.0, + "grad_norm": 2.037925081755209, + "language_loss": 0.76603061, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.79056209, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19702148, + "step": 14883, + "time_per_iteration": 2.8201096057891846 + }, + { + "auxiliary_loss_clip": 0.01382266, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.22731042, + "balance_loss_mlp": 1.0131191, + "epoch": 0.8948744927100556, + "flos": 28226003235840.0, + "grad_norm": 1.55723128940774, + "language_loss": 0.73238635, + "learning_rate": 1.147506048211253e-07, + "loss": 0.75651705, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.17700195, + "step": 14884, + "time_per_iteration": 2.8887739181518555 + }, + { + "auxiliary_loss_clip": 0.01382147, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.22615659, + "balance_loss_mlp": 1.01098442, + "epoch": 0.8949346159627236, + "flos": 21911266108800.0, + "grad_norm": 1.823817247996896, + "language_loss": 0.76435524, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.78846467, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.17810059, + "step": 14885, + "time_per_iteration": 2.850036144256592 + }, + { + "auxiliary_loss_clip": 0.01408502, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.2452724, + "balance_loss_mlp": 1.01573133, + "epoch": 0.8949947392153915, + "flos": 21368553004800.0, + "grad_norm": 1.7946325939684513, + "language_loss": 0.81993598, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84437037, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19213867, + "step": 14886, + "time_per_iteration": 4.310011386871338 + }, + { + "auxiliary_loss_clip": 0.01397064, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.23751688, + "balance_loss_mlp": 1.01303792, + "epoch": 0.8950548624680595, + "flos": 52462922780160.0, + "grad_norm": 1.5954864656282428, + "language_loss": 0.64617127, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.67045587, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18359375, + "step": 14887, + "time_per_iteration": 3.098759889602661 + }, + { + "auxiliary_loss_clip": 0.014104, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.24755502, + "balance_loss_mlp": 1.01555037, + "epoch": 0.8951149857207275, + "flos": 20131130954880.0, + "grad_norm": 1.6616490969100226, + "language_loss": 0.61832553, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.64277947, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19433594, + "step": 14888, + "time_per_iteration": 2.838829278945923 + }, + { + "auxiliary_loss_clip": 0.01403103, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.2410233, + "balance_loss_mlp": 1.01221442, + "epoch": 0.8951751089733955, + "flos": 29874864666240.0, + "grad_norm": 1.9296884324803931, + "language_loss": 0.708332, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.73267615, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19116211, + "step": 14889, + "time_per_iteration": 2.9052276611328125 + }, + { + "auxiliary_loss_clip": 0.01393827, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.23375285, + "balance_loss_mlp": 1.01234508, + "epoch": 0.8952352322260634, + "flos": 15269797146240.0, + "grad_norm": 2.301357924661621, + "language_loss": 0.71986437, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.74412078, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19458008, + "step": 14890, + "time_per_iteration": 2.869342088699341 + }, + { + "auxiliary_loss_clip": 0.01399781, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.23883581, + "balance_loss_mlp": 1.01006246, + "epoch": 0.8952953554787314, + "flos": 26809908894720.0, + "grad_norm": 1.9066392773876915, + "language_loss": 0.76414704, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.78842759, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18212891, + "step": 14891, + "time_per_iteration": 2.931612730026245 + }, + { + "auxiliary_loss_clip": 0.01404365, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.23990583, + "balance_loss_mlp": 1.01275182, + "epoch": 0.8953554787313994, + "flos": 14145342658560.0, + "grad_norm": 1.6721689731768619, + "language_loss": 0.77319825, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.79755676, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.1875, + "step": 14892, + "time_per_iteration": 2.806199789047241 + }, + { + "auxiliary_loss_clip": 0.01386187, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.22880614, + "balance_loss_mlp": 1.01259685, + "epoch": 0.8954156019840673, + "flos": 25714483585920.0, + "grad_norm": 1.325315830610486, + "language_loss": 0.82040316, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.844576, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18518066, + "step": 14893, + "time_per_iteration": 2.890418529510498 + }, + { + "auxiliary_loss_clip": 0.01382994, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.22702777, + "balance_loss_mlp": 1.01511049, + "epoch": 0.8954757252367354, + "flos": 21917962339200.0, + "grad_norm": 3.7545082245550487, + "language_loss": 0.75899899, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.78315675, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.17663574, + "step": 14894, + "time_per_iteration": 2.8413522243499756 + }, + { + "auxiliary_loss_clip": 0.01404904, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.24261653, + "balance_loss_mlp": 1.01716614, + "epoch": 0.8955358484894033, + "flos": 12978421286400.0, + "grad_norm": 1.8408517318934141, + "language_loss": 0.67812973, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.70255291, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20227051, + "step": 14895, + "time_per_iteration": 2.8283369541168213 + }, + { + "auxiliary_loss_clip": 0.01411843, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.24851239, + "balance_loss_mlp": 1.01287985, + "epoch": 0.8955959717420713, + "flos": 17282654144640.0, + "grad_norm": 1.5196775360065886, + "language_loss": 0.67725515, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.701702, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19958496, + "step": 14896, + "time_per_iteration": 2.7953009605407715 + }, + { + "auxiliary_loss_clip": 0.0139378, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.2352699, + "balance_loss_mlp": 1.01327348, + "epoch": 0.8956560949947392, + "flos": 14802154444800.0, + "grad_norm": 1.8080017313862775, + "language_loss": 0.76605976, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.79030818, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.17797852, + "step": 14897, + "time_per_iteration": 2.8564233779907227 + }, + { + "auxiliary_loss_clip": 0.01177968, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.09113955, + "balance_loss_mlp": 1.00789595, + "epoch": 0.8957162182474072, + "flos": 63637201906560.0, + "grad_norm": 0.7385006455685597, + "language_loss": 0.55388683, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57594949, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.20410156, + "step": 14898, + "time_per_iteration": 3.386202573776245 + }, + { + "auxiliary_loss_clip": 0.01395199, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.23552907, + "balance_loss_mlp": 1.01252604, + "epoch": 0.8957763415000751, + "flos": 25020770025600.0, + "grad_norm": 1.4362180586618565, + "language_loss": 0.7107861, + "learning_rate": 1.12808298352008e-07, + "loss": 0.73506033, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19702148, + "step": 14899, + "time_per_iteration": 2.8619983196258545 + }, + { + "auxiliary_loss_clip": 0.01406671, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.24450707, + "balance_loss_mlp": 1.01298249, + "epoch": 0.8958364647527431, + "flos": 19838133757440.0, + "grad_norm": 1.636626987025454, + "language_loss": 0.7455405, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.7699306, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19360352, + "step": 14900, + "time_per_iteration": 2.856999635696411 + }, + { + "auxiliary_loss_clip": 0.01179076, + "auxiliary_loss_mlp": 0.01032499, + "balance_loss_clip": 1.09018886, + "balance_loss_mlp": 1.01333046, + "epoch": 0.895896588005411, + "flos": 65565939542400.0, + "grad_norm": 0.7746626803559181, + "language_loss": 0.61817622, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.64029199, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.19140625, + "step": 14901, + "time_per_iteration": 3.3061769008636475 + }, + { + "auxiliary_loss_clip": 0.01394149, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.23260975, + "balance_loss_mlp": 1.01305592, + "epoch": 0.8959567112580791, + "flos": 25601425534080.0, + "grad_norm": 2.3287152942435516, + "language_loss": 0.71372098, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.73798561, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19250488, + "step": 14902, + "time_per_iteration": 2.903937339782715 + }, + { + "auxiliary_loss_clip": 0.01388267, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.23088074, + "balance_loss_mlp": 1.01551723, + "epoch": 0.896016834510747, + "flos": 24211102746240.0, + "grad_norm": 1.7676821768179656, + "language_loss": 0.78718889, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.81141597, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18908691, + "step": 14903, + "time_per_iteration": 2.867157220840454 + }, + { + "auxiliary_loss_clip": 0.01398191, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.2350235, + "balance_loss_mlp": 1.01296568, + "epoch": 0.896076957763415, + "flos": 23086331544960.0, + "grad_norm": 1.6341616573086277, + "language_loss": 0.73379493, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75811183, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20532227, + "step": 14904, + "time_per_iteration": 2.8379688262939453 + }, + { + "auxiliary_loss_clip": 0.0139645, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.2359159, + "balance_loss_mlp": 1.0106734, + "epoch": 0.8961370810160829, + "flos": 22246730190720.0, + "grad_norm": 1.7949229245403966, + "language_loss": 0.75535077, + "learning_rate": 1.12035883275166e-07, + "loss": 0.77962214, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.20019531, + "step": 14905, + "time_per_iteration": 2.858520746231079 + }, + { + "auxiliary_loss_clip": 0.01385885, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.22833455, + "balance_loss_mlp": 1.01102519, + "epoch": 0.8961972042687509, + "flos": 23081761820160.0, + "grad_norm": 5.3755787293201545, + "language_loss": 0.77256852, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.7967366, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19897461, + "step": 14906, + "time_per_iteration": 2.8435401916503906 + }, + { + "auxiliary_loss_clip": 0.01404407, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.24331141, + "balance_loss_mlp": 1.01742136, + "epoch": 0.896257327521419, + "flos": 18194746947840.0, + "grad_norm": 2.3232191294148876, + "language_loss": 0.75303197, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.77743518, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18518066, + "step": 14907, + "time_per_iteration": 2.820803165435791 + }, + { + "auxiliary_loss_clip": 0.01394845, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.23784971, + "balance_loss_mlp": 1.0137738, + "epoch": 0.8963174507740869, + "flos": 17904645417600.0, + "grad_norm": 1.8198733721957683, + "language_loss": 0.83378613, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85805619, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18383789, + "step": 14908, + "time_per_iteration": 2.804722785949707 + }, + { + "auxiliary_loss_clip": 0.01397811, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.23571122, + "balance_loss_mlp": 1.01179123, + "epoch": 0.8963775740267549, + "flos": 21042002148480.0, + "grad_norm": 2.309145626475353, + "language_loss": 0.71202546, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.73631477, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.1932373, + "step": 14909, + "time_per_iteration": 2.8672733306884766 + }, + { + "auxiliary_loss_clip": 0.01413637, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.25255084, + "balance_loss_mlp": 1.01498091, + "epoch": 0.8964376972794228, + "flos": 23188530844800.0, + "grad_norm": 1.7280629106692729, + "language_loss": 0.73180878, + "learning_rate": 1.113941727737877e-07, + "loss": 0.75629091, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19604492, + "step": 14910, + "time_per_iteration": 2.849710702896118 + }, + { + "auxiliary_loss_clip": 0.01397528, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.23850727, + "balance_loss_mlp": 1.00973916, + "epoch": 0.8964978205320908, + "flos": 24983823006720.0, + "grad_norm": 2.2212249552120635, + "language_loss": 0.630817, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65507019, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18029785, + "step": 14911, + "time_per_iteration": 2.87783145904541 + }, + { + "auxiliary_loss_clip": 0.01408324, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.24801481, + "balance_loss_mlp": 1.01502514, + "epoch": 0.8965579437847587, + "flos": 19181412460800.0, + "grad_norm": 1.7527952485621654, + "language_loss": 0.7599721, + "learning_rate": 1.111379898520437e-07, + "loss": 0.78439909, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19335938, + "step": 14912, + "time_per_iteration": 4.268129587173462 + }, + { + "auxiliary_loss_clip": 0.01397739, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.23691404, + "balance_loss_mlp": 1.01580858, + "epoch": 0.8966180670374267, + "flos": 24286535107200.0, + "grad_norm": 1.8766779004103074, + "language_loss": 0.82718575, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.85152054, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19909668, + "step": 14913, + "time_per_iteration": 2.903698682785034 + }, + { + "auxiliary_loss_clip": 0.0140104, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.24047494, + "balance_loss_mlp": 1.01586962, + "epoch": 0.8966781902900947, + "flos": 13561112810880.0, + "grad_norm": 56.69349409892734, + "language_loss": 0.62459493, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.64896184, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19787598, + "step": 14914, + "time_per_iteration": 2.8519060611724854 + }, + { + "auxiliary_loss_clip": 0.01179211, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.09173584, + "balance_loss_mlp": 1.01085782, + "epoch": 0.8967383135427627, + "flos": 65095582152960.0, + "grad_norm": 0.7242255757736406, + "language_loss": 0.55092877, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57302308, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.19335938, + "step": 14915, + "time_per_iteration": 4.791531085968018 + }, + { + "auxiliary_loss_clip": 0.01395277, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.23725939, + "balance_loss_mlp": 1.01314592, + "epoch": 0.8967984367954306, + "flos": 29724271413120.0, + "grad_norm": 1.45635249999403, + "language_loss": 0.71809006, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.74236262, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18847656, + "step": 14916, + "time_per_iteration": 2.960327625274658 + }, + { + "auxiliary_loss_clip": 0.0138529, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.22759879, + "balance_loss_mlp": 1.01311469, + "epoch": 0.8968585600480986, + "flos": 25713442955520.0, + "grad_norm": 2.0352971314126433, + "language_loss": 0.78555697, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80972725, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.1862793, + "step": 14917, + "time_per_iteration": 2.939617395401001 + }, + { + "auxiliary_loss_clip": 0.01411089, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.24739909, + "balance_loss_mlp": 1.01977277, + "epoch": 0.8969186833007665, + "flos": 30056885072640.0, + "grad_norm": 1.9108909284943532, + "language_loss": 0.69075787, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.71526426, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19787598, + "step": 14918, + "time_per_iteration": 2.910747528076172 + }, + { + "auxiliary_loss_clip": 0.01397265, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.23615313, + "balance_loss_mlp": 1.01392961, + "epoch": 0.8969788065534345, + "flos": 22827838147200.0, + "grad_norm": 1.676263001795645, + "language_loss": 0.84351969, + "learning_rate": 1.102436060943881e-07, + "loss": 0.86781627, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18457031, + "step": 14919, + "time_per_iteration": 2.8515143394470215 + }, + { + "auxiliary_loss_clip": 0.01405145, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.24297309, + "balance_loss_mlp": 1.01317418, + "epoch": 0.8970389298061026, + "flos": 13269382467840.0, + "grad_norm": 3.1196640515329004, + "language_loss": 0.73933429, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.76371574, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19824219, + "step": 14920, + "time_per_iteration": 2.7997632026672363 + }, + { + "auxiliary_loss_clip": 0.01405262, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.24382746, + "balance_loss_mlp": 1.01464856, + "epoch": 0.8970990530587705, + "flos": 10272257930880.0, + "grad_norm": 3.1000559566748174, + "language_loss": 0.91980624, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.94419986, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19433594, + "step": 14921, + "time_per_iteration": 4.2373206615448 + }, + { + "auxiliary_loss_clip": 0.01408037, + "auxiliary_loss_mlp": 0.01037504, + "balance_loss_clip": 1.24475861, + "balance_loss_mlp": 1.01705956, + "epoch": 0.8971591763114385, + "flos": 20312291710080.0, + "grad_norm": 1.914666733288133, + "language_loss": 0.74038523, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.7648406, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.2043457, + "step": 14922, + "time_per_iteration": 4.234920263290405 + }, + { + "auxiliary_loss_clip": 0.0139145, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.23246312, + "balance_loss_mlp": 1.01384759, + "epoch": 0.8972192995641064, + "flos": 23267130341760.0, + "grad_norm": 1.8180971742544967, + "language_loss": 0.71134472, + "learning_rate": 1.097341060694219e-07, + "loss": 0.73559427, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19665527, + "step": 14923, + "time_per_iteration": 2.834177017211914 + }, + { + "auxiliary_loss_clip": 0.01398751, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.23615098, + "balance_loss_mlp": 1.01395094, + "epoch": 0.8972794228167744, + "flos": 18378848615040.0, + "grad_norm": 1.878736373944294, + "language_loss": 0.71606135, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.74038476, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19628906, + "step": 14924, + "time_per_iteration": 2.8700478076934814 + }, + { + "auxiliary_loss_clip": 0.01403662, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.24332809, + "balance_loss_mlp": 1.01479495, + "epoch": 0.8973395460694423, + "flos": 23962789428480.0, + "grad_norm": 1.4565493086420698, + "language_loss": 0.73168337, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.75604987, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18188477, + "step": 14925, + "time_per_iteration": 2.8877785205841064 + }, + { + "auxiliary_loss_clip": 0.01410945, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.25035071, + "balance_loss_mlp": 1.00963116, + "epoch": 0.8973996693221103, + "flos": 24981108318720.0, + "grad_norm": 2.3559347196033196, + "language_loss": 0.83450174, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.85890305, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19555664, + "step": 14926, + "time_per_iteration": 2.870877981185913 + }, + { + "auxiliary_loss_clip": 0.0140658, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.24570847, + "balance_loss_mlp": 1.01918387, + "epoch": 0.8974597925747783, + "flos": 25749892281600.0, + "grad_norm": 2.375419315999326, + "language_loss": 0.79392922, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81837463, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18774414, + "step": 14927, + "time_per_iteration": 2.883622884750366 + }, + { + "auxiliary_loss_clip": 0.01398464, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.23910952, + "balance_loss_mlp": 1.01229584, + "epoch": 0.8975199158274463, + "flos": 38086278848640.0, + "grad_norm": 1.5226393584668971, + "language_loss": 0.67530394, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.6996063, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19494629, + "step": 14928, + "time_per_iteration": 3.0160059928894043 + }, + { + "auxiliary_loss_clip": 0.01400528, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.23836458, + "balance_loss_mlp": 1.01715565, + "epoch": 0.8975800390801142, + "flos": 25422843732480.0, + "grad_norm": 1.939274561686016, + "language_loss": 0.72040343, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.74478531, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20507812, + "step": 14929, + "time_per_iteration": 2.9110107421875 + }, + { + "auxiliary_loss_clip": 0.01407425, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.24767363, + "balance_loss_mlp": 1.01254141, + "epoch": 0.8976401623327822, + "flos": 21768907409280.0, + "grad_norm": 1.6546910748837127, + "language_loss": 0.6904453, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.71482766, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18273926, + "step": 14930, + "time_per_iteration": 2.868788480758667 + }, + { + "auxiliary_loss_clip": 0.01395043, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.23618078, + "balance_loss_mlp": 1.01218343, + "epoch": 0.8977002855854501, + "flos": 13853386091520.0, + "grad_norm": 1.9916619313856292, + "language_loss": 0.75960696, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.78386438, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18518066, + "step": 14931, + "time_per_iteration": 2.860804796218872 + }, + { + "auxiliary_loss_clip": 0.01393655, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.23637474, + "balance_loss_mlp": 1.01558268, + "epoch": 0.8977604088381181, + "flos": 19437010191360.0, + "grad_norm": 1.7952407939825439, + "language_loss": 0.63758951, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.66187322, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19116211, + "step": 14932, + "time_per_iteration": 2.861637830734253 + }, + { + "auxiliary_loss_clip": 0.01376937, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.22258329, + "balance_loss_mlp": 1.01490927, + "epoch": 0.8978205320907862, + "flos": 22750957952640.0, + "grad_norm": 1.8246003909372621, + "language_loss": 0.72853547, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.75263411, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18017578, + "step": 14933, + "time_per_iteration": 2.8492212295532227 + }, + { + "auxiliary_loss_clip": 0.01411469, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.24728, + "balance_loss_mlp": 1.01334262, + "epoch": 0.8978806553434541, + "flos": 21370317552000.0, + "grad_norm": 1.5478708347058663, + "language_loss": 0.75207663, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.77651215, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18725586, + "step": 14934, + "time_per_iteration": 2.954237222671509 + }, + { + "auxiliary_loss_clip": 0.01399934, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.24106884, + "balance_loss_mlp": 1.01795006, + "epoch": 0.8979407785961221, + "flos": 20934237738240.0, + "grad_norm": 1.6427031859079393, + "language_loss": 0.61309105, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.63746166, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19177246, + "step": 14935, + "time_per_iteration": 2.842212200164795 + }, + { + "auxiliary_loss_clip": 0.01387426, + "auxiliary_loss_mlp": 0.01031799, + "balance_loss_clip": 1.23154426, + "balance_loss_mlp": 1.0125227, + "epoch": 0.89800090184879, + "flos": 25239963674880.0, + "grad_norm": 1.9109760084499046, + "language_loss": 0.77227038, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.79646266, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.19299316, + "step": 14936, + "time_per_iteration": 2.951972484588623 + }, + { + "auxiliary_loss_clip": 0.01396198, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.23737168, + "balance_loss_mlp": 1.01189661, + "epoch": 0.898061025101458, + "flos": 22572285661440.0, + "grad_norm": 1.5271420033959466, + "language_loss": 0.74475533, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76902223, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18591309, + "step": 14937, + "time_per_iteration": 2.901740312576294 + }, + { + "auxiliary_loss_clip": 0.01176301, + "auxiliary_loss_mlp": 0.01016654, + "balance_loss_clip": 1.09088075, + "balance_loss_mlp": 0.99643594, + "epoch": 0.8981211483541259, + "flos": 56217764286720.0, + "grad_norm": 0.8502263214230118, + "language_loss": 0.63510394, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65703356, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.20214844, + "step": 14938, + "time_per_iteration": 3.258723497390747 + }, + { + "auxiliary_loss_clip": 0.0137847, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.22421932, + "balance_loss_mlp": 1.01256835, + "epoch": 0.898181271606794, + "flos": 16399726254720.0, + "grad_norm": 5.4935586022674245, + "language_loss": 0.81055635, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.83465803, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.19128418, + "step": 14939, + "time_per_iteration": 2.825197219848633 + }, + { + "auxiliary_loss_clip": 0.01180306, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.09201455, + "balance_loss_mlp": 1.00508273, + "epoch": 0.8982413948594619, + "flos": 63473460399360.0, + "grad_norm": 0.7213614532845988, + "language_loss": 0.52839673, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.55050051, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.25, + "step": 14940, + "time_per_iteration": 3.402911901473999 + }, + { + "auxiliary_loss_clip": 0.01396773, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.23574281, + "balance_loss_mlp": 1.01152003, + "epoch": 0.8983015181121299, + "flos": 21845380400640.0, + "grad_norm": 1.9574564760962954, + "language_loss": 0.785375, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80964744, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18969727, + "step": 14941, + "time_per_iteration": 2.8725717067718506 + }, + { + "auxiliary_loss_clip": 0.01410722, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.25037003, + "balance_loss_mlp": 1.01503611, + "epoch": 0.8983616413647978, + "flos": 28961821722240.0, + "grad_norm": 2.7604447514136816, + "language_loss": 0.74139184, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.76583982, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19042969, + "step": 14942, + "time_per_iteration": 2.9091529846191406 + }, + { + "auxiliary_loss_clip": 0.01410894, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.24947667, + "balance_loss_mlp": 1.01615882, + "epoch": 0.8984217646174658, + "flos": 17793985340160.0, + "grad_norm": 2.0460097342123773, + "language_loss": 0.80984998, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.83431578, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.1953125, + "step": 14943, + "time_per_iteration": 2.801833391189575 + }, + { + "auxiliary_loss_clip": 0.01403687, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.24107623, + "balance_loss_mlp": 1.01172197, + "epoch": 0.8984818878701337, + "flos": 23415054151680.0, + "grad_norm": 1.668510459666449, + "language_loss": 0.72063822, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.74500096, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20874023, + "step": 14944, + "time_per_iteration": 2.843008041381836 + }, + { + "auxiliary_loss_clip": 0.01412357, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.24839973, + "balance_loss_mlp": 1.01244724, + "epoch": 0.8985420111228017, + "flos": 22356213903360.0, + "grad_norm": 2.1135863761534726, + "language_loss": 0.77556419, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.80000818, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19592285, + "step": 14945, + "time_per_iteration": 2.9376699924468994 + }, + { + "auxiliary_loss_clip": 0.01421656, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.25363564, + "balance_loss_mlp": 1.01388144, + "epoch": 0.8986021343754698, + "flos": 21401473236480.0, + "grad_norm": 2.1223793660880603, + "language_loss": 0.74150407, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.76605928, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.20007324, + "step": 14946, + "time_per_iteration": 2.8550164699554443 + }, + { + "auxiliary_loss_clip": 0.01389827, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.2307601, + "balance_loss_mlp": 1.01132607, + "epoch": 0.8986622576281377, + "flos": 21335497038720.0, + "grad_norm": 1.9311676189088844, + "language_loss": 0.64494491, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66916025, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.20385742, + "step": 14947, + "time_per_iteration": 4.283062934875488 + }, + { + "auxiliary_loss_clip": 0.01396836, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.23928738, + "balance_loss_mlp": 1.01359701, + "epoch": 0.8987223808808057, + "flos": 23998967285760.0, + "grad_norm": 1.870229896600675, + "language_loss": 0.70222372, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72652155, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19360352, + "step": 14948, + "time_per_iteration": 2.8663530349731445 + }, + { + "auxiliary_loss_clip": 0.01397467, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.23760247, + "balance_loss_mlp": 1.0137682, + "epoch": 0.8987825041334736, + "flos": 41516044594560.0, + "grad_norm": 2.1654189303297593, + "language_loss": 0.75112796, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.77543342, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19299316, + "step": 14949, + "time_per_iteration": 3.1202852725982666 + }, + { + "auxiliary_loss_clip": 0.01404648, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.2420485, + "balance_loss_mlp": 1.01451337, + "epoch": 0.8988426273861416, + "flos": 27575978169600.0, + "grad_norm": 1.6265615082958633, + "language_loss": 0.76284313, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78723538, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20068359, + "step": 14950, + "time_per_iteration": 4.395023345947266 + }, + { + "auxiliary_loss_clip": 0.01400448, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.2419064, + "balance_loss_mlp": 1.01240993, + "epoch": 0.8989027506388095, + "flos": 17101583879040.0, + "grad_norm": 1.763255679302362, + "language_loss": 0.67292702, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.69724131, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18579102, + "step": 14951, + "time_per_iteration": 2.859564781188965 + }, + { + "auxiliary_loss_clip": 0.01424372, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.25763297, + "balance_loss_mlp": 1.01361656, + "epoch": 0.8989628738914776, + "flos": 20563998387840.0, + "grad_norm": 3.495001125493466, + "language_loss": 0.7413038, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76586795, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.18432617, + "step": 14952, + "time_per_iteration": 2.8744077682495117 + }, + { + "auxiliary_loss_clip": 0.01393941, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.23456264, + "balance_loss_mlp": 1.01392865, + "epoch": 0.8990229971441455, + "flos": 16259177347200.0, + "grad_norm": 2.1920551993502637, + "language_loss": 0.57856786, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.60283905, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19250488, + "step": 14953, + "time_per_iteration": 2.8160533905029297 + }, + { + "auxiliary_loss_clip": 0.01391016, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.23242378, + "balance_loss_mlp": 1.01706636, + "epoch": 0.8990831203968135, + "flos": 21991403928960.0, + "grad_norm": 2.1179209743516805, + "language_loss": 0.82806343, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.85233998, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19555664, + "step": 14954, + "time_per_iteration": 2.861647367477417 + }, + { + "auxiliary_loss_clip": 0.01390124, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.23368645, + "balance_loss_mlp": 1.01778173, + "epoch": 0.8991432436494814, + "flos": 27456857314560.0, + "grad_norm": 1.958462507660104, + "language_loss": 0.61656201, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.64082903, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18811035, + "step": 14955, + "time_per_iteration": 2.9080724716186523 + }, + { + "auxiliary_loss_clip": 0.01391442, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.23484182, + "balance_loss_mlp": 1.01257515, + "epoch": 0.8992033669021494, + "flos": 21589782670080.0, + "grad_norm": 2.221927097287631, + "language_loss": 0.55849826, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.58272755, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18908691, + "step": 14956, + "time_per_iteration": 4.264169216156006 + }, + { + "auxiliary_loss_clip": 0.01408367, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.24796593, + "balance_loss_mlp": 1.01582468, + "epoch": 0.8992634901548173, + "flos": 28595744893440.0, + "grad_norm": 1.5844502074665592, + "language_loss": 0.80813986, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.83257198, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19018555, + "step": 14957, + "time_per_iteration": 4.424845933914185 + }, + { + "auxiliary_loss_clip": 0.01404273, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.24193096, + "balance_loss_mlp": 1.01446986, + "epoch": 0.8993236134074853, + "flos": 19877569240320.0, + "grad_norm": 1.9226351639369275, + "language_loss": 0.79816294, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.8225435, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19299316, + "step": 14958, + "time_per_iteration": 2.8851354122161865 + }, + { + "auxiliary_loss_clip": 0.01407136, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.24886763, + "balance_loss_mlp": 1.01455057, + "epoch": 0.8993837366601534, + "flos": 19400017927680.0, + "grad_norm": 7.553448122320239, + "language_loss": 0.75325584, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.77765387, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18115234, + "step": 14959, + "time_per_iteration": 2.999762535095215 + }, + { + "auxiliary_loss_clip": 0.01384201, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.2286706, + "balance_loss_mlp": 1.01242018, + "epoch": 0.8994438599128213, + "flos": 18560959511040.0, + "grad_norm": 1.924872951006241, + "language_loss": 0.69754761, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.72170252, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18859863, + "step": 14960, + "time_per_iteration": 2.8315882682800293 + }, + { + "auxiliary_loss_clip": 0.01396522, + "auxiliary_loss_mlp": 0.01035585, + "balance_loss_clip": 1.23788929, + "balance_loss_mlp": 1.01659513, + "epoch": 0.8995039831654893, + "flos": 24438757173120.0, + "grad_norm": 1.7415309111215422, + "language_loss": 0.66681945, + "learning_rate": 1.049510991294591e-07, + "loss": 0.69114053, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18994141, + "step": 14961, + "time_per_iteration": 2.891075372695923 + }, + { + "auxiliary_loss_clip": 0.01380369, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.22446966, + "balance_loss_mlp": 1.01319146, + "epoch": 0.8995641064181572, + "flos": 21261150552960.0, + "grad_norm": 1.4488278201650604, + "language_loss": 0.8323943, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85651588, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18615723, + "step": 14962, + "time_per_iteration": 2.8461315631866455 + }, + { + "auxiliary_loss_clip": 0.01411645, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.24708915, + "balance_loss_mlp": 1.01379275, + "epoch": 0.8996242296708252, + "flos": 23524130661120.0, + "grad_norm": 1.8756818355817162, + "language_loss": 0.76876247, + "learning_rate": 1.047022340612298e-07, + "loss": 0.79321051, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19384766, + "step": 14963, + "time_per_iteration": 2.877379894256592 + }, + { + "auxiliary_loss_clip": 0.01177213, + "auxiliary_loss_mlp": 0.01025403, + "balance_loss_clip": 1.08902001, + "balance_loss_mlp": 1.00203788, + "epoch": 0.8996843529234931, + "flos": 62432157150720.0, + "grad_norm": 0.780399617482122, + "language_loss": 0.57576913, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59779531, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.23339844, + "step": 14964, + "time_per_iteration": 3.186002016067505 + }, + { + "auxiliary_loss_clip": 0.01426476, + "auxiliary_loss_mlp": 0.01038118, + "balance_loss_clip": 1.25964308, + "balance_loss_mlp": 1.01736379, + "epoch": 0.8997444761761612, + "flos": 24245787525120.0, + "grad_norm": 3.2292521408412562, + "language_loss": 0.68731278, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.71195865, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.2076416, + "step": 14965, + "time_per_iteration": 2.857436180114746 + }, + { + "auxiliary_loss_clip": 0.01400555, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.23933947, + "balance_loss_mlp": 1.01310706, + "epoch": 0.8998045994288291, + "flos": 21371041468800.0, + "grad_norm": 1.9060932553691323, + "language_loss": 0.73060673, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.75493193, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18859863, + "step": 14966, + "time_per_iteration": 2.8818886280059814 + }, + { + "auxiliary_loss_clip": 0.01390427, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.23192787, + "balance_loss_mlp": 1.0158186, + "epoch": 0.8998647226814971, + "flos": 28997954334720.0, + "grad_norm": 1.7734973693838822, + "language_loss": 0.7404778, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.76473391, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19360352, + "step": 14967, + "time_per_iteration": 2.899163007736206 + }, + { + "auxiliary_loss_clip": 0.01392466, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.23332751, + "balance_loss_mlp": 1.01056385, + "epoch": 0.899924845934165, + "flos": 13633785239040.0, + "grad_norm": 2.304703582166835, + "language_loss": 0.73390651, + "learning_rate": 1.040813291960323e-07, + "loss": 0.75812751, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19055176, + "step": 14968, + "time_per_iteration": 2.8371238708496094 + }, + { + "auxiliary_loss_clip": 0.0140617, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.24568987, + "balance_loss_mlp": 1.01206529, + "epoch": 0.899984969186833, + "flos": 20891363650560.0, + "grad_norm": 1.7352290231196592, + "language_loss": 0.70982742, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73419976, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19006348, + "step": 14969, + "time_per_iteration": 2.8017430305480957 + }, + { + "auxiliary_loss_clip": 0.01404104, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.24130201, + "balance_loss_mlp": 1.01040864, + "epoch": 0.9000450924395009, + "flos": 20931251581440.0, + "grad_norm": 1.9178957988034635, + "language_loss": 0.76958382, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.79391825, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18933105, + "step": 14970, + "time_per_iteration": 2.861865520477295 + }, + { + "auxiliary_loss_clip": 0.01401644, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.24046826, + "balance_loss_mlp": 1.0123713, + "epoch": 0.900105215692169, + "flos": 17173849104000.0, + "grad_norm": 1.9662780210821396, + "language_loss": 0.73737633, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.7617054, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18896484, + "step": 14971, + "time_per_iteration": 2.88590669631958 + }, + { + "auxiliary_loss_clip": 0.01394231, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.23384798, + "balance_loss_mlp": 1.01137018, + "epoch": 0.900165338944837, + "flos": 19939880609280.0, + "grad_norm": 2.296111038577219, + "language_loss": 0.82553566, + "learning_rate": 1.035858993572476e-07, + "loss": 0.84978873, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19714355, + "step": 14972, + "time_per_iteration": 2.8654143810272217 + }, + { + "auxiliary_loss_clip": 0.01418121, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.25204074, + "balance_loss_mlp": 1.015378, + "epoch": 0.9002254621975049, + "flos": 16115642282880.0, + "grad_norm": 1.980231841510622, + "language_loss": 0.82392263, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.84843761, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.18017578, + "step": 14973, + "time_per_iteration": 2.8257462978363037 + }, + { + "auxiliary_loss_clip": 0.0139526, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.23574245, + "balance_loss_mlp": 1.0160954, + "epoch": 0.9002855854501729, + "flos": 28487980483200.0, + "grad_norm": 1.6491419786867012, + "language_loss": 0.59016705, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.61447978, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19921875, + "step": 14974, + "time_per_iteration": 2.875247001647949 + }, + { + "auxiliary_loss_clip": 0.0141473, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.25316072, + "balance_loss_mlp": 1.01798272, + "epoch": 0.9003457087028408, + "flos": 25641449199360.0, + "grad_norm": 1.6670736863557445, + "language_loss": 0.63985741, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.66437542, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19104004, + "step": 14975, + "time_per_iteration": 2.9198639392852783 + }, + { + "auxiliary_loss_clip": 0.01411055, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.24937809, + "balance_loss_mlp": 1.01517367, + "epoch": 0.9004058319555088, + "flos": 24400136096640.0, + "grad_norm": 1.6829378313851888, + "language_loss": 0.73473549, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75918633, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1887207, + "step": 14976, + "time_per_iteration": 2.8396847248077393 + }, + { + "auxiliary_loss_clip": 0.01407502, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.2477088, + "balance_loss_mlp": 1.01722121, + "epoch": 0.9004659552081767, + "flos": 29071214945280.0, + "grad_norm": 1.7887874369945571, + "language_loss": 0.70540631, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.72985125, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19763184, + "step": 14977, + "time_per_iteration": 2.9585163593292236 + }, + { + "auxiliary_loss_clip": 0.01403214, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.24240136, + "balance_loss_mlp": 1.01292455, + "epoch": 0.9005260784608448, + "flos": 16772589803520.0, + "grad_norm": 2.6954913168255126, + "language_loss": 0.67419869, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.69855917, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19909668, + "step": 14978, + "time_per_iteration": 2.803304672241211 + }, + { + "auxiliary_loss_clip": 0.01415929, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.25214171, + "balance_loss_mlp": 1.01665759, + "epoch": 0.9005862017135127, + "flos": 20385688055040.0, + "grad_norm": 1.7523069480390823, + "language_loss": 0.79761994, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.822142, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19616699, + "step": 14979, + "time_per_iteration": 2.8385813236236572 + }, + { + "auxiliary_loss_clip": 0.01178208, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.09115016, + "balance_loss_mlp": 1.00826979, + "epoch": 0.9006463249661807, + "flos": 67609499777280.0, + "grad_norm": 0.7275611645191706, + "language_loss": 0.53740919, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55947614, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.20214844, + "step": 14980, + "time_per_iteration": 3.343348264694214 + }, + { + "auxiliary_loss_clip": 0.01414813, + "auxiliary_loss_mlp": 0.01038471, + "balance_loss_clip": 1.25033975, + "balance_loss_mlp": 1.01902795, + "epoch": 0.9007064482188486, + "flos": 28305055180800.0, + "grad_norm": 1.9670491609016096, + "language_loss": 0.82888305, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.85341585, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19445801, + "step": 14981, + "time_per_iteration": 2.8850014209747314 + }, + { + "auxiliary_loss_clip": 0.01383415, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.22708631, + "balance_loss_mlp": 1.01303101, + "epoch": 0.9007665714715166, + "flos": 21626277240960.0, + "grad_norm": 2.9168919393766983, + "language_loss": 0.82213718, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.84630024, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.1986084, + "step": 14982, + "time_per_iteration": 4.35743522644043 + }, + { + "auxiliary_loss_clip": 0.01390504, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.23480701, + "balance_loss_mlp": 1.01153731, + "epoch": 0.9008266947241845, + "flos": 26553089554560.0, + "grad_norm": 1.803294039112789, + "language_loss": 0.72382247, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.74802577, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18273926, + "step": 14983, + "time_per_iteration": 2.918055534362793 + }, + { + "auxiliary_loss_clip": 0.01389704, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.231969, + "balance_loss_mlp": 1.01356685, + "epoch": 0.9008868179768525, + "flos": 23120428141440.0, + "grad_norm": 2.627502837861305, + "language_loss": 0.75351715, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77773303, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18334961, + "step": 14984, + "time_per_iteration": 2.8451180458068848 + }, + { + "auxiliary_loss_clip": 0.01383384, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.22763896, + "balance_loss_mlp": 1.01441503, + "epoch": 0.9009469412295206, + "flos": 19069711752960.0, + "grad_norm": 1.7382208170488438, + "language_loss": 0.70725578, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.73143357, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.1998291, + "step": 14985, + "time_per_iteration": 4.348379135131836 + }, + { + "auxiliary_loss_clip": 0.01397534, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.23454261, + "balance_loss_mlp": 1.01693201, + "epoch": 0.9010070644821885, + "flos": 23232807521280.0, + "grad_norm": 1.9889175541751458, + "language_loss": 0.71050274, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.73483741, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19018555, + "step": 14986, + "time_per_iteration": 2.8853344917297363 + }, + { + "auxiliary_loss_clip": 0.01397664, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.2365191, + "balance_loss_mlp": 1.01424479, + "epoch": 0.9010671877348565, + "flos": 17393902404480.0, + "grad_norm": 1.6212834883149296, + "language_loss": 0.7752974, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79960859, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19213867, + "step": 14987, + "time_per_iteration": 2.8055732250213623 + }, + { + "auxiliary_loss_clip": 0.01387167, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.22918677, + "balance_loss_mlp": 1.01316893, + "epoch": 0.9011273109875244, + "flos": 21918188563200.0, + "grad_norm": 1.8418701423583734, + "language_loss": 0.74332774, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.76752174, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19042969, + "step": 14988, + "time_per_iteration": 2.8713104724884033 + }, + { + "auxiliary_loss_clip": 0.01411988, + "auxiliary_loss_mlp": 0.0103409, + "balance_loss_clip": 1.24758935, + "balance_loss_mlp": 1.0127871, + "epoch": 0.9011874342401924, + "flos": 24071413489920.0, + "grad_norm": 1.7673051977403775, + "language_loss": 0.70089817, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.72535896, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.2130127, + "step": 14989, + "time_per_iteration": 2.8861498832702637 + }, + { + "auxiliary_loss_clip": 0.01403765, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.24244189, + "balance_loss_mlp": 1.01346016, + "epoch": 0.9012475574928603, + "flos": 16766210286720.0, + "grad_norm": 2.2272823174433634, + "language_loss": 0.80886698, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.83323109, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19177246, + "step": 14990, + "time_per_iteration": 2.819833278656006 + }, + { + "auxiliary_loss_clip": 0.01417772, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.25280166, + "balance_loss_mlp": 1.01519072, + "epoch": 0.9013076807455284, + "flos": 19979768540160.0, + "grad_norm": 1.951679508232669, + "language_loss": 0.78490353, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80941981, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18676758, + "step": 14991, + "time_per_iteration": 2.9604649543762207 + }, + { + "auxiliary_loss_clip": 0.01176999, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.08870077, + "balance_loss_mlp": 1.001755, + "epoch": 0.9013678039981963, + "flos": 65210992934400.0, + "grad_norm": 0.784085663637888, + "language_loss": 0.60251749, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62455308, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.24804688, + "step": 14992, + "time_per_iteration": 6.019509315490723 + }, + { + "auxiliary_loss_clip": 0.01384536, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.22718239, + "balance_loss_mlp": 1.01084828, + "epoch": 0.9014279272508643, + "flos": 20530761442560.0, + "grad_norm": 1.8612018048384171, + "language_loss": 0.8357805, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.85992509, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19067383, + "step": 14993, + "time_per_iteration": 2.873408555984497 + }, + { + "auxiliary_loss_clip": 0.01403276, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.24285293, + "balance_loss_mlp": 1.01663029, + "epoch": 0.9014880505035322, + "flos": 17317384168320.0, + "grad_norm": 1.8971110779131388, + "language_loss": 0.74283946, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.76723671, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19812012, + "step": 14994, + "time_per_iteration": 2.855670928955078 + }, + { + "auxiliary_loss_clip": 0.01387416, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.22985959, + "balance_loss_mlp": 1.01570857, + "epoch": 0.9015481737562002, + "flos": 28414629383040.0, + "grad_norm": 1.6628790613384987, + "language_loss": 0.65608901, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.68030816, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18786621, + "step": 14995, + "time_per_iteration": 2.9220998287200928 + }, + { + "auxiliary_loss_clip": 0.01403245, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.24146116, + "balance_loss_mlp": 1.01359892, + "epoch": 0.9016082970088681, + "flos": 29764747526400.0, + "grad_norm": 1.582819939730781, + "language_loss": 0.6725412, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.69690084, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19116211, + "step": 14996, + "time_per_iteration": 2.981539249420166 + }, + { + "auxiliary_loss_clip": 0.01397168, + "auxiliary_loss_mlp": 0.01030301, + "balance_loss_clip": 1.23869622, + "balance_loss_mlp": 1.01157355, + "epoch": 0.9016684202615362, + "flos": 23523587723520.0, + "grad_norm": 1.6129012922050416, + "language_loss": 0.66521561, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68949032, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18725586, + "step": 14997, + "time_per_iteration": 2.8603146076202393 + }, + { + "auxiliary_loss_clip": 0.01392977, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.23483324, + "balance_loss_mlp": 1.01325345, + "epoch": 0.9017285435142042, + "flos": 16987123238400.0, + "grad_norm": 1.8555927297354655, + "language_loss": 0.78992313, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.81417596, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19055176, + "step": 14998, + "time_per_iteration": 2.8485498428344727 + }, + { + "auxiliary_loss_clip": 0.01401228, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.23945761, + "balance_loss_mlp": 1.01603103, + "epoch": 0.9017886667668721, + "flos": 21402197153280.0, + "grad_norm": 2.3749138137584995, + "language_loss": 0.75646943, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.78083479, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19274902, + "step": 14999, + "time_per_iteration": 2.823133945465088 + }, + { + "auxiliary_loss_clip": 0.01396407, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.23626876, + "balance_loss_mlp": 1.01335788, + "epoch": 0.9018487900195401, + "flos": 21006502963200.0, + "grad_norm": 2.3510407211616724, + "language_loss": 0.76840413, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.79269564, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19384766, + "step": 15000, + "time_per_iteration": 2.8289384841918945 + }, + { + "auxiliary_loss_clip": 0.01396415, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.23801541, + "balance_loss_mlp": 1.01091409, + "epoch": 0.901908913272208, + "flos": 53376508661760.0, + "grad_norm": 2.209401656370959, + "language_loss": 0.81900781, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.8432579, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.17675781, + "step": 15001, + "time_per_iteration": 3.1479477882385254 + }, + { + "auxiliary_loss_clip": 0.01396529, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.2383225, + "balance_loss_mlp": 1.01415622, + "epoch": 0.901969036524876, + "flos": 22099711276800.0, + "grad_norm": 1.401678750203868, + "language_loss": 0.78901935, + "learning_rate": 9.990687143794407e-08, + "loss": 0.8133173, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19116211, + "step": 15002, + "time_per_iteration": 2.895836353302002 + }, + { + "auxiliary_loss_clip": 0.01391644, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.23180294, + "balance_loss_mlp": 1.01293254, + "epoch": 0.9020291597775439, + "flos": 23843759063040.0, + "grad_norm": 1.8251259579086383, + "language_loss": 0.69458812, + "learning_rate": 9.978535328195347e-08, + "loss": 0.71883351, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19970703, + "step": 15003, + "time_per_iteration": 2.8359463214874268 + }, + { + "auxiliary_loss_clip": 0.0140026, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.23867273, + "balance_loss_mlp": 1.01329255, + "epoch": 0.902089283030212, + "flos": 18333395573760.0, + "grad_norm": 2.132241042124498, + "language_loss": 0.86749351, + "learning_rate": 9.9663907182292e-08, + "loss": 0.89182216, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19311523, + "step": 15004, + "time_per_iteration": 2.847820997238159 + }, + { + "auxiliary_loss_clip": 0.01404881, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.24324656, + "balance_loss_mlp": 1.0139606, + "epoch": 0.9021494062828799, + "flos": 24180987692160.0, + "grad_norm": 1.9964171982654428, + "language_loss": 0.72663689, + "learning_rate": 9.954253314356575e-08, + "loss": 0.75101423, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18896484, + "step": 15005, + "time_per_iteration": 2.922121047973633 + }, + { + "auxiliary_loss_clip": 0.01407934, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.24422526, + "balance_loss_mlp": 1.01483488, + "epoch": 0.9022095295355479, + "flos": 21626955912960.0, + "grad_norm": 1.8552956679601875, + "language_loss": 0.72223759, + "learning_rate": 9.942123117037748e-08, + "loss": 0.74665689, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19165039, + "step": 15006, + "time_per_iteration": 2.895777940750122 + }, + { + "auxiliary_loss_clip": 0.01410275, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.24738348, + "balance_loss_mlp": 1.0144037, + "epoch": 0.9022696527882158, + "flos": 18733433264640.0, + "grad_norm": 1.8689200073244538, + "language_loss": 0.85644281, + "learning_rate": 9.930000126732618e-08, + "loss": 0.88088018, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1907959, + "step": 15007, + "time_per_iteration": 2.792048454284668 + }, + { + "auxiliary_loss_clip": 0.0139065, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.2337203, + "balance_loss_mlp": 1.01597166, + "epoch": 0.9023297760408838, + "flos": 26772599917440.0, + "grad_norm": 1.6037615669652006, + "language_loss": 0.79029715, + "learning_rate": 9.917884343900928e-08, + "loss": 0.81455684, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19348145, + "step": 15008, + "time_per_iteration": 2.91572904586792 + }, + { + "auxiliary_loss_clip": 0.01382603, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.22876072, + "balance_loss_mlp": 1.01499295, + "epoch": 0.9023898992935517, + "flos": 20532435500160.0, + "grad_norm": 1.7147044535236158, + "language_loss": 0.74146473, + "learning_rate": 9.905775769002156e-08, + "loss": 0.76563334, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.19274902, + "step": 15009, + "time_per_iteration": 2.831244707107544 + }, + { + "auxiliary_loss_clip": 0.01398648, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.23994482, + "balance_loss_mlp": 1.01293182, + "epoch": 0.9024500225462198, + "flos": 17465443712640.0, + "grad_norm": 2.7128486828665785, + "language_loss": 0.73880959, + "learning_rate": 9.893674402495399e-08, + "loss": 0.76312065, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1953125, + "step": 15010, + "time_per_iteration": 2.8113389015197754 + }, + { + "auxiliary_loss_clip": 0.01407043, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.24693799, + "balance_loss_mlp": 1.01532257, + "epoch": 0.9025101457988878, + "flos": 20823351436800.0, + "grad_norm": 1.99455696976596, + "language_loss": 0.75501668, + "learning_rate": 9.881580244839538e-08, + "loss": 0.77943575, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19543457, + "step": 15011, + "time_per_iteration": 2.889805793762207 + }, + { + "auxiliary_loss_clip": 0.01411411, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.24800968, + "balance_loss_mlp": 1.01198792, + "epoch": 0.9025702690515557, + "flos": 19035796135680.0, + "grad_norm": 1.8677627298355568, + "language_loss": 0.74148774, + "learning_rate": 9.869493296493204e-08, + "loss": 0.76591462, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19299316, + "step": 15012, + "time_per_iteration": 2.8074276447296143 + }, + { + "auxiliary_loss_clip": 0.01387589, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.23061287, + "balance_loss_mlp": 1.01671767, + "epoch": 0.9026303923042237, + "flos": 19692336453120.0, + "grad_norm": 1.7717504150444503, + "language_loss": 0.69703615, + "learning_rate": 9.857413557914763e-08, + "loss": 0.72126484, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18566895, + "step": 15013, + "time_per_iteration": 2.889007091522217 + }, + { + "auxiliary_loss_clip": 0.01383092, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.22859371, + "balance_loss_mlp": 1.01306581, + "epoch": 0.9026905155568916, + "flos": 24618832053120.0, + "grad_norm": 1.3489306351923243, + "language_loss": 0.73534811, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75949991, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.19030762, + "step": 15014, + "time_per_iteration": 2.874812602996826 + }, + { + "auxiliary_loss_clip": 0.01394211, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.23454189, + "balance_loss_mlp": 1.01038325, + "epoch": 0.9027506388095596, + "flos": 20531485359360.0, + "grad_norm": 1.8420466350413023, + "language_loss": 0.72740126, + "learning_rate": 9.833275711893474e-08, + "loss": 0.75163269, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1854248, + "step": 15015, + "time_per_iteration": 2.8220744132995605 + }, + { + "auxiliary_loss_clip": 0.01398633, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.23852837, + "balance_loss_mlp": 1.01309645, + "epoch": 0.9028107620622275, + "flos": 22794917915520.0, + "grad_norm": 1.9065097691585, + "language_loss": 0.69909894, + "learning_rate": 9.821217605365895e-08, + "loss": 0.72340333, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18688965, + "step": 15016, + "time_per_iteration": 2.858342170715332 + }, + { + "auxiliary_loss_clip": 0.01393332, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.23552549, + "balance_loss_mlp": 1.01336646, + "epoch": 0.9028708853148956, + "flos": 25421395898880.0, + "grad_norm": 1.6732392843849997, + "language_loss": 0.71488458, + "learning_rate": 9.809166710436855e-08, + "loss": 0.73912787, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1763916, + "step": 15017, + "time_per_iteration": 4.3809874057769775 + }, + { + "auxiliary_loss_clip": 0.014092, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.2496686, + "balance_loss_mlp": 1.01642215, + "epoch": 0.9029310085675635, + "flos": 21881241544320.0, + "grad_norm": 1.5442089545404236, + "language_loss": 0.69876087, + "learning_rate": 9.797123027563237e-08, + "loss": 0.72320431, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18725586, + "step": 15018, + "time_per_iteration": 2.911013126373291 + }, + { + "auxiliary_loss_clip": 0.01399269, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.23852718, + "balance_loss_mlp": 1.01305342, + "epoch": 0.9029911318202315, + "flos": 26225769536640.0, + "grad_norm": 1.6767336079138262, + "language_loss": 0.70076513, + "learning_rate": 9.785086557201782e-08, + "loss": 0.7250815, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19299316, + "step": 15019, + "time_per_iteration": 2.960486650466919 + }, + { + "auxiliary_loss_clip": 0.013854, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.22866464, + "balance_loss_mlp": 1.01456106, + "epoch": 0.9030512550728994, + "flos": 15969935468160.0, + "grad_norm": 2.762514776339677, + "language_loss": 0.72668833, + "learning_rate": 9.773057299808951e-08, + "loss": 0.75088108, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19299316, + "step": 15020, + "time_per_iteration": 4.405425548553467 + }, + { + "auxiliary_loss_clip": 0.01403441, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.24171424, + "balance_loss_mlp": 1.01263404, + "epoch": 0.9031113783255674, + "flos": 23998107634560.0, + "grad_norm": 2.0677393131178476, + "language_loss": 0.74754089, + "learning_rate": 9.7610352558408e-08, + "loss": 0.77189445, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19287109, + "step": 15021, + "time_per_iteration": 2.8666322231292725 + }, + { + "auxiliary_loss_clip": 0.01400379, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.23742533, + "balance_loss_mlp": 1.01512122, + "epoch": 0.9031715015782353, + "flos": 22247499352320.0, + "grad_norm": 2.2259277771936787, + "language_loss": 0.73236609, + "learning_rate": 9.749020425753251e-08, + "loss": 0.75671399, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19299316, + "step": 15022, + "time_per_iteration": 2.8310277462005615 + }, + { + "auxiliary_loss_clip": 0.0137265, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.22125196, + "balance_loss_mlp": 1.01390314, + "epoch": 0.9032316248309034, + "flos": 26334393598080.0, + "grad_norm": 1.9181883458393991, + "language_loss": 0.73353249, + "learning_rate": 9.737012810001943e-08, + "loss": 0.75759339, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.1953125, + "step": 15023, + "time_per_iteration": 2.8623573780059814 + }, + { + "auxiliary_loss_clip": 0.01402009, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.2421937, + "balance_loss_mlp": 1.01582074, + "epoch": 0.9032917480835713, + "flos": 22646586902400.0, + "grad_norm": 1.7447037284930753, + "language_loss": 0.83346581, + "learning_rate": 9.725012409042155e-08, + "loss": 0.85783887, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19470215, + "step": 15024, + "time_per_iteration": 2.889345407485962 + }, + { + "auxiliary_loss_clip": 0.0140653, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.24455285, + "balance_loss_mlp": 1.01218808, + "epoch": 0.9033518713362393, + "flos": 23889393083520.0, + "grad_norm": 1.4974614148044234, + "language_loss": 0.69978881, + "learning_rate": 9.713019223328966e-08, + "loss": 0.72415918, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18334961, + "step": 15025, + "time_per_iteration": 2.89776611328125 + }, + { + "auxiliary_loss_clip": 0.01395027, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.23648298, + "balance_loss_mlp": 1.0136888, + "epoch": 0.9034119945889073, + "flos": 26916225471360.0, + "grad_norm": 1.6217589118594695, + "language_loss": 0.7773329, + "learning_rate": 9.70103325331717e-08, + "loss": 0.80161488, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19494629, + "step": 15026, + "time_per_iteration": 2.9505929946899414 + }, + { + "auxiliary_loss_clip": 0.01395809, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.23795485, + "balance_loss_mlp": 1.0141362, + "epoch": 0.9034721178415752, + "flos": 20859936497280.0, + "grad_norm": 2.531609208181517, + "language_loss": 0.69276571, + "learning_rate": 9.68905449946129e-08, + "loss": 0.71704423, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.17883301, + "step": 15027, + "time_per_iteration": 4.281956195831299 + }, + { + "auxiliary_loss_clip": 0.01372874, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.22014272, + "balance_loss_mlp": 1.01320994, + "epoch": 0.9035322410942432, + "flos": 22243834523520.0, + "grad_norm": 1.570357919657266, + "language_loss": 0.76448053, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78852773, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.18640137, + "step": 15028, + "time_per_iteration": 2.8731536865234375 + }, + { + "auxiliary_loss_clip": 0.01385492, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.22818744, + "balance_loss_mlp": 1.013556, + "epoch": 0.9035923643469111, + "flos": 25934944089600.0, + "grad_norm": 1.7318960744603689, + "language_loss": 0.70051277, + "learning_rate": 9.665118642033765e-08, + "loss": 0.72468722, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18395996, + "step": 15029, + "time_per_iteration": 2.921647071838379 + }, + { + "auxiliary_loss_clip": 0.01405163, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.24270773, + "balance_loss_mlp": 1.01536453, + "epoch": 0.9036524875995792, + "flos": 20349555442560.0, + "grad_norm": 2.1276346148857277, + "language_loss": 0.750489, + "learning_rate": 9.653161539369858e-08, + "loss": 0.77489007, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19555664, + "step": 15030, + "time_per_iteration": 2.863332509994507 + }, + { + "auxiliary_loss_clip": 0.01412097, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.24922347, + "balance_loss_mlp": 1.01859736, + "epoch": 0.9037126108522471, + "flos": 40129386635520.0, + "grad_norm": 1.7423074441692434, + "language_loss": 0.68818277, + "learning_rate": 9.641211654677151e-08, + "loss": 0.71268308, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1932373, + "step": 15031, + "time_per_iteration": 3.0670485496520996 + }, + { + "auxiliary_loss_clip": 0.01386736, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.22867918, + "balance_loss_mlp": 1.01563644, + "epoch": 0.9037727341049151, + "flos": 23342653192320.0, + "grad_norm": 1.572226001459117, + "language_loss": 0.7745223, + "learning_rate": 9.629268988408723e-08, + "loss": 0.79873955, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19348145, + "step": 15032, + "time_per_iteration": 2.960700273513794 + }, + { + "auxiliary_loss_clip": 0.01409025, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.24824607, + "balance_loss_mlp": 1.01495326, + "epoch": 0.903832857357583, + "flos": 12830180762880.0, + "grad_norm": 1.7770680348802017, + "language_loss": 0.76693094, + "learning_rate": 9.617333541017502e-08, + "loss": 0.79135674, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18603516, + "step": 15033, + "time_per_iteration": 2.801617383956909 + }, + { + "auxiliary_loss_clip": 0.01402147, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.24232554, + "balance_loss_mlp": 1.01713419, + "epoch": 0.903892980610251, + "flos": 25714121627520.0, + "grad_norm": 1.7009051357532672, + "language_loss": 0.74038076, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76476014, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18640137, + "step": 15034, + "time_per_iteration": 2.9046735763549805 + }, + { + "auxiliary_loss_clip": 0.01388247, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.23117232, + "balance_loss_mlp": 1.01427317, + "epoch": 0.9039531038629189, + "flos": 14692263528960.0, + "grad_norm": 1.6553627193769427, + "language_loss": 0.64903688, + "learning_rate": 9.593484304676791e-08, + "loss": 0.67324585, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18395996, + "step": 15035, + "time_per_iteration": 2.8165464401245117 + }, + { + "auxiliary_loss_clip": 0.0139976, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.24017739, + "balance_loss_mlp": 1.01385117, + "epoch": 0.904013227115587, + "flos": 24035642835840.0, + "grad_norm": 2.041605968614431, + "language_loss": 0.63463056, + "learning_rate": 9.581570516631643e-08, + "loss": 0.65896368, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19714355, + "step": 15036, + "time_per_iteration": 2.854534387588501 + }, + { + "auxiliary_loss_clip": 0.01379152, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.22464466, + "balance_loss_mlp": 1.01233518, + "epoch": 0.9040733503682549, + "flos": 22866640202880.0, + "grad_norm": 1.5981831636041934, + "language_loss": 0.82825786, + "learning_rate": 9.569663949272455e-08, + "loss": 0.85236168, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.18896484, + "step": 15037, + "time_per_iteration": 2.838592052459717 + }, + { + "auxiliary_loss_clip": 0.01406732, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.24263358, + "balance_loss_mlp": 1.01029253, + "epoch": 0.9041334736209229, + "flos": 19984609733760.0, + "grad_norm": 1.603441645381469, + "language_loss": 0.68429041, + "learning_rate": 9.557764603050667e-08, + "loss": 0.70864785, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18725586, + "step": 15038, + "time_per_iteration": 2.813791275024414 + }, + { + "auxiliary_loss_clip": 0.01387429, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.22876024, + "balance_loss_mlp": 1.0138731, + "epoch": 0.9041935968735909, + "flos": 17539609219200.0, + "grad_norm": 1.9047258911435485, + "language_loss": 0.77112275, + "learning_rate": 9.545872478417494e-08, + "loss": 0.79532945, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19372559, + "step": 15039, + "time_per_iteration": 2.7967288494110107 + }, + { + "auxiliary_loss_clip": 0.01389396, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.23224092, + "balance_loss_mlp": 1.00848103, + "epoch": 0.9042537201262588, + "flos": 22789443294720.0, + "grad_norm": 1.482906053111994, + "language_loss": 0.70704746, + "learning_rate": 9.533987575823977e-08, + "loss": 0.7312178, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19152832, + "step": 15040, + "time_per_iteration": 2.834010124206543 + }, + { + "auxiliary_loss_clip": 0.01402737, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.24480021, + "balance_loss_mlp": 1.01233089, + "epoch": 0.9043138433789268, + "flos": 20605288907520.0, + "grad_norm": 1.7462628048577862, + "language_loss": 0.68475431, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70909393, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18896484, + "step": 15041, + "time_per_iteration": 2.8524210453033447 + }, + { + "auxiliary_loss_clip": 0.01387465, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.22862077, + "balance_loss_mlp": 1.01122034, + "epoch": 0.9043739666315948, + "flos": 32975772071040.0, + "grad_norm": 1.682550221538491, + "language_loss": 0.58080959, + "learning_rate": 9.510239438558155e-08, + "loss": 0.60499239, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19580078, + "step": 15042, + "time_per_iteration": 2.9535481929779053 + }, + { + "auxiliary_loss_clip": 0.01175694, + "auxiliary_loss_mlp": 0.01019903, + "balance_loss_clip": 1.08997345, + "balance_loss_mlp": 1.0003531, + "epoch": 0.9044340898842628, + "flos": 67327678045440.0, + "grad_norm": 0.7786547474183138, + "language_loss": 0.56979036, + "learning_rate": 9.498376204786351e-08, + "loss": 0.59174633, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.1953125, + "step": 15043, + "time_per_iteration": 3.299680233001709 + }, + { + "auxiliary_loss_clip": 0.01395811, + "auxiliary_loss_mlp": 0.01033283, + "balance_loss_clip": 1.23578882, + "balance_loss_mlp": 1.0136137, + "epoch": 0.9044942131369307, + "flos": 17722308297600.0, + "grad_norm": 1.8068257662490432, + "language_loss": 0.70780671, + "learning_rate": 9.486520194855274e-08, + "loss": 0.73209769, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19665527, + "step": 15044, + "time_per_iteration": 2.8622124195098877 + }, + { + "auxiliary_loss_clip": 0.01397602, + "auxiliary_loss_mlp": 0.0103162, + "balance_loss_clip": 1.23629081, + "balance_loss_mlp": 1.01262999, + "epoch": 0.9045543363895987, + "flos": 17829213056640.0, + "grad_norm": 2.228365855559933, + "language_loss": 0.7045598, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72885203, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18981934, + "step": 15045, + "time_per_iteration": 2.805387258529663 + }, + { + "auxiliary_loss_clip": 0.01407631, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.24655294, + "balance_loss_mlp": 1.01356387, + "epoch": 0.9046144596422666, + "flos": 21883006091520.0, + "grad_norm": 1.7345568012434123, + "language_loss": 0.66614592, + "learning_rate": 9.462829848313081e-08, + "loss": 0.69055796, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20019531, + "step": 15046, + "time_per_iteration": 2.9846582412719727 + }, + { + "auxiliary_loss_clip": 0.01406296, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.24309409, + "balance_loss_mlp": 1.01543212, + "epoch": 0.9046745828949346, + "flos": 17680746309120.0, + "grad_norm": 1.9364562854649534, + "language_loss": 0.62371111, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64812052, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19226074, + "step": 15047, + "time_per_iteration": 2.7966630458831787 + }, + { + "auxiliary_loss_clip": 0.01404319, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.24677336, + "balance_loss_mlp": 1.0148735, + "epoch": 0.9047347061476025, + "flos": 25713533445120.0, + "grad_norm": 1.5395964758557645, + "language_loss": 0.71326667, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73764914, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19067383, + "step": 15048, + "time_per_iteration": 2.901310682296753 + }, + { + "auxiliary_loss_clip": 0.01394735, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.23394406, + "balance_loss_mlp": 1.01534748, + "epoch": 0.9047948294002706, + "flos": 15166376236800.0, + "grad_norm": 2.037991755486922, + "language_loss": 0.75628757, + "learning_rate": 9.427348518535483e-08, + "loss": 0.78059071, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20239258, + "step": 15049, + "time_per_iteration": 2.8241183757781982 + }, + { + "auxiliary_loss_clip": 0.01393467, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.2358216, + "balance_loss_mlp": 1.01390767, + "epoch": 0.9048549526529385, + "flos": 21882463153920.0, + "grad_norm": 1.7496697335475913, + "language_loss": 0.76085734, + "learning_rate": 9.415535861079993e-08, + "loss": 0.78512073, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.1895752, + "step": 15050, + "time_per_iteration": 2.8752262592315674 + }, + { + "auxiliary_loss_clip": 0.01405042, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.24264717, + "balance_loss_mlp": 1.0171845, + "epoch": 0.9049150759056065, + "flos": 23556191241600.0, + "grad_norm": 1.829112515292373, + "language_loss": 0.82857478, + "learning_rate": 9.403730430606472e-08, + "loss": 0.85298789, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19055176, + "step": 15051, + "time_per_iteration": 2.855921745300293 + }, + { + "auxiliary_loss_clip": 0.01399292, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.23952818, + "balance_loss_mlp": 1.01314831, + "epoch": 0.9049751991582745, + "flos": 19655298944640.0, + "grad_norm": 1.979868117487741, + "language_loss": 0.90088952, + "learning_rate": 9.391932227562582e-08, + "loss": 0.92518842, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.17456055, + "step": 15052, + "time_per_iteration": 4.25718355178833 + }, + { + "auxiliary_loss_clip": 0.01409107, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.24558401, + "balance_loss_mlp": 1.01448393, + "epoch": 0.9050353224109424, + "flos": 15604627800960.0, + "grad_norm": 2.2019581099372734, + "language_loss": 0.774939, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79936743, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19262695, + "step": 15053, + "time_per_iteration": 2.8298351764678955 + }, + { + "auxiliary_loss_clip": 0.01392167, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.23349404, + "balance_loss_mlp": 1.01451135, + "epoch": 0.9050954456636104, + "flos": 28195209509760.0, + "grad_norm": 2.318823445917886, + "language_loss": 0.73244053, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75669479, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18762207, + "step": 15054, + "time_per_iteration": 2.888201951980591 + }, + { + "auxiliary_loss_clip": 0.01394248, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.23517895, + "balance_loss_mlp": 1.01110435, + "epoch": 0.9051555689162784, + "flos": 25740662342400.0, + "grad_norm": 1.5772345556631429, + "language_loss": 0.83562714, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85986304, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18237305, + "step": 15055, + "time_per_iteration": 4.282470941543579 + }, + { + "auxiliary_loss_clip": 0.01390027, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.23247635, + "balance_loss_mlp": 1.01638675, + "epoch": 0.9052156921689464, + "flos": 23267401810560.0, + "grad_norm": 1.7522026478643404, + "language_loss": 0.85495543, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87920159, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18212891, + "step": 15056, + "time_per_iteration": 2.854918956756592 + }, + { + "auxiliary_loss_clip": 0.01403617, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.24281573, + "balance_loss_mlp": 1.01303959, + "epoch": 0.9052758154216143, + "flos": 29574764035200.0, + "grad_norm": 1.84902053353987, + "language_loss": 0.73212969, + "learning_rate": 9.333049639436863e-08, + "loss": 0.75647956, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18347168, + "step": 15057, + "time_per_iteration": 2.897400379180908 + }, + { + "auxiliary_loss_clip": 0.0138427, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.22955894, + "balance_loss_mlp": 1.01083887, + "epoch": 0.9053359386742823, + "flos": 22137925150080.0, + "grad_norm": 1.4569565934955013, + "language_loss": 0.81466377, + "learning_rate": 9.321294810356418e-08, + "loss": 0.83881128, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.19641113, + "step": 15058, + "time_per_iteration": 2.8499104976654053 + }, + { + "auxiliary_loss_clip": 0.01177044, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.08957326, + "balance_loss_mlp": 1.01054001, + "epoch": 0.9053960619269502, + "flos": 67124591544960.0, + "grad_norm": 0.6699087946403369, + "language_loss": 0.51354086, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53563321, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.21679688, + "step": 15059, + "time_per_iteration": 3.4356815814971924 + }, + { + "auxiliary_loss_clip": 0.01398423, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.23697972, + "balance_loss_mlp": 1.01167655, + "epoch": 0.9054561851796182, + "flos": 15823957184640.0, + "grad_norm": 1.7220357325320055, + "language_loss": 0.67530572, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69958878, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18225098, + "step": 15060, + "time_per_iteration": 2.826357841491699 + }, + { + "auxiliary_loss_clip": 0.01410777, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.24877429, + "balance_loss_mlp": 1.0141716, + "epoch": 0.9055163084322861, + "flos": 17576058545280.0, + "grad_norm": 2.5026388616703183, + "language_loss": 0.64950371, + "learning_rate": 9.286073708230357e-08, + "loss": 0.67394751, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19445801, + "step": 15061, + "time_per_iteration": 2.811018943786621 + }, + { + "auxiliary_loss_clip": 0.01394047, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.23421013, + "balance_loss_mlp": 1.01280963, + "epoch": 0.9055764316849542, + "flos": 17648549994240.0, + "grad_norm": 1.6993397126738374, + "language_loss": 0.72265226, + "learning_rate": 9.274347804044058e-08, + "loss": 0.74691856, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19775391, + "step": 15062, + "time_per_iteration": 4.286927938461304 + }, + { + "auxiliary_loss_clip": 0.01395843, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.23727214, + "balance_loss_mlp": 1.0138526, + "epoch": 0.9056365549376221, + "flos": 20130542772480.0, + "grad_norm": 1.6449551996514218, + "language_loss": 0.7124967, + "learning_rate": 9.2626291321936e-08, + "loss": 0.7367835, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18981934, + "step": 15063, + "time_per_iteration": 2.8913259506225586 + }, + { + "auxiliary_loss_clip": 0.01381542, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.22672284, + "balance_loss_mlp": 1.01193929, + "epoch": 0.9056966781902901, + "flos": 27609984276480.0, + "grad_norm": 1.6383164382867228, + "language_loss": 0.72884262, + "learning_rate": 9.250917693123406e-08, + "loss": 0.75295961, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18225098, + "step": 15064, + "time_per_iteration": 2.920121431350708 + }, + { + "auxiliary_loss_clip": 0.0139818, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.23630393, + "balance_loss_mlp": 1.01197267, + "epoch": 0.9057568014429581, + "flos": 25930781568000.0, + "grad_norm": 4.6766154397594715, + "language_loss": 0.71370071, + "learning_rate": 9.23921348727752e-08, + "loss": 0.73798859, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1862793, + "step": 15065, + "time_per_iteration": 2.8761324882507324 + }, + { + "auxiliary_loss_clip": 0.01401478, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.24203408, + "balance_loss_mlp": 1.01666379, + "epoch": 0.905816924695626, + "flos": 22940805709440.0, + "grad_norm": 1.663514413521259, + "language_loss": 0.64134359, + "learning_rate": 9.227516515099743e-08, + "loss": 0.66571534, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19030762, + "step": 15066, + "time_per_iteration": 2.9378576278686523 + }, + { + "auxiliary_loss_clip": 0.01417144, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.2494204, + "balance_loss_mlp": 1.01347542, + "epoch": 0.905877047948294, + "flos": 22165913698560.0, + "grad_norm": 2.052818970766034, + "language_loss": 0.80539751, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82990289, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19909668, + "step": 15067, + "time_per_iteration": 2.875018358230591 + }, + { + "auxiliary_loss_clip": 0.01403489, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.24167418, + "balance_loss_mlp": 1.01251531, + "epoch": 0.905937171200962, + "flos": 15313259416320.0, + "grad_norm": 1.700427815820377, + "language_loss": 0.70498991, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72935677, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.20690918, + "step": 15068, + "time_per_iteration": 2.9475185871124268 + }, + { + "auxiliary_loss_clip": 0.01382269, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.22716057, + "balance_loss_mlp": 1.01652062, + "epoch": 0.90599729445363, + "flos": 19472192663040.0, + "grad_norm": 2.593862243588848, + "language_loss": 0.8656615, + "learning_rate": 9.19246900500943e-08, + "loss": 0.88983411, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18481445, + "step": 15069, + "time_per_iteration": 2.801565647125244 + }, + { + "auxiliary_loss_clip": 0.01410455, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.24716294, + "balance_loss_mlp": 1.01164007, + "epoch": 0.9060574177062979, + "flos": 23743595779200.0, + "grad_norm": 2.915105026426814, + "language_loss": 0.60137868, + "learning_rate": 9.180800971936987e-08, + "loss": 0.62578809, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18835449, + "step": 15070, + "time_per_iteration": 2.8373055458068848 + }, + { + "auxiliary_loss_clip": 0.0140741, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.2433883, + "balance_loss_mlp": 1.00959897, + "epoch": 0.9061175409589659, + "flos": 17320008366720.0, + "grad_norm": 5.5684998499968295, + "language_loss": 0.82136333, + "learning_rate": 9.169140174747724e-08, + "loss": 0.84572768, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19433594, + "step": 15071, + "time_per_iteration": 2.8442718982696533 + }, + { + "auxiliary_loss_clip": 0.01404383, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.241907, + "balance_loss_mlp": 1.01454008, + "epoch": 0.9061776642116338, + "flos": 17785615052160.0, + "grad_norm": 1.8525092867849862, + "language_loss": 0.62575907, + "learning_rate": 9.157486613883758e-08, + "loss": 0.65014327, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19506836, + "step": 15072, + "time_per_iteration": 2.8864874839782715 + }, + { + "auxiliary_loss_clip": 0.01397335, + "auxiliary_loss_mlp": 0.01034426, + "balance_loss_clip": 1.23783302, + "balance_loss_mlp": 1.01488745, + "epoch": 0.9062377874643018, + "flos": 42793037861760.0, + "grad_norm": 2.745093344131427, + "language_loss": 0.73930812, + "learning_rate": 9.145840289787021e-08, + "loss": 0.76362568, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19555664, + "step": 15073, + "time_per_iteration": 3.0293798446655273 + }, + { + "auxiliary_loss_clip": 0.01386535, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.23024511, + "balance_loss_mlp": 1.01117074, + "epoch": 0.9062979107169697, + "flos": 16370018403840.0, + "grad_norm": 1.8117981518229371, + "language_loss": 0.817756, + "learning_rate": 9.134201202899161e-08, + "loss": 0.84192103, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18811035, + "step": 15074, + "time_per_iteration": 2.852825880050659 + }, + { + "auxiliary_loss_clip": 0.01177497, + "auxiliary_loss_mlp": 0.01021244, + "balance_loss_clip": 1.08916712, + "balance_loss_mlp": 0.99873775, + "epoch": 0.9063580339696378, + "flos": 69348588618240.0, + "grad_norm": 0.7445175106688425, + "language_loss": 0.52425981, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54624718, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22460938, + "step": 15075, + "time_per_iteration": 3.3948209285736084 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01017174, + "balance_loss_clip": 1.08849669, + "balance_loss_mlp": 0.99791002, + "epoch": 0.9064181572223057, + "flos": 58823673868800.0, + "grad_norm": 0.7244433172132912, + "language_loss": 0.6220957, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64401209, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.19238281, + "step": 15076, + "time_per_iteration": 3.2045018672943115 + }, + { + "auxiliary_loss_clip": 0.01387277, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.23003697, + "balance_loss_mlp": 1.01505756, + "epoch": 0.9064782804749737, + "flos": 21772798462080.0, + "grad_norm": 2.165120878380871, + "language_loss": 0.82767075, + "learning_rate": 9.09932736990091e-08, + "loss": 0.85187626, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18225098, + "step": 15077, + "time_per_iteration": 2.8462929725646973 + }, + { + "auxiliary_loss_clip": 0.01392999, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.23667955, + "balance_loss_mlp": 1.01190591, + "epoch": 0.9065384037276417, + "flos": 21407264570880.0, + "grad_norm": 2.0473128233237476, + "language_loss": 0.84924424, + "learning_rate": 9.08771723625934e-08, + "loss": 0.87347329, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18005371, + "step": 15078, + "time_per_iteration": 2.8452298641204834 + }, + { + "auxiliary_loss_clip": 0.01372864, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.21967745, + "balance_loss_mlp": 1.01161027, + "epoch": 0.9065985269803096, + "flos": 38296197313920.0, + "grad_norm": 1.6681643765382916, + "language_loss": 0.66358525, + "learning_rate": 9.076114342030617e-08, + "loss": 0.68762028, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.19030762, + "step": 15079, + "time_per_iteration": 3.0046846866607666 + }, + { + "auxiliary_loss_clip": 0.01387634, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.22904491, + "balance_loss_mlp": 1.01357985, + "epoch": 0.9066586502329776, + "flos": 44833838163840.0, + "grad_norm": 2.4334130044853435, + "language_loss": 0.71264195, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73683798, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18359375, + "step": 15080, + "time_per_iteration": 3.053250551223755 + }, + { + "auxiliary_loss_clip": 0.01423978, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.25902438, + "balance_loss_mlp": 1.01321769, + "epoch": 0.9067187734856456, + "flos": 18633043756800.0, + "grad_norm": 2.823165874955985, + "language_loss": 0.71764028, + "learning_rate": 9.052930273571547e-08, + "loss": 0.74220026, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18823242, + "step": 15081, + "time_per_iteration": 2.8097054958343506 + }, + { + "auxiliary_loss_clip": 0.01387142, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.23059654, + "balance_loss_mlp": 1.01631391, + "epoch": 0.9067788967383136, + "flos": 22757880407040.0, + "grad_norm": 20.849941928366338, + "language_loss": 0.74312508, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76733863, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.17895508, + "step": 15082, + "time_per_iteration": 2.8673455715179443 + }, + { + "auxiliary_loss_clip": 0.01390831, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.23307157, + "balance_loss_mlp": 1.01703095, + "epoch": 0.9068390199909815, + "flos": 27681751808640.0, + "grad_norm": 1.7421827828429575, + "language_loss": 0.78724396, + "learning_rate": 9.029775168040266e-08, + "loss": 0.81151533, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19274902, + "step": 15083, + "time_per_iteration": 2.874119997024536 + }, + { + "auxiliary_loss_clip": 0.01384554, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.22937846, + "balance_loss_mlp": 1.01641941, + "epoch": 0.9068991432436495, + "flos": 24254836485120.0, + "grad_norm": 1.5302329239129722, + "language_loss": 0.69440472, + "learning_rate": 9.01820847747028e-08, + "loss": 0.7186023, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18762207, + "step": 15084, + "time_per_iteration": 2.8608689308166504 + }, + { + "auxiliary_loss_clip": 0.01398334, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.23905897, + "balance_loss_mlp": 1.01442409, + "epoch": 0.9069592664963174, + "flos": 28043756605440.0, + "grad_norm": 2.066108753794254, + "language_loss": 0.6734342, + "learning_rate": 9.006649028948965e-08, + "loss": 0.69774598, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18408203, + "step": 15085, + "time_per_iteration": 2.913005828857422 + }, + { + "auxiliary_loss_clip": 0.01176187, + "auxiliary_loss_mlp": 0.01023287, + "balance_loss_clip": 1.09106326, + "balance_loss_mlp": 1.00516725, + "epoch": 0.9070193897489854, + "flos": 68806445713920.0, + "grad_norm": 0.7958609430588538, + "language_loss": 0.61353821, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63553292, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.18164062, + "step": 15086, + "time_per_iteration": 3.350461721420288 + }, + { + "auxiliary_loss_clip": 0.01387005, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.2295773, + "balance_loss_mlp": 1.01246798, + "epoch": 0.9070795130016533, + "flos": 23452317884160.0, + "grad_norm": 1.50717260446102, + "language_loss": 0.73630655, + "learning_rate": 8.983551859805416e-08, + "loss": 0.76049083, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18945312, + "step": 15087, + "time_per_iteration": 4.326995134353638 + }, + { + "auxiliary_loss_clip": 0.01391989, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.23322868, + "balance_loss_mlp": 1.0115205, + "epoch": 0.9071396362543214, + "flos": 18925678995840.0, + "grad_norm": 4.160169684458599, + "language_loss": 0.77507055, + "learning_rate": 8.972014140059058e-08, + "loss": 0.7992897, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18408203, + "step": 15088, + "time_per_iteration": 2.8666350841522217 + }, + { + "auxiliary_loss_clip": 0.01383067, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.22883642, + "balance_loss_mlp": 1.01840675, + "epoch": 0.9071997595069893, + "flos": 25239601716480.0, + "grad_norm": 2.2778523795424976, + "language_loss": 0.74015784, + "learning_rate": 8.960483664113038e-08, + "loss": 0.76435691, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.18444824, + "step": 15089, + "time_per_iteration": 3.049724578857422 + }, + { + "auxiliary_loss_clip": 0.01382966, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.22910237, + "balance_loss_mlp": 1.01454902, + "epoch": 0.9072598827596573, + "flos": 24355678440960.0, + "grad_norm": 2.067233158013782, + "language_loss": 0.76130581, + "learning_rate": 8.948960432404628e-08, + "loss": 0.78546667, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18579102, + "step": 15090, + "time_per_iteration": 2.869291067123413 + }, + { + "auxiliary_loss_clip": 0.01408701, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.24615693, + "balance_loss_mlp": 1.01349354, + "epoch": 0.9073200060123253, + "flos": 22685343713280.0, + "grad_norm": 2.021321107565697, + "language_loss": 0.78149235, + "learning_rate": 8.93744444537079e-08, + "loss": 0.80590695, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19250488, + "step": 15091, + "time_per_iteration": 4.293613910675049 + }, + { + "auxiliary_loss_clip": 0.01380755, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.22708213, + "balance_loss_mlp": 1.01262999, + "epoch": 0.9073801292649932, + "flos": 23705834353920.0, + "grad_norm": 1.6038848613057661, + "language_loss": 0.85968322, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88379157, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.17456055, + "step": 15092, + "time_per_iteration": 2.9290308952331543 + }, + { + "auxiliary_loss_clip": 0.01406769, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.24774182, + "balance_loss_mlp": 1.01789045, + "epoch": 0.9074402525176612, + "flos": 25386620630400.0, + "grad_norm": 1.6298829534969845, + "language_loss": 0.79457545, + "learning_rate": 8.914434207073296e-08, + "loss": 0.8190006, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.1784668, + "step": 15093, + "time_per_iteration": 2.884214162826538 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01019767, + "balance_loss_clip": 1.08871436, + "balance_loss_mlp": 1.00002599, + "epoch": 0.9075003757703292, + "flos": 67677195277440.0, + "grad_norm": 0.7414614047336038, + "language_loss": 0.57024771, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59220016, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19726562, + "step": 15094, + "time_per_iteration": 3.2655086517333984 + }, + { + "auxiliary_loss_clip": 0.01410521, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.24817979, + "balance_loss_mlp": 1.01525295, + "epoch": 0.9075604990229972, + "flos": 22463344886400.0, + "grad_norm": 1.8834012573745251, + "language_loss": 0.72232997, + "learning_rate": 8.891452952710742e-08, + "loss": 0.74678469, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19689941, + "step": 15095, + "time_per_iteration": 2.8991854190826416 + }, + { + "auxiliary_loss_clip": 0.01394956, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.2368381, + "balance_loss_mlp": 1.01545596, + "epoch": 0.9076206222756651, + "flos": 19546086700800.0, + "grad_norm": 1.8249169883387788, + "language_loss": 0.74896806, + "learning_rate": 8.879973195594526e-08, + "loss": 0.77326131, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18920898, + "step": 15096, + "time_per_iteration": 4.249268054962158 + }, + { + "auxiliary_loss_clip": 0.01406057, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.24374008, + "balance_loss_mlp": 1.01579547, + "epoch": 0.9076807455283331, + "flos": 30128924073600.0, + "grad_norm": 1.988444705193047, + "language_loss": 0.57539654, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59981239, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19750977, + "step": 15097, + "time_per_iteration": 4.275999069213867 + }, + { + "auxiliary_loss_clip": 0.01384565, + "auxiliary_loss_mlp": 0.01026814, + "balance_loss_clip": 1.22740507, + "balance_loss_mlp": 1.00913501, + "epoch": 0.907740868781001, + "flos": 18706756815360.0, + "grad_norm": 1.5827266778627698, + "language_loss": 0.80199611, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82610989, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.17687988, + "step": 15098, + "time_per_iteration": 2.8163399696350098 + }, + { + "auxiliary_loss_clip": 0.01401728, + "auxiliary_loss_mlp": 0.01029618, + "balance_loss_clip": 1.2391082, + "balance_loss_mlp": 1.01111627, + "epoch": 0.907800992033669, + "flos": 22649256345600.0, + "grad_norm": 1.6406146409155598, + "language_loss": 0.6653415, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68965495, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18493652, + "step": 15099, + "time_per_iteration": 2.8763582706451416 + }, + { + "auxiliary_loss_clip": 0.01409596, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.24734926, + "balance_loss_mlp": 1.01792073, + "epoch": 0.907861115286337, + "flos": 21297464144640.0, + "grad_norm": 1.918406221125474, + "language_loss": 0.71257156, + "learning_rate": 8.834126644384477e-08, + "loss": 0.73704219, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19543457, + "step": 15100, + "time_per_iteration": 2.8390393257141113 + }, + { + "auxiliary_loss_clip": 0.01177678, + "auxiliary_loss_mlp": 0.01014051, + "balance_loss_clip": 1.09142375, + "balance_loss_mlp": 0.9963131, + "epoch": 0.907921238539005, + "flos": 69771004502400.0, + "grad_norm": 0.6265732015261626, + "language_loss": 0.53442717, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55634451, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.17773438, + "step": 15101, + "time_per_iteration": 3.4344167709350586 + }, + { + "auxiliary_loss_clip": 0.01398785, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.23967838, + "balance_loss_mlp": 1.01541424, + "epoch": 0.9079813617916729, + "flos": 23487907559040.0, + "grad_norm": 1.8665340191141773, + "language_loss": 0.69185251, + "learning_rate": 8.811246861216081e-08, + "loss": 0.71618974, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1953125, + "step": 15102, + "time_per_iteration": 2.9113478660583496 + }, + { + "auxiliary_loss_clip": 0.01388476, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.23101532, + "balance_loss_mlp": 1.01331854, + "epoch": 0.9080414850443409, + "flos": 22940262771840.0, + "grad_norm": 1.8624228031326542, + "language_loss": 0.79834825, + "learning_rate": 8.799817844260049e-08, + "loss": 0.82255095, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18469238, + "step": 15103, + "time_per_iteration": 2.8403337001800537 + }, + { + "auxiliary_loss_clip": 0.01392341, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.2320838, + "balance_loss_mlp": 1.01711953, + "epoch": 0.9081016082970089, + "flos": 26188234335360.0, + "grad_norm": 1.7893076026054535, + "language_loss": 0.7281158, + "learning_rate": 8.78839607763413e-08, + "loss": 0.75239944, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18896484, + "step": 15104, + "time_per_iteration": 2.905101776123047 + }, + { + "auxiliary_loss_clip": 0.01386617, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.22999489, + "balance_loss_mlp": 1.01217794, + "epoch": 0.9081617315496768, + "flos": 24472853769600.0, + "grad_norm": 1.7277364466417617, + "language_loss": 0.77955115, + "learning_rate": 8.77698156177138e-08, + "loss": 0.80372471, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18554688, + "step": 15105, + "time_per_iteration": 2.936025619506836 + }, + { + "auxiliary_loss_clip": 0.01397173, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.23772943, + "balance_loss_mlp": 1.0164063, + "epoch": 0.9082218548023449, + "flos": 24755761376640.0, + "grad_norm": 1.9599480610510869, + "language_loss": 0.74068284, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76500481, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18603516, + "step": 15106, + "time_per_iteration": 2.9089343547821045 + }, + { + "auxiliary_loss_clip": 0.01403436, + "auxiliary_loss_mlp": 0.01036096, + "balance_loss_clip": 1.24262667, + "balance_loss_mlp": 1.01609266, + "epoch": 0.9082819780550128, + "flos": 24430839333120.0, + "grad_norm": 1.777705639420387, + "language_loss": 0.81251121, + "learning_rate": 8.754174284066462e-08, + "loss": 0.83690655, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19995117, + "step": 15107, + "time_per_iteration": 2.8594207763671875 + }, + { + "auxiliary_loss_clip": 0.01176287, + "auxiliary_loss_mlp": 0.010159, + "balance_loss_clip": 1.09005594, + "balance_loss_mlp": 0.99511021, + "epoch": 0.9083421013076808, + "flos": 59642028149760.0, + "grad_norm": 0.8099126663880823, + "language_loss": 0.59637177, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61829364, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.20800781, + "step": 15108, + "time_per_iteration": 3.324327230453491 + }, + { + "auxiliary_loss_clip": 0.01397526, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.23574018, + "balance_loss_mlp": 1.01235175, + "epoch": 0.9084022245603487, + "flos": 33633307774080.0, + "grad_norm": 1.64750093921466, + "language_loss": 0.74567276, + "learning_rate": 8.73139601460482e-08, + "loss": 0.76995862, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18725586, + "step": 15109, + "time_per_iteration": 2.9706268310546875 + }, + { + "auxiliary_loss_clip": 0.01385573, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.2296524, + "balance_loss_mlp": 1.01152742, + "epoch": 0.9084623478130167, + "flos": 24982194193920.0, + "grad_norm": 1.558963460634957, + "language_loss": 0.72656667, + "learning_rate": 8.720017759045073e-08, + "loss": 0.75072908, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.19152832, + "step": 15110, + "time_per_iteration": 2.893172264099121 + }, + { + "auxiliary_loss_clip": 0.01392143, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.23514962, + "balance_loss_mlp": 1.01585901, + "epoch": 0.9085224710656846, + "flos": 31472843679360.0, + "grad_norm": 1.6951084814385649, + "language_loss": 0.69459844, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71886832, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18969727, + "step": 15111, + "time_per_iteration": 2.9517035484313965 + }, + { + "auxiliary_loss_clip": 0.01176566, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.09072483, + "balance_loss_mlp": 1.01012802, + "epoch": 0.9085825943183526, + "flos": 64945622327040.0, + "grad_norm": 0.6965448566653105, + "language_loss": 0.51862007, + "learning_rate": 8.697283008425026e-08, + "loss": 0.54073298, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.24609375, + "step": 15112, + "time_per_iteration": 3.357377290725708 + }, + { + "auxiliary_loss_clip": 0.0139478, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.23540187, + "balance_loss_mlp": 1.01397192, + "epoch": 0.9086427175710206, + "flos": 18962580769920.0, + "grad_norm": 3.051126959636042, + "language_loss": 0.71048641, + "learning_rate": 8.685926514226837e-08, + "loss": 0.73475814, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1842041, + "step": 15113, + "time_per_iteration": 2.8245859146118164 + }, + { + "auxiliary_loss_clip": 0.01395313, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.23686934, + "balance_loss_mlp": 1.01203823, + "epoch": 0.9087028408236886, + "flos": 34026332520960.0, + "grad_norm": 1.9315613141334236, + "language_loss": 0.79869199, + "learning_rate": 8.674577274677508e-08, + "loss": 0.82295281, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18725586, + "step": 15114, + "time_per_iteration": 2.942697048187256 + }, + { + "auxiliary_loss_clip": 0.01416822, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.25215387, + "balance_loss_mlp": 1.01543975, + "epoch": 0.9087629640763565, + "flos": 21954592644480.0, + "grad_norm": 31.462972631070038, + "language_loss": 0.71674025, + "learning_rate": 8.663235290207405e-08, + "loss": 0.7412622, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19934082, + "step": 15115, + "time_per_iteration": 2.837405204772949 + }, + { + "auxiliary_loss_clip": 0.01417133, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.25197446, + "balance_loss_mlp": 1.0172286, + "epoch": 0.9088230873290245, + "flos": 21773069930880.0, + "grad_norm": 1.5427004004490101, + "language_loss": 0.66014564, + "learning_rate": 8.651900561246561e-08, + "loss": 0.68468755, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19824219, + "step": 15116, + "time_per_iteration": 2.853137969970703 + }, + { + "auxiliary_loss_clip": 0.01381039, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.22590947, + "balance_loss_mlp": 1.01414025, + "epoch": 0.9088832105816925, + "flos": 21550663900800.0, + "grad_norm": 1.9310747254237142, + "language_loss": 0.69912398, + "learning_rate": 8.640573088224812e-08, + "loss": 0.72327447, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.19885254, + "step": 15117, + "time_per_iteration": 2.8283915519714355 + }, + { + "auxiliary_loss_clip": 0.01392907, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.23455811, + "balance_loss_mlp": 1.01271152, + "epoch": 0.9089433338343604, + "flos": 26008340434560.0, + "grad_norm": 1.4728157815752887, + "language_loss": 0.75456429, + "learning_rate": 8.629252871571745e-08, + "loss": 0.77880853, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18798828, + "step": 15118, + "time_per_iteration": 2.8804609775543213 + }, + { + "auxiliary_loss_clip": 0.01418688, + "auxiliary_loss_mlp": 0.01036089, + "balance_loss_clip": 1.25196087, + "balance_loss_mlp": 1.01614499, + "epoch": 0.9090034570870285, + "flos": 21188251900800.0, + "grad_norm": 2.074008027988038, + "language_loss": 0.73328334, + "learning_rate": 8.617939911716554e-08, + "loss": 0.7578311, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.19946289, + "step": 15119, + "time_per_iteration": 2.870600700378418 + }, + { + "auxiliary_loss_clip": 0.01421046, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.25583291, + "balance_loss_mlp": 1.01238382, + "epoch": 0.9090635803396964, + "flos": 16149919858560.0, + "grad_norm": 2.3041320153256444, + "language_loss": 0.71708417, + "learning_rate": 8.60663420908827e-08, + "loss": 0.74161196, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19335938, + "step": 15120, + "time_per_iteration": 2.779383659362793 + }, + { + "auxiliary_loss_clip": 0.01407924, + "auxiliary_loss_mlp": 0.01032174, + "balance_loss_clip": 1.24680614, + "balance_loss_mlp": 1.01200402, + "epoch": 0.9091237035923644, + "flos": 20600402469120.0, + "grad_norm": 2.0967677456509426, + "language_loss": 0.66720223, + "learning_rate": 8.595335764115596e-08, + "loss": 0.6916033, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.20166016, + "step": 15121, + "time_per_iteration": 2.8634984493255615 + }, + { + "auxiliary_loss_clip": 0.01395472, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.23638558, + "balance_loss_mlp": 1.01720548, + "epoch": 0.9091838268450323, + "flos": 52245629412480.0, + "grad_norm": 2.0652921110401614, + "language_loss": 0.71405101, + "learning_rate": 8.58404457722699e-08, + "loss": 0.73836935, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19140625, + "step": 15122, + "time_per_iteration": 4.5781755447387695 + }, + { + "auxiliary_loss_clip": 0.01395746, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.23803425, + "balance_loss_mlp": 1.01360548, + "epoch": 0.9092439500977003, + "flos": 20569653987840.0, + "grad_norm": 1.479356186498681, + "language_loss": 0.7507956, + "learning_rate": 8.572760648850575e-08, + "loss": 0.77507472, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18566895, + "step": 15123, + "time_per_iteration": 2.9025211334228516 + }, + { + "auxiliary_loss_clip": 0.01385655, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.22939682, + "balance_loss_mlp": 1.01595259, + "epoch": 0.9093040733503682, + "flos": 28628665125120.0, + "grad_norm": 1.9464265572336512, + "language_loss": 0.76597512, + "learning_rate": 8.561483979414253e-08, + "loss": 0.79018664, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1953125, + "step": 15124, + "time_per_iteration": 2.932560443878174 + }, + { + "auxiliary_loss_clip": 0.01394165, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.23685873, + "balance_loss_mlp": 1.01339412, + "epoch": 0.9093641966030362, + "flos": 23450508092160.0, + "grad_norm": 4.975815988473684, + "language_loss": 0.73851418, + "learning_rate": 8.55021456934566e-08, + "loss": 0.76281607, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.22631836, + "step": 15125, + "time_per_iteration": 4.310215711593628 + }, + { + "auxiliary_loss_clip": 0.01385232, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.22997487, + "balance_loss_mlp": 1.01498687, + "epoch": 0.9094243198557042, + "flos": 16808903395200.0, + "grad_norm": 1.6262684825920743, + "language_loss": 0.79840219, + "learning_rate": 8.538952419072143e-08, + "loss": 0.8225944, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.19006348, + "step": 15126, + "time_per_iteration": 2.810014009475708 + }, + { + "auxiliary_loss_clip": 0.01397978, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.24061596, + "balance_loss_mlp": 1.01584148, + "epoch": 0.9094844431083722, + "flos": 24282236851200.0, + "grad_norm": 1.677925969579953, + "language_loss": 0.76517117, + "learning_rate": 8.527697529020694e-08, + "loss": 0.7894963, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18676758, + "step": 15127, + "time_per_iteration": 2.9155213832855225 + }, + { + "auxiliary_loss_clip": 0.01397082, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.2361747, + "balance_loss_mlp": 1.01727974, + "epoch": 0.9095445663610401, + "flos": 21954683134080.0, + "grad_norm": 2.039281886024452, + "language_loss": 0.63612115, + "learning_rate": 8.516449899618173e-08, + "loss": 0.6604628, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19799805, + "step": 15128, + "time_per_iteration": 2.83559250831604 + }, + { + "auxiliary_loss_clip": 0.01386187, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.22916317, + "balance_loss_mlp": 1.01371527, + "epoch": 0.9096046896137081, + "flos": 19802544082560.0, + "grad_norm": 4.8008102315739904, + "language_loss": 0.77268654, + "learning_rate": 8.505209531291013e-08, + "loss": 0.7968775, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1920166, + "step": 15129, + "time_per_iteration": 2.8571431636810303 + }, + { + "auxiliary_loss_clip": 0.01393431, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.23405313, + "balance_loss_mlp": 1.01379204, + "epoch": 0.909664812866376, + "flos": 22648351449600.0, + "grad_norm": 1.866536889574983, + "language_loss": 0.83965182, + "learning_rate": 8.49397642446552e-08, + "loss": 0.86391318, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18908691, + "step": 15130, + "time_per_iteration": 2.8612613677978516 + }, + { + "auxiliary_loss_clip": 0.01402034, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.2415942, + "balance_loss_mlp": 1.01275563, + "epoch": 0.909724936119044, + "flos": 39865011413760.0, + "grad_norm": 8.068537886758703, + "language_loss": 0.75529552, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77963024, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18676758, + "step": 15131, + "time_per_iteration": 3.097151041030884 + }, + { + "auxiliary_loss_clip": 0.014019, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.24246025, + "balance_loss_mlp": 1.01333714, + "epoch": 0.9097850593717121, + "flos": 35083001018880.0, + "grad_norm": 2.7318293049800393, + "language_loss": 0.60349196, + "learning_rate": 8.471531997023085e-08, + "loss": 0.62783784, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19348145, + "step": 15132, + "time_per_iteration": 5.710118055343628 + }, + { + "auxiliary_loss_clip": 0.01397718, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.23977458, + "balance_loss_mlp": 1.01582408, + "epoch": 0.90984518262438, + "flos": 23377835664000.0, + "grad_norm": 1.3472164983443906, + "language_loss": 0.8289814, + "learning_rate": 8.460320677257193e-08, + "loss": 0.85330081, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18395996, + "step": 15133, + "time_per_iteration": 2.8587417602539062 + }, + { + "auxiliary_loss_clip": 0.01398015, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.23719072, + "balance_loss_mlp": 1.0148747, + "epoch": 0.909905305877048, + "flos": 27533827998720.0, + "grad_norm": 1.6954698397567738, + "language_loss": 0.74595344, + "learning_rate": 8.449116620695118e-08, + "loss": 0.77026474, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18237305, + "step": 15134, + "time_per_iteration": 2.8762855529785156 + }, + { + "auxiliary_loss_clip": 0.01423892, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.25779057, + "balance_loss_mlp": 1.01690912, + "epoch": 0.9099654291297159, + "flos": 24357262008960.0, + "grad_norm": 1.5809503843085893, + "language_loss": 0.73850298, + "learning_rate": 8.437919827761786e-08, + "loss": 0.76310396, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19287109, + "step": 15135, + "time_per_iteration": 2.854510545730591 + }, + { + "auxiliary_loss_clip": 0.01389789, + "auxiliary_loss_mlp": 0.0103059, + "balance_loss_clip": 1.23289144, + "balance_loss_mlp": 1.01253009, + "epoch": 0.9100255523823839, + "flos": 21225153674880.0, + "grad_norm": 1.8907806755836871, + "language_loss": 0.70380354, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72800732, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18066406, + "step": 15136, + "time_per_iteration": 2.8556480407714844 + }, + { + "auxiliary_loss_clip": 0.01175971, + "auxiliary_loss_mlp": 0.01025173, + "balance_loss_clip": 1.09009862, + "balance_loss_mlp": 1.00514627, + "epoch": 0.9100856756350518, + "flos": 46075576452480.0, + "grad_norm": 0.8152814190759695, + "language_loss": 0.59272122, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61473268, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.20019531, + "step": 15137, + "time_per_iteration": 3.085944414138794 + }, + { + "auxiliary_loss_clip": 0.01399798, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.23988986, + "balance_loss_mlp": 1.01210332, + "epoch": 0.9101457988877198, + "flos": 20239528792320.0, + "grad_norm": 1.5432992828819696, + "language_loss": 0.83177704, + "learning_rate": 8.40437303497834e-08, + "loss": 0.85607755, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18151855, + "step": 15138, + "time_per_iteration": 2.8535947799682617 + }, + { + "auxiliary_loss_clip": 0.01382462, + "auxiliary_loss_mlp": 0.01032879, + "balance_loss_clip": 1.22933483, + "balance_loss_mlp": 1.01381731, + "epoch": 0.9102059221403878, + "flos": 26626485899520.0, + "grad_norm": 5.220587164965373, + "language_loss": 0.81661916, + "learning_rate": 8.39320530080283e-08, + "loss": 0.84077257, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.19042969, + "step": 15139, + "time_per_iteration": 2.9006919860839844 + }, + { + "auxiliary_loss_clip": 0.01386382, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.22920978, + "balance_loss_mlp": 1.01357603, + "epoch": 0.9102660453930558, + "flos": 21918686256000.0, + "grad_norm": 2.3996005578753414, + "language_loss": 0.78302234, + "learning_rate": 8.382044832376167e-08, + "loss": 0.80720574, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18383789, + "step": 15140, + "time_per_iteration": 2.8628859519958496 + }, + { + "auxiliary_loss_clip": 0.01385574, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.22716081, + "balance_loss_mlp": 1.01413107, + "epoch": 0.9103261686457237, + "flos": 36191049626880.0, + "grad_norm": 1.6815110399493267, + "language_loss": 0.6689955, + "learning_rate": 8.370891630121569e-08, + "loss": 0.6931802, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18786621, + "step": 15141, + "time_per_iteration": 3.0448150634765625 + }, + { + "auxiliary_loss_clip": 0.01403432, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.24162507, + "balance_loss_mlp": 1.01567185, + "epoch": 0.9103862918983917, + "flos": 23889121614720.0, + "grad_norm": 1.841721287826934, + "language_loss": 0.75770211, + "learning_rate": 8.359745694462005e-08, + "loss": 0.78208983, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19677734, + "step": 15142, + "time_per_iteration": 2.919987201690674 + }, + { + "auxiliary_loss_clip": 0.01394847, + "auxiliary_loss_mlp": 0.0103771, + "balance_loss_clip": 1.23665881, + "balance_loss_mlp": 1.01918483, + "epoch": 0.9104464151510596, + "flos": 14947770769920.0, + "grad_norm": 1.9127252840448863, + "language_loss": 0.65597969, + "learning_rate": 8.348607025820076e-08, + "loss": 0.6803053, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18518066, + "step": 15143, + "time_per_iteration": 2.9090960025787354 + }, + { + "auxiliary_loss_clip": 0.01396903, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.23589921, + "balance_loss_mlp": 1.01580715, + "epoch": 0.9105065384037276, + "flos": 33668671224960.0, + "grad_norm": 2.333842782363234, + "language_loss": 0.61863863, + "learning_rate": 8.337475624618152e-08, + "loss": 0.64295959, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19384766, + "step": 15144, + "time_per_iteration": 2.9157614707946777 + }, + { + "auxiliary_loss_clip": 0.01374397, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.22300613, + "balance_loss_mlp": 1.01234078, + "epoch": 0.9105666616563957, + "flos": 24327463668480.0, + "grad_norm": 2.3736720224355716, + "language_loss": 0.71624994, + "learning_rate": 8.326351491278382e-08, + "loss": 0.74029869, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.18115234, + "step": 15145, + "time_per_iteration": 2.9644694328308105 + }, + { + "auxiliary_loss_clip": 0.01381331, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.22749805, + "balance_loss_mlp": 1.01145601, + "epoch": 0.9106267849090636, + "flos": 29983850686080.0, + "grad_norm": 2.1503242749873257, + "language_loss": 0.71506405, + "learning_rate": 8.315234626222545e-08, + "loss": 0.7391786, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.18664551, + "step": 15146, + "time_per_iteration": 2.9375815391540527 + }, + { + "auxiliary_loss_clip": 0.01391466, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.2329644, + "balance_loss_mlp": 1.01624441, + "epoch": 0.9106869081617316, + "flos": 25348632981120.0, + "grad_norm": 2.296726091513083, + "language_loss": 0.73746806, + "learning_rate": 8.304125029872233e-08, + "loss": 0.76174062, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19543457, + "step": 15147, + "time_per_iteration": 2.9135732650756836 + }, + { + "auxiliary_loss_clip": 0.01404971, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.24228036, + "balance_loss_mlp": 1.01295424, + "epoch": 0.9107470314143995, + "flos": 18196194781440.0, + "grad_norm": 1.8365945791704876, + "language_loss": 0.80950427, + "learning_rate": 8.293022702648711e-08, + "loss": 0.83387238, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18884277, + "step": 15148, + "time_per_iteration": 2.9536545276641846 + }, + { + "auxiliary_loss_clip": 0.01403783, + "auxiliary_loss_mlp": 0.01031758, + "balance_loss_clip": 1.24185193, + "balance_loss_mlp": 1.01324534, + "epoch": 0.9108071546670675, + "flos": 23561484883200.0, + "grad_norm": 2.3329974114949983, + "language_loss": 0.68966937, + "learning_rate": 8.281927644972996e-08, + "loss": 0.71402478, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18518066, + "step": 15149, + "time_per_iteration": 2.8825302124023438 + }, + { + "auxiliary_loss_clip": 0.01408717, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.24721813, + "balance_loss_mlp": 1.01513696, + "epoch": 0.9108672779197354, + "flos": 25641449199360.0, + "grad_norm": 1.567015150447077, + "language_loss": 0.64215642, + "learning_rate": 8.270839857265776e-08, + "loss": 0.6665839, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18896484, + "step": 15150, + "time_per_iteration": 2.907957077026367 + }, + { + "auxiliary_loss_clip": 0.01411348, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.25093961, + "balance_loss_mlp": 1.01408613, + "epoch": 0.9109274011724035, + "flos": 22347436412160.0, + "grad_norm": 1.8308869809480104, + "language_loss": 0.739815, + "learning_rate": 8.259759339947514e-08, + "loss": 0.76425761, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18823242, + "step": 15151, + "time_per_iteration": 2.861001491546631 + }, + { + "auxiliary_loss_clip": 0.01386227, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.22897613, + "balance_loss_mlp": 1.01171267, + "epoch": 0.9109875244250714, + "flos": 26699565530880.0, + "grad_norm": 1.6347548072396723, + "language_loss": 0.64770621, + "learning_rate": 8.248686093438429e-08, + "loss": 0.67187083, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18518066, + "step": 15152, + "time_per_iteration": 2.9067063331604004 + }, + { + "auxiliary_loss_clip": 0.01390709, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.23134112, + "balance_loss_mlp": 1.01028538, + "epoch": 0.9110476476777394, + "flos": 22940488995840.0, + "grad_norm": 1.9150378056173416, + "language_loss": 0.7373144, + "learning_rate": 8.23762011815834e-08, + "loss": 0.76152027, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19604492, + "step": 15153, + "time_per_iteration": 2.887770414352417 + }, + { + "auxiliary_loss_clip": 0.01409233, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.24707496, + "balance_loss_mlp": 1.01368284, + "epoch": 0.9111077709304073, + "flos": 13478848485120.0, + "grad_norm": 3.549504024638522, + "language_loss": 0.72767591, + "learning_rate": 8.226561414526956e-08, + "loss": 0.7520889, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18395996, + "step": 15154, + "time_per_iteration": 2.813575267791748 + }, + { + "auxiliary_loss_clip": 0.01386149, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.23030245, + "balance_loss_mlp": 1.01468539, + "epoch": 0.9111678941830753, + "flos": 20860434190080.0, + "grad_norm": 1.7555069346544643, + "language_loss": 0.82767272, + "learning_rate": 8.215509982963564e-08, + "loss": 0.85186636, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18530273, + "step": 15155, + "time_per_iteration": 2.891860008239746 + }, + { + "auxiliary_loss_clip": 0.01392459, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.23602104, + "balance_loss_mlp": 1.01327693, + "epoch": 0.9112280174357432, + "flos": 19691612536320.0, + "grad_norm": 1.417939110633223, + "language_loss": 0.60092556, + "learning_rate": 8.204465823887252e-08, + "loss": 0.6251716, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18884277, + "step": 15156, + "time_per_iteration": 2.8353707790374756 + }, + { + "auxiliary_loss_clip": 0.01404613, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.24142075, + "balance_loss_mlp": 1.0123558, + "epoch": 0.9112881406884112, + "flos": 25458071448960.0, + "grad_norm": 2.1670765474565643, + "language_loss": 0.74869698, + "learning_rate": 8.193428937716796e-08, + "loss": 0.77306068, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.1940918, + "step": 15157, + "time_per_iteration": 4.3113555908203125 + }, + { + "auxiliary_loss_clip": 0.01406344, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.24540687, + "balance_loss_mlp": 1.01504791, + "epoch": 0.9113482639410793, + "flos": 33078378574080.0, + "grad_norm": 2.471589830470554, + "language_loss": 0.59722912, + "learning_rate": 8.182399324870747e-08, + "loss": 0.62162471, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.1817627, + "step": 15158, + "time_per_iteration": 2.9585824012756348 + }, + { + "auxiliary_loss_clip": 0.01386313, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.22926068, + "balance_loss_mlp": 1.01463985, + "epoch": 0.9114083871937472, + "flos": 21845697114240.0, + "grad_norm": 1.6050822550618948, + "language_loss": 0.68107283, + "learning_rate": 8.171376985767375e-08, + "loss": 0.70526302, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18078613, + "step": 15159, + "time_per_iteration": 2.849174737930298 + }, + { + "auxiliary_loss_clip": 0.01385599, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.22679043, + "balance_loss_mlp": 1.01132894, + "epoch": 0.9114685104464152, + "flos": 27100327138560.0, + "grad_norm": 2.6333685880415967, + "language_loss": 0.78950489, + "learning_rate": 8.160361920824588e-08, + "loss": 0.81366587, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19177246, + "step": 15160, + "time_per_iteration": 4.325668811798096 + }, + { + "auxiliary_loss_clip": 0.01410433, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.24930358, + "balance_loss_mlp": 1.00940752, + "epoch": 0.9115286336990831, + "flos": 17975870012160.0, + "grad_norm": 1.8310780616975546, + "language_loss": 0.69265532, + "learning_rate": 8.149354130460073e-08, + "loss": 0.7170471, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1932373, + "step": 15161, + "time_per_iteration": 2.8188445568084717 + }, + { + "auxiliary_loss_clip": 0.01397175, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.23775721, + "balance_loss_mlp": 1.01191568, + "epoch": 0.9115887569517511, + "flos": 22940172282240.0, + "grad_norm": 1.5670943958760184, + "language_loss": 0.7690407, + "learning_rate": 8.138353615091321e-08, + "loss": 0.79332477, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.1932373, + "step": 15162, + "time_per_iteration": 2.8609020709991455 + }, + { + "auxiliary_loss_clip": 0.01400975, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.24126077, + "balance_loss_mlp": 1.01593351, + "epoch": 0.911648880204419, + "flos": 23999148264960.0, + "grad_norm": 1.789910518853387, + "language_loss": 0.67601269, + "learning_rate": 8.127360375135395e-08, + "loss": 0.7003758, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19396973, + "step": 15163, + "time_per_iteration": 2.8994369506835938 + }, + { + "auxiliary_loss_clip": 0.01421176, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.25671768, + "balance_loss_mlp": 1.01460004, + "epoch": 0.911709003457087, + "flos": 17064003432960.0, + "grad_norm": 1.9224805025395915, + "language_loss": 0.71383464, + "learning_rate": 8.116374411009186e-08, + "loss": 0.73839897, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20629883, + "step": 15164, + "time_per_iteration": 2.813257932662964 + }, + { + "auxiliary_loss_clip": 0.01389321, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.23467577, + "balance_loss_mlp": 1.01511741, + "epoch": 0.911769126709755, + "flos": 21663450483840.0, + "grad_norm": 1.5317197331876806, + "language_loss": 0.76726186, + "learning_rate": 8.105395723129315e-08, + "loss": 0.79148602, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.17980957, + "step": 15165, + "time_per_iteration": 2.843998908996582 + }, + { + "auxiliary_loss_clip": 0.01406443, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.24507833, + "balance_loss_mlp": 1.01524138, + "epoch": 0.911829249962423, + "flos": 24801033438720.0, + "grad_norm": 3.3941007081692574, + "language_loss": 0.73081261, + "learning_rate": 8.094424311912074e-08, + "loss": 0.75521815, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.1887207, + "step": 15166, + "time_per_iteration": 2.8748795986175537 + }, + { + "auxiliary_loss_clip": 0.01409055, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.24630487, + "balance_loss_mlp": 1.01396942, + "epoch": 0.9118893732150909, + "flos": 20969148741120.0, + "grad_norm": 2.382963108698475, + "language_loss": 0.73546171, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75989002, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19836426, + "step": 15167, + "time_per_iteration": 5.661333322525024 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.087286, + "balance_loss_mlp": 1.00339413, + "epoch": 0.9119494964677589, + "flos": 67948990646400.0, + "grad_norm": 0.7705465213616485, + "language_loss": 0.65559494, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67764163, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.27539062, + "step": 15168, + "time_per_iteration": 3.3370957374572754 + }, + { + "auxiliary_loss_clip": 0.01405128, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.24625087, + "balance_loss_mlp": 1.01477623, + "epoch": 0.9120096197204268, + "flos": 18560688042240.0, + "grad_norm": 1.9629559383217052, + "language_loss": 0.7862072, + "learning_rate": 8.061553742395033e-08, + "loss": 0.81059504, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18884277, + "step": 15169, + "time_per_iteration": 2.8416216373443604 + }, + { + "auxiliary_loss_clip": 0.01399793, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.2400074, + "balance_loss_mlp": 1.0142293, + "epoch": 0.9120697429730948, + "flos": 19034981729280.0, + "grad_norm": 1.501198985738292, + "language_loss": 0.83178473, + "learning_rate": 8.05061144198591e-08, + "loss": 0.85612357, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1986084, + "step": 15170, + "time_per_iteration": 2.838719367980957 + }, + { + "auxiliary_loss_clip": 0.01401273, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.24100888, + "balance_loss_mlp": 1.01634407, + "epoch": 0.9121298662257629, + "flos": 17172129801600.0, + "grad_norm": 2.223382196724005, + "language_loss": 0.78149211, + "learning_rate": 8.039676420316799e-08, + "loss": 0.80585581, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1875, + "step": 15171, + "time_per_iteration": 2.8506052494049072 + }, + { + "auxiliary_loss_clip": 0.01384867, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.22798908, + "balance_loss_mlp": 1.01292527, + "epoch": 0.9121899894784308, + "flos": 19692200718720.0, + "grad_norm": 1.552821584746152, + "language_loss": 0.67570341, + "learning_rate": 8.02874867780241e-08, + "loss": 0.69987249, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.19128418, + "step": 15172, + "time_per_iteration": 2.862438201904297 + }, + { + "auxiliary_loss_clip": 0.01402261, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.2418654, + "balance_loss_mlp": 1.01670897, + "epoch": 0.9122501127310988, + "flos": 22245644315520.0, + "grad_norm": 2.3172268348310356, + "language_loss": 0.76063752, + "learning_rate": 8.017828214857103e-08, + "loss": 0.7850123, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18530273, + "step": 15173, + "time_per_iteration": 2.936774730682373 + }, + { + "auxiliary_loss_clip": 0.01416556, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.25175059, + "balance_loss_mlp": 1.01360953, + "epoch": 0.9123102359837667, + "flos": 15964596581760.0, + "grad_norm": 2.5893646585268892, + "language_loss": 0.66191173, + "learning_rate": 8.00691503189499e-08, + "loss": 0.6864177, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.20410156, + "step": 15174, + "time_per_iteration": 2.821871280670166 + }, + { + "auxiliary_loss_clip": 0.01396526, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.23594117, + "balance_loss_mlp": 1.01278603, + "epoch": 0.9123703592364347, + "flos": 25166748309120.0, + "grad_norm": 2.001926730789998, + "language_loss": 0.76128608, + "learning_rate": 7.996009129329894e-08, + "loss": 0.78557515, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19604492, + "step": 15175, + "time_per_iteration": 2.881073236465454 + }, + { + "auxiliary_loss_clip": 0.01175796, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.08746028, + "balance_loss_mlp": 1.00233638, + "epoch": 0.9124304824891026, + "flos": 60831146736000.0, + "grad_norm": 0.9652431988079677, + "language_loss": 0.58437115, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60637563, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22363281, + "step": 15176, + "time_per_iteration": 3.404252290725708 + }, + { + "auxiliary_loss_clip": 0.01404241, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.24325156, + "balance_loss_mlp": 1.01579547, + "epoch": 0.9124906057417707, + "flos": 18160197903360.0, + "grad_norm": 1.7180963967766834, + "language_loss": 0.6671868, + "learning_rate": 7.97421916704475e-08, + "loss": 0.69157135, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1842041, + "step": 15177, + "time_per_iteration": 2.881483316421509 + }, + { + "auxiliary_loss_clip": 0.01399017, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.24157751, + "balance_loss_mlp": 1.01840079, + "epoch": 0.9125507289944386, + "flos": 11692741017600.0, + "grad_norm": 1.8861226040422145, + "language_loss": 0.8153131, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83966863, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18127441, + "step": 15178, + "time_per_iteration": 2.802271842956543 + }, + { + "auxiliary_loss_clip": 0.01387151, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.22932374, + "balance_loss_mlp": 1.01239443, + "epoch": 0.9126108522471066, + "flos": 17757762238080.0, + "grad_norm": 2.7087797619569987, + "language_loss": 0.79802561, + "learning_rate": 7.952458331306711e-08, + "loss": 0.82220477, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18395996, + "step": 15179, + "time_per_iteration": 2.8539934158325195 + }, + { + "auxiliary_loss_clip": 0.01392624, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.23493791, + "balance_loss_mlp": 1.0145247, + "epoch": 0.9126709754997745, + "flos": 27647067029760.0, + "grad_norm": 1.538645331325776, + "language_loss": 0.68882871, + "learning_rate": 7.941588836924507e-08, + "loss": 0.71308029, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18005371, + "step": 15180, + "time_per_iteration": 2.9671337604522705 + }, + { + "auxiliary_loss_clip": 0.01381142, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.22555375, + "balance_loss_mlp": 1.01124823, + "epoch": 0.9127310987524425, + "flos": 15933576631680.0, + "grad_norm": 1.6616686520377464, + "language_loss": 0.75612593, + "learning_rate": 7.930726625416495e-08, + "loss": 0.78024292, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1932373, + "step": 15181, + "time_per_iteration": 2.871734619140625 + }, + { + "auxiliary_loss_clip": 0.01415282, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.25111008, + "balance_loss_mlp": 1.01238608, + "epoch": 0.9127912220051104, + "flos": 21545144035200.0, + "grad_norm": 1.7482004793750665, + "language_loss": 0.75594628, + "learning_rate": 7.919871697194614e-08, + "loss": 0.78040707, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18408203, + "step": 15182, + "time_per_iteration": 2.9049365520477295 + }, + { + "auxiliary_loss_clip": 0.01406156, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.24415922, + "balance_loss_mlp": 1.01130414, + "epoch": 0.9128513452577784, + "flos": 24074852094720.0, + "grad_norm": 1.8798402500355733, + "language_loss": 0.77034587, + "learning_rate": 7.909024052670421e-08, + "loss": 0.79471016, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18981934, + "step": 15183, + "time_per_iteration": 2.8924717903137207 + }, + { + "auxiliary_loss_clip": 0.01411397, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.24871826, + "balance_loss_mlp": 1.01197791, + "epoch": 0.9129114685104465, + "flos": 16225035505920.0, + "grad_norm": 2.3613370613423568, + "language_loss": 0.7745406, + "learning_rate": 7.898183692255256e-08, + "loss": 0.79896289, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.18847656, + "step": 15184, + "time_per_iteration": 2.8206751346588135 + }, + { + "auxiliary_loss_clip": 0.01404175, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.24404955, + "balance_loss_mlp": 1.01574326, + "epoch": 0.9129715917631144, + "flos": 19392100087680.0, + "grad_norm": 6.599140649747385, + "language_loss": 0.74980605, + "learning_rate": 7.887350616360233e-08, + "loss": 0.77420002, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19494629, + "step": 15185, + "time_per_iteration": 2.813835382461548 + }, + { + "auxiliary_loss_clip": 0.01389562, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.23069167, + "balance_loss_mlp": 1.01018286, + "epoch": 0.9130317150157824, + "flos": 20599045125120.0, + "grad_norm": 2.045955429393509, + "language_loss": 0.68743527, + "learning_rate": 7.876524825396158e-08, + "loss": 0.7116195, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18664551, + "step": 15186, + "time_per_iteration": 2.8619987964630127 + }, + { + "auxiliary_loss_clip": 0.01413719, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.24776852, + "balance_loss_mlp": 1.01455128, + "epoch": 0.9130918382684503, + "flos": 20197740579840.0, + "grad_norm": 1.755688224127138, + "language_loss": 0.78468049, + "learning_rate": 7.865706319773502e-08, + "loss": 0.80915928, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19616699, + "step": 15187, + "time_per_iteration": 2.830235004425049 + }, + { + "auxiliary_loss_clip": 0.01397573, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.2382679, + "balance_loss_mlp": 1.01344538, + "epoch": 0.9131519615211183, + "flos": 25567871875200.0, + "grad_norm": 5.173013383387578, + "language_loss": 0.66434205, + "learning_rate": 7.854895099902515e-08, + "loss": 0.68864048, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18835449, + "step": 15188, + "time_per_iteration": 2.8934803009033203 + }, + { + "auxiliary_loss_clip": 0.01385533, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.22760355, + "balance_loss_mlp": 1.01442301, + "epoch": 0.9132120847737862, + "flos": 17940642295680.0, + "grad_norm": 3.661693145025564, + "language_loss": 0.77271092, + "learning_rate": 7.844091166193157e-08, + "loss": 0.7968986, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18823242, + "step": 15189, + "time_per_iteration": 2.821176528930664 + }, + { + "auxiliary_loss_clip": 0.01391716, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.23452771, + "balance_loss_mlp": 1.01342058, + "epoch": 0.9132722080264543, + "flos": 20057236917120.0, + "grad_norm": 2.0012024000488067, + "language_loss": 0.76359093, + "learning_rate": 7.8332945190551e-08, + "loss": 0.78781998, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.1776123, + "step": 15190, + "time_per_iteration": 2.832014799118042 + }, + { + "auxiliary_loss_clip": 0.01179102, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.09332514, + "balance_loss_mlp": 1.00675189, + "epoch": 0.9133323312791222, + "flos": 70473540798720.0, + "grad_norm": 0.7185280936978781, + "language_loss": 0.57436472, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59645694, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.23339844, + "step": 15191, + "time_per_iteration": 3.42051100730896 + }, + { + "auxiliary_loss_clip": 0.01403975, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.2422204, + "balance_loss_mlp": 1.01436639, + "epoch": 0.9133924545317902, + "flos": 25494746999040.0, + "grad_norm": 1.704292704954527, + "language_loss": 0.74842942, + "learning_rate": 7.81172308613034e-08, + "loss": 0.77280313, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19030762, + "step": 15192, + "time_per_iteration": 2.924443244934082 + }, + { + "auxiliary_loss_clip": 0.0139415, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.23739409, + "balance_loss_mlp": 1.01215792, + "epoch": 0.9134525777844581, + "flos": 39945013499520.0, + "grad_norm": 3.249399597383762, + "language_loss": 0.7015965, + "learning_rate": 7.800948301161647e-08, + "loss": 0.72584718, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18762207, + "step": 15193, + "time_per_iteration": 4.403618574142456 + }, + { + "auxiliary_loss_clip": 0.01397006, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.2395668, + "balance_loss_mlp": 1.01596451, + "epoch": 0.9135127010371261, + "flos": 20896521557760.0, + "grad_norm": 1.5579720582158325, + "language_loss": 0.742311, + "learning_rate": 7.790180804400215e-08, + "loss": 0.76662719, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18652344, + "step": 15194, + "time_per_iteration": 2.8864023685455322 + }, + { + "auxiliary_loss_clip": 0.0141438, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.25025582, + "balance_loss_mlp": 1.01508737, + "epoch": 0.913572824289794, + "flos": 20822898988800.0, + "grad_norm": 1.9459057696976252, + "language_loss": 0.62887526, + "learning_rate": 7.779420596254383e-08, + "loss": 0.65336871, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1986084, + "step": 15195, + "time_per_iteration": 2.862072229385376 + }, + { + "auxiliary_loss_clip": 0.0139755, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.23750782, + "balance_loss_mlp": 1.01479292, + "epoch": 0.913632947542462, + "flos": 25714438341120.0, + "grad_norm": 1.465303757649483, + "language_loss": 0.71928024, + "learning_rate": 7.768667677132201e-08, + "loss": 0.74358666, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1829834, + "step": 15196, + "time_per_iteration": 4.385135173797607 + }, + { + "auxiliary_loss_clip": 0.01388187, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.23030758, + "balance_loss_mlp": 1.01448202, + "epoch": 0.9136930707951301, + "flos": 26297310844800.0, + "grad_norm": 1.681796734765295, + "language_loss": 0.7229411, + "learning_rate": 7.757922047441411e-08, + "loss": 0.7471512, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18334961, + "step": 15197, + "time_per_iteration": 2.910677433013916 + }, + { + "auxiliary_loss_clip": 0.01413108, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.25041628, + "balance_loss_mlp": 1.01019883, + "epoch": 0.913753194047798, + "flos": 22102290230400.0, + "grad_norm": 3.2164674566511113, + "language_loss": 0.79425061, + "learning_rate": 7.747183707589489e-08, + "loss": 0.81867456, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1907959, + "step": 15198, + "time_per_iteration": 2.845381736755371 + }, + { + "auxiliary_loss_clip": 0.01386203, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.22945333, + "balance_loss_mlp": 1.01360273, + "epoch": 0.913813317300466, + "flos": 23597843719680.0, + "grad_norm": 2.024838788742723, + "language_loss": 0.68087566, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70506859, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19494629, + "step": 15199, + "time_per_iteration": 2.8685526847839355 + }, + { + "auxiliary_loss_clip": 0.01407694, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.24567771, + "balance_loss_mlp": 1.01702762, + "epoch": 0.9138734405531339, + "flos": 28888153908480.0, + "grad_norm": 1.5648012623925622, + "language_loss": 0.67937279, + "learning_rate": 7.725728899030714e-08, + "loss": 0.70380253, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18261719, + "step": 15200, + "time_per_iteration": 2.9092416763305664 + }, + { + "auxiliary_loss_clip": 0.01384637, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.22874331, + "balance_loss_mlp": 1.01395917, + "epoch": 0.9139335638058019, + "flos": 22831548220800.0, + "grad_norm": 2.15282289461593, + "language_loss": 0.7196762, + "learning_rate": 7.715012431137435e-08, + "loss": 0.74384785, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18579102, + "step": 15201, + "time_per_iteration": 4.336301565170288 + }, + { + "auxiliary_loss_clip": 0.01389724, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.23130918, + "balance_loss_mlp": 1.01153278, + "epoch": 0.9139936870584698, + "flos": 18013178989440.0, + "grad_norm": 1.961614724687617, + "language_loss": 0.71774006, + "learning_rate": 7.704303254710165e-08, + "loss": 0.74192542, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.17297363, + "step": 15202, + "time_per_iteration": 2.8878252506256104 + }, + { + "auxiliary_loss_clip": 0.01400279, + "auxiliary_loss_mlp": 0.01034469, + "balance_loss_clip": 1.24008441, + "balance_loss_mlp": 1.01524067, + "epoch": 0.9140538103111379, + "flos": 15821875923840.0, + "grad_norm": 1.9310505856269153, + "language_loss": 0.67265856, + "learning_rate": 7.693601370155001e-08, + "loss": 0.69700611, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19226074, + "step": 15203, + "time_per_iteration": 4.28705096244812 + }, + { + "auxiliary_loss_clip": 0.01405572, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.24549651, + "balance_loss_mlp": 1.01175797, + "epoch": 0.9141139335638058, + "flos": 23997383717760.0, + "grad_norm": 1.8004550808078363, + "language_loss": 0.69929183, + "learning_rate": 7.682906777877751e-08, + "loss": 0.72365284, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18774414, + "step": 15204, + "time_per_iteration": 2.9007856845855713 + }, + { + "auxiliary_loss_clip": 0.01394906, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.23418355, + "balance_loss_mlp": 1.01216555, + "epoch": 0.9141740568164738, + "flos": 24035009408640.0, + "grad_norm": 2.0465285358987915, + "language_loss": 0.60554314, + "learning_rate": 7.672219478283915e-08, + "loss": 0.62980628, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19238281, + "step": 15205, + "time_per_iteration": 2.8596856594085693 + }, + { + "auxiliary_loss_clip": 0.01383698, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.22798657, + "balance_loss_mlp": 1.01250243, + "epoch": 0.9142341800691417, + "flos": 27030188419200.0, + "grad_norm": 1.6672996821432178, + "language_loss": 0.81890011, + "learning_rate": 7.661539471778811e-08, + "loss": 0.84304833, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18603516, + "step": 15206, + "time_per_iteration": 2.9018874168395996 + }, + { + "auxiliary_loss_clip": 0.01409391, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.24775708, + "balance_loss_mlp": 1.01058102, + "epoch": 0.9142943033218097, + "flos": 20422182625920.0, + "grad_norm": 4.466811883069632, + "language_loss": 0.74835294, + "learning_rate": 7.650866758767382e-08, + "loss": 0.77273601, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18310547, + "step": 15207, + "time_per_iteration": 2.8425028324127197 + }, + { + "auxiliary_loss_clip": 0.01393001, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.23381126, + "balance_loss_mlp": 1.01072097, + "epoch": 0.9143544265744776, + "flos": 19764737412480.0, + "grad_norm": 1.5785509198621657, + "language_loss": 0.73545134, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75968248, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1940918, + "step": 15208, + "time_per_iteration": 2.8482542037963867 + }, + { + "auxiliary_loss_clip": 0.01399239, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.24197757, + "balance_loss_mlp": 1.01297104, + "epoch": 0.9144145498271457, + "flos": 17174346796800.0, + "grad_norm": 2.181262536085906, + "language_loss": 0.86402571, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88832712, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17932129, + "step": 15209, + "time_per_iteration": 2.804022789001465 + }, + { + "auxiliary_loss_clip": 0.01390396, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.23244059, + "balance_loss_mlp": 1.01762617, + "epoch": 0.9144746730798137, + "flos": 23734908777600.0, + "grad_norm": 1.6504452523624553, + "language_loss": 0.75939059, + "learning_rate": 7.618892384741093e-08, + "loss": 0.78365469, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18395996, + "step": 15210, + "time_per_iteration": 2.891416072845459 + }, + { + "auxiliary_loss_clip": 0.01403878, + "auxiliary_loss_mlp": 0.01033362, + "balance_loss_clip": 1.24251139, + "balance_loss_mlp": 1.01444364, + "epoch": 0.9145347963324816, + "flos": 25858697322240.0, + "grad_norm": 1.7462231235447103, + "language_loss": 0.78504723, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80941963, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18908691, + "step": 15211, + "time_per_iteration": 2.8979239463806152 + }, + { + "auxiliary_loss_clip": 0.01396266, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.23714495, + "balance_loss_mlp": 1.0106771, + "epoch": 0.9145949195851496, + "flos": 19251596424960.0, + "grad_norm": 1.6649963950058662, + "language_loss": 0.83566153, + "learning_rate": 7.597612610270986e-08, + "loss": 0.85991818, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18701172, + "step": 15212, + "time_per_iteration": 2.846406936645508 + }, + { + "auxiliary_loss_clip": 0.01388855, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.2316103, + "balance_loss_mlp": 1.0148797, + "epoch": 0.9146550428378175, + "flos": 18305542759680.0, + "grad_norm": 1.678227531559026, + "language_loss": 0.8443743, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86859977, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18798828, + "step": 15213, + "time_per_iteration": 2.8153235912323 + }, + { + "auxiliary_loss_clip": 0.01395739, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.23609579, + "balance_loss_mlp": 1.01638484, + "epoch": 0.9147151660904855, + "flos": 20093912467200.0, + "grad_norm": 1.5738265332531631, + "language_loss": 0.72393548, + "learning_rate": 7.576362019471894e-08, + "loss": 0.74823916, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18249512, + "step": 15214, + "time_per_iteration": 2.808718681335449 + }, + { + "auxiliary_loss_clip": 0.01413067, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.24981546, + "balance_loss_mlp": 1.01636755, + "epoch": 0.9147752893431534, + "flos": 24399774138240.0, + "grad_norm": 1.5540838822279412, + "language_loss": 0.63605982, + "learning_rate": 7.565747668956413e-08, + "loss": 0.66054374, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18945312, + "step": 15215, + "time_per_iteration": 2.910959243774414 + }, + { + "auxiliary_loss_clip": 0.01418907, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.25476289, + "balance_loss_mlp": 1.01488781, + "epoch": 0.9148354125958215, + "flos": 18159519231360.0, + "grad_norm": 2.5270075078727876, + "language_loss": 0.77034914, + "learning_rate": 7.555140615567058e-08, + "loss": 0.79488188, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19458008, + "step": 15216, + "time_per_iteration": 2.847985029220581 + }, + { + "auxiliary_loss_clip": 0.01396715, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.23734522, + "balance_loss_mlp": 1.01318204, + "epoch": 0.9148955358484894, + "flos": 23377926153600.0, + "grad_norm": 2.1406500351979334, + "language_loss": 0.68845689, + "learning_rate": 7.544540859706062e-08, + "loss": 0.71275014, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19433594, + "step": 15217, + "time_per_iteration": 2.901492118835449 + }, + { + "auxiliary_loss_clip": 0.01396954, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.23865116, + "balance_loss_mlp": 1.01375914, + "epoch": 0.9149556591011574, + "flos": 18085353724800.0, + "grad_norm": 1.8723388391471645, + "language_loss": 0.80856645, + "learning_rate": 7.533948401775347e-08, + "loss": 0.83286029, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18664551, + "step": 15218, + "time_per_iteration": 2.8249731063842773 + }, + { + "auxiliary_loss_clip": 0.01179169, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.09206176, + "balance_loss_mlp": 1.00398302, + "epoch": 0.9150157823538253, + "flos": 54610962537600.0, + "grad_norm": 0.8461120594568002, + "language_loss": 0.59383631, + "learning_rate": 7.523363242176595e-08, + "loss": 0.6159234, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.25585938, + "step": 15219, + "time_per_iteration": 3.2968616485595703 + }, + { + "auxiliary_loss_clip": 0.01387366, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.23049188, + "balance_loss_mlp": 1.01516032, + "epoch": 0.9150759056064933, + "flos": 17901659260800.0, + "grad_norm": 1.84685194628434, + "language_loss": 0.79247308, + "learning_rate": 7.512785381311216e-08, + "loss": 0.81668305, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18469238, + "step": 15220, + "time_per_iteration": 2.846248149871826 + }, + { + "auxiliary_loss_clip": 0.01402801, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.24057138, + "balance_loss_mlp": 1.01438856, + "epoch": 0.9151360288591612, + "flos": 18081598406400.0, + "grad_norm": 1.777110540378662, + "language_loss": 0.66326487, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68763137, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19482422, + "step": 15221, + "time_per_iteration": 2.8657407760620117 + }, + { + "auxiliary_loss_clip": 0.01394602, + "auxiliary_loss_mlp": 0.01035117, + "balance_loss_clip": 1.23468399, + "balance_loss_mlp": 1.01638961, + "epoch": 0.9151961521118293, + "flos": 19363975804800.0, + "grad_norm": 1.7890905096433132, + "language_loss": 0.85015649, + "learning_rate": 7.491651557384692e-08, + "loss": 0.8744536, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18737793, + "step": 15222, + "time_per_iteration": 2.9546730518341064 + }, + { + "auxiliary_loss_clip": 0.01180534, + "auxiliary_loss_mlp": 0.01038183, + "balance_loss_clip": 1.09153342, + "balance_loss_mlp": 1.01357794, + "epoch": 0.9152562753644973, + "flos": 72178515060480.0, + "grad_norm": 0.7255830156947594, + "language_loss": 0.49695674, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51914394, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.24609375, + "step": 15223, + "time_per_iteration": 3.3283181190490723 + }, + { + "auxiliary_loss_clip": 0.01400039, + "auxiliary_loss_mlp": 0.0103764, + "balance_loss_clip": 1.23942089, + "balance_loss_mlp": 1.0181973, + "epoch": 0.9153163986171652, + "flos": 20786721131520.0, + "grad_norm": 1.7284340160350067, + "language_loss": 0.73071706, + "learning_rate": 7.470546933201349e-08, + "loss": 0.75509393, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19445801, + "step": 15224, + "time_per_iteration": 2.932164430618286 + }, + { + "auxiliary_loss_clip": 0.01391663, + "auxiliary_loss_mlp": 0.01029575, + "balance_loss_clip": 1.23393464, + "balance_loss_mlp": 1.01065624, + "epoch": 0.9153765218698332, + "flos": 23050651380480.0, + "grad_norm": 1.7290092547202913, + "language_loss": 0.82962382, + "learning_rate": 7.460005572013895e-08, + "loss": 0.85383618, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18920898, + "step": 15225, + "time_per_iteration": 2.8880815505981445 + }, + { + "auxiliary_loss_clip": 0.01395322, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.23492122, + "balance_loss_mlp": 1.01021469, + "epoch": 0.9154366451225011, + "flos": 29003926648320.0, + "grad_norm": 1.3458988061239618, + "language_loss": 0.71296322, + "learning_rate": 7.44947151196238e-08, + "loss": 0.7372086, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18994141, + "step": 15226, + "time_per_iteration": 2.9552230834960938 + }, + { + "auxiliary_loss_clip": 0.01395599, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.23564053, + "balance_loss_mlp": 1.01658916, + "epoch": 0.9154967683751691, + "flos": 22319628842880.0, + "grad_norm": 1.836220480059665, + "language_loss": 0.75689638, + "learning_rate": 7.43894475344613e-08, + "loss": 0.78120697, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18859863, + "step": 15227, + "time_per_iteration": 2.901482105255127 + }, + { + "auxiliary_loss_clip": 0.0138992, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.23240542, + "balance_loss_mlp": 1.01363873, + "epoch": 0.915556891627837, + "flos": 24582156503040.0, + "grad_norm": 1.5241746761710058, + "language_loss": 0.74552655, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76974583, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18359375, + "step": 15228, + "time_per_iteration": 4.347513198852539 + }, + { + "auxiliary_loss_clip": 0.01387475, + "auxiliary_loss_mlp": 0.01035088, + "balance_loss_clip": 1.2290889, + "balance_loss_mlp": 1.01723015, + "epoch": 0.9156170148805051, + "flos": 22175053148160.0, + "grad_norm": 1.6539613488754643, + "language_loss": 0.73024607, + "learning_rate": 7.417913142616106e-08, + "loss": 0.75447166, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.1784668, + "step": 15229, + "time_per_iteration": 2.8836474418640137 + }, + { + "auxiliary_loss_clip": 0.0140134, + "auxiliary_loss_mlp": 0.01034795, + "balance_loss_clip": 1.24165702, + "balance_loss_mlp": 1.01469648, + "epoch": 0.915677138133173, + "flos": 20929803747840.0, + "grad_norm": 2.6500273046771956, + "language_loss": 0.83494943, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85931075, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.2010498, + "step": 15230, + "time_per_iteration": 2.8780415058135986 + }, + { + "auxiliary_loss_clip": 0.01391572, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.23446465, + "balance_loss_mlp": 1.01293075, + "epoch": 0.915737261385841, + "flos": 24353823404160.0, + "grad_norm": 5.177584408598246, + "language_loss": 0.84522569, + "learning_rate": 7.396910742713957e-08, + "loss": 0.86945271, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18212891, + "step": 15231, + "time_per_iteration": 4.3860368728637695 + }, + { + "auxiliary_loss_clip": 0.01384939, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.22785962, + "balance_loss_mlp": 1.011729, + "epoch": 0.9157973846385089, + "flos": 26772826141440.0, + "grad_norm": 1.5161218934317031, + "language_loss": 0.73122156, + "learning_rate": 7.386420497856516e-08, + "loss": 0.75537407, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18579102, + "step": 15232, + "time_per_iteration": 2.914599657058716 + }, + { + "auxiliary_loss_clip": 0.01404088, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.24310207, + "balance_loss_mlp": 1.0130806, + "epoch": 0.9158575078911769, + "flos": 18487925124480.0, + "grad_norm": 2.83464728369176, + "language_loss": 0.69468558, + "learning_rate": 7.375937556925338e-08, + "loss": 0.71904486, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1875, + "step": 15233, + "time_per_iteration": 2.96437406539917 + }, + { + "auxiliary_loss_clip": 0.01404909, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.24230838, + "balance_loss_mlp": 1.01574326, + "epoch": 0.9159176311438448, + "flos": 21808976319360.0, + "grad_norm": 2.838109132012694, + "language_loss": 0.70397818, + "learning_rate": 7.365461920317861e-08, + "loss": 0.72837281, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.18811035, + "step": 15234, + "time_per_iteration": 2.8652570247650146 + }, + { + "auxiliary_loss_clip": 0.01418599, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.25548458, + "balance_loss_mlp": 1.01634955, + "epoch": 0.9159777543965129, + "flos": 24792753640320.0, + "grad_norm": 1.6169670913879444, + "language_loss": 0.88993752, + "learning_rate": 7.354993588431391e-08, + "loss": 0.91448325, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19628906, + "step": 15235, + "time_per_iteration": 2.9045631885528564 + }, + { + "auxiliary_loss_clip": 0.01402425, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.24099731, + "balance_loss_mlp": 1.01343489, + "epoch": 0.9160378776491809, + "flos": 26879685655680.0, + "grad_norm": 1.5793573189527998, + "language_loss": 0.77800471, + "learning_rate": 7.344532561662853e-08, + "loss": 0.80235702, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19372559, + "step": 15236, + "time_per_iteration": 4.329482316970825 + }, + { + "auxiliary_loss_clip": 0.01179041, + "auxiliary_loss_mlp": 0.01044327, + "balance_loss_clip": 1.09066677, + "balance_loss_mlp": 1.02115238, + "epoch": 0.9160980009018488, + "flos": 70609836695040.0, + "grad_norm": 0.6798137799792019, + "language_loss": 0.6226697, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64490342, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.23144531, + "step": 15237, + "time_per_iteration": 3.379786968231201 + }, + { + "auxiliary_loss_clip": 0.01402409, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.24151325, + "balance_loss_mlp": 1.01383257, + "epoch": 0.9161581241545168, + "flos": 16297888913280.0, + "grad_norm": 2.320840338126538, + "language_loss": 0.75394815, + "learning_rate": 7.323632425066151e-08, + "loss": 0.77830648, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19604492, + "step": 15238, + "time_per_iteration": 4.271705627441406 + }, + { + "auxiliary_loss_clip": 0.01399089, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.23807669, + "balance_loss_mlp": 1.01193786, + "epoch": 0.9162182474071847, + "flos": 18446272646400.0, + "grad_norm": 1.9513793748587411, + "language_loss": 0.75684857, + "learning_rate": 7.313193316030464e-08, + "loss": 0.78114593, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18688965, + "step": 15239, + "time_per_iteration": 2.8524742126464844 + }, + { + "auxiliary_loss_clip": 0.01404081, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.24317431, + "balance_loss_mlp": 1.01044679, + "epoch": 0.9162783706598527, + "flos": 19175394902400.0, + "grad_norm": 2.173450935259786, + "language_loss": 0.64508617, + "learning_rate": 7.302761513697819e-08, + "loss": 0.66942823, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19665527, + "step": 15240, + "time_per_iteration": 2.8394808769226074 + }, + { + "auxiliary_loss_clip": 0.01397683, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.2392664, + "balance_loss_mlp": 1.01118183, + "epoch": 0.9163384939125206, + "flos": 20422816053120.0, + "grad_norm": 1.8914261036838191, + "language_loss": 0.76891494, + "learning_rate": 7.292337018463746e-08, + "loss": 0.79318941, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18591309, + "step": 15241, + "time_per_iteration": 2.879179000854492 + }, + { + "auxiliary_loss_clip": 0.01446968, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.27562284, + "balance_loss_mlp": 1.01524651, + "epoch": 0.9163986171651887, + "flos": 19655298944640.0, + "grad_norm": 2.1484523815354515, + "language_loss": 0.68929458, + "learning_rate": 7.281919830723549e-08, + "loss": 0.71411133, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.19458008, + "step": 15242, + "time_per_iteration": 2.828090190887451 + }, + { + "auxiliary_loss_clip": 0.01394308, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.23405612, + "balance_loss_mlp": 1.01721263, + "epoch": 0.9164587404178566, + "flos": 12830678455680.0, + "grad_norm": 1.9119668292084162, + "language_loss": 0.8164196, + "learning_rate": 7.271509950872334e-08, + "loss": 0.84072411, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18933105, + "step": 15243, + "time_per_iteration": 2.8173904418945312 + }, + { + "auxiliary_loss_clip": 0.01416003, + "auxiliary_loss_mlp": 0.01037731, + "balance_loss_clip": 1.2515794, + "balance_loss_mlp": 1.01832378, + "epoch": 0.9165188636705246, + "flos": 22319493108480.0, + "grad_norm": 1.749825684113621, + "language_loss": 0.82132745, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84586477, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19396973, + "step": 15244, + "time_per_iteration": 2.8434932231903076 + }, + { + "auxiliary_loss_clip": 0.01405627, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.2409997, + "balance_loss_mlp": 1.01556039, + "epoch": 0.9165789869231925, + "flos": 18232463128320.0, + "grad_norm": 2.3362215547208622, + "language_loss": 0.74076176, + "learning_rate": 7.250712116415214e-08, + "loss": 0.76516926, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19555664, + "step": 15245, + "time_per_iteration": 2.8322644233703613 + }, + { + "auxiliary_loss_clip": 0.01399145, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.24039054, + "balance_loss_mlp": 1.01858425, + "epoch": 0.9166391101758605, + "flos": 13697544441600.0, + "grad_norm": 1.6092439558897738, + "language_loss": 0.75119019, + "learning_rate": 7.240324162598033e-08, + "loss": 0.77555889, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19152832, + "step": 15246, + "time_per_iteration": 2.8385536670684814 + }, + { + "auxiliary_loss_clip": 0.01409548, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.24962354, + "balance_loss_mlp": 1.01371169, + "epoch": 0.9166992334285284, + "flos": 17355462307200.0, + "grad_norm": 1.9507248774964434, + "language_loss": 0.76316291, + "learning_rate": 7.229943518247106e-08, + "loss": 0.78759181, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19616699, + "step": 15247, + "time_per_iteration": 2.8454172611236572 + }, + { + "auxiliary_loss_clip": 0.01410561, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.24789476, + "balance_loss_mlp": 1.01376009, + "epoch": 0.9167593566811965, + "flos": 23741288294400.0, + "grad_norm": 2.183755052310745, + "language_loss": 0.77202791, + "learning_rate": 7.219570183756052e-08, + "loss": 0.79645729, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1862793, + "step": 15248, + "time_per_iteration": 2.894878625869751 + }, + { + "auxiliary_loss_clip": 0.01397584, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.23762751, + "balance_loss_mlp": 1.0124594, + "epoch": 0.9168194799338644, + "flos": 27829585128960.0, + "grad_norm": 4.531161110362229, + "language_loss": 0.75153553, + "learning_rate": 7.209204159518178e-08, + "loss": 0.77582705, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19104004, + "step": 15249, + "time_per_iteration": 2.919281244277954 + }, + { + "auxiliary_loss_clip": 0.01393674, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.23410094, + "balance_loss_mlp": 1.01492965, + "epoch": 0.9168796031865324, + "flos": 21725399894400.0, + "grad_norm": 2.3967380342444784, + "language_loss": 0.7710495, + "learning_rate": 7.198845445926616e-08, + "loss": 0.79532444, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18896484, + "step": 15250, + "time_per_iteration": 2.834374189376831 + }, + { + "auxiliary_loss_clip": 0.01401145, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.24319077, + "balance_loss_mlp": 1.01553082, + "epoch": 0.9169397264392004, + "flos": 23414918417280.0, + "grad_norm": 1.7774202356039672, + "language_loss": 0.76405716, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78842157, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19763184, + "step": 15251, + "time_per_iteration": 2.8456451892852783 + }, + { + "auxiliary_loss_clip": 0.01397698, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.23895752, + "balance_loss_mlp": 1.01355958, + "epoch": 0.9169998496918683, + "flos": 23961432084480.0, + "grad_norm": 3.009396563248502, + "language_loss": 0.81657326, + "learning_rate": 7.178149952253298e-08, + "loss": 0.84087813, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19238281, + "step": 15252, + "time_per_iteration": 2.892542839050293 + }, + { + "auxiliary_loss_clip": 0.01402147, + "auxiliary_loss_mlp": 0.0103078, + "balance_loss_clip": 1.24323201, + "balance_loss_mlp": 1.0128274, + "epoch": 0.9170599729445363, + "flos": 18341539637760.0, + "grad_norm": 1.5961268986143755, + "language_loss": 0.77709126, + "learning_rate": 7.167813172956316e-08, + "loss": 0.80142057, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.17956543, + "step": 15253, + "time_per_iteration": 2.8378188610076904 + }, + { + "auxiliary_loss_clip": 0.01398627, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.23789823, + "balance_loss_mlp": 1.01203513, + "epoch": 0.9171200961972042, + "flos": 22685207978880.0, + "grad_norm": 1.6807375618102127, + "language_loss": 0.73455888, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75884932, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18383789, + "step": 15254, + "time_per_iteration": 2.8554773330688477 + }, + { + "auxiliary_loss_clip": 0.01383032, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.22743821, + "balance_loss_mlp": 1.01166439, + "epoch": 0.9171802194498723, + "flos": 26729273381760.0, + "grad_norm": 1.6414741254078207, + "language_loss": 0.79448116, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81860936, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18115234, + "step": 15255, + "time_per_iteration": 2.8666257858276367 + }, + { + "auxiliary_loss_clip": 0.01398523, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.23806071, + "balance_loss_mlp": 1.0162518, + "epoch": 0.9172403427025402, + "flos": 37903489280640.0, + "grad_norm": 2.085737333673275, + "language_loss": 0.68951499, + "learning_rate": 7.136846709927047e-08, + "loss": 0.71385157, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18884277, + "step": 15256, + "time_per_iteration": 2.983759641647339 + }, + { + "auxiliary_loss_clip": 0.01398414, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.24101782, + "balance_loss_mlp": 1.01552498, + "epoch": 0.9173004659552082, + "flos": 17063822453760.0, + "grad_norm": 1.6006284203809102, + "language_loss": 0.84563029, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86995345, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18383789, + "step": 15257, + "time_per_iteration": 2.8749446868896484 + }, + { + "auxiliary_loss_clip": 0.01385674, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.22854161, + "balance_loss_mlp": 1.0164721, + "epoch": 0.9173605892078761, + "flos": 22212090656640.0, + "grad_norm": 1.6319944138628661, + "language_loss": 0.77818882, + "learning_rate": 7.116238967539012e-08, + "loss": 0.80238366, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17333984, + "step": 15258, + "time_per_iteration": 2.9538776874542236 + }, + { + "auxiliary_loss_clip": 0.01389882, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.23217082, + "balance_loss_mlp": 1.0153724, + "epoch": 0.9174207124605441, + "flos": 16516268156160.0, + "grad_norm": 1.891269149287408, + "language_loss": 0.79738224, + "learning_rate": 7.105946067406999e-08, + "loss": 0.82162869, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19372559, + "step": 15259, + "time_per_iteration": 2.874805212020874 + }, + { + "auxiliary_loss_clip": 0.01396367, + "auxiliary_loss_mlp": 0.01035396, + "balance_loss_clip": 1.23864317, + "balance_loss_mlp": 1.01658475, + "epoch": 0.917480835713212, + "flos": 24546431093760.0, + "grad_norm": 1.5500452088547994, + "language_loss": 0.77056825, + "learning_rate": 7.095660481836895e-08, + "loss": 0.79488587, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18811035, + "step": 15260, + "time_per_iteration": 2.907242774963379 + }, + { + "auxiliary_loss_clip": 0.01396218, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.23721576, + "balance_loss_mlp": 1.01313448, + "epoch": 0.9175409589658801, + "flos": 20888965676160.0, + "grad_norm": 1.5740382774989974, + "language_loss": 0.61427039, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63855457, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19067383, + "step": 15261, + "time_per_iteration": 2.9197559356689453 + }, + { + "auxiliary_loss_clip": 0.01389561, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.23164701, + "balance_loss_mlp": 1.01252961, + "epoch": 0.917601082218548, + "flos": 14282362471680.0, + "grad_norm": 2.235580347357797, + "language_loss": 0.74440044, + "learning_rate": 7.075111255942002e-08, + "loss": 0.76860458, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18322754, + "step": 15262, + "time_per_iteration": 2.881556987762451 + }, + { + "auxiliary_loss_clip": 0.01408114, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.24440193, + "balance_loss_mlp": 1.01651502, + "epoch": 0.917661205471216, + "flos": 19108649543040.0, + "grad_norm": 1.7819490562048943, + "language_loss": 0.78428102, + "learning_rate": 7.064847616396496e-08, + "loss": 0.8087194, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19213867, + "step": 15263, + "time_per_iteration": 4.351459741592407 + }, + { + "auxiliary_loss_clip": 0.0140837, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.24366415, + "balance_loss_mlp": 1.01452374, + "epoch": 0.917721328723884, + "flos": 21116665347840.0, + "grad_norm": 1.9015875950070904, + "language_loss": 0.7616564, + "learning_rate": 7.054591292971324e-08, + "loss": 0.78608, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19458008, + "step": 15264, + "time_per_iteration": 2.833146333694458 + }, + { + "auxiliary_loss_clip": 0.01411487, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.25110173, + "balance_loss_mlp": 1.01673853, + "epoch": 0.9177814519765519, + "flos": 21952873342080.0, + "grad_norm": 2.214711130837956, + "language_loss": 0.837551, + "learning_rate": 7.044342286055394e-08, + "loss": 0.86202139, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18798828, + "step": 15265, + "time_per_iteration": 2.8300843238830566 + }, + { + "auxiliary_loss_clip": 0.01406411, + "auxiliary_loss_mlp": 0.01040348, + "balance_loss_clip": 1.2418642, + "balance_loss_mlp": 1.02022529, + "epoch": 0.9178415752292199, + "flos": 24216396387840.0, + "grad_norm": 1.6972012248836394, + "language_loss": 0.73735523, + "learning_rate": 7.034100596037306e-08, + "loss": 0.76182282, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20117188, + "step": 15266, + "time_per_iteration": 4.385447025299072 + }, + { + "auxiliary_loss_clip": 0.01400246, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.2395885, + "balance_loss_mlp": 1.01304257, + "epoch": 0.9179016984818879, + "flos": 20050223973120.0, + "grad_norm": 1.942922273062093, + "language_loss": 0.78596312, + "learning_rate": 7.023866223305486e-08, + "loss": 0.81027836, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18249512, + "step": 15267, + "time_per_iteration": 2.8551549911499023 + }, + { + "auxiliary_loss_clip": 0.01180928, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.09148073, + "balance_loss_mlp": 1.01353765, + "epoch": 0.9179618217345559, + "flos": 65589512083200.0, + "grad_norm": 0.7379390295632065, + "language_loss": 0.5631333, + "learning_rate": 7.013639168247975e-08, + "loss": 0.5853011, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.22363281, + "step": 15268, + "time_per_iteration": 3.3699913024902344 + }, + { + "auxiliary_loss_clip": 0.01406443, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.24629223, + "balance_loss_mlp": 1.01076424, + "epoch": 0.9180219449872238, + "flos": 21334546897920.0, + "grad_norm": 2.0181412730517976, + "language_loss": 0.77724314, + "learning_rate": 7.0034194312526e-08, + "loss": 0.80160725, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19189453, + "step": 15269, + "time_per_iteration": 2.85247540473938 + }, + { + "auxiliary_loss_clip": 0.01392252, + "auxiliary_loss_mlp": 0.01036137, + "balance_loss_clip": 1.23407793, + "balance_loss_mlp": 1.01676559, + "epoch": 0.9180820682398918, + "flos": 41074173446400.0, + "grad_norm": 2.0372050232013366, + "language_loss": 0.73161149, + "learning_rate": 6.993207012706936e-08, + "loss": 0.75589538, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19384766, + "step": 15270, + "time_per_iteration": 3.012798309326172 + }, + { + "auxiliary_loss_clip": 0.01387745, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.23022485, + "balance_loss_mlp": 1.01138937, + "epoch": 0.9181421914925597, + "flos": 28084187473920.0, + "grad_norm": 1.6114798640713504, + "language_loss": 0.80779159, + "learning_rate": 6.98300191299821e-08, + "loss": 0.831985, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.20214844, + "step": 15271, + "time_per_iteration": 2.9160687923431396 + }, + { + "auxiliary_loss_clip": 0.01394313, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.23366344, + "balance_loss_mlp": 1.0128262, + "epoch": 0.9182023147452277, + "flos": 29181467819520.0, + "grad_norm": 1.84591993919902, + "language_loss": 0.73084104, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75509596, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18359375, + "step": 15272, + "time_per_iteration": 4.304319381713867 + }, + { + "auxiliary_loss_clip": 0.01391826, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.23236036, + "balance_loss_mlp": 1.0179708, + "epoch": 0.9182624379978956, + "flos": 24071413489920.0, + "grad_norm": 2.1113965447273326, + "language_loss": 0.7304498, + "learning_rate": 6.962613671639105e-08, + "loss": 0.75472713, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.17944336, + "step": 15273, + "time_per_iteration": 4.430824279785156 + }, + { + "auxiliary_loss_clip": 0.01376931, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.22235179, + "balance_loss_mlp": 1.00937581, + "epoch": 0.9183225612505637, + "flos": 23303624912640.0, + "grad_norm": 1.4521223189260974, + "language_loss": 0.748649, + "learning_rate": 6.952430530761933e-08, + "loss": 0.77268839, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.17626953, + "step": 15274, + "time_per_iteration": 2.88554048538208 + }, + { + "auxiliary_loss_clip": 0.01402697, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.24138105, + "balance_loss_mlp": 1.01367164, + "epoch": 0.9183826845032316, + "flos": 19618578149760.0, + "grad_norm": 2.0244418396424284, + "language_loss": 0.69935906, + "learning_rate": 6.942254710267902e-08, + "loss": 0.72370768, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18493652, + "step": 15275, + "time_per_iteration": 2.8554248809814453 + }, + { + "auxiliary_loss_clip": 0.0139319, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.23456454, + "balance_loss_mlp": 1.00782311, + "epoch": 0.9184428077558996, + "flos": 18488332327680.0, + "grad_norm": 2.005094974271042, + "language_loss": 0.73402941, + "learning_rate": 6.932086210542953e-08, + "loss": 0.75822502, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18554688, + "step": 15276, + "time_per_iteration": 2.8442280292510986 + }, + { + "auxiliary_loss_clip": 0.01402173, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.24266255, + "balance_loss_mlp": 1.01602221, + "epoch": 0.9185029310085676, + "flos": 20750859987840.0, + "grad_norm": 1.6455978030338418, + "language_loss": 0.74247682, + "learning_rate": 6.921925031972642e-08, + "loss": 0.76684153, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18273926, + "step": 15277, + "time_per_iteration": 2.844045877456665 + }, + { + "auxiliary_loss_clip": 0.01179418, + "auxiliary_loss_mlp": 0.01045979, + "balance_loss_clip": 1.09206343, + "balance_loss_mlp": 1.02499866, + "epoch": 0.9185630542612355, + "flos": 68240901968640.0, + "grad_norm": 0.7199013596637794, + "language_loss": 0.59216583, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61441976, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.20996094, + "step": 15278, + "time_per_iteration": 3.428173065185547 + }, + { + "auxiliary_loss_clip": 0.01378799, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.22299218, + "balance_loss_mlp": 1.01515722, + "epoch": 0.9186231775139035, + "flos": 12246991545600.0, + "grad_norm": 1.7742288904641985, + "language_loss": 0.64697689, + "learning_rate": 6.901624639836879e-08, + "loss": 0.67109644, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18005371, + "step": 15279, + "time_per_iteration": 2.8651609420776367 + }, + { + "auxiliary_loss_clip": 0.01174464, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.0893631, + "balance_loss_mlp": 1.0081557, + "epoch": 0.9186833007665715, + "flos": 63969498852480.0, + "grad_norm": 0.8634388105783173, + "language_loss": 0.60188365, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62395012, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.24023438, + "step": 15280, + "time_per_iteration": 3.250854969024658 + }, + { + "auxiliary_loss_clip": 0.01404718, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.24344516, + "balance_loss_mlp": 1.01442814, + "epoch": 0.9187434240192395, + "flos": 19984293020160.0, + "grad_norm": 1.6837797886922972, + "language_loss": 0.7071318, + "learning_rate": 6.881353536939815e-08, + "loss": 0.73152328, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.20007324, + "step": 15281, + "time_per_iteration": 2.8851330280303955 + }, + { + "auxiliary_loss_clip": 0.01395424, + "auxiliary_loss_mlp": 0.01034253, + "balance_loss_clip": 1.23548627, + "balance_loss_mlp": 1.01482201, + "epoch": 0.9188035472719074, + "flos": 25238561086080.0, + "grad_norm": 1.927065472783901, + "language_loss": 0.8534739, + "learning_rate": 6.871228969916831e-08, + "loss": 0.87777066, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19433594, + "step": 15282, + "time_per_iteration": 2.8666088581085205 + }, + { + "auxiliary_loss_clip": 0.01400906, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.24289918, + "balance_loss_mlp": 1.01485467, + "epoch": 0.9188636705245754, + "flos": 18414664513920.0, + "grad_norm": 1.8234078124676185, + "language_loss": 0.6075381, + "learning_rate": 6.861111726356194e-08, + "loss": 0.63188744, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19165039, + "step": 15283, + "time_per_iteration": 2.823317766189575 + }, + { + "auxiliary_loss_clip": 0.01414616, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.2499404, + "balance_loss_mlp": 1.01256013, + "epoch": 0.9189237937772433, + "flos": 23779683146880.0, + "grad_norm": 1.4933247987533482, + "language_loss": 0.6619274, + "learning_rate": 6.851001806641554e-08, + "loss": 0.68638611, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18688965, + "step": 15284, + "time_per_iteration": 2.8839762210845947 + }, + { + "auxiliary_loss_clip": 0.01396213, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.23690844, + "balance_loss_mlp": 1.01359105, + "epoch": 0.9189839170299113, + "flos": 21224384513280.0, + "grad_norm": 1.8281059050128534, + "language_loss": 0.74020463, + "learning_rate": 6.840899211156292e-08, + "loss": 0.76449829, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19567871, + "step": 15285, + "time_per_iteration": 2.85089373588562 + }, + { + "auxiliary_loss_clip": 0.01384522, + "auxiliary_loss_mlp": 0.01034428, + "balance_loss_clip": 1.22667122, + "balance_loss_mlp": 1.01533043, + "epoch": 0.9190440402825792, + "flos": 16736095232640.0, + "grad_norm": 2.54085811877475, + "language_loss": 0.72536325, + "learning_rate": 6.830803940283458e-08, + "loss": 0.74955273, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19091797, + "step": 15286, + "time_per_iteration": 2.834083080291748 + }, + { + "auxiliary_loss_clip": 0.01397118, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.23796725, + "balance_loss_mlp": 1.01683378, + "epoch": 0.9191041635352473, + "flos": 23451774946560.0, + "grad_norm": 1.6281771963018994, + "language_loss": 0.73922282, + "learning_rate": 6.820715994405945e-08, + "loss": 0.7635538, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19152832, + "step": 15287, + "time_per_iteration": 2.8573148250579834 + }, + { + "auxiliary_loss_clip": 0.01402624, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.24133515, + "balance_loss_mlp": 1.01315379, + "epoch": 0.9191642867879152, + "flos": 18816919200000.0, + "grad_norm": 2.3162578518735604, + "language_loss": 0.65993357, + "learning_rate": 6.810635373906226e-08, + "loss": 0.68430078, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20935059, + "step": 15288, + "time_per_iteration": 2.84517240524292 + }, + { + "auxiliary_loss_clip": 0.01399247, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.23905325, + "balance_loss_mlp": 1.01584935, + "epoch": 0.9192244100405832, + "flos": 32173705918080.0, + "grad_norm": 1.8014855117846167, + "language_loss": 0.71313477, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73748124, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19555664, + "step": 15289, + "time_per_iteration": 2.9161102771759033 + }, + { + "auxiliary_loss_clip": 0.01402855, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.24335933, + "balance_loss_mlp": 1.0163573, + "epoch": 0.9192845332932512, + "flos": 16365041475840.0, + "grad_norm": 2.0209914166034193, + "language_loss": 0.75622904, + "learning_rate": 6.790496110568921e-08, + "loss": 0.78061414, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19287109, + "step": 15290, + "time_per_iteration": 2.813507556915283 + }, + { + "auxiliary_loss_clip": 0.01389215, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.23207998, + "balance_loss_mlp": 1.01093459, + "epoch": 0.9193446565459191, + "flos": 26625626248320.0, + "grad_norm": 2.3585322575054963, + "language_loss": 0.7278735, + "learning_rate": 6.78043746849506e-08, + "loss": 0.75206184, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18664551, + "step": 15291, + "time_per_iteration": 2.8639259338378906 + }, + { + "auxiliary_loss_clip": 0.01390961, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.23389137, + "balance_loss_mlp": 1.0130384, + "epoch": 0.9194047797985871, + "flos": 22502418410880.0, + "grad_norm": 1.7731809954041826, + "language_loss": 0.71079147, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73501194, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18066406, + "step": 15292, + "time_per_iteration": 2.9565138816833496 + }, + { + "auxiliary_loss_clip": 0.0140137, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.23963523, + "balance_loss_mlp": 1.01401806, + "epoch": 0.9194649030512551, + "flos": 25088691749760.0, + "grad_norm": 1.8251744884155028, + "language_loss": 0.7383492, + "learning_rate": 6.760342165443988e-08, + "loss": 0.76270121, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19812012, + "step": 15293, + "time_per_iteration": 2.857969045639038 + }, + { + "auxiliary_loss_clip": 0.01400829, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.24204659, + "balance_loss_mlp": 1.01503956, + "epoch": 0.9195250263039231, + "flos": 11918585652480.0, + "grad_norm": 1.9706265175302111, + "language_loss": 0.78872132, + "learning_rate": 6.750305505228837e-08, + "loss": 0.81306463, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18481445, + "step": 15294, + "time_per_iteration": 2.8027405738830566 + }, + { + "auxiliary_loss_clip": 0.01406238, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.2435472, + "balance_loss_mlp": 1.01747787, + "epoch": 0.919585149556591, + "flos": 21843887322240.0, + "grad_norm": 1.5268534553284165, + "language_loss": 0.77810574, + "learning_rate": 6.74027617306141e-08, + "loss": 0.80254686, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20397949, + "step": 15295, + "time_per_iteration": 2.8774046897888184 + }, + { + "auxiliary_loss_clip": 0.01390831, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.23718131, + "balance_loss_mlp": 1.0119493, + "epoch": 0.919645272809259, + "flos": 28195797692160.0, + "grad_norm": 2.023914655681007, + "language_loss": 0.72710109, + "learning_rate": 6.730254169322114e-08, + "loss": 0.75130397, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.17504883, + "step": 15296, + "time_per_iteration": 2.907451629638672 + }, + { + "auxiliary_loss_clip": 0.01383508, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.22660422, + "balance_loss_mlp": 1.01641011, + "epoch": 0.9197053960619269, + "flos": 18341992085760.0, + "grad_norm": 4.000792272020894, + "language_loss": 0.7533083, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77750224, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19482422, + "step": 15297, + "time_per_iteration": 2.8440535068511963 + }, + { + "auxiliary_loss_clip": 0.01388015, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.23008966, + "balance_loss_mlp": 1.01478195, + "epoch": 0.9197655193145949, + "flos": 28195028530560.0, + "grad_norm": 5.777005437320414, + "language_loss": 0.74556583, + "learning_rate": 6.710232148647676e-08, + "loss": 0.76978558, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19189453, + "step": 15298, + "time_per_iteration": 4.380703926086426 + }, + { + "auxiliary_loss_clip": 0.01407376, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.24527979, + "balance_loss_mlp": 1.0152657, + "epoch": 0.9198256425672628, + "flos": 17313945563520.0, + "grad_norm": 1.9270295545195613, + "language_loss": 0.80304468, + "learning_rate": 6.70023213247175e-08, + "loss": 0.82746637, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1953125, + "step": 15299, + "time_per_iteration": 2.795534133911133 + }, + { + "auxiliary_loss_clip": 0.0139699, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.23807442, + "balance_loss_mlp": 1.01423764, + "epoch": 0.9198857658199309, + "flos": 17867562664320.0, + "grad_norm": 2.1494109768106493, + "language_loss": 0.64817882, + "learning_rate": 6.690239446242385e-08, + "loss": 0.67248249, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19152832, + "step": 15300, + "time_per_iteration": 2.828676223754883 + }, + { + "auxiliary_loss_clip": 0.01373378, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.22077, + "balance_loss_mlp": 1.01170313, + "epoch": 0.9199458890725988, + "flos": 22137744170880.0, + "grad_norm": 1.7115546422966972, + "language_loss": 0.70440352, + "learning_rate": 6.680254090338545e-08, + "loss": 0.72842467, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.17041016, + "step": 15301, + "time_per_iteration": 4.321869850158691 + }, + { + "auxiliary_loss_clip": 0.01406558, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.24580359, + "balance_loss_mlp": 1.01464176, + "epoch": 0.9200060123252668, + "flos": 16042562651520.0, + "grad_norm": 1.6865449254505391, + "language_loss": 0.71326482, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73767412, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19726562, + "step": 15302, + "time_per_iteration": 2.9079232215881348 + }, + { + "auxiliary_loss_clip": 0.01404836, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.24285793, + "balance_loss_mlp": 1.01481497, + "epoch": 0.9200661355779348, + "flos": 26874346769280.0, + "grad_norm": 1.634503676530838, + "language_loss": 0.77603257, + "learning_rate": 6.660305371021579e-08, + "loss": 0.80040526, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.17614746, + "step": 15303, + "time_per_iteration": 2.8875041007995605 + }, + { + "auxiliary_loss_clip": 0.01406949, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.24882007, + "balance_loss_mlp": 1.01572311, + "epoch": 0.9201262588306027, + "flos": 12794410108800.0, + "grad_norm": 1.978585718842766, + "language_loss": 0.88868046, + "learning_rate": 6.650342008365006e-08, + "loss": 0.91309345, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18640137, + "step": 15304, + "time_per_iteration": 2.7883200645446777 + }, + { + "auxiliary_loss_clip": 0.01413675, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.25063491, + "balance_loss_mlp": 1.01493096, + "epoch": 0.9201863820832707, + "flos": 20641421520000.0, + "grad_norm": 1.9700693556251907, + "language_loss": 0.78336072, + "learning_rate": 6.64038597754677e-08, + "loss": 0.80784965, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.20288086, + "step": 15305, + "time_per_iteration": 2.8257133960723877 + }, + { + "auxiliary_loss_clip": 0.0139016, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.23075187, + "balance_loss_mlp": 1.01411128, + "epoch": 0.9202465053359387, + "flos": 26406975536640.0, + "grad_norm": 2.0912529366472334, + "language_loss": 0.82770836, + "learning_rate": 6.630437278944501e-08, + "loss": 0.85193646, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1854248, + "step": 15306, + "time_per_iteration": 2.8687961101531982 + }, + { + "auxiliary_loss_clip": 0.01393717, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.23629308, + "balance_loss_mlp": 1.01165378, + "epoch": 0.9203066285886067, + "flos": 10495252143360.0, + "grad_norm": 2.187629368673581, + "language_loss": 0.72916347, + "learning_rate": 6.62049591293541e-08, + "loss": 0.75339198, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.17492676, + "step": 15307, + "time_per_iteration": 4.2179882526397705 + }, + { + "auxiliary_loss_clip": 0.01410148, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.2479744, + "balance_loss_mlp": 1.01193106, + "epoch": 0.9203667518412746, + "flos": 19400108417280.0, + "grad_norm": 1.8215912320509062, + "language_loss": 0.79377413, + "learning_rate": 6.610561879896526e-08, + "loss": 0.81821704, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.2220459, + "step": 15308, + "time_per_iteration": 4.203775405883789 + }, + { + "auxiliary_loss_clip": 0.01393303, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.23462212, + "balance_loss_mlp": 1.0133512, + "epoch": 0.9204268750939426, + "flos": 15933576631680.0, + "grad_norm": 1.809850349557736, + "language_loss": 0.79196882, + "learning_rate": 6.600635180204484e-08, + "loss": 0.81622571, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19018555, + "step": 15309, + "time_per_iteration": 2.7997732162475586 + }, + { + "auxiliary_loss_clip": 0.01396466, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.23608994, + "balance_loss_mlp": 1.01203251, + "epoch": 0.9204869983466105, + "flos": 16480361767680.0, + "grad_norm": 2.9117239600965066, + "language_loss": 0.66949254, + "learning_rate": 6.590715814235781e-08, + "loss": 0.69376493, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18762207, + "step": 15310, + "time_per_iteration": 2.8129992485046387 + }, + { + "auxiliary_loss_clip": 0.01395815, + "auxiliary_loss_mlp": 0.01030835, + "balance_loss_clip": 1.23536754, + "balance_loss_mlp": 1.01155865, + "epoch": 0.9205471215992785, + "flos": 21548084947200.0, + "grad_norm": 2.532308133573014, + "language_loss": 0.66495693, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68922341, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19274902, + "step": 15311, + "time_per_iteration": 2.886121988296509 + }, + { + "auxiliary_loss_clip": 0.01394537, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.23428428, + "balance_loss_mlp": 1.01600623, + "epoch": 0.9206072448519464, + "flos": 25015793097600.0, + "grad_norm": 1.9256586188559217, + "language_loss": 0.7695421, + "learning_rate": 6.570899084972503e-08, + "loss": 0.79383349, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18603516, + "step": 15312, + "time_per_iteration": 2.8597910404205322 + }, + { + "auxiliary_loss_clip": 0.01389649, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.23451793, + "balance_loss_mlp": 1.01632452, + "epoch": 0.9206673681046145, + "flos": 20532571234560.0, + "grad_norm": 1.669277366692225, + "language_loss": 0.79150736, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81575358, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18640137, + "step": 15313, + "time_per_iteration": 2.8319849967956543 + }, + { + "auxiliary_loss_clip": 0.01399445, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.23893785, + "balance_loss_mlp": 1.01318479, + "epoch": 0.9207274913572824, + "flos": 20892404280960.0, + "grad_norm": 1.8155330192538022, + "language_loss": 0.79307544, + "learning_rate": 6.55111169511251e-08, + "loss": 0.81739897, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19726562, + "step": 15314, + "time_per_iteration": 2.8369131088256836 + }, + { + "auxiliary_loss_clip": 0.01421863, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.25551295, + "balance_loss_mlp": 1.01462817, + "epoch": 0.9207876146099504, + "flos": 22717811496960.0, + "grad_norm": 1.67903664927146, + "language_loss": 0.79631811, + "learning_rate": 6.541229003396864e-08, + "loss": 0.82088041, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.1973877, + "step": 15315, + "time_per_iteration": 2.8352620601654053 + }, + { + "auxiliary_loss_clip": 0.01419116, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.25421119, + "balance_loss_mlp": 1.01667094, + "epoch": 0.9208477378626184, + "flos": 18514511084160.0, + "grad_norm": 1.8000234294543411, + "language_loss": 0.77446598, + "learning_rate": 6.531353647657156e-08, + "loss": 0.79901928, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19543457, + "step": 15316, + "time_per_iteration": 2.8204267024993896 + }, + { + "auxiliary_loss_clip": 0.01406089, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.24477434, + "balance_loss_mlp": 1.01688528, + "epoch": 0.9209078611152863, + "flos": 23009134636800.0, + "grad_norm": 1.6798810912931423, + "language_loss": 0.69579732, + "learning_rate": 6.521485628267931e-08, + "loss": 0.72023118, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20397949, + "step": 15317, + "time_per_iteration": 2.955392360687256 + }, + { + "auxiliary_loss_clip": 0.01395646, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.2366749, + "balance_loss_mlp": 1.01387322, + "epoch": 0.9209679843679544, + "flos": 24072273141120.0, + "grad_norm": 1.6508630409571223, + "language_loss": 0.84230673, + "learning_rate": 6.511624945603378e-08, + "loss": 0.8665961, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19421387, + "step": 15318, + "time_per_iteration": 2.843653678894043 + }, + { + "auxiliary_loss_clip": 0.01394256, + "auxiliary_loss_mlp": 0.01034419, + "balance_loss_clip": 1.23512042, + "balance_loss_mlp": 1.01495218, + "epoch": 0.9210281076206223, + "flos": 13561339034880.0, + "grad_norm": 2.4305603902380173, + "language_loss": 0.86611187, + "learning_rate": 6.501771600037354e-08, + "loss": 0.89039862, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19470215, + "step": 15319, + "time_per_iteration": 2.7987968921661377 + }, + { + "auxiliary_loss_clip": 0.01174701, + "auxiliary_loss_mlp": 0.01020339, + "balance_loss_clip": 1.08829403, + "balance_loss_mlp": 1.00260031, + "epoch": 0.9210882308732903, + "flos": 71460432535680.0, + "grad_norm": 0.769243309131967, + "language_loss": 0.5623067, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58425707, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.17773438, + "step": 15320, + "time_per_iteration": 3.373079299926758 + }, + { + "auxiliary_loss_clip": 0.01403359, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.23866343, + "balance_loss_mlp": 1.0136106, + "epoch": 0.9211483541259582, + "flos": 18516728079360.0, + "grad_norm": 2.1756345412580926, + "language_loss": 0.64986277, + "learning_rate": 6.482086921695384e-08, + "loss": 0.67423815, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20568848, + "step": 15321, + "time_per_iteration": 2.8464059829711914 + }, + { + "auxiliary_loss_clip": 0.01379786, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.2266494, + "balance_loss_mlp": 1.01078475, + "epoch": 0.9212084773786262, + "flos": 23268623420160.0, + "grad_norm": 1.4266917066165652, + "language_loss": 0.71874309, + "learning_rate": 6.47225558966582e-08, + "loss": 0.74283361, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.18469238, + "step": 15322, + "time_per_iteration": 2.895540952682495 + }, + { + "auxiliary_loss_clip": 0.01390329, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.23228836, + "balance_loss_mlp": 1.01605761, + "epoch": 0.9212686006312941, + "flos": 16298115137280.0, + "grad_norm": 3.0229059573195167, + "language_loss": 0.70716405, + "learning_rate": 6.462431596227725e-08, + "loss": 0.73141432, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18640137, + "step": 15323, + "time_per_iteration": 2.805449962615967 + }, + { + "auxiliary_loss_clip": 0.01404946, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.24183798, + "balance_loss_mlp": 1.01647198, + "epoch": 0.9213287238839621, + "flos": 19793630856960.0, + "grad_norm": 1.8895014902475942, + "language_loss": 0.75549746, + "learning_rate": 6.452614941753597e-08, + "loss": 0.77991736, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.20544434, + "step": 15324, + "time_per_iteration": 2.821956157684326 + }, + { + "auxiliary_loss_clip": 0.01389038, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.23073983, + "balance_loss_mlp": 1.01694012, + "epoch": 0.92138884713663, + "flos": 21039649418880.0, + "grad_norm": 1.7855358775431383, + "language_loss": 0.71364719, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73790371, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19677734, + "step": 15325, + "time_per_iteration": 2.852393627166748 + }, + { + "auxiliary_loss_clip": 0.01394928, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.23585725, + "balance_loss_mlp": 1.01371229, + "epoch": 0.9214489703892981, + "flos": 28599590701440.0, + "grad_norm": 1.7234948251457534, + "language_loss": 0.78594351, + "learning_rate": 6.433003651186109e-08, + "loss": 0.81022549, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19543457, + "step": 15326, + "time_per_iteration": 2.8858280181884766 + }, + { + "auxiliary_loss_clip": 0.01399774, + "auxiliary_loss_mlp": 0.01034657, + "balance_loss_clip": 1.23838758, + "balance_loss_mlp": 1.01545215, + "epoch": 0.921509093641966, + "flos": 16369837424640.0, + "grad_norm": 2.551779633885365, + "language_loss": 0.72104686, + "learning_rate": 6.42320901583635e-08, + "loss": 0.74539119, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.1920166, + "step": 15327, + "time_per_iteration": 2.782902240753174 + }, + { + "auxiliary_loss_clip": 0.01414103, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.25116479, + "balance_loss_mlp": 1.01945949, + "epoch": 0.921569216894634, + "flos": 26841381292800.0, + "grad_norm": 2.1425972750356688, + "language_loss": 0.78149194, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80601919, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19152832, + "step": 15328, + "time_per_iteration": 2.849195957183838 + }, + { + "auxiliary_loss_clip": 0.01385225, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.22811997, + "balance_loss_mlp": 1.01204443, + "epoch": 0.921629340147302, + "flos": 24656005296000.0, + "grad_norm": 2.927058594993416, + "language_loss": 0.72058135, + "learning_rate": 6.4036417668619e-08, + "loss": 0.74473733, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18322754, + "step": 15329, + "time_per_iteration": 2.8509557247161865 + }, + { + "auxiliary_loss_clip": 0.01389993, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.23239136, + "balance_loss_mlp": 1.01232266, + "epoch": 0.9216894633999699, + "flos": 15095061152640.0, + "grad_norm": 3.0027311975654514, + "language_loss": 0.87399179, + "learning_rate": 6.393869153979192e-08, + "loss": 0.89819503, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18005371, + "step": 15330, + "time_per_iteration": 2.8531572818756104 + }, + { + "auxiliary_loss_clip": 0.01399177, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.23838329, + "balance_loss_mlp": 1.01155186, + "epoch": 0.921749586652638, + "flos": 19212884858880.0, + "grad_norm": 1.9176839548145632, + "language_loss": 0.7672748, + "learning_rate": 6.384103882660397e-08, + "loss": 0.79157329, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19116211, + "step": 15331, + "time_per_iteration": 2.857767343521118 + }, + { + "auxiliary_loss_clip": 0.01393544, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.23400116, + "balance_loss_mlp": 1.01386738, + "epoch": 0.9218097099053059, + "flos": 20532299765760.0, + "grad_norm": 1.8946470075362702, + "language_loss": 0.76976365, + "learning_rate": 6.374345953275794e-08, + "loss": 0.79402637, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18859863, + "step": 15332, + "time_per_iteration": 2.8147265911102295 + }, + { + "auxiliary_loss_clip": 0.01387851, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.2297287, + "balance_loss_mlp": 1.01410294, + "epoch": 0.9218698331579739, + "flos": 17357769792000.0, + "grad_norm": 2.1905504084626277, + "language_loss": 0.74964464, + "learning_rate": 6.364595366195358e-08, + "loss": 0.77384537, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18139648, + "step": 15333, + "time_per_iteration": 4.2445502281188965 + }, + { + "auxiliary_loss_clip": 0.01178279, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.09072447, + "balance_loss_mlp": 1.00654948, + "epoch": 0.9219299564106418, + "flos": 61985309074560.0, + "grad_norm": 0.8047373640313156, + "language_loss": 0.52925825, + "learning_rate": 6.354852121788879e-08, + "loss": 0.55135256, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.24609375, + "step": 15334, + "time_per_iteration": 3.318010091781616 + }, + { + "auxiliary_loss_clip": 0.01390178, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.23383939, + "balance_loss_mlp": 1.01170254, + "epoch": 0.9219900796633098, + "flos": 15709179830400.0, + "grad_norm": 2.190086271923082, + "language_loss": 0.63060045, + "learning_rate": 6.345116220425839e-08, + "loss": 0.65481359, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19433594, + "step": 15335, + "time_per_iteration": 2.829707145690918 + }, + { + "auxiliary_loss_clip": 0.0140276, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.2431531, + "balance_loss_mlp": 1.01494801, + "epoch": 0.9220502029159777, + "flos": 24942487242240.0, + "grad_norm": 1.6502912404919745, + "language_loss": 0.72544599, + "learning_rate": 6.335387662475366e-08, + "loss": 0.74981225, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18920898, + "step": 15336, + "time_per_iteration": 4.393793106079102 + }, + { + "auxiliary_loss_clip": 0.01391626, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.23512197, + "balance_loss_mlp": 1.01397443, + "epoch": 0.9221103261686457, + "flos": 15675128478720.0, + "grad_norm": 1.803001884291993, + "language_loss": 0.72593945, + "learning_rate": 6.325666448306433e-08, + "loss": 0.75016546, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.16992188, + "step": 15337, + "time_per_iteration": 2.7989416122436523 + }, + { + "auxiliary_loss_clip": 0.01181731, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.0913707, + "balance_loss_mlp": 1.01692772, + "epoch": 0.9221704494213137, + "flos": 67547867080320.0, + "grad_norm": 0.8836295383695084, + "language_loss": 0.65383911, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67609084, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.265625, + "step": 15338, + "time_per_iteration": 3.235442876815796 + }, + { + "auxiliary_loss_clip": 0.01404147, + "auxiliary_loss_mlp": 0.01034896, + "balance_loss_clip": 1.24412668, + "balance_loss_mlp": 1.01554835, + "epoch": 0.9222305726739817, + "flos": 30239131703040.0, + "grad_norm": 1.6808689760019158, + "language_loss": 0.67595446, + "learning_rate": 6.306246052787289e-08, + "loss": 0.70034492, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19348145, + "step": 15339, + "time_per_iteration": 2.941366195678711 + }, + { + "auxiliary_loss_clip": 0.01392424, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.233042, + "balance_loss_mlp": 1.01210237, + "epoch": 0.9222906959266496, + "flos": 25347954309120.0, + "grad_norm": 2.067827232934132, + "language_loss": 0.7289077, + "learning_rate": 6.296546872173513e-08, + "loss": 0.75314426, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19140625, + "step": 15340, + "time_per_iteration": 2.862283945083618 + }, + { + "auxiliary_loss_clip": 0.01397845, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.2403276, + "balance_loss_mlp": 1.01573932, + "epoch": 0.9223508191793176, + "flos": 27611341620480.0, + "grad_norm": 1.5390161180509006, + "language_loss": 0.70785582, + "learning_rate": 6.286855036814098e-08, + "loss": 0.7321763, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18469238, + "step": 15341, + "time_per_iteration": 2.9116036891937256 + }, + { + "auxiliary_loss_clip": 0.01381889, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.2283082, + "balance_loss_mlp": 1.0161469, + "epoch": 0.9224109424319856, + "flos": 27318887360640.0, + "grad_norm": 1.7400149760509922, + "language_loss": 0.67988372, + "learning_rate": 6.277170547076571e-08, + "loss": 0.70404458, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18041992, + "step": 15342, + "time_per_iteration": 4.282375335693359 + }, + { + "auxiliary_loss_clip": 0.01401001, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.24077296, + "balance_loss_mlp": 1.01421857, + "epoch": 0.9224710656846535, + "flos": 48223580244480.0, + "grad_norm": 1.9717361764336299, + "language_loss": 0.70310253, + "learning_rate": 6.26749340332815e-08, + "loss": 0.7274335, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.17895508, + "step": 15343, + "time_per_iteration": 4.432230472564697 + }, + { + "auxiliary_loss_clip": 0.01179557, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.09212303, + "balance_loss_mlp": 1.00126958, + "epoch": 0.9225311889373216, + "flos": 66755754783360.0, + "grad_norm": 0.9877778365909519, + "language_loss": 0.52096355, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54302359, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.25195312, + "step": 15344, + "time_per_iteration": 3.537489414215088 + }, + { + "auxiliary_loss_clip": 0.01387051, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.23305869, + "balance_loss_mlp": 1.01393008, + "epoch": 0.9225913121899895, + "flos": 22280962521600.0, + "grad_norm": 1.5879415604495823, + "language_loss": 0.70942545, + "learning_rate": 6.248161155266162e-08, + "loss": 0.73362118, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.18579102, + "step": 15345, + "time_per_iteration": 2.8364100456237793 + }, + { + "auxiliary_loss_clip": 0.01402134, + "auxiliary_loss_mlp": 0.01039971, + "balance_loss_clip": 1.24299097, + "balance_loss_mlp": 1.02088547, + "epoch": 0.9226514354426575, + "flos": 20091966940800.0, + "grad_norm": 1.700504777171097, + "language_loss": 0.7808609, + "learning_rate": 6.238506051685677e-08, + "loss": 0.80528188, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1907959, + "step": 15346, + "time_per_iteration": 2.857239246368408 + }, + { + "auxiliary_loss_clip": 0.01417677, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.25290322, + "balance_loss_mlp": 1.01565409, + "epoch": 0.9227115586953254, + "flos": 16079464425600.0, + "grad_norm": 1.760423133367011, + "language_loss": 0.77786767, + "learning_rate": 6.228858295560457e-08, + "loss": 0.80239606, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19519043, + "step": 15347, + "time_per_iteration": 2.806594133377075 + }, + { + "auxiliary_loss_clip": 0.01380892, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.22654819, + "balance_loss_mlp": 1.01039696, + "epoch": 0.9227716819479934, + "flos": 20454967123200.0, + "grad_norm": 1.4695805987386088, + "language_loss": 0.76987785, + "learning_rate": 6.219217887256367e-08, + "loss": 0.79397184, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18127441, + "step": 15348, + "time_per_iteration": 2.9550981521606445 + }, + { + "auxiliary_loss_clip": 0.01397979, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.23580396, + "balance_loss_mlp": 1.0149802, + "epoch": 0.9228318052006613, + "flos": 25017693379200.0, + "grad_norm": 2.3074366504644064, + "language_loss": 0.68477339, + "learning_rate": 6.209584827138959e-08, + "loss": 0.70909965, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19677734, + "step": 15349, + "time_per_iteration": 2.8561244010925293 + }, + { + "auxiliary_loss_clip": 0.01403818, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.24198067, + "balance_loss_mlp": 1.01519227, + "epoch": 0.9228919284533293, + "flos": 12684881151360.0, + "grad_norm": 2.8995115457269853, + "language_loss": 0.87869346, + "learning_rate": 6.199959115573495e-08, + "loss": 0.90307885, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1953125, + "step": 15350, + "time_per_iteration": 2.836699962615967 + }, + { + "auxiliary_loss_clip": 0.0117973, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_clip": 1.09257865, + "balance_loss_mlp": 1.01912713, + "epoch": 0.9229520517059973, + "flos": 70015472012160.0, + "grad_norm": 0.7710484731428903, + "language_loss": 0.60393149, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62619287, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.2734375, + "step": 15351, + "time_per_iteration": 3.322584629058838 + }, + { + "auxiliary_loss_clip": 0.01410127, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.24642801, + "balance_loss_mlp": 1.01195347, + "epoch": 0.9230121749586653, + "flos": 14802109200000.0, + "grad_norm": 1.9129608629482966, + "language_loss": 0.78831851, + "learning_rate": 6.180729739558233e-08, + "loss": 0.81271994, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.18054199, + "step": 15352, + "time_per_iteration": 2.8323557376861572 + }, + { + "auxiliary_loss_clip": 0.01416466, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.25172102, + "balance_loss_mlp": 1.01700258, + "epoch": 0.9230722982113332, + "flos": 22977752728320.0, + "grad_norm": 2.137089888184517, + "language_loss": 0.60047692, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62501341, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.20166016, + "step": 15353, + "time_per_iteration": 2.9186341762542725 + }, + { + "auxiliary_loss_clip": 0.01393868, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.23641336, + "balance_loss_mlp": 1.01250064, + "epoch": 0.9231324214640012, + "flos": 18560597552640.0, + "grad_norm": 12.782841877349565, + "language_loss": 0.74940014, + "learning_rate": 6.161529762127293e-08, + "loss": 0.77364612, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18237305, + "step": 15354, + "time_per_iteration": 2.829019546508789 + }, + { + "auxiliary_loss_clip": 0.01405862, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.24056721, + "balance_loss_mlp": 1.01219749, + "epoch": 0.9231925447166691, + "flos": 22090752806400.0, + "grad_norm": 2.2222156322561766, + "language_loss": 0.6616869, + "learning_rate": 6.1519407987912e-08, + "loss": 0.68607217, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.20446777, + "step": 15355, + "time_per_iteration": 2.803534507751465 + }, + { + "auxiliary_loss_clip": 0.01388796, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.23241222, + "balance_loss_mlp": 1.01368499, + "epoch": 0.9232526679693371, + "flos": 26552184658560.0, + "grad_norm": 1.6780826912140967, + "language_loss": 0.7495774, + "learning_rate": 6.142359186192947e-08, + "loss": 0.77378321, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18115234, + "step": 15356, + "time_per_iteration": 2.8956570625305176 + }, + { + "auxiliary_loss_clip": 0.01396218, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.23669982, + "balance_loss_mlp": 1.01118898, + "epoch": 0.9233127912220052, + "flos": 14764754977920.0, + "grad_norm": 1.8475532748650503, + "language_loss": 0.61681724, + "learning_rate": 6.132784924695844e-08, + "loss": 0.64108908, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19763184, + "step": 15357, + "time_per_iteration": 2.804544687271118 + }, + { + "auxiliary_loss_clip": 0.01404158, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.24044204, + "balance_loss_mlp": 1.01341736, + "epoch": 0.9233729144746731, + "flos": 25271888520960.0, + "grad_norm": 1.3859654387999516, + "language_loss": 0.7018708, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72624612, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19934082, + "step": 15358, + "time_per_iteration": 2.8624107837677 + }, + { + "auxiliary_loss_clip": 0.01389561, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.23140526, + "balance_loss_mlp": 1.01405907, + "epoch": 0.9234330377273411, + "flos": 27859971651840.0, + "grad_norm": 4.671304278896834, + "language_loss": 0.74422187, + "learning_rate": 6.113658456457104e-08, + "loss": 0.76844835, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19030762, + "step": 15359, + "time_per_iteration": 2.8641719818115234 + }, + { + "auxiliary_loss_clip": 0.01403131, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.24300313, + "balance_loss_mlp": 1.01335955, + "epoch": 0.923493160980009, + "flos": 24619148766720.0, + "grad_norm": 2.4298393484471905, + "language_loss": 0.6570648, + "learning_rate": 6.104106250440732e-08, + "loss": 0.68142188, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19213867, + "step": 15360, + "time_per_iteration": 2.8536829948425293 + }, + { + "auxiliary_loss_clip": 0.01179808, + "auxiliary_loss_mlp": 0.01027177, + "balance_loss_clip": 1.09236121, + "balance_loss_mlp": 1.00505161, + "epoch": 0.923553284232677, + "flos": 67733099867520.0, + "grad_norm": 0.7680831528932573, + "language_loss": 0.5523603, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57443011, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.22167969, + "step": 15361, + "time_per_iteration": 3.255507707595825 + }, + { + "auxiliary_loss_clip": 0.01412602, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.24795961, + "balance_loss_mlp": 1.01467705, + "epoch": 0.9236134074853449, + "flos": 18816557241600.0, + "grad_norm": 2.0299892907349535, + "language_loss": 0.70792627, + "learning_rate": 6.085023896425112e-08, + "loss": 0.73238301, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18395996, + "step": 15362, + "time_per_iteration": 2.9089877605438232 + }, + { + "auxiliary_loss_clip": 0.01413214, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.24894297, + "balance_loss_mlp": 1.01408947, + "epoch": 0.923673530738013, + "flos": 27793904964480.0, + "grad_norm": 1.5710097991348202, + "language_loss": 0.76482123, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78929859, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.2043457, + "step": 15363, + "time_per_iteration": 2.9085824489593506 + }, + { + "auxiliary_loss_clip": 0.01404188, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.24460483, + "balance_loss_mlp": 1.01225066, + "epoch": 0.9237336539906809, + "flos": 26808370571520.0, + "grad_norm": 2.030296818179137, + "language_loss": 0.84107423, + "learning_rate": 6.065970955510514e-08, + "loss": 0.86543143, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19287109, + "step": 15364, + "time_per_iteration": 2.8963844776153564 + }, + { + "auxiliary_loss_clip": 0.01398808, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.24005318, + "balance_loss_mlp": 1.01430535, + "epoch": 0.9237937772433489, + "flos": 23598477146880.0, + "grad_norm": 1.7178121904351211, + "language_loss": 0.68330765, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70761698, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.17822266, + "step": 15365, + "time_per_iteration": 2.896214485168457 + }, + { + "auxiliary_loss_clip": 0.01398532, + "auxiliary_loss_mlp": 0.01034791, + "balance_loss_clip": 1.23932719, + "balance_loss_mlp": 1.01569414, + "epoch": 0.9238539004960168, + "flos": 26151694519680.0, + "grad_norm": 2.161479666263676, + "language_loss": 0.63483047, + "learning_rate": 6.046947430586913e-08, + "loss": 0.65916371, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19091797, + "step": 15366, + "time_per_iteration": 2.8952066898345947 + }, + { + "auxiliary_loss_clip": 0.01389805, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.23292136, + "balance_loss_mlp": 1.0118444, + "epoch": 0.9239140237486848, + "flos": 21077546578560.0, + "grad_norm": 1.389385536408186, + "language_loss": 0.75401133, + "learning_rate": 6.037446700023619e-08, + "loss": 0.77822137, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19360352, + "step": 15367, + "time_per_iteration": 2.8543620109558105 + }, + { + "auxiliary_loss_clip": 0.0137744, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.22554314, + "balance_loss_mlp": 1.01435876, + "epoch": 0.9239741470013527, + "flos": 24618605829120.0, + "grad_norm": 1.8308677880189417, + "language_loss": 0.6531288, + "learning_rate": 6.027953324539759e-08, + "loss": 0.67723501, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.18823242, + "step": 15368, + "time_per_iteration": 4.290413856506348 + }, + { + "auxiliary_loss_clip": 0.01414714, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.2517395, + "balance_loss_mlp": 1.01533377, + "epoch": 0.9240342702540207, + "flos": 24729175416960.0, + "grad_norm": 1.712310540463667, + "language_loss": 0.75642776, + "learning_rate": 6.018467304495401e-08, + "loss": 0.78091967, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19152832, + "step": 15369, + "time_per_iteration": 2.853825569152832 + }, + { + "auxiliary_loss_clip": 0.01417391, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.25198853, + "balance_loss_mlp": 1.02089119, + "epoch": 0.9240943935066888, + "flos": 20859710273280.0, + "grad_norm": 1.8400534618230138, + "language_loss": 0.77378118, + "learning_rate": 6.008988640250145e-08, + "loss": 0.79836261, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.1986084, + "step": 15370, + "time_per_iteration": 2.8508455753326416 + }, + { + "auxiliary_loss_clip": 0.01399374, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.24011981, + "balance_loss_mlp": 1.01617682, + "epoch": 0.9241545167593567, + "flos": 24473034748800.0, + "grad_norm": 1.9666054559392503, + "language_loss": 0.67093456, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69527233, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18212891, + "step": 15371, + "time_per_iteration": 4.3189496994018555 + }, + { + "auxiliary_loss_clip": 0.01177843, + "auxiliary_loss_mlp": 0.01033193, + "balance_loss_clip": 1.09150779, + "balance_loss_mlp": 1.00839698, + "epoch": 0.9242146400120247, + "flos": 61858062138240.0, + "grad_norm": 0.7392553713229377, + "language_loss": 0.57740307, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59951341, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.24804688, + "step": 15372, + "time_per_iteration": 3.298266649246216 + }, + { + "auxiliary_loss_clip": 0.01384841, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.23016834, + "balance_loss_mlp": 1.01222444, + "epoch": 0.9242747632646926, + "flos": 22056972923520.0, + "grad_norm": 1.722602784454251, + "language_loss": 0.70518386, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72933459, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18017578, + "step": 15373, + "time_per_iteration": 2.8402249813079834 + }, + { + "auxiliary_loss_clip": 0.01399603, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.23994279, + "balance_loss_mlp": 1.01598489, + "epoch": 0.9243348865173606, + "flos": 18487427431680.0, + "grad_norm": 3.302380146821489, + "language_loss": 0.76091635, + "learning_rate": 5.971147548445299e-08, + "loss": 0.78525805, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18579102, + "step": 15374, + "time_per_iteration": 2.833189010620117 + }, + { + "auxiliary_loss_clip": 0.0139622, + "auxiliary_loss_mlp": 0.01032484, + "balance_loss_clip": 1.2372402, + "balance_loss_mlp": 1.01367283, + "epoch": 0.9243950097700285, + "flos": 23269302092160.0, + "grad_norm": 1.6104188438905056, + "language_loss": 0.65554804, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67983508, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18811035, + "step": 15375, + "time_per_iteration": 2.865429639816284 + }, + { + "auxiliary_loss_clip": 0.01395713, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.23878407, + "balance_loss_mlp": 1.01472604, + "epoch": 0.9244551330226966, + "flos": 29760223046400.0, + "grad_norm": 2.1023233929341485, + "language_loss": 0.67654598, + "learning_rate": 5.952271146669829e-08, + "loss": 0.70083678, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18652344, + "step": 15376, + "time_per_iteration": 2.9252164363861084 + }, + { + "auxiliary_loss_clip": 0.01180713, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.09322047, + "balance_loss_mlp": 1.01363015, + "epoch": 0.9245152562753645, + "flos": 68896537390080.0, + "grad_norm": 0.6510989781460519, + "language_loss": 0.61173582, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63391483, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.23535156, + "step": 15377, + "time_per_iteration": 3.395092248916626 + }, + { + "auxiliary_loss_clip": 0.01396702, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.23782086, + "balance_loss_mlp": 1.01353991, + "epoch": 0.9245753795280325, + "flos": 21589058753280.0, + "grad_norm": 1.6047680216222764, + "language_loss": 0.74676275, + "learning_rate": 5.933424178131341e-08, + "loss": 0.77104962, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18457031, + "step": 15378, + "time_per_iteration": 5.628398180007935 + }, + { + "auxiliary_loss_clip": 0.01401724, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.24133039, + "balance_loss_mlp": 1.01364839, + "epoch": 0.9246355027807004, + "flos": 34509448944000.0, + "grad_norm": 4.366674835744232, + "language_loss": 0.63546932, + "learning_rate": 5.924011732219503e-08, + "loss": 0.65981293, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18994141, + "step": 15379, + "time_per_iteration": 2.9438424110412598 + }, + { + "auxiliary_loss_clip": 0.01390628, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.23439431, + "balance_loss_mlp": 1.01625443, + "epoch": 0.9246956260333684, + "flos": 15960479304960.0, + "grad_norm": 1.9345295429167837, + "language_loss": 0.85116267, + "learning_rate": 5.914606645688591e-08, + "loss": 0.87542272, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19128418, + "step": 15380, + "time_per_iteration": 2.8681411743164062 + }, + { + "auxiliary_loss_clip": 0.01401206, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.2385242, + "balance_loss_mlp": 1.01310956, + "epoch": 0.9247557492860363, + "flos": 23378966784000.0, + "grad_norm": 1.906807415582918, + "language_loss": 0.74125904, + "learning_rate": 5.905208918895233e-08, + "loss": 0.76559603, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19384766, + "step": 15381, + "time_per_iteration": 2.972817897796631 + }, + { + "auxiliary_loss_clip": 0.01399532, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.24068189, + "balance_loss_mlp": 1.01630187, + "epoch": 0.9248158725387043, + "flos": 23050334666880.0, + "grad_norm": 1.979077482593437, + "language_loss": 0.79152942, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.81587398, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1862793, + "step": 15382, + "time_per_iteration": 2.8791143894195557 + }, + { + "auxiliary_loss_clip": 0.01401857, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.24026549, + "balance_loss_mlp": 1.01340175, + "epoch": 0.9248759957913724, + "flos": 22531040386560.0, + "grad_norm": 1.524587271053467, + "language_loss": 0.75127017, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77561873, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19592285, + "step": 15383, + "time_per_iteration": 2.8699631690979004 + }, + { + "auxiliary_loss_clip": 0.01394371, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.23630071, + "balance_loss_mlp": 1.01470256, + "epoch": 0.9249361190440403, + "flos": 25458207183360.0, + "grad_norm": 1.7354771387759966, + "language_loss": 0.76046753, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.78474122, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18310547, + "step": 15384, + "time_per_iteration": 2.9247310161590576 + }, + { + "auxiliary_loss_clip": 0.01384829, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.22895908, + "balance_loss_mlp": 1.01565051, + "epoch": 0.9249962422967083, + "flos": 12384418561920.0, + "grad_norm": 3.7755490660835034, + "language_loss": 0.67348057, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.69767749, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19226074, + "step": 15385, + "time_per_iteration": 2.8124303817749023 + }, + { + "auxiliary_loss_clip": 0.01392957, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.23551738, + "balance_loss_mlp": 1.01202619, + "epoch": 0.9250563655493762, + "flos": 22939629344640.0, + "grad_norm": 1.8316392536887356, + "language_loss": 0.81366527, + "learning_rate": 5.85833069345496e-08, + "loss": 0.83790112, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18579102, + "step": 15386, + "time_per_iteration": 2.8352081775665283 + }, + { + "auxiliary_loss_clip": 0.01394347, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.23812008, + "balance_loss_mlp": 1.01436663, + "epoch": 0.9251164888020442, + "flos": 18487834634880.0, + "grad_norm": 4.2238303899838705, + "language_loss": 0.76131105, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.78558433, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18603516, + "step": 15387, + "time_per_iteration": 2.9227821826934814 + }, + { + "auxiliary_loss_clip": 0.01387014, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.23070669, + "balance_loss_mlp": 1.01633191, + "epoch": 0.9251766120547121, + "flos": 33050163801600.0, + "grad_norm": 1.2563008386865662, + "language_loss": 0.70667022, + "learning_rate": 5.839630933893014e-08, + "loss": 0.73089206, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18847656, + "step": 15388, + "time_per_iteration": 2.952864170074463 + }, + { + "auxiliary_loss_clip": 0.01403363, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.24157178, + "balance_loss_mlp": 1.01408494, + "epoch": 0.9252367353073802, + "flos": 24398235815040.0, + "grad_norm": 2.161792832276638, + "language_loss": 0.8250975, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84946096, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18884277, + "step": 15389, + "time_per_iteration": 2.868638753890991 + }, + { + "auxiliary_loss_clip": 0.01430691, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.26235938, + "balance_loss_mlp": 1.01655591, + "epoch": 0.9252968585600481, + "flos": 18926221933440.0, + "grad_norm": 1.7331879159378432, + "language_loss": 0.80058289, + "learning_rate": 5.820960624653381e-08, + "loss": 0.82525766, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.20227051, + "step": 15390, + "time_per_iteration": 2.8247954845428467 + }, + { + "auxiliary_loss_clip": 0.01406622, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.24420142, + "balance_loss_mlp": 1.0131644, + "epoch": 0.9253569818127161, + "flos": 21735218016000.0, + "grad_norm": 1.8592709098460587, + "language_loss": 0.76446152, + "learning_rate": 5.811636514789597e-08, + "loss": 0.78885382, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19445801, + "step": 15391, + "time_per_iteration": 2.8217084407806396 + }, + { + "auxiliary_loss_clip": 0.0139972, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.23878908, + "balance_loss_mlp": 1.01349366, + "epoch": 0.925417105065384, + "flos": 34253579744640.0, + "grad_norm": 2.2966719320860816, + "language_loss": 0.53147322, + "learning_rate": 5.80231976856802e-08, + "loss": 0.55579937, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19396973, + "step": 15392, + "time_per_iteration": 3.0000407695770264 + }, + { + "auxiliary_loss_clip": 0.01396226, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.23571956, + "balance_loss_mlp": 1.00946689, + "epoch": 0.925477228318052, + "flos": 25970986212480.0, + "grad_norm": 1.7952875798868602, + "language_loss": 0.77764893, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.80189013, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18432617, + "step": 15393, + "time_per_iteration": 2.8743226528167725 + }, + { + "auxiliary_loss_clip": 0.01398006, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.23876154, + "balance_loss_mlp": 1.01060486, + "epoch": 0.9255373515707199, + "flos": 11845687000320.0, + "grad_norm": 1.852032016558179, + "language_loss": 0.69995618, + "learning_rate": 5.783708368464357e-08, + "loss": 0.72423029, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18811035, + "step": 15394, + "time_per_iteration": 2.8122100830078125 + }, + { + "auxiliary_loss_clip": 0.01412287, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.25180006, + "balance_loss_mlp": 1.01279759, + "epoch": 0.925597474823388, + "flos": 21444347324160.0, + "grad_norm": 2.281987195007043, + "language_loss": 0.73343527, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75787622, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19018555, + "step": 15395, + "time_per_iteration": 2.8177051544189453 + }, + { + "auxiliary_loss_clip": 0.01384436, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.22671437, + "balance_loss_mlp": 1.01031637, + "epoch": 0.925657598076056, + "flos": 22868042791680.0, + "grad_norm": 2.2387384160435375, + "language_loss": 0.73078996, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.75491679, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.17932129, + "step": 15396, + "time_per_iteration": 2.8439695835113525 + }, + { + "auxiliary_loss_clip": 0.01395191, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.23715258, + "balance_loss_mlp": 1.01214457, + "epoch": 0.9257177213287239, + "flos": 25715388481920.0, + "grad_norm": 1.6606145415107847, + "language_loss": 0.87630427, + "learning_rate": 5.755846504448603e-08, + "loss": 0.90056372, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18615723, + "step": 15397, + "time_per_iteration": 2.875885009765625 + }, + { + "auxiliary_loss_clip": 0.01180878, + "auxiliary_loss_mlp": 0.01021929, + "balance_loss_clip": 1.09284472, + "balance_loss_mlp": 1.00199747, + "epoch": 0.9257778445813919, + "flos": 59620898828160.0, + "grad_norm": 0.8010465012553011, + "language_loss": 0.55136049, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57338858, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.19921875, + "step": 15398, + "time_per_iteration": 3.2037580013275146 + }, + { + "auxiliary_loss_clip": 0.0141119, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.24687755, + "balance_loss_mlp": 1.01366234, + "epoch": 0.9258379678340598, + "flos": 27720961067520.0, + "grad_norm": 1.9486914966108844, + "language_loss": 0.77663314, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.80107915, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1973877, + "step": 15399, + "time_per_iteration": 2.9123294353485107 + }, + { + "auxiliary_loss_clip": 0.01369299, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.2152307, + "balance_loss_mlp": 1.01342392, + "epoch": 0.9258980910867278, + "flos": 24874339294080.0, + "grad_norm": 1.4684713352904821, + "language_loss": 0.79023874, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.81424224, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.1763916, + "step": 15400, + "time_per_iteration": 2.859886646270752 + }, + { + "auxiliary_loss_clip": 0.01179594, + "auxiliary_loss_mlp": 0.01018499, + "balance_loss_clip": 1.09091794, + "balance_loss_mlp": 0.99789965, + "epoch": 0.9259582143393957, + "flos": 63164129829120.0, + "grad_norm": 0.7488689176818936, + "language_loss": 0.51346791, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53544885, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.20605469, + "step": 15401, + "time_per_iteration": 3.243189811706543 + }, + { + "auxiliary_loss_clip": 0.01376231, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.22252917, + "balance_loss_mlp": 1.01701832, + "epoch": 0.9260183375920638, + "flos": 24135987098880.0, + "grad_norm": 1.981483072038302, + "language_loss": 0.82771575, + "learning_rate": 5.709557384259378e-08, + "loss": 0.85183072, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.18249512, + "step": 15402, + "time_per_iteration": 2.8693394660949707 + }, + { + "auxiliary_loss_clip": 0.01177714, + "auxiliary_loss_mlp": 0.01016763, + "balance_loss_clip": 1.09106874, + "balance_loss_mlp": 0.9975937, + "epoch": 0.9260784608447317, + "flos": 63076997047680.0, + "grad_norm": 6.6063425167505905, + "language_loss": 0.51145327, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53339803, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.19140625, + "step": 15403, + "time_per_iteration": 4.727404356002808 + }, + { + "auxiliary_loss_clip": 0.0117753, + "auxiliary_loss_mlp": 0.01019956, + "balance_loss_clip": 1.09217381, + "balance_loss_mlp": 1.00002432, + "epoch": 0.9261385840973997, + "flos": 70619609571840.0, + "grad_norm": 0.6843885729580603, + "language_loss": 0.58756483, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60953963, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.19921875, + "step": 15404, + "time_per_iteration": 3.2892258167266846 + }, + { + "auxiliary_loss_clip": 0.01400371, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.23958814, + "balance_loss_mlp": 1.0114553, + "epoch": 0.9261987073500676, + "flos": 20239528792320.0, + "grad_norm": 1.9524668823766087, + "language_loss": 0.727072, + "learning_rate": 5.681872319494596e-08, + "loss": 0.75138718, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19714355, + "step": 15405, + "time_per_iteration": 2.858870029449463 + }, + { + "auxiliary_loss_clip": 0.01412797, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.25079918, + "balance_loss_mlp": 1.01618171, + "epoch": 0.9262588306027356, + "flos": 20962543000320.0, + "grad_norm": 1.667722031048722, + "language_loss": 0.69416606, + "learning_rate": 5.672658701232458e-08, + "loss": 0.71865159, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19555664, + "step": 15406, + "time_per_iteration": 2.831692695617676 + }, + { + "auxiliary_loss_clip": 0.01407221, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.24546778, + "balance_loss_mlp": 1.01614749, + "epoch": 0.9263189538554035, + "flos": 22167497266560.0, + "grad_norm": 2.8088653825929493, + "language_loss": 0.77165115, + "learning_rate": 5.663452451882555e-08, + "loss": 0.79607487, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18994141, + "step": 15407, + "time_per_iteration": 4.302434682846069 + }, + { + "auxiliary_loss_clip": 0.0142139, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.25338101, + "balance_loss_mlp": 1.01619792, + "epoch": 0.9263790771080715, + "flos": 18196421005440.0, + "grad_norm": 1.8497815815524838, + "language_loss": 0.73178673, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.75636542, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 1.68164062, + "router_z_loss_mlp": 0.20275879, + "step": 15408, + "time_per_iteration": 2.8828623294830322 + }, + { + "auxiliary_loss_clip": 0.01383832, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.2276988, + "balance_loss_mlp": 1.01425076, + "epoch": 0.9264392003607396, + "flos": 48195546451200.0, + "grad_norm": 1.7536748269212328, + "language_loss": 0.69251943, + "learning_rate": 5.645062061315675e-08, + "loss": 0.71667671, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.17651367, + "step": 15409, + "time_per_iteration": 3.1143994331359863 + }, + { + "auxiliary_loss_clip": 0.01410377, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.24958932, + "balance_loss_mlp": 1.01724279, + "epoch": 0.9264993236134075, + "flos": 26398967207040.0, + "grad_norm": 2.1462337988597575, + "language_loss": 0.76771098, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.79217982, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19262695, + "step": 15410, + "time_per_iteration": 2.905673027038574 + }, + { + "auxiliary_loss_clip": 0.01405048, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.24362683, + "balance_loss_mlp": 1.01316285, + "epoch": 0.9265594468660755, + "flos": 20929079831040.0, + "grad_norm": 1.6157170114984925, + "language_loss": 0.83035028, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.85471886, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.1862793, + "step": 15411, + "time_per_iteration": 2.8602707386016846 + }, + { + "auxiliary_loss_clip": 0.01414327, + "auxiliary_loss_mlp": 0.01034257, + "balance_loss_clip": 1.25288904, + "balance_loss_mlp": 1.01618505, + "epoch": 0.9266195701187434, + "flos": 17532641520000.0, + "grad_norm": 1.6500069655131637, + "language_loss": 0.7599147, + "learning_rate": 5.617531751025728e-08, + "loss": 0.78440058, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18066406, + "step": 15412, + "time_per_iteration": 2.8743443489074707 + }, + { + "auxiliary_loss_clip": 0.01396235, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.23672462, + "balance_loss_mlp": 1.01225257, + "epoch": 0.9266796933714114, + "flos": 33700686560640.0, + "grad_norm": 1.651224767801993, + "language_loss": 0.67921132, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.7034775, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18139648, + "step": 15413, + "time_per_iteration": 5.8645100593566895 + }, + { + "auxiliary_loss_clip": 0.01404187, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.24304605, + "balance_loss_mlp": 1.01435769, + "epoch": 0.9267398166240793, + "flos": 18925814730240.0, + "grad_norm": 1.7297126711430377, + "language_loss": 0.76199102, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78637356, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19702148, + "step": 15414, + "time_per_iteration": 2.8258206844329834 + }, + { + "auxiliary_loss_clip": 0.01390734, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.23341632, + "balance_loss_mlp": 1.01077545, + "epoch": 0.9267999398767474, + "flos": 20486982458880.0, + "grad_norm": 1.9630219058933642, + "language_loss": 0.82516348, + "learning_rate": 5.59006777975819e-08, + "loss": 0.84936881, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19042969, + "step": 15415, + "time_per_iteration": 2.8239364624023438 + }, + { + "auxiliary_loss_clip": 0.01403255, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.24084115, + "balance_loss_mlp": 1.01362252, + "epoch": 0.9268600631294153, + "flos": 24799811829120.0, + "grad_norm": 1.6768369428768497, + "language_loss": 0.54799104, + "learning_rate": 5.580927866294671e-08, + "loss": 0.57235342, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.19372559, + "step": 15416, + "time_per_iteration": 2.875593662261963 + }, + { + "auxiliary_loss_clip": 0.01392709, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.23636413, + "balance_loss_mlp": 1.01205611, + "epoch": 0.9269201863820833, + "flos": 18706304367360.0, + "grad_norm": 1.5013577471267507, + "language_loss": 0.7324543, + "learning_rate": 5.571795325221807e-08, + "loss": 0.75668538, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18359375, + "step": 15417, + "time_per_iteration": 2.8136303424835205 + }, + { + "auxiliary_loss_clip": 0.01396632, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.23713303, + "balance_loss_mlp": 1.01238227, + "epoch": 0.9269803096347512, + "flos": 20934011514240.0, + "grad_norm": 2.5063021282623987, + "language_loss": 0.76214749, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.78643316, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19555664, + "step": 15418, + "time_per_iteration": 2.821510076522827 + }, + { + "auxiliary_loss_clip": 0.0139513, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.23709917, + "balance_loss_mlp": 1.01541805, + "epoch": 0.9270404328874192, + "flos": 28014682181760.0, + "grad_norm": 1.4366616919612427, + "language_loss": 0.77286446, + "learning_rate": 5.553552361633174e-08, + "loss": 0.79715586, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18579102, + "step": 15419, + "time_per_iteration": 2.9277560710906982 + }, + { + "auxiliary_loss_clip": 0.0137459, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.22054279, + "balance_loss_mlp": 1.01215696, + "epoch": 0.9271005561400871, + "flos": 25902159592320.0, + "grad_norm": 1.605783427351983, + "language_loss": 0.75880808, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78284872, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.17321777, + "step": 15420, + "time_per_iteration": 2.8773722648620605 + }, + { + "auxiliary_loss_clip": 0.0140362, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.24185359, + "balance_loss_mlp": 1.0160557, + "epoch": 0.9271606793927551, + "flos": 27065597114880.0, + "grad_norm": 4.034143324091733, + "language_loss": 0.77128834, + "learning_rate": 5.535338891759389e-08, + "loss": 0.79567862, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19348145, + "step": 15421, + "time_per_iteration": 2.9718427658081055 + }, + { + "auxiliary_loss_clip": 0.01392301, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.23290122, + "balance_loss_mlp": 1.01368856, + "epoch": 0.9272208026454232, + "flos": 26220068691840.0, + "grad_norm": 2.018817800138314, + "language_loss": 0.73626482, + "learning_rate": 5.526243217829041e-08, + "loss": 0.76051331, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1887207, + "step": 15422, + "time_per_iteration": 3.018231153488159 + }, + { + "auxiliary_loss_clip": 0.01395099, + "auxiliary_loss_mlp": 0.01041515, + "balance_loss_clip": 1.23365188, + "balance_loss_mlp": 1.02095151, + "epoch": 0.9272809258980911, + "flos": 12466049460480.0, + "grad_norm": 5.825333033894933, + "language_loss": 0.78618526, + "learning_rate": 5.517154918363065e-08, + "loss": 0.8105514, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20556641, + "step": 15423, + "time_per_iteration": 3.0010883808135986 + }, + { + "auxiliary_loss_clip": 0.01409359, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.24657357, + "balance_loss_mlp": 1.01119852, + "epoch": 0.9273410491507591, + "flos": 22867183140480.0, + "grad_norm": 8.889624381136501, + "language_loss": 0.76023126, + "learning_rate": 5.508073993706053e-08, + "loss": 0.78463179, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19494629, + "step": 15424, + "time_per_iteration": 2.9217419624328613 + }, + { + "auxiliary_loss_clip": 0.01178065, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.0904603, + "balance_loss_mlp": 1.01244795, + "epoch": 0.927401172403427, + "flos": 47691110448000.0, + "grad_norm": 0.7801516692567665, + "language_loss": 0.60751653, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62963247, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.2109375, + "step": 15425, + "time_per_iteration": 3.1258013248443604 + }, + { + "auxiliary_loss_clip": 0.01394204, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.23586595, + "balance_loss_mlp": 1.01255095, + "epoch": 0.927461295656095, + "flos": 29984710337280.0, + "grad_norm": 1.703763798715788, + "language_loss": 0.71538353, + "learning_rate": 5.489934270196106e-08, + "loss": 0.73964047, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18933105, + "step": 15426, + "time_per_iteration": 2.9189705848693848 + }, + { + "auxiliary_loss_clip": 0.01406467, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.24772787, + "balance_loss_mlp": 1.01269686, + "epoch": 0.9275214189087629, + "flos": 20384964138240.0, + "grad_norm": 1.7307038911390453, + "language_loss": 0.83313549, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85750973, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18261719, + "step": 15427, + "time_per_iteration": 2.8471221923828125 + }, + { + "auxiliary_loss_clip": 0.0140167, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.24243498, + "balance_loss_mlp": 1.01233828, + "epoch": 0.927581542161431, + "flos": 22393522880640.0, + "grad_norm": 1.5718315870045299, + "language_loss": 0.77391124, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79823369, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18237305, + "step": 15428, + "time_per_iteration": 2.868229389190674 + }, + { + "auxiliary_loss_clip": 0.01390253, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.23199916, + "balance_loss_mlp": 1.01465547, + "epoch": 0.9276416654140989, + "flos": 23963287121280.0, + "grad_norm": 1.7220557332613533, + "language_loss": 0.74346983, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76771653, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19763184, + "step": 15429, + "time_per_iteration": 2.8907630443573 + }, + { + "auxiliary_loss_clip": 0.0138078, + "auxiliary_loss_mlp": 0.01030195, + "balance_loss_clip": 1.22474575, + "balance_loss_mlp": 1.01125252, + "epoch": 0.9277017886667669, + "flos": 13925651316480.0, + "grad_norm": 1.8609064816622367, + "language_loss": 0.75755256, + "learning_rate": 5.45374333601647e-08, + "loss": 0.78166234, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18945312, + "step": 15430, + "time_per_iteration": 2.9311399459838867 + }, + { + "auxiliary_loss_clip": 0.01407898, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.2467382, + "balance_loss_mlp": 1.01097369, + "epoch": 0.9277619119194348, + "flos": 35680894796160.0, + "grad_norm": 1.4726563937669495, + "language_loss": 0.77292871, + "learning_rate": 5.444714044648391e-08, + "loss": 0.7973187, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20129395, + "step": 15431, + "time_per_iteration": 3.0472373962402344 + }, + { + "auxiliary_loss_clip": 0.01388868, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.23188937, + "balance_loss_mlp": 1.01390159, + "epoch": 0.9278220351721028, + "flos": 23851586413440.0, + "grad_norm": 1.9745414072805318, + "language_loss": 0.71642464, + "learning_rate": 5.4356921308363e-08, + "loss": 0.74063724, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18493652, + "step": 15432, + "time_per_iteration": 2.9828691482543945 + }, + { + "auxiliary_loss_clip": 0.01404617, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.24287105, + "balance_loss_mlp": 1.01352966, + "epoch": 0.9278821584247707, + "flos": 15235610060160.0, + "grad_norm": 2.205155149380228, + "language_loss": 0.83941466, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.86377782, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18151855, + "step": 15433, + "time_per_iteration": 2.8203232288360596 + }, + { + "auxiliary_loss_clip": 0.01377371, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.2235781, + "balance_loss_mlp": 1.01193714, + "epoch": 0.9279422816774388, + "flos": 24692228398080.0, + "grad_norm": 4.274568609144984, + "language_loss": 0.67971778, + "learning_rate": 5.417670437248056e-08, + "loss": 0.70379418, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.18334961, + "step": 15434, + "time_per_iteration": 2.860714912414551 + }, + { + "auxiliary_loss_clip": 0.01375274, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.22335672, + "balance_loss_mlp": 1.01654792, + "epoch": 0.9280024049301068, + "flos": 19177838121600.0, + "grad_norm": 2.677407243313559, + "language_loss": 0.69528139, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71938825, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.18859863, + "step": 15435, + "time_per_iteration": 2.8710451126098633 + }, + { + "auxiliary_loss_clip": 0.01395099, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.2360518, + "balance_loss_mlp": 1.01316929, + "epoch": 0.9280625281827747, + "flos": 11399291372160.0, + "grad_norm": 1.9992657638309568, + "language_loss": 0.72712463, + "learning_rate": 5.399678257985263e-08, + "loss": 0.75139958, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19238281, + "step": 15436, + "time_per_iteration": 2.8025665283203125 + }, + { + "auxiliary_loss_clip": 0.01394964, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.23707891, + "balance_loss_mlp": 1.01345205, + "epoch": 0.9281226514354427, + "flos": 24795106369920.0, + "grad_norm": 1.926569264431376, + "language_loss": 0.67787218, + "learning_rate": 5.390693237078925e-08, + "loss": 0.7021414, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18493652, + "step": 15437, + "time_per_iteration": 2.861618995666504 + }, + { + "auxiliary_loss_clip": 0.01402198, + "auxiliary_loss_mlp": 0.01038277, + "balance_loss_clip": 1.23909152, + "balance_loss_mlp": 1.01677132, + "epoch": 0.9281827746881106, + "flos": 15090808141440.0, + "grad_norm": 2.1577432218025563, + "language_loss": 0.71880764, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.7432124, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.21520996, + "step": 15438, + "time_per_iteration": 4.19848895072937 + }, + { + "auxiliary_loss_clip": 0.01407486, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.24594975, + "balance_loss_mlp": 1.01385355, + "epoch": 0.9282428979407786, + "flos": 24145986199680.0, + "grad_norm": 1.7651335255279244, + "language_loss": 0.65561461, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.68000847, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18054199, + "step": 15439, + "time_per_iteration": 2.859358549118042 + }, + { + "auxiliary_loss_clip": 0.01399769, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.24003053, + "balance_loss_mlp": 1.01501989, + "epoch": 0.9283030211934465, + "flos": 24833003529600.0, + "grad_norm": 1.6950998679595688, + "language_loss": 0.70792651, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7322638, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18933105, + "step": 15440, + "time_per_iteration": 2.8510501384735107 + }, + { + "auxiliary_loss_clip": 0.01406499, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.24241102, + "balance_loss_mlp": 1.01644003, + "epoch": 0.9283631444461146, + "flos": 23989827836160.0, + "grad_norm": 1.669068908184784, + "language_loss": 0.77068555, + "learning_rate": 5.354826952900682e-08, + "loss": 0.79510581, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19104004, + "step": 15441, + "time_per_iteration": 4.413456678390503 + }, + { + "auxiliary_loss_clip": 0.01383355, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.22852802, + "balance_loss_mlp": 1.01039863, + "epoch": 0.9284232676987825, + "flos": 22794963160320.0, + "grad_norm": 1.69040443915942, + "language_loss": 0.64514256, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66925406, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.17407227, + "step": 15442, + "time_per_iteration": 2.843022346496582 + }, + { + "auxiliary_loss_clip": 0.01407439, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.24361372, + "balance_loss_mlp": 1.01379395, + "epoch": 0.9284833909514505, + "flos": 19509999333120.0, + "grad_norm": 2.438023358481136, + "language_loss": 0.81480992, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.83920968, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18762207, + "step": 15443, + "time_per_iteration": 2.8598039150238037 + }, + { + "auxiliary_loss_clip": 0.01408403, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.24766147, + "balance_loss_mlp": 1.01305151, + "epoch": 0.9285435142041184, + "flos": 23196539174400.0, + "grad_norm": 1.8353384305732534, + "language_loss": 0.65761244, + "learning_rate": 5.328004738702896e-08, + "loss": 0.68201721, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19006348, + "step": 15444, + "time_per_iteration": 2.873081922531128 + }, + { + "auxiliary_loss_clip": 0.01395371, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.23503506, + "balance_loss_mlp": 1.01329184, + "epoch": 0.9286036374567864, + "flos": 17684365893120.0, + "grad_norm": 2.042333716497454, + "language_loss": 0.74084258, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.76511896, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18969727, + "step": 15445, + "time_per_iteration": 2.801954746246338 + }, + { + "auxiliary_loss_clip": 0.01395901, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.23682487, + "balance_loss_mlp": 1.01328063, + "epoch": 0.9286637607094543, + "flos": 20896476312960.0, + "grad_norm": 1.7982940718524525, + "language_loss": 0.71815288, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.74243736, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19262695, + "step": 15446, + "time_per_iteration": 2.886275053024292 + }, + { + "auxiliary_loss_clip": 0.01427791, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.26107788, + "balance_loss_mlp": 1.01178885, + "epoch": 0.9287238839621224, + "flos": 19035343687680.0, + "grad_norm": 1.848944680981843, + "language_loss": 0.69696689, + "learning_rate": 5.301248962337523e-08, + "loss": 0.72154444, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.1817627, + "step": 15447, + "time_per_iteration": 2.8518166542053223 + }, + { + "auxiliary_loss_clip": 0.01376706, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.22422266, + "balance_loss_mlp": 1.01439309, + "epoch": 0.9287840072147904, + "flos": 20566396362240.0, + "grad_norm": 2.3289081071173747, + "language_loss": 0.72867674, + "learning_rate": 5.292345135757403e-08, + "loss": 0.75276542, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.17785645, + "step": 15448, + "time_per_iteration": 5.764143228530884 + }, + { + "auxiliary_loss_clip": 0.01391461, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.23326683, + "balance_loss_mlp": 1.011603, + "epoch": 0.9288441304674583, + "flos": 21260517125760.0, + "grad_norm": 1.7473995488421628, + "language_loss": 0.74996889, + "learning_rate": 5.283448692511072e-08, + "loss": 0.77419239, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19274902, + "step": 15449, + "time_per_iteration": 2.811849355697632 + }, + { + "auxiliary_loss_clip": 0.01398114, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.23903275, + "balance_loss_mlp": 1.01120317, + "epoch": 0.9289042537201263, + "flos": 27680575443840.0, + "grad_norm": 1.961665688874903, + "language_loss": 0.68557924, + "learning_rate": 5.27455963293586e-08, + "loss": 0.70986676, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19445801, + "step": 15450, + "time_per_iteration": 2.865870714187622 + }, + { + "auxiliary_loss_clip": 0.01395814, + "auxiliary_loss_mlp": 0.01029026, + "balance_loss_clip": 1.23522043, + "balance_loss_mlp": 1.00986874, + "epoch": 0.9289643769727942, + "flos": 19327119275520.0, + "grad_norm": 5.120934890009868, + "language_loss": 0.72858953, + "learning_rate": 5.265677957368875e-08, + "loss": 0.7528379, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19165039, + "step": 15451, + "time_per_iteration": 2.8286430835723877 + }, + { + "auxiliary_loss_clip": 0.01400193, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.2401464, + "balance_loss_mlp": 1.01724863, + "epoch": 0.9290245002254622, + "flos": 14064571411200.0, + "grad_norm": 2.074166530762542, + "language_loss": 0.74226093, + "learning_rate": 5.25680366614687e-08, + "loss": 0.76662058, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18530273, + "step": 15452, + "time_per_iteration": 2.8131699562072754 + }, + { + "auxiliary_loss_clip": 0.01398242, + "auxiliary_loss_mlp": 0.01036588, + "balance_loss_clip": 1.24115133, + "balance_loss_mlp": 1.01724029, + "epoch": 0.9290846234781301, + "flos": 20056603489920.0, + "grad_norm": 1.8531168929229556, + "language_loss": 0.74603724, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.7703855, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19360352, + "step": 15453, + "time_per_iteration": 2.8120665550231934 + }, + { + "auxiliary_loss_clip": 0.01177317, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.09123957, + "balance_loss_mlp": 1.01284683, + "epoch": 0.9291447467307982, + "flos": 61254015068160.0, + "grad_norm": 0.8343949688529111, + "language_loss": 0.6066829, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62881631, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.23144531, + "step": 15454, + "time_per_iteration": 3.249325752258301 + }, + { + "auxiliary_loss_clip": 0.01411259, + "auxiliary_loss_mlp": 0.0103659, + "balance_loss_clip": 1.25009418, + "balance_loss_mlp": 1.01750422, + "epoch": 0.9292048699834661, + "flos": 20561736147840.0, + "grad_norm": 1.5635289844004099, + "language_loss": 0.70058358, + "learning_rate": 5.230225101914709e-08, + "loss": 0.72506201, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.1907959, + "step": 15455, + "time_per_iteration": 2.8329641819000244 + }, + { + "auxiliary_loss_clip": 0.01405066, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.24497938, + "balance_loss_mlp": 1.01347125, + "epoch": 0.9292649932361341, + "flos": 23634293045760.0, + "grad_norm": 1.6249263218817902, + "language_loss": 0.65462375, + "learning_rate": 5.22138035143509e-08, + "loss": 0.67900574, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19641113, + "step": 15456, + "time_per_iteration": 2.846841335296631 + }, + { + "auxiliary_loss_clip": 0.01396224, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.23922014, + "balance_loss_mlp": 1.01247144, + "epoch": 0.929325116488802, + "flos": 15017547530880.0, + "grad_norm": 1.6880542702097234, + "language_loss": 0.6881209, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.71240699, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19909668, + "step": 15457, + "time_per_iteration": 3.008776903152466 + }, + { + "auxiliary_loss_clip": 0.01404239, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.24268103, + "balance_loss_mlp": 1.01310897, + "epoch": 0.92938523974147, + "flos": 17976277215360.0, + "grad_norm": 8.031203657693178, + "language_loss": 0.81766844, + "learning_rate": 5.203713008885291e-08, + "loss": 0.84202832, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1862793, + "step": 15458, + "time_per_iteration": 2.8365681171417236 + }, + { + "auxiliary_loss_clip": 0.01404733, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.24479854, + "balance_loss_mlp": 1.0172987, + "epoch": 0.9294453629941379, + "flos": 23013432892800.0, + "grad_norm": 1.720808273458055, + "language_loss": 0.72796905, + "learning_rate": 5.194890417485065e-08, + "loss": 0.75237918, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18969727, + "step": 15459, + "time_per_iteration": 2.9145281314849854 + }, + { + "auxiliary_loss_clip": 0.01402706, + "auxiliary_loss_mlp": 0.01031293, + "balance_loss_clip": 1.24308395, + "balance_loss_mlp": 1.01206422, + "epoch": 0.929505486246806, + "flos": 17063912943360.0, + "grad_norm": 2.5833687990375878, + "language_loss": 0.59963191, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.62397194, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19238281, + "step": 15460, + "time_per_iteration": 2.80780029296875 + }, + { + "auxiliary_loss_clip": 0.01408734, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.24647164, + "balance_loss_mlp": 1.01400805, + "epoch": 0.9295656094994739, + "flos": 27350766961920.0, + "grad_norm": 2.0615734432473336, + "language_loss": 0.81050146, + "learning_rate": 5.177267396106733e-08, + "loss": 0.83491945, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19067383, + "step": 15461, + "time_per_iteration": 2.963458776473999 + }, + { + "auxiliary_loss_clip": 0.01397822, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.23940372, + "balance_loss_mlp": 1.01364708, + "epoch": 0.9296257327521419, + "flos": 21481022874240.0, + "grad_norm": 2.5567888995192996, + "language_loss": 0.7872315, + "learning_rate": 5.168466966796869e-08, + "loss": 0.81153125, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18505859, + "step": 15462, + "time_per_iteration": 2.899139165878296 + }, + { + "auxiliary_loss_clip": 0.0139372, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.23453164, + "balance_loss_mlp": 1.0124352, + "epoch": 0.9296858560048099, + "flos": 16371330503040.0, + "grad_norm": 3.3537462363934614, + "language_loss": 0.63932645, + "learning_rate": 5.159673925518282e-08, + "loss": 0.66356665, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17858887, + "step": 15463, + "time_per_iteration": 2.8409464359283447 + }, + { + "auxiliary_loss_clip": 0.01398692, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.23957491, + "balance_loss_mlp": 1.01127172, + "epoch": 0.9297459792574778, + "flos": 29869797248640.0, + "grad_norm": 1.5026908600929316, + "language_loss": 0.71338594, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73766565, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18029785, + "step": 15464, + "time_per_iteration": 2.9600491523742676 + }, + { + "auxiliary_loss_clip": 0.01413519, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.2517333, + "balance_loss_mlp": 1.0143162, + "epoch": 0.9298061025101458, + "flos": 15933395652480.0, + "grad_norm": 1.7867679505395258, + "language_loss": 0.78061175, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.8050788, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.1887207, + "step": 15465, + "time_per_iteration": 2.810683250427246 + }, + { + "auxiliary_loss_clip": 0.01180507, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.09312689, + "balance_loss_mlp": 1.00344014, + "epoch": 0.9298662257628137, + "flos": 64130362675200.0, + "grad_norm": 1.0041184608424722, + "language_loss": 0.56475514, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58682346, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.22851562, + "step": 15466, + "time_per_iteration": 3.4860854148864746 + }, + { + "auxiliary_loss_clip": 0.01412578, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.24971497, + "balance_loss_mlp": 1.01503229, + "epoch": 0.9299263490154818, + "flos": 24290969097600.0, + "grad_norm": 1.4595564802218013, + "language_loss": 0.73509037, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75956821, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.2019043, + "step": 15467, + "time_per_iteration": 2.9434194564819336 + }, + { + "auxiliary_loss_clip": 0.01405987, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.24556065, + "balance_loss_mlp": 1.01416969, + "epoch": 0.9299864722681497, + "flos": 23304665543040.0, + "grad_norm": 1.7138567186543188, + "language_loss": 0.72202504, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.7464115, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18493652, + "step": 15468, + "time_per_iteration": 2.869332790374756 + }, + { + "auxiliary_loss_clip": 0.01410022, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.24655104, + "balance_loss_mlp": 1.01196527, + "epoch": 0.9300465955208177, + "flos": 21405500023680.0, + "grad_norm": 2.2263819183914255, + "language_loss": 0.75439489, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77880603, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19140625, + "step": 15469, + "time_per_iteration": 2.8619065284729004 + }, + { + "auxiliary_loss_clip": 0.01409384, + "auxiliary_loss_mlp": 0.01035358, + "balance_loss_clip": 1.24725449, + "balance_loss_mlp": 1.01635551, + "epoch": 0.9301067187734856, + "flos": 24582111258240.0, + "grad_norm": 1.9070902894506503, + "language_loss": 0.76470065, + "learning_rate": 5.098329529416379e-08, + "loss": 0.78914803, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19006348, + "step": 15470, + "time_per_iteration": 2.882906913757324 + }, + { + "auxiliary_loss_clip": 0.01395548, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.23822999, + "balance_loss_mlp": 1.01289332, + "epoch": 0.9301668420261536, + "flos": 22206254077440.0, + "grad_norm": 1.7248033246758643, + "language_loss": 0.75034904, + "learning_rate": 5.089595604367902e-08, + "loss": 0.77460933, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.17590332, + "step": 15471, + "time_per_iteration": 2.863020420074463 + }, + { + "auxiliary_loss_clip": 0.01391468, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.23336983, + "balance_loss_mlp": 1.01685238, + "epoch": 0.9302269652788215, + "flos": 17756269159680.0, + "grad_norm": 3.0694294444238888, + "language_loss": 0.69885314, + "learning_rate": 5.080869070341487e-08, + "loss": 0.72312135, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18505859, + "step": 15472, + "time_per_iteration": 2.848362922668457 + }, + { + "auxiliary_loss_clip": 0.0137463, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.22090125, + "balance_loss_mlp": 1.01211452, + "epoch": 0.9302870885314896, + "flos": 19400198906880.0, + "grad_norm": 1.6682812068819237, + "language_loss": 0.89358014, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.91763496, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.18737793, + "step": 15473, + "time_per_iteration": 4.369073390960693 + }, + { + "auxiliary_loss_clip": 0.01405179, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.24310911, + "balance_loss_mlp": 1.0145843, + "epoch": 0.9303472117841575, + "flos": 21769767060480.0, + "grad_norm": 1.843197473469873, + "language_loss": 0.65752774, + "learning_rate": 5.063438176678203e-08, + "loss": 0.68192351, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19799805, + "step": 15474, + "time_per_iteration": 2.8484296798706055 + }, + { + "auxiliary_loss_clip": 0.01401723, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.24253392, + "balance_loss_mlp": 1.0170548, + "epoch": 0.9304073350368255, + "flos": 19619211576960.0, + "grad_norm": 1.835652518903361, + "language_loss": 0.75263071, + "learning_rate": 5.054733817702339e-08, + "loss": 0.7770074, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18896484, + "step": 15475, + "time_per_iteration": 2.8758738040924072 + }, + { + "auxiliary_loss_clip": 0.01392948, + "auxiliary_loss_mlp": 0.01027234, + "balance_loss_clip": 1.23388088, + "balance_loss_mlp": 1.00956702, + "epoch": 0.9304674582894935, + "flos": 30452714997120.0, + "grad_norm": 1.9980120870498477, + "language_loss": 0.6694622, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.69366401, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17663574, + "step": 15476, + "time_per_iteration": 4.457249879837036 + }, + { + "auxiliary_loss_clip": 0.01404283, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.24361491, + "balance_loss_mlp": 1.01470923, + "epoch": 0.9305275815421614, + "flos": 17794754501760.0, + "grad_norm": 2.3515241115700327, + "language_loss": 0.69696951, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.72135597, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19665527, + "step": 15477, + "time_per_iteration": 2.8219234943389893 + }, + { + "auxiliary_loss_clip": 0.01387924, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.2319169, + "balance_loss_mlp": 1.01338959, + "epoch": 0.9305877047948294, + "flos": 25308609315840.0, + "grad_norm": 1.6345233383513778, + "language_loss": 0.58830214, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.61248857, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.17333984, + "step": 15478, + "time_per_iteration": 2.899062395095825 + }, + { + "auxiliary_loss_clip": 0.01412842, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.2475971, + "balance_loss_mlp": 1.01293182, + "epoch": 0.9306478280474973, + "flos": 16984589529600.0, + "grad_norm": 2.03581271570634, + "language_loss": 0.7995832, + "learning_rate": 5.01999030853566e-08, + "loss": 0.82403165, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19067383, + "step": 15479, + "time_per_iteration": 2.8235297203063965 + }, + { + "auxiliary_loss_clip": 0.0139886, + "auxiliary_loss_mlp": 0.01031372, + "balance_loss_clip": 1.2388916, + "balance_loss_mlp": 1.01337147, + "epoch": 0.9307079513001654, + "flos": 35676958498560.0, + "grad_norm": 24.500826176437645, + "language_loss": 0.69193089, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.71623319, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18005371, + "step": 15480, + "time_per_iteration": 2.9760823249816895 + }, + { + "auxiliary_loss_clip": 0.01406072, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.24577832, + "balance_loss_mlp": 1.01406527, + "epoch": 0.9307680745528333, + "flos": 19217454583680.0, + "grad_norm": 1.9198016412630379, + "language_loss": 0.68475646, + "learning_rate": 5.002662914604583e-08, + "loss": 0.70913547, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.1776123, + "step": 15481, + "time_per_iteration": 2.8418664932250977 + }, + { + "auxiliary_loss_clip": 0.01399473, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.24092627, + "balance_loss_mlp": 1.01337051, + "epoch": 0.9308281978055013, + "flos": 19072019237760.0, + "grad_norm": 3.3141222292463564, + "language_loss": 0.75843906, + "learning_rate": 4.994010308952701e-08, + "loss": 0.78275156, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18408203, + "step": 15482, + "time_per_iteration": 2.899949312210083 + }, + { + "auxiliary_loss_clip": 0.01388164, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.23194551, + "balance_loss_mlp": 1.01184404, + "epoch": 0.9308883210581692, + "flos": 20531168645760.0, + "grad_norm": 1.6905379955966857, + "language_loss": 0.80030274, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82448775, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18481445, + "step": 15483, + "time_per_iteration": 5.622990369796753 + }, + { + "auxiliary_loss_clip": 0.01394523, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.23564303, + "balance_loss_mlp": 1.01128232, + "epoch": 0.9309484443108372, + "flos": 13008038647680.0, + "grad_norm": 2.015901487020223, + "language_loss": 0.75239956, + "learning_rate": 4.976727281916782e-08, + "loss": 0.77664673, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18920898, + "step": 15484, + "time_per_iteration": 2.8016207218170166 + }, + { + "auxiliary_loss_clip": 0.01404834, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.24339843, + "balance_loss_mlp": 1.01752734, + "epoch": 0.9310085675635051, + "flos": 12575352193920.0, + "grad_norm": 2.3175244887908253, + "language_loss": 0.77351135, + "learning_rate": 4.968096861188087e-08, + "loss": 0.79792523, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19018555, + "step": 15485, + "time_per_iteration": 2.8489465713500977 + }, + { + "auxiliary_loss_clip": 0.01408545, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.24544239, + "balance_loss_mlp": 1.01541924, + "epoch": 0.9310686908161732, + "flos": 23488133783040.0, + "grad_norm": 2.1994976877598202, + "language_loss": 0.79084432, + "learning_rate": 4.959473836088723e-08, + "loss": 0.8152805, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19665527, + "step": 15486, + "time_per_iteration": 2.876664638519287 + }, + { + "auxiliary_loss_clip": 0.01405718, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.24368465, + "balance_loss_mlp": 1.01126838, + "epoch": 0.9311288140688411, + "flos": 24180897202560.0, + "grad_norm": 2.1062959573629776, + "language_loss": 0.78004003, + "learning_rate": 4.950858206945674e-08, + "loss": 0.80439913, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18920898, + "step": 15487, + "time_per_iteration": 2.881012201309204 + }, + { + "auxiliary_loss_clip": 0.01394743, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.23548198, + "balance_loss_mlp": 1.01295733, + "epoch": 0.9311889373215091, + "flos": 35603743132800.0, + "grad_norm": 2.413693513029208, + "language_loss": 0.68221402, + "learning_rate": 4.942249974085633e-08, + "loss": 0.70648295, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19189453, + "step": 15488, + "time_per_iteration": 2.964520215988159 + }, + { + "auxiliary_loss_clip": 0.01382354, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.22689104, + "balance_loss_mlp": 1.01039732, + "epoch": 0.9312490605741771, + "flos": 20240297953920.0, + "grad_norm": 2.0224055311726845, + "language_loss": 0.75728869, + "learning_rate": 4.933649137834983e-08, + "loss": 0.78139162, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17553711, + "step": 15489, + "time_per_iteration": 2.8474299907684326 + }, + { + "auxiliary_loss_clip": 0.0141314, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.24912345, + "balance_loss_mlp": 1.01348901, + "epoch": 0.931309183826845, + "flos": 13957576162560.0, + "grad_norm": 2.105668953530822, + "language_loss": 0.81723392, + "learning_rate": 4.925055698519931e-08, + "loss": 0.84170401, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.20385742, + "step": 15490, + "time_per_iteration": 2.940305471420288 + }, + { + "auxiliary_loss_clip": 0.01400414, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.23912501, + "balance_loss_mlp": 1.01153755, + "epoch": 0.931369307079513, + "flos": 20166268181760.0, + "grad_norm": 1.6912054180218208, + "language_loss": 0.72774243, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.7520541, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1920166, + "step": 15491, + "time_per_iteration": 2.9710636138916016 + }, + { + "auxiliary_loss_clip": 0.01384309, + "auxiliary_loss_mlp": 0.01030631, + "balance_loss_clip": 1.22814667, + "balance_loss_mlp": 1.01217699, + "epoch": 0.931429430332181, + "flos": 25349673611520.0, + "grad_norm": 1.7583584433213046, + "language_loss": 0.75425017, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.77839959, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18444824, + "step": 15492, + "time_per_iteration": 2.865389108657837 + }, + { + "auxiliary_loss_clip": 0.01186909, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.09701061, + "balance_loss_mlp": 1.01635504, + "epoch": 0.931489553584849, + "flos": 71255699239680.0, + "grad_norm": 0.7131006879813684, + "language_loss": 0.53454566, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55681193, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.23339844, + "step": 15493, + "time_per_iteration": 3.229370594024658 + }, + { + "auxiliary_loss_clip": 0.01404392, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.2448647, + "balance_loss_mlp": 1.0136683, + "epoch": 0.9315496768375169, + "flos": 14650792030080.0, + "grad_norm": 2.989405385540274, + "language_loss": 0.71822214, + "learning_rate": 4.890755917128531e-08, + "loss": 0.74259031, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18774414, + "step": 15494, + "time_per_iteration": 2.830939531326294 + }, + { + "auxiliary_loss_clip": 0.01404025, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.24137282, + "balance_loss_mlp": 1.01159143, + "epoch": 0.9316098000901849, + "flos": 28341685486080.0, + "grad_norm": 1.524988114132579, + "language_loss": 0.68876576, + "learning_rate": 4.882199467373671e-08, + "loss": 0.71311665, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19458008, + "step": 15495, + "time_per_iteration": 2.951896905899048 + }, + { + "auxiliary_loss_clip": 0.01387917, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.23134255, + "balance_loss_mlp": 1.01559114, + "epoch": 0.9316699233428528, + "flos": 28524565543680.0, + "grad_norm": 1.796601263248502, + "language_loss": 0.62562013, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.64982927, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.17407227, + "step": 15496, + "time_per_iteration": 2.97820782661438 + }, + { + "auxiliary_loss_clip": 0.01393479, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.23363328, + "balance_loss_mlp": 1.01581323, + "epoch": 0.9317300465955208, + "flos": 33706568384640.0, + "grad_norm": 1.5969162434435715, + "language_loss": 0.77744097, + "learning_rate": 4.865108764847825e-08, + "loss": 0.80172098, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18713379, + "step": 15497, + "time_per_iteration": 2.9990978240966797 + }, + { + "auxiliary_loss_clip": 0.0141234, + "auxiliary_loss_mlp": 0.01037254, + "balance_loss_clip": 1.24879622, + "balance_loss_mlp": 1.01775169, + "epoch": 0.9317901698481887, + "flos": 23668389642240.0, + "grad_norm": 1.610980878514427, + "language_loss": 0.66951978, + "learning_rate": 4.856574512724898e-08, + "loss": 0.6940158, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19494629, + "step": 15498, + "time_per_iteration": 2.872574806213379 + }, + { + "auxiliary_loss_clip": 0.01400455, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.24047172, + "balance_loss_mlp": 1.01540947, + "epoch": 0.9318502931008568, + "flos": 20969917902720.0, + "grad_norm": 1.7433947083530605, + "language_loss": 0.80240792, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82675683, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19030762, + "step": 15499, + "time_per_iteration": 2.904355764389038 + }, + { + "auxiliary_loss_clip": 0.01382558, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.2283076, + "balance_loss_mlp": 1.01305652, + "epoch": 0.9319104163535247, + "flos": 23451593967360.0, + "grad_norm": 1.6866448625865487, + "language_loss": 0.77328813, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79743814, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.1940918, + "step": 15500, + "time_per_iteration": 2.8918533325195312 + }, + { + "auxiliary_loss_clip": 0.01388127, + "auxiliary_loss_mlp": 0.01029162, + "balance_loss_clip": 1.23203373, + "balance_loss_mlp": 1.01052928, + "epoch": 0.9319705396061927, + "flos": 22357933205760.0, + "grad_norm": 1.5980520414634642, + "language_loss": 0.72939521, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.75356811, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18652344, + "step": 15501, + "time_per_iteration": 2.9042775630950928 + }, + { + "auxiliary_loss_clip": 0.01398313, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.2368443, + "balance_loss_mlp": 1.01029384, + "epoch": 0.9320306628588607, + "flos": 21002702400000.0, + "grad_norm": 2.5065456392803522, + "language_loss": 0.672454, + "learning_rate": 4.822511506047666e-08, + "loss": 0.69672227, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18212891, + "step": 15502, + "time_per_iteration": 2.9093074798583984 + }, + { + "auxiliary_loss_clip": 0.0140846, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.24635577, + "balance_loss_mlp": 1.01705098, + "epoch": 0.9320907861115286, + "flos": 24549824453760.0, + "grad_norm": 1.4922832727814672, + "language_loss": 0.6618911, + "learning_rate": 4.814014256446586e-08, + "loss": 0.68633687, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19067383, + "step": 15503, + "time_per_iteration": 2.9178342819213867 + }, + { + "auxiliary_loss_clip": 0.01403777, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.24126768, + "balance_loss_mlp": 1.01058125, + "epoch": 0.9321509093641966, + "flos": 19793178408960.0, + "grad_norm": 1.5282282575169222, + "language_loss": 0.75753772, + "learning_rate": 4.805524408317652e-08, + "loss": 0.78187186, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19055176, + "step": 15504, + "time_per_iteration": 2.8795325756073 + }, + { + "auxiliary_loss_clip": 0.01412172, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.25141978, + "balance_loss_mlp": 1.01471817, + "epoch": 0.9322110326168646, + "flos": 24983415803520.0, + "grad_norm": 4.794244815377157, + "language_loss": 0.72093689, + "learning_rate": 4.797041961982762e-08, + "loss": 0.74540639, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20043945, + "step": 15505, + "time_per_iteration": 2.923491954803467 + }, + { + "auxiliary_loss_clip": 0.0139794, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.23620033, + "balance_loss_mlp": 1.01606286, + "epoch": 0.9322711558695326, + "flos": 16152227343360.0, + "grad_norm": 1.8128670908932207, + "language_loss": 0.76193595, + "learning_rate": 4.788566917763614e-08, + "loss": 0.78627264, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19665527, + "step": 15506, + "time_per_iteration": 2.8324222564697266 + }, + { + "auxiliary_loss_clip": 0.01384288, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.22940707, + "balance_loss_mlp": 1.01032424, + "epoch": 0.9323312791222005, + "flos": 23742917107200.0, + "grad_norm": 2.449768793748549, + "language_loss": 0.83756793, + "learning_rate": 4.780099275981597e-08, + "loss": 0.86169809, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18395996, + "step": 15507, + "time_per_iteration": 2.8786239624023438 + }, + { + "auxiliary_loss_clip": 0.0139821, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.23676038, + "balance_loss_mlp": 1.01591265, + "epoch": 0.9323914023748685, + "flos": 20787852251520.0, + "grad_norm": 1.602206790039189, + "language_loss": 0.68255544, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70689386, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19726562, + "step": 15508, + "time_per_iteration": 4.287375211715698 + }, + { + "auxiliary_loss_clip": 0.01393071, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.23574543, + "balance_loss_mlp": 1.01398778, + "epoch": 0.9324515256275364, + "flos": 23925797164800.0, + "grad_norm": 1.721947269409349, + "language_loss": 0.73215258, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.7564196, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.19641113, + "step": 15509, + "time_per_iteration": 2.905116319656372 + }, + { + "auxiliary_loss_clip": 0.01395568, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.23584318, + "balance_loss_mlp": 1.01206589, + "epoch": 0.9325116488802044, + "flos": 18014491088640.0, + "grad_norm": 1.8185875024737304, + "language_loss": 0.74653953, + "learning_rate": 4.754740768467624e-08, + "loss": 0.770805, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18920898, + "step": 15510, + "time_per_iteration": 2.816983461380005 + }, + { + "auxiliary_loss_clip": 0.01415647, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.25136852, + "balance_loss_mlp": 1.01647425, + "epoch": 0.9325717721328723, + "flos": 29033136806400.0, + "grad_norm": 2.1543788971977813, + "language_loss": 0.70921445, + "learning_rate": 4.746302739642161e-08, + "loss": 0.7337265, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19067383, + "step": 15511, + "time_per_iteration": 4.370168685913086 + }, + { + "auxiliary_loss_clip": 0.01398515, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.23927069, + "balance_loss_mlp": 1.01792336, + "epoch": 0.9326318953855404, + "flos": 21654627747840.0, + "grad_norm": 2.1571795446524322, + "language_loss": 0.78686076, + "learning_rate": 4.737872114856412e-08, + "loss": 0.81121254, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18725586, + "step": 15512, + "time_per_iteration": 2.8278722763061523 + }, + { + "auxiliary_loss_clip": 0.01395483, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.23745549, + "balance_loss_mlp": 1.01179171, + "epoch": 0.9326920186382083, + "flos": 26076352648320.0, + "grad_norm": 1.4466495654822484, + "language_loss": 0.80880153, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.83306468, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19042969, + "step": 15513, + "time_per_iteration": 2.852989435195923 + }, + { + "auxiliary_loss_clip": 0.01417324, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.25308442, + "balance_loss_mlp": 1.01629889, + "epoch": 0.9327521418908763, + "flos": 12064609180800.0, + "grad_norm": 2.1499748871037165, + "language_loss": 0.81602192, + "learning_rate": 4.721033078682768e-08, + "loss": 0.84057212, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.21411133, + "step": 15514, + "time_per_iteration": 2.82910418510437 + }, + { + "auxiliary_loss_clip": 0.01391015, + "auxiliary_loss_mlp": 0.01036498, + "balance_loss_clip": 1.23455834, + "balance_loss_mlp": 1.01779413, + "epoch": 0.9328122651435443, + "flos": 43849661114880.0, + "grad_norm": 2.2540383642343644, + "language_loss": 0.72434092, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.74861604, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18701172, + "step": 15515, + "time_per_iteration": 3.1750433444976807 + }, + { + "auxiliary_loss_clip": 0.01420662, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.2554121, + "balance_loss_mlp": 1.02089155, + "epoch": 0.9328723883962122, + "flos": 15203278010880.0, + "grad_norm": 2.474715698899694, + "language_loss": 0.81592816, + "learning_rate": 4.704223662500806e-08, + "loss": 0.84053755, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19384766, + "step": 15516, + "time_per_iteration": 2.86899733543396 + }, + { + "auxiliary_loss_clip": 0.01404159, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.24157965, + "balance_loss_mlp": 1.01601791, + "epoch": 0.9329325116488802, + "flos": 20270186784000.0, + "grad_norm": 1.809479818349552, + "language_loss": 0.81659985, + "learning_rate": 4.695830062703643e-08, + "loss": 0.84098947, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18798828, + "step": 15517, + "time_per_iteration": 2.8763108253479004 + }, + { + "auxiliary_loss_clip": 0.01406029, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.24486351, + "balance_loss_mlp": 1.0124507, + "epoch": 0.9329926349015482, + "flos": 13122725512320.0, + "grad_norm": 1.9559774054271344, + "language_loss": 0.75743866, + "learning_rate": 4.687443868860219e-08, + "loss": 0.78182077, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19702148, + "step": 15518, + "time_per_iteration": 5.769965171813965 + }, + { + "auxiliary_loss_clip": 0.01402235, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.24348509, + "balance_loss_mlp": 1.01762438, + "epoch": 0.9330527581542162, + "flos": 23050832359680.0, + "grad_norm": 2.074152970682481, + "language_loss": 0.76739913, + "learning_rate": 4.679065081288458e-08, + "loss": 0.79179192, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19421387, + "step": 15519, + "time_per_iteration": 2.8666892051696777 + }, + { + "auxiliary_loss_clip": 0.01392346, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.23390603, + "balance_loss_mlp": 1.0141865, + "epoch": 0.9331128814068841, + "flos": 15567997495680.0, + "grad_norm": 1.8846013303014395, + "language_loss": 0.83949375, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.86375052, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19152832, + "step": 15520, + "time_per_iteration": 2.820233106613159 + }, + { + "auxiliary_loss_clip": 0.01384564, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.22702193, + "balance_loss_mlp": 1.01225114, + "epoch": 0.9331730046595521, + "flos": 22281550704000.0, + "grad_norm": 1.6922956985972841, + "language_loss": 0.76697099, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.79112381, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18457031, + "step": 15521, + "time_per_iteration": 2.92683482170105 + }, + { + "auxiliary_loss_clip": 0.01407752, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.24924374, + "balance_loss_mlp": 1.01341844, + "epoch": 0.93323312791222, + "flos": 15785833800960.0, + "grad_norm": 1.5737427967463962, + "language_loss": 0.78213996, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.80653727, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18554688, + "step": 15522, + "time_per_iteration": 2.85965633392334 + }, + { + "auxiliary_loss_clip": 0.01396464, + "auxiliary_loss_mlp": 0.01032294, + "balance_loss_clip": 1.23708272, + "balance_loss_mlp": 1.01304138, + "epoch": 0.933293251164888, + "flos": 22019301987840.0, + "grad_norm": 1.9525539800648035, + "language_loss": 0.63748342, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.66177094, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19250488, + "step": 15523, + "time_per_iteration": 2.824198007583618 + }, + { + "auxiliary_loss_clip": 0.01400613, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.24215794, + "balance_loss_mlp": 1.01689839, + "epoch": 0.933353374417556, + "flos": 26042527520640.0, + "grad_norm": 1.6019940144560039, + "language_loss": 0.69019413, + "learning_rate": 4.63728224861577e-08, + "loss": 0.71455556, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.1862793, + "step": 15524, + "time_per_iteration": 2.883239269256592 + }, + { + "auxiliary_loss_clip": 0.01403912, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.24254, + "balance_loss_mlp": 1.01589549, + "epoch": 0.933413497670224, + "flos": 24910924354560.0, + "grad_norm": 1.6545781087448683, + "language_loss": 0.74585158, + "learning_rate": 4.628947905336589e-08, + "loss": 0.77024651, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19677734, + "step": 15525, + "time_per_iteration": 2.8922815322875977 + }, + { + "auxiliary_loss_clip": 0.01388445, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.23200583, + "balance_loss_mlp": 1.01473391, + "epoch": 0.9334736209228919, + "flos": 23697283086720.0, + "grad_norm": 1.9667043249449998, + "language_loss": 0.84389007, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.8681134, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19152832, + "step": 15526, + "time_per_iteration": 2.8421835899353027 + }, + { + "auxiliary_loss_clip": 0.01402018, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.24012947, + "balance_loss_mlp": 1.0106504, + "epoch": 0.9335337441755599, + "flos": 15385705620480.0, + "grad_norm": 1.72922197481918, + "language_loss": 0.69753528, + "learning_rate": 4.61230144456366e-08, + "loss": 0.72185194, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18994141, + "step": 15527, + "time_per_iteration": 2.81846284866333 + }, + { + "auxiliary_loss_clip": 0.01404569, + "auxiliary_loss_mlp": 0.01032879, + "balance_loss_clip": 1.24148107, + "balance_loss_mlp": 1.0129354, + "epoch": 0.9335938674282279, + "flos": 16115189834880.0, + "grad_norm": 2.316801211349661, + "language_loss": 0.66278559, + "learning_rate": 4.603989327701141e-08, + "loss": 0.68716007, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19946289, + "step": 15528, + "time_per_iteration": 2.8125429153442383 + }, + { + "auxiliary_loss_clip": 0.01402696, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.24080801, + "balance_loss_mlp": 1.01446962, + "epoch": 0.9336539906808958, + "flos": 18962173566720.0, + "grad_norm": 2.022948114664363, + "language_loss": 0.7602731, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.78463912, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19421387, + "step": 15529, + "time_per_iteration": 2.8415586948394775 + }, + { + "auxiliary_loss_clip": 0.01396939, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.23782504, + "balance_loss_mlp": 1.01392412, + "epoch": 0.9337141139335638, + "flos": 18116961857280.0, + "grad_norm": 3.281671061384607, + "language_loss": 0.6344955, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65878993, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18579102, + "step": 15530, + "time_per_iteration": 2.8474481105804443 + }, + { + "auxiliary_loss_clip": 0.01377255, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.22257853, + "balance_loss_mlp": 1.01401091, + "epoch": 0.9337742371862318, + "flos": 17354104963200.0, + "grad_norm": 1.908940206300086, + "language_loss": 0.72863519, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.75273836, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.19067383, + "step": 15531, + "time_per_iteration": 2.8562252521514893 + }, + { + "auxiliary_loss_clip": 0.01397147, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.23799646, + "balance_loss_mlp": 1.0119009, + "epoch": 0.9338343604388998, + "flos": 29070762497280.0, + "grad_norm": 1.6091420699236476, + "language_loss": 0.71731067, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.74159777, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19665527, + "step": 15532, + "time_per_iteration": 2.900383472442627 + }, + { + "auxiliary_loss_clip": 0.01400032, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.23891091, + "balance_loss_mlp": 1.01355839, + "epoch": 0.9338944836915677, + "flos": 18670081265280.0, + "grad_norm": 1.4947229987298272, + "language_loss": 0.73796171, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.76229215, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19445801, + "step": 15533, + "time_per_iteration": 2.832350015640259 + }, + { + "auxiliary_loss_clip": 0.01393664, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.23646927, + "balance_loss_mlp": 1.01481533, + "epoch": 0.9339546069442357, + "flos": 16626385296000.0, + "grad_norm": 1.7924302237219938, + "language_loss": 0.80884051, + "learning_rate": 4.554272235700507e-08, + "loss": 0.83310926, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18408203, + "step": 15534, + "time_per_iteration": 2.8324015140533447 + }, + { + "auxiliary_loss_clip": 0.01374645, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.22365284, + "balance_loss_mlp": 1.01290238, + "epoch": 0.9340147301969036, + "flos": 23702848197120.0, + "grad_norm": 1.7718300959999087, + "language_loss": 0.75039494, + "learning_rate": 4.546011991495513e-08, + "loss": 0.774459, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 1.51074219, + "router_z_loss_mlp": 0.18847656, + "step": 15535, + "time_per_iteration": 2.86625599861145 + }, + { + "auxiliary_loss_clip": 0.01403515, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.24338746, + "balance_loss_mlp": 1.0114857, + "epoch": 0.9340748534495716, + "flos": 28665295430400.0, + "grad_norm": 1.9528418821346103, + "language_loss": 0.78338492, + "learning_rate": 4.537759158925292e-08, + "loss": 0.80772471, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18969727, + "step": 15536, + "time_per_iteration": 2.896688461303711 + }, + { + "auxiliary_loss_clip": 0.01398586, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.23802876, + "balance_loss_mlp": 1.01244175, + "epoch": 0.9341349767022396, + "flos": 24910336172160.0, + "grad_norm": 1.467517994044706, + "language_loss": 0.81143749, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.83574045, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19262695, + "step": 15537, + "time_per_iteration": 2.9201860427856445 + }, + { + "auxiliary_loss_clip": 0.01413089, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.25073779, + "balance_loss_mlp": 1.0136894, + "epoch": 0.9341950999549076, + "flos": 29071441169280.0, + "grad_norm": 1.7331553638721642, + "language_loss": 0.78790957, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.81236213, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18469238, + "step": 15538, + "time_per_iteration": 2.913827657699585 + }, + { + "auxiliary_loss_clip": 0.01385924, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.22829616, + "balance_loss_mlp": 1.01326537, + "epoch": 0.9342552232075755, + "flos": 23597662740480.0, + "grad_norm": 1.6398652049345188, + "language_loss": 0.73962355, + "learning_rate": 4.513045134151672e-08, + "loss": 0.76380396, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18859863, + "step": 15539, + "time_per_iteration": 2.8836214542388916 + }, + { + "auxiliary_loss_clip": 0.0138744, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.2312969, + "balance_loss_mlp": 1.0158422, + "epoch": 0.9343153464602435, + "flos": 36735255809280.0, + "grad_norm": 1.4767676506708085, + "language_loss": 0.6545285, + "learning_rate": 4.504821951247373e-08, + "loss": 0.67873973, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.17858887, + "step": 15540, + "time_per_iteration": 3.0191965103149414 + }, + { + "auxiliary_loss_clip": 0.01394607, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.2355547, + "balance_loss_mlp": 1.01211715, + "epoch": 0.9343754697129115, + "flos": 22246594456320.0, + "grad_norm": 2.881702445961392, + "language_loss": 0.76760346, + "learning_rate": 4.496606181539864e-08, + "loss": 0.79185945, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18859863, + "step": 15541, + "time_per_iteration": 2.894556760787964 + }, + { + "auxiliary_loss_clip": 0.01395304, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.237396, + "balance_loss_mlp": 1.01278591, + "epoch": 0.9344355929655794, + "flos": 29720878053120.0, + "grad_norm": 1.9214964556906093, + "language_loss": 0.6778962, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.70216799, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19067383, + "step": 15542, + "time_per_iteration": 2.9487991333007812 + }, + { + "auxiliary_loss_clip": 0.01396121, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.23825598, + "balance_loss_mlp": 1.00900304, + "epoch": 0.9344957162182475, + "flos": 18889772607360.0, + "grad_norm": 1.7890967742813073, + "language_loss": 0.70534372, + "learning_rate": 4.480196882960907e-08, + "loss": 0.7295928, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19787598, + "step": 15543, + "time_per_iteration": 4.296624422073364 + }, + { + "auxiliary_loss_clip": 0.01414063, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.25006652, + "balance_loss_mlp": 1.01483226, + "epoch": 0.9345558394709154, + "flos": 27429275969280.0, + "grad_norm": 1.9226675916395377, + "language_loss": 0.70552075, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.73002946, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.21984863, + "step": 15544, + "time_per_iteration": 2.909769296646118 + }, + { + "auxiliary_loss_clip": 0.0140513, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.24372578, + "balance_loss_mlp": 1.01096475, + "epoch": 0.9346159627235834, + "flos": 20751176701440.0, + "grad_norm": 2.365760803643456, + "language_loss": 0.78244817, + "learning_rate": 4.463817240903789e-08, + "loss": 0.80679417, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18493652, + "step": 15545, + "time_per_iteration": 2.8744499683380127 + }, + { + "auxiliary_loss_clip": 0.01405817, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.24287701, + "balance_loss_mlp": 1.01087189, + "epoch": 0.9346760859762513, + "flos": 21079084901760.0, + "grad_norm": 1.6009421487637157, + "language_loss": 0.69753408, + "learning_rate": 4.455638541847495e-08, + "loss": 0.72189009, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18920898, + "step": 15546, + "time_per_iteration": 2.880429267883301 + }, + { + "auxiliary_loss_clip": 0.0139604, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.24097347, + "balance_loss_mlp": 1.0135392, + "epoch": 0.9347362092289193, + "flos": 29216740780800.0, + "grad_norm": 1.7222280192593653, + "language_loss": 0.83387172, + "learning_rate": 4.447467257852966e-08, + "loss": 0.8581503, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18273926, + "step": 15547, + "time_per_iteration": 4.318403244018555 + }, + { + "auxiliary_loss_clip": 0.01387244, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.23179746, + "balance_loss_mlp": 1.01206684, + "epoch": 0.9347963324815872, + "flos": 19436783967360.0, + "grad_norm": 1.7404132689911553, + "language_loss": 0.84262055, + "learning_rate": 4.439303389230087e-08, + "loss": 0.86679405, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18029785, + "step": 15548, + "time_per_iteration": 2.807861804962158 + }, + { + "auxiliary_loss_clip": 0.01411563, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.24684095, + "balance_loss_mlp": 1.01492524, + "epoch": 0.9348564557342552, + "flos": 36915240199680.0, + "grad_norm": 1.7678605164075185, + "language_loss": 0.66120124, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.68566394, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19787598, + "step": 15549, + "time_per_iteration": 2.9554319381713867 + }, + { + "auxiliary_loss_clip": 0.01408665, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.24840546, + "balance_loss_mlp": 1.01572013, + "epoch": 0.9349165789869232, + "flos": 21700307013120.0, + "grad_norm": 1.7107477847247619, + "language_loss": 0.80494797, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82939231, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.20056152, + "step": 15550, + "time_per_iteration": 2.8479788303375244 + }, + { + "auxiliary_loss_clip": 0.01396412, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.23805225, + "balance_loss_mlp": 1.01544809, + "epoch": 0.9349767022395912, + "flos": 18853459015680.0, + "grad_norm": 1.7715649438077197, + "language_loss": 0.76539409, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.78970599, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19335938, + "step": 15551, + "time_per_iteration": 2.863986015319824 + }, + { + "auxiliary_loss_clip": 0.01389296, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.23419404, + "balance_loss_mlp": 1.01334655, + "epoch": 0.9350368254922591, + "flos": 24984365944320.0, + "grad_norm": 1.6626048545321366, + "language_loss": 0.73791707, + "learning_rate": 4.406722074642255e-08, + "loss": 0.76212174, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.17834473, + "step": 15552, + "time_per_iteration": 2.9901864528656006 + }, + { + "auxiliary_loss_clip": 0.01400187, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.24076581, + "balance_loss_mlp": 1.01251292, + "epoch": 0.9350969487449271, + "flos": 23079590069760.0, + "grad_norm": 1.712687664754558, + "language_loss": 0.78124905, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.80555475, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17883301, + "step": 15553, + "time_per_iteration": 5.580382585525513 + }, + { + "auxiliary_loss_clip": 0.01401781, + "auxiliary_loss_mlp": 0.01037307, + "balance_loss_clip": 1.24107766, + "balance_loss_mlp": 1.01741099, + "epoch": 0.9351570719975951, + "flos": 18634536835200.0, + "grad_norm": 2.4448154641405146, + "language_loss": 0.79192197, + "learning_rate": 4.390475917613723e-08, + "loss": 0.81631291, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19897461, + "step": 15554, + "time_per_iteration": 2.82419753074646 + }, + { + "auxiliary_loss_clip": 0.01381645, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.22655249, + "balance_loss_mlp": 1.01245904, + "epoch": 0.935217195250263, + "flos": 15896901081600.0, + "grad_norm": 1.8328274831235163, + "language_loss": 0.69627661, + "learning_rate": 4.382363965244695e-08, + "loss": 0.72039545, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.17797852, + "step": 15555, + "time_per_iteration": 2.856189250946045 + }, + { + "auxiliary_loss_clip": 0.01388933, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.23144579, + "balance_loss_mlp": 1.01292109, + "epoch": 0.935277318502931, + "flos": 24401267216640.0, + "grad_norm": 1.5878715864064672, + "language_loss": 0.75880569, + "learning_rate": 4.374259430715965e-08, + "loss": 0.78301227, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18798828, + "step": 15556, + "time_per_iteration": 2.9026272296905518 + }, + { + "auxiliary_loss_clip": 0.01395014, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.23645318, + "balance_loss_mlp": 1.01289582, + "epoch": 0.935337441755599, + "flos": 27611839313280.0, + "grad_norm": 1.4763172747621789, + "language_loss": 0.7357884, + "learning_rate": 4.366162314334953e-08, + "loss": 0.76005977, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19238281, + "step": 15557, + "time_per_iteration": 2.940505266189575 + }, + { + "auxiliary_loss_clip": 0.01400401, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.2405858, + "balance_loss_mlp": 1.01273847, + "epoch": 0.935397565008267, + "flos": 20491914142080.0, + "grad_norm": 1.5638769152602652, + "language_loss": 0.63703585, + "learning_rate": 4.358072616408681e-08, + "loss": 0.66135979, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19274902, + "step": 15558, + "time_per_iteration": 2.8757224082946777 + }, + { + "auxiliary_loss_clip": 0.01403513, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.244241, + "balance_loss_mlp": 1.01498246, + "epoch": 0.9354576882609349, + "flos": 23663548448640.0, + "grad_norm": 1.8951485304941278, + "language_loss": 0.74455488, + "learning_rate": 4.34999033724388e-08, + "loss": 0.76893997, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.20007324, + "step": 15559, + "time_per_iteration": 2.8501529693603516 + }, + { + "auxiliary_loss_clip": 0.01398426, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.24033451, + "balance_loss_mlp": 1.01088643, + "epoch": 0.9355178115136029, + "flos": 36698761238400.0, + "grad_norm": 1.5636549476693904, + "language_loss": 0.64529705, + "learning_rate": 4.341915477147062e-08, + "loss": 0.66956216, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.17199707, + "step": 15560, + "time_per_iteration": 2.9709084033966064 + }, + { + "auxiliary_loss_clip": 0.01427793, + "auxiliary_loss_mlp": 0.01037494, + "balance_loss_clip": 1.25798368, + "balance_loss_mlp": 1.01622653, + "epoch": 0.9355779347662708, + "flos": 14467278545280.0, + "grad_norm": 1.9927432104178116, + "language_loss": 0.64983219, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.67448509, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.21264648, + "step": 15561, + "time_per_iteration": 2.800750732421875 + }, + { + "auxiliary_loss_clip": 0.01389961, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.23259854, + "balance_loss_mlp": 1.01452529, + "epoch": 0.9356380580189388, + "flos": 23196539174400.0, + "grad_norm": 1.8351018270596962, + "language_loss": 0.75941736, + "learning_rate": 4.325788015381859e-08, + "loss": 0.78365779, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19580078, + "step": 15562, + "time_per_iteration": 2.9363183975219727 + }, + { + "auxiliary_loss_clip": 0.01179786, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.09280348, + "balance_loss_mlp": 1.01183403, + "epoch": 0.9356981812716068, + "flos": 67501056695040.0, + "grad_norm": 0.9412912110840377, + "language_loss": 0.62397099, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64610279, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.21582031, + "step": 15563, + "time_per_iteration": 3.206998109817505 + }, + { + "auxiliary_loss_clip": 0.01385723, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.2302227, + "balance_loss_mlp": 1.01363564, + "epoch": 0.9357583045242748, + "flos": 24692499866880.0, + "grad_norm": 1.5438989481067655, + "language_loss": 0.78656834, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.81076103, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.19897461, + "step": 15564, + "time_per_iteration": 2.8773157596588135 + }, + { + "auxiliary_loss_clip": 0.01407308, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.24400699, + "balance_loss_mlp": 1.0141623, + "epoch": 0.9358184277769427, + "flos": 19473142803840.0, + "grad_norm": 2.353952734640925, + "language_loss": 0.78959119, + "learning_rate": 4.301652473389694e-08, + "loss": 0.81399894, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19311523, + "step": 15565, + "time_per_iteration": 2.8308422565460205 + }, + { + "auxiliary_loss_clip": 0.01386867, + "auxiliary_loss_mlp": 0.0103021, + "balance_loss_clip": 1.23035729, + "balance_loss_mlp": 1.01266289, + "epoch": 0.9358785510296107, + "flos": 18926402912640.0, + "grad_norm": 1.8237209961151926, + "language_loss": 0.72280133, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74697208, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.17553711, + "step": 15566, + "time_per_iteration": 2.815019130706787 + }, + { + "auxiliary_loss_clip": 0.01399829, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.23797882, + "balance_loss_mlp": 1.01304305, + "epoch": 0.9359386742822787, + "flos": 23451865436160.0, + "grad_norm": 5.1628753450131795, + "language_loss": 0.6839478, + "learning_rate": 4.285599216057889e-08, + "loss": 0.70826465, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18823242, + "step": 15567, + "time_per_iteration": 2.8701229095458984 + }, + { + "auxiliary_loss_clip": 0.01401898, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.24266505, + "balance_loss_mlp": 1.01281357, + "epoch": 0.9359987975349466, + "flos": 32756804645760.0, + "grad_norm": 2.0529335103127466, + "language_loss": 0.63119531, + "learning_rate": 4.277583719504418e-08, + "loss": 0.65552402, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18164062, + "step": 15568, + "time_per_iteration": 2.9120895862579346 + }, + { + "auxiliary_loss_clip": 0.01390623, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.23237491, + "balance_loss_mlp": 1.01656461, + "epoch": 0.9360589207876147, + "flos": 22829738428800.0, + "grad_norm": 2.0309980266612238, + "language_loss": 0.79501373, + "learning_rate": 4.269575644764556e-08, + "loss": 0.81926692, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18139648, + "step": 15569, + "time_per_iteration": 2.837771415710449 + }, + { + "auxiliary_loss_clip": 0.01397753, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.23608685, + "balance_loss_mlp": 1.01260638, + "epoch": 0.9361190440402826, + "flos": 20894666520960.0, + "grad_norm": 5.150432230626565, + "language_loss": 0.70384824, + "learning_rate": 4.261574992142014e-08, + "loss": 0.72814441, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19262695, + "step": 15570, + "time_per_iteration": 2.849252223968506 + }, + { + "auxiliary_loss_clip": 0.01398438, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.23839617, + "balance_loss_mlp": 1.00974715, + "epoch": 0.9361791672929506, + "flos": 19327209765120.0, + "grad_norm": 1.6947087464207835, + "language_loss": 0.79659462, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.82086438, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18798828, + "step": 15571, + "time_per_iteration": 2.82601261138916 + }, + { + "auxiliary_loss_clip": 0.01399652, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.23990917, + "balance_loss_mlp": 1.01067388, + "epoch": 0.9362392905456185, + "flos": 15166421481600.0, + "grad_norm": 2.201512508415923, + "language_loss": 0.78491724, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.80920947, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18896484, + "step": 15572, + "time_per_iteration": 2.844223737716675 + }, + { + "auxiliary_loss_clip": 0.01381833, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.22641253, + "balance_loss_mlp": 1.01298594, + "epoch": 0.9362994137982865, + "flos": 22094734348800.0, + "grad_norm": 1.6279461400042996, + "language_loss": 0.78571594, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80984658, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18261719, + "step": 15573, + "time_per_iteration": 2.823782205581665 + }, + { + "auxiliary_loss_clip": 0.01385026, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.22994423, + "balance_loss_mlp": 1.01168036, + "epoch": 0.9363595370509544, + "flos": 23522366113920.0, + "grad_norm": 1.5516864680127629, + "language_loss": 0.74739647, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.77154899, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18554688, + "step": 15574, + "time_per_iteration": 2.8705625534057617 + }, + { + "auxiliary_loss_clip": 0.01393301, + "auxiliary_loss_mlp": 0.01030732, + "balance_loss_clip": 1.23709929, + "balance_loss_mlp": 1.01168263, + "epoch": 0.9364196603036224, + "flos": 27134288000640.0, + "grad_norm": 1.9655918587353216, + "language_loss": 0.69340789, + "learning_rate": 4.221683071397564e-08, + "loss": 0.71764821, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.19067383, + "step": 15575, + "time_per_iteration": 2.909931182861328 + }, + { + "auxiliary_loss_clip": 0.01380113, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.22650242, + "balance_loss_mlp": 1.01362491, + "epoch": 0.9364797835562904, + "flos": 18488332327680.0, + "grad_norm": 1.5671891680311858, + "language_loss": 0.66393751, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.68806529, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.19042969, + "step": 15576, + "time_per_iteration": 2.8272457122802734 + }, + { + "auxiliary_loss_clip": 0.01399948, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.23837256, + "balance_loss_mlp": 1.01066113, + "epoch": 0.9365399068089584, + "flos": 13013241799680.0, + "grad_norm": 2.3751027552936175, + "language_loss": 0.77323395, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.79752541, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1854248, + "step": 15577, + "time_per_iteration": 2.814401388168335 + }, + { + "auxiliary_loss_clip": 0.01402131, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.24126959, + "balance_loss_mlp": 1.01632965, + "epoch": 0.9366000300616263, + "flos": 25677491322240.0, + "grad_norm": 1.9432758964432237, + "language_loss": 0.53434741, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.55872583, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19384766, + "step": 15578, + "time_per_iteration": 4.387479543685913 + }, + { + "auxiliary_loss_clip": 0.01371328, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.21730804, + "balance_loss_mlp": 1.01174426, + "epoch": 0.9366601533142943, + "flos": 21443125714560.0, + "grad_norm": 1.6697283682582675, + "language_loss": 0.70965576, + "learning_rate": 4.189903163783692e-08, + "loss": 0.73367155, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18493652, + "step": 15579, + "time_per_iteration": 2.866657257080078 + }, + { + "auxiliary_loss_clip": 0.01395307, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.23685074, + "balance_loss_mlp": 1.01206994, + "epoch": 0.9367202765669622, + "flos": 24102750153600.0, + "grad_norm": 2.142095399152322, + "language_loss": 0.77339637, + "learning_rate": 4.181976748973959e-08, + "loss": 0.79764956, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.17932129, + "step": 15580, + "time_per_iteration": 2.8751673698425293 + }, + { + "auxiliary_loss_clip": 0.01408916, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.24648321, + "balance_loss_mlp": 1.01452065, + "epoch": 0.9367803998196302, + "flos": 20899191000960.0, + "grad_norm": 1.8743423567282866, + "language_loss": 0.67383599, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.69827271, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20202637, + "step": 15581, + "time_per_iteration": 2.8453528881073 + }, + { + "auxiliary_loss_clip": 0.01390997, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.2338469, + "balance_loss_mlp": 1.01076734, + "epoch": 0.9368405230722983, + "flos": 22574412167040.0, + "grad_norm": 2.0485340247921973, + "language_loss": 0.76538444, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78958488, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18273926, + "step": 15582, + "time_per_iteration": 4.385905027389526 + }, + { + "auxiliary_loss_clip": 0.01394612, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.23648143, + "balance_loss_mlp": 1.01614618, + "epoch": 0.9369006463249662, + "flos": 18889546383360.0, + "grad_norm": 1.6218225136245776, + "language_loss": 0.74711621, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.77141464, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1907959, + "step": 15583, + "time_per_iteration": 2.8466501235961914 + }, + { + "auxiliary_loss_clip": 0.01411898, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.248559, + "balance_loss_mlp": 1.01394582, + "epoch": 0.9369607695776342, + "flos": 26443741576320.0, + "grad_norm": 2.165010940520297, + "language_loss": 0.853773, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.87822551, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.1940918, + "step": 15584, + "time_per_iteration": 2.8633909225463867 + }, + { + "auxiliary_loss_clip": 0.01420834, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.25544286, + "balance_loss_mlp": 1.01402819, + "epoch": 0.9370208928303021, + "flos": 39581968072320.0, + "grad_norm": 1.5572466371479246, + "language_loss": 0.72744983, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.75199687, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19824219, + "step": 15585, + "time_per_iteration": 3.0002529621124268 + }, + { + "auxiliary_loss_clip": 0.01383211, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.22786403, + "balance_loss_mlp": 1.009431, + "epoch": 0.9370810160829701, + "flos": 22972594821120.0, + "grad_norm": 1.7734803051365042, + "language_loss": 0.81087565, + "learning_rate": 4.134574204836316e-08, + "loss": 0.83498466, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18249512, + "step": 15586, + "time_per_iteration": 2.8816869258880615 + }, + { + "auxiliary_loss_clip": 0.01391025, + "auxiliary_loss_mlp": 0.01034076, + "balance_loss_clip": 1.23328447, + "balance_loss_mlp": 1.01513338, + "epoch": 0.937141139335638, + "flos": 23085200424960.0, + "grad_norm": 1.6082296425172466, + "language_loss": 0.77166939, + "learning_rate": 4.126699774396258e-08, + "loss": 0.79592037, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1895752, + "step": 15587, + "time_per_iteration": 2.910634756088257 + }, + { + "auxiliary_loss_clip": 0.01408804, + "auxiliary_loss_mlp": 0.01035254, + "balance_loss_clip": 1.24547136, + "balance_loss_mlp": 1.01556039, + "epoch": 0.937201262588306, + "flos": 16363774621440.0, + "grad_norm": 2.1532447252120632, + "language_loss": 0.8801465, + "learning_rate": 4.118832771491387e-08, + "loss": 0.90458715, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.19702148, + "step": 15588, + "time_per_iteration": 5.5915868282318115 + }, + { + "auxiliary_loss_clip": 0.01386727, + "auxiliary_loss_mlp": 0.01031052, + "balance_loss_clip": 1.2321527, + "balance_loss_mlp": 1.01284885, + "epoch": 0.937261385840974, + "flos": 20203848627840.0, + "grad_norm": 1.7799875177970732, + "language_loss": 0.79017729, + "learning_rate": 4.11097319642002e-08, + "loss": 0.81435508, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18188477, + "step": 15589, + "time_per_iteration": 2.833195924758911 + }, + { + "auxiliary_loss_clip": 0.01384671, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.22881746, + "balance_loss_mlp": 1.01571417, + "epoch": 0.937321509093642, + "flos": 18304999822080.0, + "grad_norm": 2.122253628165616, + "language_loss": 0.78005683, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80425823, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.1973877, + "step": 15590, + "time_per_iteration": 2.787128210067749 + }, + { + "auxiliary_loss_clip": 0.01406664, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.24367535, + "balance_loss_mlp": 1.01323128, + "epoch": 0.9373816323463099, + "flos": 25895599096320.0, + "grad_norm": 1.8073955535254873, + "language_loss": 0.71997482, + "learning_rate": 4.095276330969577e-08, + "loss": 0.74437082, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19702148, + "step": 15591, + "time_per_iteration": 2.8492865562438965 + }, + { + "auxiliary_loss_clip": 0.01416041, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.25156665, + "balance_loss_mlp": 1.01360965, + "epoch": 0.9374417555989779, + "flos": 27210308544000.0, + "grad_norm": 1.9018842955238968, + "language_loss": 0.54752171, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.5720154, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19714355, + "step": 15592, + "time_per_iteration": 2.8730247020721436 + }, + { + "auxiliary_loss_clip": 0.01395289, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.23677504, + "balance_loss_mlp": 1.01170909, + "epoch": 0.9375018788516458, + "flos": 23631261644160.0, + "grad_norm": 1.4993081523272904, + "language_loss": 0.67606604, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.70031571, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1796875, + "step": 15593, + "time_per_iteration": 2.850207567214966 + }, + { + "auxiliary_loss_clip": 0.01398386, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.23785949, + "balance_loss_mlp": 1.01346338, + "epoch": 0.9375620021043138, + "flos": 22690139662080.0, + "grad_norm": 1.628406830187153, + "language_loss": 0.74392271, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76822031, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.17907715, + "step": 15594, + "time_per_iteration": 2.8662147521972656 + }, + { + "auxiliary_loss_clip": 0.01399768, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.24237823, + "balance_loss_mlp": 1.01434934, + "epoch": 0.9376221253569819, + "flos": 27570910752000.0, + "grad_norm": 1.5773255543802072, + "language_loss": 0.74056441, + "learning_rate": 4.063971747165351e-08, + "loss": 0.76489806, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19250488, + "step": 15595, + "time_per_iteration": 2.992496967315674 + }, + { + "auxiliary_loss_clip": 0.01396744, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.23622572, + "balance_loss_mlp": 1.01343703, + "epoch": 0.9376822486096498, + "flos": 24139470948480.0, + "grad_norm": 1.7234356207336765, + "language_loss": 0.7669487, + "learning_rate": 4.056164175257626e-08, + "loss": 0.79124683, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19641113, + "step": 15596, + "time_per_iteration": 2.927608013153076 + }, + { + "auxiliary_loss_clip": 0.01398221, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.23861122, + "balance_loss_mlp": 1.01483417, + "epoch": 0.9377423718623178, + "flos": 22794329733120.0, + "grad_norm": 1.7765609404557516, + "language_loss": 0.79270959, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.81704015, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.20007324, + "step": 15597, + "time_per_iteration": 2.8876144886016846 + }, + { + "auxiliary_loss_clip": 0.01410863, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.24665976, + "balance_loss_mlp": 1.01500654, + "epoch": 0.9378024951149857, + "flos": 19177476163200.0, + "grad_norm": 1.4583549335933, + "language_loss": 0.81330681, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83775836, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19287109, + "step": 15598, + "time_per_iteration": 2.9666144847869873 + }, + { + "auxiliary_loss_clip": 0.01422599, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.25583243, + "balance_loss_mlp": 1.01120341, + "epoch": 0.9378626183676537, + "flos": 23513860091520.0, + "grad_norm": 2.173806659457754, + "language_loss": 0.63841927, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.66295207, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19482422, + "step": 15599, + "time_per_iteration": 2.8800790309906006 + }, + { + "auxiliary_loss_clip": 0.01404347, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.24272728, + "balance_loss_mlp": 1.0138011, + "epoch": 0.9379227416203216, + "flos": 18414800248320.0, + "grad_norm": 1.9382906118775722, + "language_loss": 0.74069482, + "learning_rate": 4.0250081926821e-08, + "loss": 0.76506412, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18786621, + "step": 15600, + "time_per_iteration": 2.812058448791504 + }, + { + "auxiliary_loss_clip": 0.0139765, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.23951221, + "balance_loss_mlp": 1.01493287, + "epoch": 0.9379828648729897, + "flos": 17830841869440.0, + "grad_norm": 1.8943926470971277, + "language_loss": 0.70528078, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.72958726, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18078613, + "step": 15601, + "time_per_iteration": 2.930206298828125 + }, + { + "auxiliary_loss_clip": 0.01183182, + "auxiliary_loss_mlp": 0.01028726, + "balance_loss_clip": 1.09479594, + "balance_loss_mlp": 1.00164175, + "epoch": 0.9380429881256576, + "flos": 68055397712640.0, + "grad_norm": 0.7435471256833117, + "language_loss": 0.58099306, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60311222, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.27148438, + "step": 15602, + "time_per_iteration": 3.5120227336883545 + }, + { + "auxiliary_loss_clip": 0.01409143, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.24698389, + "balance_loss_mlp": 1.01652527, + "epoch": 0.9381031113783256, + "flos": 20786675886720.0, + "grad_norm": 2.156156286309213, + "language_loss": 0.72703141, + "learning_rate": 4.001719234324663e-08, + "loss": 0.75146961, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18127441, + "step": 15603, + "time_per_iteration": 2.900451183319092 + }, + { + "auxiliary_loss_clip": 0.01373327, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.22108054, + "balance_loss_mlp": 1.01281524, + "epoch": 0.9381632346309935, + "flos": 19034167322880.0, + "grad_norm": 1.6892087973838246, + "language_loss": 0.76429832, + "learning_rate": 3.993971112362171e-08, + "loss": 0.7883507, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.19091797, + "step": 15604, + "time_per_iteration": 2.845463275909424 + }, + { + "auxiliary_loss_clip": 0.01401526, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.24092126, + "balance_loss_mlp": 1.0139395, + "epoch": 0.9382233578836615, + "flos": 23524356885120.0, + "grad_norm": 2.2975359464392078, + "language_loss": 0.65822339, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.68257463, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.1965332, + "step": 15605, + "time_per_iteration": 2.836155414581299 + }, + { + "auxiliary_loss_clip": 0.01416978, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.25161815, + "balance_loss_mlp": 1.01467609, + "epoch": 0.9382834811363294, + "flos": 43082867923200.0, + "grad_norm": 1.83811450221809, + "language_loss": 0.68095326, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.70546901, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19934082, + "step": 15606, + "time_per_iteration": 3.0473427772521973 + }, + { + "auxiliary_loss_clip": 0.01382238, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.22800553, + "balance_loss_mlp": 1.0145762, + "epoch": 0.9383436043889974, + "flos": 16445088806400.0, + "grad_norm": 1.812233648348569, + "language_loss": 0.78327239, + "learning_rate": 3.970771343058166e-08, + "loss": 0.80741584, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.17553711, + "step": 15607, + "time_per_iteration": 2.819307327270508 + }, + { + "auxiliary_loss_clip": 0.01397044, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.23770046, + "balance_loss_mlp": 1.01737928, + "epoch": 0.9384037276416655, + "flos": 20750317050240.0, + "grad_norm": 2.1298856359174265, + "language_loss": 0.83714253, + "learning_rate": 3.963052953128776e-08, + "loss": 0.86147082, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18408203, + "step": 15608, + "time_per_iteration": 2.8339216709136963 + }, + { + "auxiliary_loss_clip": 0.0139937, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.24119759, + "balance_loss_mlp": 1.01587868, + "epoch": 0.9384638508943334, + "flos": 19072064482560.0, + "grad_norm": 3.800067413404078, + "language_loss": 0.69783109, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.72217184, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18823242, + "step": 15609, + "time_per_iteration": 2.8217291831970215 + }, + { + "auxiliary_loss_clip": 0.01404482, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.24203777, + "balance_loss_mlp": 1.01132059, + "epoch": 0.9385239741470014, + "flos": 23415506599680.0, + "grad_norm": 1.8340449223792288, + "language_loss": 0.76562989, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.78998059, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19274902, + "step": 15610, + "time_per_iteration": 2.8732151985168457 + }, + { + "auxiliary_loss_clip": 0.01399723, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.24039912, + "balance_loss_mlp": 1.01395762, + "epoch": 0.9385840973996693, + "flos": 12832804961280.0, + "grad_norm": 1.9251622983496737, + "language_loss": 0.76033556, + "learning_rate": 3.939942386953987e-08, + "loss": 0.78465271, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18029785, + "step": 15611, + "time_per_iteration": 2.827209949493408 + }, + { + "auxiliary_loss_clip": 0.01398508, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.23954165, + "balance_loss_mlp": 1.016608, + "epoch": 0.9386442206523373, + "flos": 15495415557120.0, + "grad_norm": 1.731999646637399, + "language_loss": 0.66431999, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68866295, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19165039, + "step": 15612, + "time_per_iteration": 2.8208634853363037 + }, + { + "auxiliary_loss_clip": 0.01386004, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.22979021, + "balance_loss_mlp": 1.01384878, + "epoch": 0.9387043439050052, + "flos": 21188749593600.0, + "grad_norm": 1.8240787888625594, + "language_loss": 0.58332765, + "learning_rate": 3.924572515435742e-08, + "loss": 0.60750771, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18164062, + "step": 15613, + "time_per_iteration": 4.299671173095703 + }, + { + "auxiliary_loss_clip": 0.01404279, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.24308097, + "balance_loss_mlp": 1.01700985, + "epoch": 0.9387644671576733, + "flos": 27678584672640.0, + "grad_norm": 2.129621601567192, + "language_loss": 0.71297008, + "learning_rate": 3.916898732330764e-08, + "loss": 0.73736084, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.17773438, + "step": 15614, + "time_per_iteration": 2.907902956008911 + }, + { + "auxiliary_loss_clip": 0.01402033, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.23948169, + "balance_loss_mlp": 1.01537561, + "epoch": 0.9388245904103412, + "flos": 18843731383680.0, + "grad_norm": 1.853990515311715, + "language_loss": 0.81988335, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.84425437, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19702148, + "step": 15615, + "time_per_iteration": 2.828937292098999 + }, + { + "auxiliary_loss_clip": 0.01382299, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.22568011, + "balance_loss_mlp": 1.01333416, + "epoch": 0.9388847136630092, + "flos": 25494792243840.0, + "grad_norm": 1.6098860080050859, + "language_loss": 0.72336292, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74751043, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.19128418, + "step": 15616, + "time_per_iteration": 4.311228275299072 + }, + { + "auxiliary_loss_clip": 0.0140057, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.24193752, + "balance_loss_mlp": 1.01453829, + "epoch": 0.9389448369156771, + "flos": 18744472995840.0, + "grad_norm": 1.7229232777041865, + "language_loss": 0.66641665, + "learning_rate": 3.89392199712355e-08, + "loss": 0.69076568, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.19787598, + "step": 15617, + "time_per_iteration": 2.8747506141662598 + }, + { + "auxiliary_loss_clip": 0.01404723, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.24226916, + "balance_loss_mlp": 1.01285148, + "epoch": 0.9390049601683451, + "flos": 21725761852800.0, + "grad_norm": 1.964378448222992, + "language_loss": 0.74064517, + "learning_rate": 3.886277957725092e-08, + "loss": 0.76500845, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1875, + "step": 15618, + "time_per_iteration": 2.839963436126709 + }, + { + "auxiliary_loss_clip": 0.01414563, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.24930835, + "balance_loss_mlp": 1.01642942, + "epoch": 0.939065083421013, + "flos": 19400741844480.0, + "grad_norm": 2.1919855485047135, + "language_loss": 0.70717055, + "learning_rate": 3.878641354978662e-08, + "loss": 0.73166943, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.18896484, + "step": 15619, + "time_per_iteration": 2.8600919246673584 + }, + { + "auxiliary_loss_clip": 0.01398979, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.23910499, + "balance_loss_mlp": 1.01246965, + "epoch": 0.939125206673681, + "flos": 24692364132480.0, + "grad_norm": 3.0792791732144176, + "language_loss": 0.7837162, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.80802143, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1907959, + "step": 15620, + "time_per_iteration": 2.8636281490325928 + }, + { + "auxiliary_loss_clip": 0.01390739, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.23367715, + "balance_loss_mlp": 1.01271617, + "epoch": 0.9391853299263491, + "flos": 16334564463360.0, + "grad_norm": 1.9330543528435413, + "language_loss": 0.7491163, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.77333242, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18151855, + "step": 15621, + "time_per_iteration": 2.926732301712036 + }, + { + "auxiliary_loss_clip": 0.01411128, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.24695778, + "balance_loss_mlp": 1.01398432, + "epoch": 0.939245453179017, + "flos": 11663168901120.0, + "grad_norm": 2.432880745956629, + "language_loss": 0.67719626, + "learning_rate": 3.855776169545688e-08, + "loss": 0.70163846, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19116211, + "step": 15622, + "time_per_iteration": 2.863950729370117 + }, + { + "auxiliary_loss_clip": 0.01389278, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.23245227, + "balance_loss_mlp": 1.01635277, + "epoch": 0.939305576431685, + "flos": 23159049217920.0, + "grad_norm": 1.6302037902840594, + "language_loss": 0.72275114, + "learning_rate": 3.848169316300209e-08, + "loss": 0.74699348, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.1862793, + "step": 15623, + "time_per_iteration": 5.823200225830078 + }, + { + "auxiliary_loss_clip": 0.01398018, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.23972344, + "balance_loss_mlp": 1.01315665, + "epoch": 0.9393656996843529, + "flos": 33299743973760.0, + "grad_norm": 1.75319437753113, + "language_loss": 0.73419297, + "learning_rate": 3.84056990115178e-08, + "loss": 0.75849783, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19287109, + "step": 15624, + "time_per_iteration": 2.960040807723999 + }, + { + "auxiliary_loss_clip": 0.01389909, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.23233283, + "balance_loss_mlp": 1.01366806, + "epoch": 0.9394258229370209, + "flos": 21699447361920.0, + "grad_norm": 1.840002443587124, + "language_loss": 0.90440512, + "learning_rate": 3.832977924388614e-08, + "loss": 0.92862201, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18127441, + "step": 15625, + "time_per_iteration": 2.8558897972106934 + }, + { + "auxiliary_loss_clip": 0.01391311, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.23302913, + "balance_loss_mlp": 1.01426375, + "epoch": 0.9394859461896888, + "flos": 23883646993920.0, + "grad_norm": 1.9318991858878543, + "language_loss": 0.84593487, + "learning_rate": 3.825393386298592e-08, + "loss": 0.87018394, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1932373, + "step": 15626, + "time_per_iteration": 2.8637161254882812 + }, + { + "auxiliary_loss_clip": 0.0117793, + "auxiliary_loss_mlp": 0.0101976, + "balance_loss_clip": 1.09065938, + "balance_loss_mlp": 1.0018307, + "epoch": 0.9395460694423569, + "flos": 61595360974080.0, + "grad_norm": 0.7781650295314795, + "language_loss": 0.56128883, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58326578, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1796875, + "step": 15627, + "time_per_iteration": 3.3093039989471436 + }, + { + "auxiliary_loss_clip": 0.01401089, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.24187481, + "balance_loss_mlp": 1.01315308, + "epoch": 0.9396061926950248, + "flos": 21005417088000.0, + "grad_norm": 3.0438516745996425, + "language_loss": 0.70700079, + "learning_rate": 3.810246627288105e-08, + "loss": 0.7313326, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18933105, + "step": 15628, + "time_per_iteration": 2.8497703075408936 + }, + { + "auxiliary_loss_clip": 0.01398838, + "auxiliary_loss_mlp": 0.01029978, + "balance_loss_clip": 1.24116158, + "balance_loss_mlp": 1.01186967, + "epoch": 0.9396663159476928, + "flos": 27498555037440.0, + "grad_norm": 6.44455212782039, + "language_loss": 0.75787491, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.78216308, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18115234, + "step": 15629, + "time_per_iteration": 2.894747495651245 + }, + { + "auxiliary_loss_clip": 0.01386645, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.23233294, + "balance_loss_mlp": 1.01243186, + "epoch": 0.9397264392003607, + "flos": 19436919701760.0, + "grad_norm": 2.5054209357138992, + "language_loss": 0.74588817, + "learning_rate": 3.795129626417748e-08, + "loss": 0.77006209, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18322754, + "step": 15630, + "time_per_iteration": 2.8149120807647705 + }, + { + "auxiliary_loss_clip": 0.01384811, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.23023987, + "balance_loss_mlp": 1.01178789, + "epoch": 0.9397865624530287, + "flos": 18013993395840.0, + "grad_norm": 1.8472200364025315, + "language_loss": 0.70088738, + "learning_rate": 3.787582286001845e-08, + "loss": 0.72503191, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.1784668, + "step": 15631, + "time_per_iteration": 2.806218147277832 + }, + { + "auxiliary_loss_clip": 0.01394212, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.23845875, + "balance_loss_mlp": 1.01441956, + "epoch": 0.9398466857056966, + "flos": 22574728880640.0, + "grad_norm": 1.4924255988296533, + "language_loss": 0.75239599, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77665228, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.17004395, + "step": 15632, + "time_per_iteration": 2.9303033351898193 + }, + { + "auxiliary_loss_clip": 0.01420706, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.2562387, + "balance_loss_mlp": 1.01585197, + "epoch": 0.9399068089583646, + "flos": 24546521583360.0, + "grad_norm": 1.887505414941001, + "language_loss": 0.75313091, + "learning_rate": 3.772509926639622e-08, + "loss": 0.77770692, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.21020508, + "step": 15633, + "time_per_iteration": 2.8765904903411865 + }, + { + "auxiliary_loss_clip": 0.01404576, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.24222708, + "balance_loss_mlp": 1.01364458, + "epoch": 0.9399669322110327, + "flos": 25641494444160.0, + "grad_norm": 1.7776589159109235, + "language_loss": 0.73327899, + "learning_rate": 3.764984908264823e-08, + "loss": 0.75766778, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.20666504, + "step": 15634, + "time_per_iteration": 2.9478559494018555 + }, + { + "auxiliary_loss_clip": 0.01408704, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.24496925, + "balance_loss_mlp": 1.01451433, + "epoch": 0.9400270554637006, + "flos": 17097466602240.0, + "grad_norm": 2.0833059418836615, + "language_loss": 0.69888747, + "learning_rate": 3.75746733114144e-08, + "loss": 0.72330678, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18713379, + "step": 15635, + "time_per_iteration": 2.831812858581543 + }, + { + "auxiliary_loss_clip": 0.01377371, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.22224855, + "balance_loss_mlp": 1.01386058, + "epoch": 0.9400871787163686, + "flos": 22065433701120.0, + "grad_norm": 1.5477413525728871, + "language_loss": 0.74697316, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.77107847, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.19287109, + "step": 15636, + "time_per_iteration": 2.8760297298431396 + }, + { + "auxiliary_loss_clip": 0.01402024, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.24283266, + "balance_loss_mlp": 1.01483762, + "epoch": 0.9401473019690365, + "flos": 16991557228800.0, + "grad_norm": 2.1873011835981577, + "language_loss": 0.84086919, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.86523616, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19836426, + "step": 15637, + "time_per_iteration": 2.8801279067993164 + }, + { + "auxiliary_loss_clip": 0.01403015, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.24274337, + "balance_loss_mlp": 1.01574516, + "epoch": 0.9402074252217045, + "flos": 19691250577920.0, + "grad_norm": 2.1664106426837018, + "language_loss": 0.69813925, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.72251946, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19262695, + "step": 15638, + "time_per_iteration": 2.8621532917022705 + }, + { + "auxiliary_loss_clip": 0.01382582, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.22807598, + "balance_loss_mlp": 1.01290011, + "epoch": 0.9402675484743724, + "flos": 24765127050240.0, + "grad_norm": 1.7432558631564368, + "language_loss": 0.8538326, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87796545, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.17822266, + "step": 15639, + "time_per_iteration": 2.910263776779175 + }, + { + "auxiliary_loss_clip": 0.01397382, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.2368983, + "balance_loss_mlp": 1.01322031, + "epoch": 0.9403276717270405, + "flos": 25570088870400.0, + "grad_norm": 1.4719015132262792, + "language_loss": 0.78809094, + "learning_rate": 3.719991074263662e-08, + "loss": 0.81238967, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19274902, + "step": 15640, + "time_per_iteration": 3.04253888130188 + }, + { + "auxiliary_loss_clip": 0.01410496, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.24843431, + "balance_loss_mlp": 1.01556158, + "epoch": 0.9403877949797084, + "flos": 26701646791680.0, + "grad_norm": 1.5257969979657229, + "language_loss": 0.74373502, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76817763, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18188477, + "step": 15641, + "time_per_iteration": 2.9261746406555176 + }, + { + "auxiliary_loss_clip": 0.01410181, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.2450043, + "balance_loss_mlp": 1.01638961, + "epoch": 0.9404479182323764, + "flos": 15019719281280.0, + "grad_norm": 2.1759405770681957, + "language_loss": 0.83426774, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.85873115, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.19787598, + "step": 15642, + "time_per_iteration": 2.82086181640625 + }, + { + "auxiliary_loss_clip": 0.01391201, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.23433769, + "balance_loss_mlp": 1.01083922, + "epoch": 0.9405080414850443, + "flos": 24984546923520.0, + "grad_norm": 2.1521182416071163, + "language_loss": 0.69332576, + "learning_rate": 3.697594633355084e-08, + "loss": 0.7175343, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18798828, + "step": 15643, + "time_per_iteration": 2.9383535385131836 + }, + { + "auxiliary_loss_clip": 0.0140477, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.24272835, + "balance_loss_mlp": 1.0185864, + "epoch": 0.9405681647377123, + "flos": 20853104532480.0, + "grad_norm": 1.8672346765696008, + "language_loss": 0.77261263, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.79703897, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19262695, + "step": 15644, + "time_per_iteration": 3.0102365016937256 + }, + { + "auxiliary_loss_clip": 0.01386114, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.23099399, + "balance_loss_mlp": 1.01296258, + "epoch": 0.9406282879903802, + "flos": 23816132472960.0, + "grad_norm": 2.267101137421202, + "language_loss": 0.68722391, + "learning_rate": 3.682700891311974e-08, + "loss": 0.71139777, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18322754, + "step": 15645, + "time_per_iteration": 2.8829092979431152 + }, + { + "auxiliary_loss_clip": 0.01381856, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.22720623, + "balance_loss_mlp": 1.01514626, + "epoch": 0.9406884112430483, + "flos": 27687000205440.0, + "grad_norm": 1.6319333045375188, + "language_loss": 0.70790982, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.73206198, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.18212891, + "step": 15646, + "time_per_iteration": 2.9393808841705322 + }, + { + "auxiliary_loss_clip": 0.01393666, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.235098, + "balance_loss_mlp": 1.01252973, + "epoch": 0.9407485344957163, + "flos": 23085200424960.0, + "grad_norm": 1.5958653959384281, + "language_loss": 0.74568939, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76993513, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18395996, + "step": 15647, + "time_per_iteration": 4.29063868522644 + }, + { + "auxiliary_loss_clip": 0.01181409, + "auxiliary_loss_mlp": 0.01025792, + "balance_loss_clip": 1.0924902, + "balance_loss_mlp": 1.00767183, + "epoch": 0.9408086577483842, + "flos": 71045237836800.0, + "grad_norm": 0.8850170833601647, + "language_loss": 0.63559508, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65766704, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.18164062, + "step": 15648, + "time_per_iteration": 3.433637857437134 + }, + { + "auxiliary_loss_clip": 0.01387461, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.23282933, + "balance_loss_mlp": 1.01633763, + "epoch": 0.9408687810010522, + "flos": 23741062070400.0, + "grad_norm": 1.8803048284562656, + "language_loss": 0.66863072, + "learning_rate": 3.653002741939337e-08, + "loss": 0.6928491, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18041992, + "step": 15649, + "time_per_iteration": 2.854145050048828 + }, + { + "auxiliary_loss_clip": 0.01402228, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.24091148, + "balance_loss_mlp": 1.0162642, + "epoch": 0.9409289042537201, + "flos": 18378486656640.0, + "grad_norm": 3.2587737465198594, + "language_loss": 0.7856971, + "learning_rate": 3.645596817637586e-08, + "loss": 0.81006521, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18322754, + "step": 15650, + "time_per_iteration": 2.8250887393951416 + }, + { + "auxiliary_loss_clip": 0.01401474, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.24329507, + "balance_loss_mlp": 1.01334858, + "epoch": 0.9409890275063881, + "flos": 23888850145920.0, + "grad_norm": 1.6468161856774448, + "language_loss": 0.74777329, + "learning_rate": 3.638198339114451e-08, + "loss": 0.77212209, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.20056152, + "step": 15651, + "time_per_iteration": 2.9264655113220215 + }, + { + "auxiliary_loss_clip": 0.0139639, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.23928547, + "balance_loss_mlp": 1.0136497, + "epoch": 0.941049150759056, + "flos": 16553803357440.0, + "grad_norm": 1.8292360599221005, + "language_loss": 0.73133796, + "learning_rate": 3.630807306650507e-08, + "loss": 0.75563204, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19372559, + "step": 15652, + "time_per_iteration": 4.279458522796631 + }, + { + "auxiliary_loss_clip": 0.01418277, + "auxiliary_loss_mlp": 0.01037846, + "balance_loss_clip": 1.25129843, + "balance_loss_mlp": 1.01757991, + "epoch": 0.9411092740117241, + "flos": 25129122618240.0, + "grad_norm": 2.028144023178358, + "language_loss": 0.67136669, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.69592798, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.20251465, + "step": 15653, + "time_per_iteration": 2.882615566253662 + }, + { + "auxiliary_loss_clip": 0.01414326, + "auxiliary_loss_mlp": 0.01036688, + "balance_loss_clip": 1.25192261, + "balance_loss_mlp": 1.01645803, + "epoch": 0.941169397264392, + "flos": 21152164533120.0, + "grad_norm": 1.8915747359642483, + "language_loss": 0.78373563, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80824578, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20214844, + "step": 15654, + "time_per_iteration": 2.8326432704925537 + }, + { + "auxiliary_loss_clip": 0.01423362, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.2564683, + "balance_loss_mlp": 1.01671433, + "epoch": 0.94122952051706, + "flos": 38523173068800.0, + "grad_norm": 1.930367415843534, + "language_loss": 0.71114516, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.73572356, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.1776123, + "step": 15655, + "time_per_iteration": 3.0381829738616943 + }, + { + "auxiliary_loss_clip": 0.01389759, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.23138237, + "balance_loss_mlp": 1.01308823, + "epoch": 0.9412896437697279, + "flos": 18378350922240.0, + "grad_norm": 1.888385973393771, + "language_loss": 0.73097765, + "learning_rate": 3.601317642987944e-08, + "loss": 0.75520444, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19836426, + "step": 15656, + "time_per_iteration": 2.8174080848693848 + }, + { + "auxiliary_loss_clip": 0.01394756, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.23617065, + "balance_loss_mlp": 1.0135473, + "epoch": 0.9413497670223959, + "flos": 25895961054720.0, + "grad_norm": 2.0256985428616403, + "language_loss": 0.79247963, + "learning_rate": 3.593963845018377e-08, + "loss": 0.81674838, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18579102, + "step": 15657, + "time_per_iteration": 2.891327381134033 + }, + { + "auxiliary_loss_clip": 0.01395761, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.23573923, + "balance_loss_mlp": 1.01465154, + "epoch": 0.9414098902750638, + "flos": 16627109212800.0, + "grad_norm": 2.2185269401537573, + "language_loss": 0.85291553, + "learning_rate": 3.586617494785371e-08, + "loss": 0.87721908, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19934082, + "step": 15658, + "time_per_iteration": 5.752011299133301 + }, + { + "auxiliary_loss_clip": 0.01406412, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.24256873, + "balance_loss_mlp": 1.01352453, + "epoch": 0.9414700135277319, + "flos": 18634355856000.0, + "grad_norm": 2.1529502468778245, + "language_loss": 0.71575183, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.740152, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.20080566, + "step": 15659, + "time_per_iteration": 2.8107247352600098 + }, + { + "auxiliary_loss_clip": 0.01397477, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.24000561, + "balance_loss_mlp": 1.01733255, + "epoch": 0.9415301367803999, + "flos": 26289664473600.0, + "grad_norm": 1.654388232949391, + "language_loss": 0.80294919, + "learning_rate": 3.571947138643172e-08, + "loss": 0.82728434, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18701172, + "step": 15660, + "time_per_iteration": 2.850080728530884 + }, + { + "auxiliary_loss_clip": 0.01373679, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.21888018, + "balance_loss_mlp": 1.01319027, + "epoch": 0.9415902600330678, + "flos": 23272604962560.0, + "grad_norm": 1.4585268165919931, + "language_loss": 0.68497109, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70902538, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18566895, + "step": 15661, + "time_per_iteration": 2.875058650970459 + }, + { + "auxiliary_loss_clip": 0.01380636, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.22215676, + "balance_loss_mlp": 1.01445079, + "epoch": 0.9416503832857358, + "flos": 14726948307840.0, + "grad_norm": 1.9585791643017243, + "language_loss": 0.67133683, + "learning_rate": 3.557306576786434e-08, + "loss": 0.69546533, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17773438, + "step": 15662, + "time_per_iteration": 2.806626081466675 + }, + { + "auxiliary_loss_clip": 0.01178134, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.09101915, + "balance_loss_mlp": 1.01254833, + "epoch": 0.9417105065384037, + "flos": 70341615665280.0, + "grad_norm": 0.7610550651232125, + "language_loss": 0.59319687, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61532497, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.22167969, + "step": 15663, + "time_per_iteration": 3.401543378829956 + }, + { + "auxiliary_loss_clip": 0.01409041, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.24496055, + "balance_loss_mlp": 1.01392615, + "epoch": 0.9417706297910717, + "flos": 34071061645440.0, + "grad_norm": 2.1015670800763386, + "language_loss": 0.67819905, + "learning_rate": 3.542695811435914e-08, + "loss": 0.70262539, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19677734, + "step": 15664, + "time_per_iteration": 2.9625699520111084 + }, + { + "auxiliary_loss_clip": 0.01403861, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.24705625, + "balance_loss_mlp": 1.01549184, + "epoch": 0.9418307530437396, + "flos": 16480180788480.0, + "grad_norm": 1.989977390935584, + "language_loss": 0.74042165, + "learning_rate": 3.535401603143207e-08, + "loss": 0.76480258, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18725586, + "step": 15665, + "time_per_iteration": 2.8203487396240234 + }, + { + "auxiliary_loss_clip": 0.01391301, + "auxiliary_loss_mlp": 0.01031182, + "balance_loss_clip": 1.23446298, + "balance_loss_mlp": 1.01294327, + "epoch": 0.9418908762964077, + "flos": 11260597501440.0, + "grad_norm": 1.8591854408247217, + "language_loss": 0.64290643, + "learning_rate": 3.528114844807773e-08, + "loss": 0.6671313, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18225098, + "step": 15666, + "time_per_iteration": 2.851552963256836 + }, + { + "auxiliary_loss_clip": 0.01394325, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.2356112, + "balance_loss_mlp": 1.01237869, + "epoch": 0.9419509995490756, + "flos": 18447132297600.0, + "grad_norm": 1.8132323595045707, + "language_loss": 0.79213738, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81639647, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1920166, + "step": 15667, + "time_per_iteration": 2.834063768386841 + }, + { + "auxiliary_loss_clip": 0.01394402, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.23568225, + "balance_loss_mlp": 1.01655221, + "epoch": 0.9420111228017436, + "flos": 20746968935040.0, + "grad_norm": 2.896256304967013, + "language_loss": 0.75989211, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.78418422, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18261719, + "step": 15668, + "time_per_iteration": 2.848926305770874 + }, + { + "auxiliary_loss_clip": 0.01397688, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.23690629, + "balance_loss_mlp": 1.01504076, + "epoch": 0.9420712460544115, + "flos": 21151757329920.0, + "grad_norm": 3.8388476949595995, + "language_loss": 0.6019758, + "learning_rate": 3.506299272306723e-08, + "loss": 0.62628961, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18664551, + "step": 15669, + "time_per_iteration": 2.824575662612915 + }, + { + "auxiliary_loss_clip": 0.01385775, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.23051012, + "balance_loss_mlp": 1.01163411, + "epoch": 0.9421313693070795, + "flos": 15860270776320.0, + "grad_norm": 1.866353376994672, + "language_loss": 0.77454704, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79870725, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18615723, + "step": 15670, + "time_per_iteration": 2.8121819496154785 + }, + { + "auxiliary_loss_clip": 0.01399901, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.24163067, + "balance_loss_mlp": 1.01006794, + "epoch": 0.9421914925597474, + "flos": 32428624976640.0, + "grad_norm": 2.5716449882354344, + "language_loss": 0.65721297, + "learning_rate": 3.491792812150574e-08, + "loss": 0.68150592, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.1932373, + "step": 15671, + "time_per_iteration": 2.935765504837036 + }, + { + "auxiliary_loss_clip": 0.01404062, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.24340868, + "balance_loss_mlp": 1.01228237, + "epoch": 0.9422516158124155, + "flos": 19727835638400.0, + "grad_norm": 1.5209562211660483, + "language_loss": 0.80288988, + "learning_rate": 3.48455075935139e-08, + "loss": 0.82723659, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18334961, + "step": 15672, + "time_per_iteration": 2.852769136428833 + }, + { + "auxiliary_loss_clip": 0.01410415, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.24599266, + "balance_loss_mlp": 1.01937294, + "epoch": 0.9423117390650835, + "flos": 16261937280000.0, + "grad_norm": 2.015830313043843, + "language_loss": 0.74565917, + "learning_rate": 3.47731615843776e-08, + "loss": 0.77014893, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.1920166, + "step": 15673, + "time_per_iteration": 2.8180108070373535 + }, + { + "auxiliary_loss_clip": 0.01397001, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.23789537, + "balance_loss_mlp": 1.01214123, + "epoch": 0.9423718623177514, + "flos": 31809348391680.0, + "grad_norm": 1.9662117483941726, + "language_loss": 0.70814025, + "learning_rate": 3.470089009683974e-08, + "loss": 0.73243093, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19921875, + "step": 15674, + "time_per_iteration": 2.9343671798706055 + }, + { + "auxiliary_loss_clip": 0.01394829, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.23599052, + "balance_loss_mlp": 1.01267803, + "epoch": 0.9424319855704194, + "flos": 23342381723520.0, + "grad_norm": 2.276967829846908, + "language_loss": 0.81787306, + "learning_rate": 3.462869313364125e-08, + "loss": 0.84212887, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18078613, + "step": 15675, + "time_per_iteration": 2.8766908645629883 + }, + { + "auxiliary_loss_clip": 0.01393324, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.2351687, + "balance_loss_mlp": 1.01184225, + "epoch": 0.9424921088230873, + "flos": 20787490293120.0, + "grad_norm": 1.6666372517385821, + "language_loss": 0.63230336, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.65653837, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18334961, + "step": 15676, + "time_per_iteration": 2.8637049198150635 + }, + { + "auxiliary_loss_clip": 0.0139981, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.23923016, + "balance_loss_mlp": 1.0140413, + "epoch": 0.9425522320757553, + "flos": 19036158094080.0, + "grad_norm": 2.051223507624855, + "language_loss": 0.67392898, + "learning_rate": 3.448452279120984e-08, + "loss": 0.6982649, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19726562, + "step": 15677, + "time_per_iteration": 2.8687174320220947 + }, + { + "auxiliary_loss_clip": 0.0140356, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.24094272, + "balance_loss_mlp": 1.01373029, + "epoch": 0.9426123553284232, + "flos": 25166205371520.0, + "grad_norm": 2.226929498074642, + "language_loss": 0.65127647, + "learning_rate": 3.441254941744387e-08, + "loss": 0.67564327, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1940918, + "step": 15678, + "time_per_iteration": 2.9398910999298096 + }, + { + "auxiliary_loss_clip": 0.01393196, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.2354995, + "balance_loss_mlp": 1.01327085, + "epoch": 0.9426724785810913, + "flos": 21189428265600.0, + "grad_norm": 1.9097580584196483, + "language_loss": 0.75229084, + "learning_rate": 3.434065057895097e-08, + "loss": 0.77654046, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18481445, + "step": 15679, + "time_per_iteration": 2.8485448360443115 + }, + { + "auxiliary_loss_clip": 0.01409023, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.2478261, + "balance_loss_mlp": 1.0169487, + "epoch": 0.9427326018337592, + "flos": 14765071691520.0, + "grad_norm": 2.17612102971205, + "language_loss": 0.77999723, + "learning_rate": 3.426882627845762e-08, + "loss": 0.8044529, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19592285, + "step": 15680, + "time_per_iteration": 2.813602924346924 + }, + { + "auxiliary_loss_clip": 0.01398016, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.24068785, + "balance_loss_mlp": 1.0136106, + "epoch": 0.9427927250864272, + "flos": 20933559066240.0, + "grad_norm": 1.761321680522249, + "language_loss": 0.76607269, + "learning_rate": 3.419707651868742e-08, + "loss": 0.7903744, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18530273, + "step": 15681, + "time_per_iteration": 2.8398356437683105 + }, + { + "auxiliary_loss_clip": 0.01410226, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.24851203, + "balance_loss_mlp": 1.01653075, + "epoch": 0.9428528483390951, + "flos": 19761208318080.0, + "grad_norm": 2.282523016609095, + "language_loss": 0.67048436, + "learning_rate": 3.412540130236086e-08, + "loss": 0.69494182, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18994141, + "step": 15682, + "time_per_iteration": 4.263850688934326 + }, + { + "auxiliary_loss_clip": 0.01387845, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.22918463, + "balance_loss_mlp": 1.01376224, + "epoch": 0.9429129715917631, + "flos": 24545661932160.0, + "grad_norm": 1.7205219610216753, + "language_loss": 0.77031171, + "learning_rate": 3.405380063219665e-08, + "loss": 0.79451632, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18859863, + "step": 15683, + "time_per_iteration": 2.922215223312378 + }, + { + "auxiliary_loss_clip": 0.01409008, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.24776077, + "balance_loss_mlp": 1.01903224, + "epoch": 0.942973094844431, + "flos": 17966775807360.0, + "grad_norm": 3.117109281623423, + "language_loss": 0.76917231, + "learning_rate": 3.398227451090885e-08, + "loss": 0.79365057, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19775391, + "step": 15684, + "time_per_iteration": 2.780897855758667 + }, + { + "auxiliary_loss_clip": 0.01390507, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.23384154, + "balance_loss_mlp": 1.0117445, + "epoch": 0.9430332180970991, + "flos": 26148074935680.0, + "grad_norm": 1.6416565686685076, + "language_loss": 0.77855551, + "learning_rate": 3.391082294121017e-08, + "loss": 0.80275536, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.17736816, + "step": 15685, + "time_per_iteration": 2.887619972229004 + }, + { + "auxiliary_loss_clip": 0.01394507, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.23777008, + "balance_loss_mlp": 1.0118928, + "epoch": 0.943093341349767, + "flos": 23961929777280.0, + "grad_norm": 1.8303506579814757, + "language_loss": 0.76471257, + "learning_rate": 3.383944592581023e-08, + "loss": 0.78895736, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18078613, + "step": 15686, + "time_per_iteration": 2.852933645248413 + }, + { + "auxiliary_loss_clip": 0.01405125, + "auxiliary_loss_mlp": 0.010321, + "balance_loss_clip": 1.24275017, + "balance_loss_mlp": 1.01222825, + "epoch": 0.943153464602435, + "flos": 17977589314560.0, + "grad_norm": 3.1028940971994756, + "language_loss": 0.81629348, + "learning_rate": 3.376814346741575e-08, + "loss": 0.8406657, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.1986084, + "step": 15687, + "time_per_iteration": 4.356295108795166 + }, + { + "auxiliary_loss_clip": 0.01412451, + "auxiliary_loss_mlp": 0.01035864, + "balance_loss_clip": 1.24996376, + "balance_loss_mlp": 1.01600373, + "epoch": 0.943213587855103, + "flos": 14509609695360.0, + "grad_norm": 2.0658099255524602, + "language_loss": 0.77124178, + "learning_rate": 3.369691556873011e-08, + "loss": 0.79572493, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19873047, + "step": 15688, + "time_per_iteration": 2.7963361740112305 + }, + { + "auxiliary_loss_clip": 0.01383864, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.2302587, + "balance_loss_mlp": 1.01126456, + "epoch": 0.9432737111077709, + "flos": 28998271048320.0, + "grad_norm": 1.7259325209897614, + "language_loss": 0.69119036, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.71533024, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.18847656, + "step": 15689, + "time_per_iteration": 2.890425682067871 + }, + { + "auxiliary_loss_clip": 0.01382395, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.22734535, + "balance_loss_mlp": 1.0184505, + "epoch": 0.9433338343604389, + "flos": 21617137791360.0, + "grad_norm": 1.9122396029522821, + "language_loss": 0.80722857, + "learning_rate": 3.35546834612872e-08, + "loss": 0.83141017, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.17321777, + "step": 15690, + "time_per_iteration": 2.8511645793914795 + }, + { + "auxiliary_loss_clip": 0.01389523, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.23264551, + "balance_loss_mlp": 1.01205337, + "epoch": 0.9433939576131068, + "flos": 33195237189120.0, + "grad_norm": 1.8928355971207866, + "language_loss": 0.61323082, + "learning_rate": 3.348367925792317e-08, + "loss": 0.63742304, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.17663574, + "step": 15691, + "time_per_iteration": 2.997518301010132 + }, + { + "auxiliary_loss_clip": 0.01403486, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.2420373, + "balance_loss_mlp": 1.01232076, + "epoch": 0.9434540808657749, + "flos": 20496348132480.0, + "grad_norm": 1.93696047475537, + "language_loss": 0.66764754, + "learning_rate": 3.341274962505514e-08, + "loss": 0.69199204, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18652344, + "step": 15692, + "time_per_iteration": 2.8448970317840576 + }, + { + "auxiliary_loss_clip": 0.01396209, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.23590207, + "balance_loss_mlp": 1.01252997, + "epoch": 0.9435142041184428, + "flos": 21552790406400.0, + "grad_norm": 2.8440184193732154, + "language_loss": 0.75747979, + "learning_rate": 3.334189456537251e-08, + "loss": 0.78174978, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18261719, + "step": 15693, + "time_per_iteration": 5.736707925796509 + }, + { + "auxiliary_loss_clip": 0.01392666, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.2348485, + "balance_loss_mlp": 1.01452184, + "epoch": 0.9435743273711108, + "flos": 25019819884800.0, + "grad_norm": 1.609723174764035, + "language_loss": 0.73240507, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75666404, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18701172, + "step": 15694, + "time_per_iteration": 2.8831520080566406 + }, + { + "auxiliary_loss_clip": 0.0118015, + "auxiliary_loss_mlp": 0.01024723, + "balance_loss_clip": 1.09209788, + "balance_loss_mlp": 1.00336099, + "epoch": 0.9436344506237787, + "flos": 60191618463360.0, + "grad_norm": 0.7390174750703838, + "language_loss": 0.50644445, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52849317, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21386719, + "step": 15695, + "time_per_iteration": 3.4074082374572754 + }, + { + "auxiliary_loss_clip": 0.01382747, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.22845197, + "balance_loss_mlp": 1.01757276, + "epoch": 0.9436945738764467, + "flos": 22247544597120.0, + "grad_norm": 1.640641200779351, + "language_loss": 0.66058731, + "learning_rate": 3.312977685229335e-08, + "loss": 0.68477547, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18493652, + "step": 15696, + "time_per_iteration": 2.8858556747436523 + }, + { + "auxiliary_loss_clip": 0.01412953, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.25249708, + "balance_loss_mlp": 1.0128479, + "epoch": 0.9437546971291146, + "flos": 25055816762880.0, + "grad_norm": 1.663483412313059, + "language_loss": 0.66924, + "learning_rate": 3.305922011219353e-08, + "loss": 0.6936841, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18603516, + "step": 15697, + "time_per_iteration": 2.908749580383301 + }, + { + "auxiliary_loss_clip": 0.01181108, + "auxiliary_loss_mlp": 0.01037695, + "balance_loss_clip": 1.09339428, + "balance_loss_mlp": 1.01328075, + "epoch": 0.9438148203817827, + "flos": 56819911075200.0, + "grad_norm": 3.9270824662890447, + "language_loss": 0.63358736, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65577537, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.24414062, + "step": 15698, + "time_per_iteration": 3.2012455463409424 + }, + { + "auxiliary_loss_clip": 0.01398918, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.23669171, + "balance_loss_mlp": 1.01538157, + "epoch": 0.9438749436344506, + "flos": 22356621106560.0, + "grad_norm": 1.6403660359318228, + "language_loss": 0.70271236, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72704822, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19299316, + "step": 15699, + "time_per_iteration": 2.839911699295044 + }, + { + "auxiliary_loss_clip": 0.01386789, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.23131239, + "balance_loss_mlp": 1.01439142, + "epoch": 0.9439350668871186, + "flos": 13378685201280.0, + "grad_norm": 2.029029581551905, + "language_loss": 0.7507298, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.77492177, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18041992, + "step": 15700, + "time_per_iteration": 2.88627290725708 + }, + { + "auxiliary_loss_clip": 0.01402855, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.24605834, + "balance_loss_mlp": 1.01327074, + "epoch": 0.9439951901397866, + "flos": 17794302053760.0, + "grad_norm": 4.222971801620476, + "language_loss": 0.71432209, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.73866814, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18481445, + "step": 15701, + "time_per_iteration": 2.880390167236328 + }, + { + "auxiliary_loss_clip": 0.01423092, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.25665998, + "balance_loss_mlp": 1.01177764, + "epoch": 0.9440553133924545, + "flos": 18888731976960.0, + "grad_norm": 2.118810548087412, + "language_loss": 0.78468156, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80921447, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.1842041, + "step": 15702, + "time_per_iteration": 2.8772659301757812 + }, + { + "auxiliary_loss_clip": 0.01410587, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.24755454, + "balance_loss_mlp": 1.01245999, + "epoch": 0.9441154366451225, + "flos": 19582355047680.0, + "grad_norm": 1.6336649476605354, + "language_loss": 0.67217714, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.6965856, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.17797852, + "step": 15703, + "time_per_iteration": 3.078662395477295 + }, + { + "auxiliary_loss_clip": 0.0140589, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.24384809, + "balance_loss_mlp": 1.01204729, + "epoch": 0.9441755598977905, + "flos": 30305922307200.0, + "grad_norm": 2.2827924117850573, + "language_loss": 0.73942876, + "learning_rate": 3.256741150552833e-08, + "loss": 0.76380128, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19311523, + "step": 15704, + "time_per_iteration": 2.9371068477630615 + }, + { + "auxiliary_loss_clip": 0.013827, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.22580183, + "balance_loss_mlp": 1.01192093, + "epoch": 0.9442356831504585, + "flos": 20677644622080.0, + "grad_norm": 2.7830588472081166, + "language_loss": 0.75640488, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.78054285, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.19177246, + "step": 15705, + "time_per_iteration": 2.852957010269165 + }, + { + "auxiliary_loss_clip": 0.01392859, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.23633361, + "balance_loss_mlp": 1.01446366, + "epoch": 0.9442958064031264, + "flos": 16115868506880.0, + "grad_norm": 1.906471639502226, + "language_loss": 0.7752434, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.7994976, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.1809082, + "step": 15706, + "time_per_iteration": 2.8056094646453857 + }, + { + "auxiliary_loss_clip": 0.01373538, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.22021186, + "balance_loss_mlp": 1.00926709, + "epoch": 0.9443559296557944, + "flos": 20456686425600.0, + "grad_norm": 1.4372574297989535, + "language_loss": 0.69647396, + "learning_rate": 3.23577554137866e-08, + "loss": 0.72049487, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.19274902, + "step": 15707, + "time_per_iteration": 2.8777594566345215 + }, + { + "auxiliary_loss_clip": 0.01389661, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.2347374, + "balance_loss_mlp": 1.01333332, + "epoch": 0.9444160529084623, + "flos": 21619445276160.0, + "grad_norm": 2.055947421919738, + "language_loss": 0.70416296, + "learning_rate": 3.22880192727244e-08, + "loss": 0.72837222, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.17907715, + "step": 15708, + "time_per_iteration": 2.845287561416626 + }, + { + "auxiliary_loss_clip": 0.01398786, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.24171233, + "balance_loss_mlp": 1.01365781, + "epoch": 0.9444761761611303, + "flos": 18450842371200.0, + "grad_norm": 2.5613461554254875, + "language_loss": 0.72442973, + "learning_rate": 3.221835774749748e-08, + "loss": 0.74873209, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17810059, + "step": 15709, + "time_per_iteration": 2.809617519378662 + }, + { + "auxiliary_loss_clip": 0.01385728, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.23043692, + "balance_loss_mlp": 1.01206422, + "epoch": 0.9445362994137982, + "flos": 20966298318720.0, + "grad_norm": 2.0200578332026757, + "language_loss": 0.85598689, + "learning_rate": 3.214877084074774e-08, + "loss": 0.88014889, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18395996, + "step": 15710, + "time_per_iteration": 2.832777261734009 + }, + { + "auxiliary_loss_clip": 0.0140543, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.2423923, + "balance_loss_mlp": 1.01524115, + "epoch": 0.9445964226664663, + "flos": 20313015626880.0, + "grad_norm": 1.5952675174206652, + "language_loss": 0.72137749, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.74578416, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19995117, + "step": 15711, + "time_per_iteration": 2.8389997482299805 + }, + { + "auxiliary_loss_clip": 0.01402936, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.24380982, + "balance_loss_mlp": 1.01341665, + "epoch": 0.9446565459191342, + "flos": 26407247005440.0, + "grad_norm": 2.714388225478444, + "language_loss": 0.70557606, + "learning_rate": 3.200982089323179e-08, + "loss": 0.7299304, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1907959, + "step": 15712, + "time_per_iteration": 2.9374778270721436 + }, + { + "auxiliary_loss_clip": 0.01417587, + "auxiliary_loss_mlp": 0.01035416, + "balance_loss_clip": 1.25301051, + "balance_loss_mlp": 1.01578212, + "epoch": 0.9447166691718022, + "flos": 16553396154240.0, + "grad_norm": 2.2342025939172876, + "language_loss": 0.7187472, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.74327725, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19628906, + "step": 15713, + "time_per_iteration": 2.8547704219818115 + }, + { + "auxiliary_loss_clip": 0.01385509, + "auxiliary_loss_mlp": 0.01036756, + "balance_loss_clip": 1.23062098, + "balance_loss_mlp": 1.01641846, + "epoch": 0.9447767924244702, + "flos": 29175450261120.0, + "grad_norm": 1.6293927671270205, + "language_loss": 0.77385557, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79807818, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.20336914, + "step": 15714, + "time_per_iteration": 2.9928553104400635 + }, + { + "auxiliary_loss_clip": 0.01403835, + "auxiliary_loss_mlp": 0.01033025, + "balance_loss_clip": 1.24248755, + "balance_loss_mlp": 1.0146668, + "epoch": 0.9448369156771381, + "flos": 19282616375040.0, + "grad_norm": 2.2234870920061747, + "language_loss": 0.68210506, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.70647365, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18359375, + "step": 15715, + "time_per_iteration": 2.812911033630371 + }, + { + "auxiliary_loss_clip": 0.01402762, + "auxiliary_loss_mlp": 0.01034384, + "balance_loss_clip": 1.24192929, + "balance_loss_mlp": 1.01444006, + "epoch": 0.9448970389298061, + "flos": 23851495923840.0, + "grad_norm": 1.7595507723091628, + "language_loss": 0.7526654, + "learning_rate": 3.173281653583948e-08, + "loss": 0.77703691, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19946289, + "step": 15716, + "time_per_iteration": 2.8818581104278564 + }, + { + "auxiliary_loss_clip": 0.01400217, + "auxiliary_loss_mlp": 0.01027397, + "balance_loss_clip": 1.24024391, + "balance_loss_mlp": 1.0085144, + "epoch": 0.944957162182474, + "flos": 22392346515840.0, + "grad_norm": 1.7600417277335711, + "language_loss": 0.62511015, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64938617, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1887207, + "step": 15717, + "time_per_iteration": 4.313234567642212 + }, + { + "auxiliary_loss_clip": 0.0139746, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.23932445, + "balance_loss_mlp": 1.0144732, + "epoch": 0.9450172854351421, + "flos": 17392726039680.0, + "grad_norm": 1.6325025820002794, + "language_loss": 0.79917562, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.82347614, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18139648, + "step": 15718, + "time_per_iteration": 2.8022913932800293 + }, + { + "auxiliary_loss_clip": 0.01180515, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.09249794, + "balance_loss_mlp": 1.00482368, + "epoch": 0.94507740868781, + "flos": 68498535715200.0, + "grad_norm": 0.7095110987787475, + "language_loss": 0.57864815, + "learning_rate": 3.152584694592719e-08, + "loss": 0.60068274, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.18164062, + "step": 15719, + "time_per_iteration": 3.37575101852417 + }, + { + "auxiliary_loss_clip": 0.01398945, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.23801208, + "balance_loss_mlp": 1.01342559, + "epoch": 0.945137531940478, + "flos": 21152436001920.0, + "grad_norm": 2.303150938332756, + "language_loss": 0.7651211, + "learning_rate": 3.145700636861193e-08, + "loss": 0.78943968, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.19494629, + "step": 15720, + "time_per_iteration": 2.875047206878662 + }, + { + "auxiliary_loss_clip": 0.01380539, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.22344351, + "balance_loss_mlp": 1.01400256, + "epoch": 0.9451976551931459, + "flos": 24544304588160.0, + "grad_norm": 2.1343738470573608, + "language_loss": 0.72979808, + "learning_rate": 3.138824043864452e-08, + "loss": 0.75392956, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18603516, + "step": 15721, + "time_per_iteration": 2.872805595397949 + }, + { + "auxiliary_loss_clip": 0.01400052, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.24019706, + "balance_loss_mlp": 1.01638091, + "epoch": 0.9452577784458139, + "flos": 23451005784960.0, + "grad_norm": 1.9823838324766136, + "language_loss": 0.86084712, + "learning_rate": 3.131954915863244e-08, + "loss": 0.88520646, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19482422, + "step": 15722, + "time_per_iteration": 4.3119072914123535 + }, + { + "auxiliary_loss_clip": 0.01181207, + "auxiliary_loss_mlp": 0.01022012, + "balance_loss_clip": 1.09352493, + "balance_loss_mlp": 1.00570381, + "epoch": 0.9453179016984818, + "flos": 52047266353920.0, + "grad_norm": 0.9031642058713492, + "language_loss": 0.64490509, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66693735, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.16308594, + "step": 15723, + "time_per_iteration": 3.242908477783203 + }, + { + "auxiliary_loss_clip": 0.01405667, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.24458873, + "balance_loss_mlp": 1.01478875, + "epoch": 0.9453780249511499, + "flos": 13479029464320.0, + "grad_norm": 2.048261267733471, + "language_loss": 0.73708874, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.76148391, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19067383, + "step": 15724, + "time_per_iteration": 2.920581579208374 + }, + { + "auxiliary_loss_clip": 0.01395383, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.23713136, + "balance_loss_mlp": 1.01170278, + "epoch": 0.9454381482038178, + "flos": 23268985378560.0, + "grad_norm": 2.184050879488074, + "language_loss": 0.856359, + "learning_rate": 3.111392324436024e-08, + "loss": 0.88062304, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19335938, + "step": 15725, + "time_per_iteration": 2.865981340408325 + }, + { + "auxiliary_loss_clip": 0.01401324, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.24236071, + "balance_loss_mlp": 1.01085222, + "epoch": 0.9454982714564858, + "flos": 19505112894720.0, + "grad_norm": 1.8904404301700122, + "language_loss": 0.71599001, + "learning_rate": 3.104553059018822e-08, + "loss": 0.74029511, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18322754, + "step": 15726, + "time_per_iteration": 2.859386920928955 + }, + { + "auxiliary_loss_clip": 0.01401521, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.24231267, + "balance_loss_mlp": 1.01603603, + "epoch": 0.9455583947091538, + "flos": 23268487685760.0, + "grad_norm": 1.808752986305152, + "language_loss": 0.61536562, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63973582, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19470215, + "step": 15727, + "time_per_iteration": 2.880540370941162 + }, + { + "auxiliary_loss_clip": 0.01387511, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.23147154, + "balance_loss_mlp": 1.01261246, + "epoch": 0.9456185179618217, + "flos": 17681244001920.0, + "grad_norm": 1.6941768114150166, + "language_loss": 0.82491529, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.84910774, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.19104004, + "step": 15728, + "time_per_iteration": 4.22817587852478 + }, + { + "auxiliary_loss_clip": 0.01180012, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.09334874, + "balance_loss_mlp": 1.0119673, + "epoch": 0.9456786412144897, + "flos": 61442912684160.0, + "grad_norm": 0.737347826883757, + "language_loss": 0.59101403, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61309969, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.16601562, + "step": 15729, + "time_per_iteration": 4.684915065765381 + }, + { + "auxiliary_loss_clip": 0.01382959, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.22584105, + "balance_loss_mlp": 1.01210868, + "epoch": 0.9457387644671577, + "flos": 18597635061120.0, + "grad_norm": 1.9449291941082627, + "language_loss": 0.77846646, + "learning_rate": 3.077270662890052e-08, + "loss": 0.80261338, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19616699, + "step": 15730, + "time_per_iteration": 2.836890459060669 + }, + { + "auxiliary_loss_clip": 0.01402468, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.24227846, + "balance_loss_mlp": 1.01498652, + "epoch": 0.9457988877198257, + "flos": 21119153811840.0, + "grad_norm": 1.3770720710316837, + "language_loss": 0.63489658, + "learning_rate": 3.070468731536047e-08, + "loss": 0.65926421, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19311523, + "step": 15731, + "time_per_iteration": 2.9405975341796875 + }, + { + "auxiliary_loss_clip": 0.01395459, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.23498976, + "balance_loss_mlp": 1.01123953, + "epoch": 0.9458590109724936, + "flos": 26699791754880.0, + "grad_norm": 3.0028800225445256, + "language_loss": 0.64643884, + "learning_rate": 3.063674267769589e-08, + "loss": 0.67070913, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20336914, + "step": 15732, + "time_per_iteration": 2.925813674926758 + }, + { + "auxiliary_loss_clip": 0.01413664, + "auxiliary_loss_mlp": 0.01030186, + "balance_loss_clip": 1.2491678, + "balance_loss_mlp": 1.01117253, + "epoch": 0.9459191342251616, + "flos": 18670940916480.0, + "grad_norm": 1.9657147377884634, + "language_loss": 0.8516835, + "learning_rate": 3.056887271848363e-08, + "loss": 0.876122, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.19030762, + "step": 15733, + "time_per_iteration": 2.8026206493377686 + }, + { + "auxiliary_loss_clip": 0.01386864, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.23283935, + "balance_loss_mlp": 1.01212835, + "epoch": 0.9459792574778295, + "flos": 23407407780480.0, + "grad_norm": 1.4485004388529126, + "language_loss": 0.7290619, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.75322831, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.17663574, + "step": 15734, + "time_per_iteration": 2.877760171890259 + }, + { + "auxiliary_loss_clip": 0.01379308, + "auxiliary_loss_mlp": 0.01028002, + "balance_loss_clip": 1.2245084, + "balance_loss_mlp": 1.01037097, + "epoch": 0.9460393807304975, + "flos": 24404615331840.0, + "grad_norm": 13.520151814134735, + "language_loss": 0.87095284, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89502597, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.1763916, + "step": 15735, + "time_per_iteration": 2.914188861846924 + }, + { + "auxiliary_loss_clip": 0.01403861, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.24413681, + "balance_loss_mlp": 1.01354504, + "epoch": 0.9460995039831654, + "flos": 21948077393280.0, + "grad_norm": 1.9999020475359695, + "language_loss": 0.68061072, + "learning_rate": 3.036571093728102e-08, + "loss": 0.70496523, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18054199, + "step": 15736, + "time_per_iteration": 2.855769157409668 + }, + { + "auxiliary_loss_clip": 0.0117594, + "auxiliary_loss_mlp": 0.01017784, + "balance_loss_clip": 1.09002066, + "balance_loss_mlp": 0.99918747, + "epoch": 0.9461596272358335, + "flos": 70353922250880.0, + "grad_norm": 0.8693790193733918, + "language_loss": 0.65304625, + "learning_rate": 3.029813971758499e-08, + "loss": 0.6749835, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.18554688, + "step": 15737, + "time_per_iteration": 3.348862648010254 + }, + { + "auxiliary_loss_clip": 0.01174321, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.08957088, + "balance_loss_mlp": 1.00657701, + "epoch": 0.9462197504885014, + "flos": 58624116462720.0, + "grad_norm": 0.8099701742835257, + "language_loss": 0.5894472, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.61149651, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.24023438, + "step": 15738, + "time_per_iteration": 3.302438259124756 + }, + { + "auxiliary_loss_clip": 0.01377995, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.22340894, + "balance_loss_mlp": 1.01358652, + "epoch": 0.9462798737411694, + "flos": 23442409272960.0, + "grad_norm": 1.878194724246135, + "language_loss": 0.71959782, + "learning_rate": 3.016322135462834e-08, + "loss": 0.74370402, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.19055176, + "step": 15739, + "time_per_iteration": 2.85922908782959 + }, + { + "auxiliary_loss_clip": 0.01406721, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.24559093, + "balance_loss_mlp": 1.01383638, + "epoch": 0.9463399969938374, + "flos": 25056947882880.0, + "grad_norm": 2.5244123235446714, + "language_loss": 0.64885396, + "learning_rate": 3.009587421648363e-08, + "loss": 0.67324954, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19006348, + "step": 15740, + "time_per_iteration": 2.8383712768554688 + }, + { + "auxiliary_loss_clip": 0.01385785, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.23039365, + "balance_loss_mlp": 1.01534581, + "epoch": 0.9464001202465053, + "flos": 24363234322560.0, + "grad_norm": 1.723577229975546, + "language_loss": 0.67379045, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.69799805, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.19641113, + "step": 15741, + "time_per_iteration": 2.865550994873047 + }, + { + "auxiliary_loss_clip": 0.0140169, + "auxiliary_loss_mlp": 0.01033947, + "balance_loss_clip": 1.24207783, + "balance_loss_mlp": 1.01507664, + "epoch": 0.9464602434991733, + "flos": 17174346796800.0, + "grad_norm": 2.364179052279632, + "language_loss": 0.76452076, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.78887713, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1887207, + "step": 15742, + "time_per_iteration": 2.8343098163604736 + }, + { + "auxiliary_loss_clip": 0.01385571, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.22939515, + "balance_loss_mlp": 1.01172769, + "epoch": 0.9465203667518413, + "flos": 19947662714880.0, + "grad_norm": 2.6438642321854475, + "language_loss": 0.73206103, + "learning_rate": 2.989428100602187e-08, + "loss": 0.75621808, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1842041, + "step": 15743, + "time_per_iteration": 2.946211576461792 + }, + { + "auxiliary_loss_clip": 0.01397013, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.23563015, + "balance_loss_mlp": 1.01228166, + "epoch": 0.9465804900045093, + "flos": 20129864100480.0, + "grad_norm": 2.136463010203987, + "language_loss": 0.80655074, + "learning_rate": 2.982723267901943e-08, + "loss": 0.83083159, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18786621, + "step": 15744, + "time_per_iteration": 2.8593456745147705 + }, + { + "auxiliary_loss_clip": 0.01400515, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.23910093, + "balance_loss_mlp": 1.01517415, + "epoch": 0.9466406132571772, + "flos": 23921317929600.0, + "grad_norm": 1.6946146879387574, + "language_loss": 0.78808087, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.81243253, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19470215, + "step": 15745, + "time_per_iteration": 2.8633596897125244 + }, + { + "auxiliary_loss_clip": 0.01406032, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.24312449, + "balance_loss_mlp": 1.0123837, + "epoch": 0.9467007365098452, + "flos": 19941509422080.0, + "grad_norm": 2.3672414154130217, + "language_loss": 0.70904249, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.73341727, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19055176, + "step": 15746, + "time_per_iteration": 2.8703107833862305 + }, + { + "auxiliary_loss_clip": 0.0139636, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.23763919, + "balance_loss_mlp": 1.0135448, + "epoch": 0.9467608597625131, + "flos": 19317979825920.0, + "grad_norm": 1.8504150373339785, + "language_loss": 0.57108778, + "learning_rate": 2.962653596305964e-08, + "loss": 0.59537613, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18933105, + "step": 15747, + "time_per_iteration": 2.7869908809661865 + }, + { + "auxiliary_loss_clip": 0.01177906, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.09203506, + "balance_loss_mlp": 1.015028, + "epoch": 0.9468209830151811, + "flos": 69661792258560.0, + "grad_norm": 0.6700392142477466, + "language_loss": 0.53288186, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55504298, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.23144531, + "step": 15748, + "time_per_iteration": 3.5311431884765625 + }, + { + "auxiliary_loss_clip": 0.01401776, + "auxiliary_loss_mlp": 0.01035479, + "balance_loss_clip": 1.24107289, + "balance_loss_mlp": 1.01592851, + "epoch": 0.946881106267849, + "flos": 27028107158400.0, + "grad_norm": 1.611726546872284, + "language_loss": 0.67064095, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.69501346, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19555664, + "step": 15749, + "time_per_iteration": 2.90354061126709 + }, + { + "auxiliary_loss_clip": 0.01395499, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.23526788, + "balance_loss_mlp": 1.01463723, + "epoch": 0.9469412295205171, + "flos": 20198826455040.0, + "grad_norm": 1.9729554667103444, + "language_loss": 0.77180886, + "learning_rate": 2.942651169791621e-08, + "loss": 0.79610419, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.1940918, + "step": 15750, + "time_per_iteration": 2.8536465167999268 + }, + { + "auxiliary_loss_clip": 0.01393544, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.23606682, + "balance_loss_mlp": 1.01068926, + "epoch": 0.947001352773185, + "flos": 21334908856320.0, + "grad_norm": 1.6377370319713205, + "language_loss": 0.68525589, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70949298, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.19470215, + "step": 15751, + "time_per_iteration": 2.884308099746704 + }, + { + "auxiliary_loss_clip": 0.01404431, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.2430582, + "balance_loss_mlp": 1.01418686, + "epoch": 0.947061476025853, + "flos": 21953913972480.0, + "grad_norm": 2.581568927232058, + "language_loss": 0.66142124, + "learning_rate": 2.929353580532723e-08, + "loss": 0.68579841, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19091797, + "step": 15752, + "time_per_iteration": 2.8411879539489746 + }, + { + "auxiliary_loss_clip": 0.01393, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.23468065, + "balance_loss_mlp": 1.01125932, + "epoch": 0.947121599278521, + "flos": 21403916455680.0, + "grad_norm": 1.8541845574484335, + "language_loss": 0.72094238, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.74517727, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.19213867, + "step": 15753, + "time_per_iteration": 4.2640626430511475 + }, + { + "auxiliary_loss_clip": 0.01411461, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.24797535, + "balance_loss_mlp": 1.01774645, + "epoch": 0.9471817225311889, + "flos": 23086060076160.0, + "grad_norm": 1.8216298975229803, + "language_loss": 0.71347249, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.73796457, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.20007324, + "step": 15754, + "time_per_iteration": 2.8627097606658936 + }, + { + "auxiliary_loss_clip": 0.01395821, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.23439288, + "balance_loss_mlp": 1.01256227, + "epoch": 0.947241845783857, + "flos": 11918087959680.0, + "grad_norm": 2.2229130899783724, + "language_loss": 0.79637361, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.82063401, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.17651367, + "step": 15755, + "time_per_iteration": 2.803941011428833 + }, + { + "auxiliary_loss_clip": 0.01415313, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.24870276, + "balance_loss_mlp": 1.01619935, + "epoch": 0.9473019690365249, + "flos": 20750407539840.0, + "grad_norm": 2.223633994387574, + "language_loss": 0.76015782, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.78466427, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19128418, + "step": 15756, + "time_per_iteration": 4.248815298080444 + }, + { + "auxiliary_loss_clip": 0.01397444, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.23779464, + "balance_loss_mlp": 1.011608, + "epoch": 0.9473620922891929, + "flos": 17648957197440.0, + "grad_norm": 2.391702943088744, + "language_loss": 0.75972533, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.78399992, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18408203, + "step": 15757, + "time_per_iteration": 2.851130247116089 + }, + { + "auxiliary_loss_clip": 0.01398249, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.23666596, + "balance_loss_mlp": 1.01194167, + "epoch": 0.9474222155418608, + "flos": 23560444252800.0, + "grad_norm": 2.4404414147231517, + "language_loss": 0.80579758, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8300916, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.1920166, + "step": 15758, + "time_per_iteration": 2.822061061859131 + }, + { + "auxiliary_loss_clip": 0.01384485, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.22729111, + "balance_loss_mlp": 1.01504946, + "epoch": 0.9474823387945288, + "flos": 27101277279360.0, + "grad_norm": 1.5281963497547446, + "language_loss": 0.7247898, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74898088, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19592285, + "step": 15759, + "time_per_iteration": 2.9126381874084473 + }, + { + "auxiliary_loss_clip": 0.0138684, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.23359597, + "balance_loss_mlp": 1.01098967, + "epoch": 0.9475424620471967, + "flos": 22976893077120.0, + "grad_norm": 1.5901159313326272, + "language_loss": 0.76475561, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.78891563, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.1817627, + "step": 15760, + "time_per_iteration": 2.839984893798828 + }, + { + "auxiliary_loss_clip": 0.01397933, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.23920298, + "balance_loss_mlp": 1.01302338, + "epoch": 0.9476025852998647, + "flos": 20057282161920.0, + "grad_norm": 1.8035871762148665, + "language_loss": 0.73398268, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75828117, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18884277, + "step": 15761, + "time_per_iteration": 2.8317642211914062 + }, + { + "auxiliary_loss_clip": 0.01398967, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.24178565, + "balance_loss_mlp": 1.01987672, + "epoch": 0.9476627085525327, + "flos": 14983586668800.0, + "grad_norm": 1.9453223892309939, + "language_loss": 0.73020256, + "learning_rate": 2.863314050734722e-08, + "loss": 0.75458062, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18981934, + "step": 15762, + "time_per_iteration": 2.8151895999908447 + }, + { + "auxiliary_loss_clip": 0.01421153, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.25549376, + "balance_loss_mlp": 1.01749539, + "epoch": 0.9477228318052007, + "flos": 18707073528960.0, + "grad_norm": 1.8751145951343542, + "language_loss": 0.67644703, + "learning_rate": 2.856751208570518e-08, + "loss": 0.70103085, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.1973877, + "step": 15763, + "time_per_iteration": 4.241209030151367 + }, + { + "auxiliary_loss_clip": 0.01403615, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.24297142, + "balance_loss_mlp": 1.01546896, + "epoch": 0.9477829550578686, + "flos": 23884732869120.0, + "grad_norm": 1.7014201153292459, + "language_loss": 0.71214592, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.7365272, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19030762, + "step": 15764, + "time_per_iteration": 4.19794225692749 + }, + { + "auxiliary_loss_clip": 0.01382525, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.23120761, + "balance_loss_mlp": 1.01037312, + "epoch": 0.9478430783105366, + "flos": 22572557130240.0, + "grad_norm": 1.63488240519942, + "language_loss": 0.71902966, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.74313998, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 1.51269531, + "router_z_loss_mlp": 0.18139648, + "step": 15765, + "time_per_iteration": 2.8548567295074463 + }, + { + "auxiliary_loss_clip": 0.01177731, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.09149528, + "balance_loss_mlp": 1.01322174, + "epoch": 0.9479032015632046, + "flos": 60883051800960.0, + "grad_norm": 0.8174986119513635, + "language_loss": 0.59207821, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61418033, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.19238281, + "step": 15766, + "time_per_iteration": 3.156442165374756 + }, + { + "auxiliary_loss_clip": 0.01393696, + "auxiliary_loss_mlp": 0.01037911, + "balance_loss_clip": 1.23517704, + "balance_loss_mlp": 1.01915967, + "epoch": 0.9479633248158725, + "flos": 14691992060160.0, + "grad_norm": 1.780997234906845, + "language_loss": 0.75163603, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.7759521, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18762207, + "step": 15767, + "time_per_iteration": 2.8255720138549805 + }, + { + "auxiliary_loss_clip": 0.01409258, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.24615407, + "balance_loss_mlp": 1.01249409, + "epoch": 0.9480234480685406, + "flos": 20342090050560.0, + "grad_norm": 1.96443628141305, + "language_loss": 0.73548818, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.7598896, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.18383789, + "step": 15768, + "time_per_iteration": 2.846597909927368 + }, + { + "auxiliary_loss_clip": 0.0118261, + "auxiliary_loss_mlp": 0.01027775, + "balance_loss_clip": 1.0933274, + "balance_loss_mlp": 1.01079988, + "epoch": 0.9480835713212085, + "flos": 70326386150400.0, + "grad_norm": 0.7320028041834035, + "language_loss": 0.5530504, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57515424, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.16992188, + "step": 15769, + "time_per_iteration": 3.2899396419525146 + }, + { + "auxiliary_loss_clip": 0.01397106, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.23633838, + "balance_loss_mlp": 1.01229143, + "epoch": 0.9481436945738765, + "flos": 25461329074560.0, + "grad_norm": 1.3479992376320484, + "language_loss": 0.7776159, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.8018899, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18005371, + "step": 15770, + "time_per_iteration": 2.940720319747925 + }, + { + "auxiliary_loss_clip": 0.01402614, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.24353433, + "balance_loss_mlp": 1.01349688, + "epoch": 0.9482038178265444, + "flos": 26991295873920.0, + "grad_norm": 1.8934317623970707, + "language_loss": 0.80599856, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.83035505, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19555664, + "step": 15771, + "time_per_iteration": 2.9939322471618652 + }, + { + "auxiliary_loss_clip": 0.01398891, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.24174428, + "balance_loss_mlp": 1.01231956, + "epoch": 0.9482639410792124, + "flos": 17794437788160.0, + "grad_norm": 2.067358629902965, + "language_loss": 0.70740634, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.73170668, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18823242, + "step": 15772, + "time_per_iteration": 2.9077634811401367 + }, + { + "auxiliary_loss_clip": 0.01400146, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.24129844, + "balance_loss_mlp": 1.01306224, + "epoch": 0.9483240643318803, + "flos": 21006638697600.0, + "grad_norm": 1.5366063191493367, + "language_loss": 0.74435246, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.76867133, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18676758, + "step": 15773, + "time_per_iteration": 2.8676867485046387 + }, + { + "auxiliary_loss_clip": 0.01407691, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.24465799, + "balance_loss_mlp": 1.01787138, + "epoch": 0.9483841875845483, + "flos": 20092690857600.0, + "grad_norm": 1.9779610025387027, + "language_loss": 0.63633311, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.66078162, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19287109, + "step": 15774, + "time_per_iteration": 2.8057992458343506 + }, + { + "auxiliary_loss_clip": 0.01398622, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.2380352, + "balance_loss_mlp": 1.01231694, + "epoch": 0.9484443108372163, + "flos": 20823396681600.0, + "grad_norm": 3.0798775825444613, + "language_loss": 0.61039037, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.63469464, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19482422, + "step": 15775, + "time_per_iteration": 2.8179807662963867 + }, + { + "auxiliary_loss_clip": 0.0139823, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.23803401, + "balance_loss_mlp": 1.01360822, + "epoch": 0.9485044340898843, + "flos": 36442892039040.0, + "grad_norm": 1.8022802857631814, + "language_loss": 0.6229167, + "learning_rate": 2.772114638584555e-08, + "loss": 0.64722443, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18945312, + "step": 15776, + "time_per_iteration": 2.9442999362945557 + }, + { + "auxiliary_loss_clip": 0.01404762, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.24242353, + "balance_loss_mlp": 1.01292288, + "epoch": 0.9485645573425522, + "flos": 22613033243520.0, + "grad_norm": 1.874860089152283, + "language_loss": 0.74459827, + "learning_rate": 2.765656478622458e-08, + "loss": 0.7689715, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19641113, + "step": 15777, + "time_per_iteration": 2.8325893878936768 + }, + { + "auxiliary_loss_clip": 0.0143676, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.26717639, + "balance_loss_mlp": 1.01331663, + "epoch": 0.9486246805952202, + "flos": 22027853255040.0, + "grad_norm": 2.7600869119629645, + "language_loss": 0.73097014, + "learning_rate": 2.759205797806441e-08, + "loss": 0.75566524, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 1.69628906, + "router_z_loss_mlp": 0.19433594, + "step": 15778, + "time_per_iteration": 2.8414289951324463 + }, + { + "auxiliary_loss_clip": 0.01378298, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.22673368, + "balance_loss_mlp": 1.01529086, + "epoch": 0.9486848038478882, + "flos": 16517399276160.0, + "grad_norm": 1.7439916659147217, + "language_loss": 0.7050333, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.72915071, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.18151855, + "step": 15779, + "time_per_iteration": 2.8414008617401123 + }, + { + "auxiliary_loss_clip": 0.01405534, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.24533057, + "balance_loss_mlp": 1.01457345, + "epoch": 0.9487449271005561, + "flos": 19253632440960.0, + "grad_norm": 1.9790834061069276, + "language_loss": 0.79624867, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.82064509, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19543457, + "step": 15780, + "time_per_iteration": 2.7985169887542725 + }, + { + "auxiliary_loss_clip": 0.01396657, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.23795354, + "balance_loss_mlp": 1.0196892, + "epoch": 0.9488050503532242, + "flos": 21772798462080.0, + "grad_norm": 1.7590676331994415, + "language_loss": 0.66827118, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.69261742, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18286133, + "step": 15781, + "time_per_iteration": 2.8563005924224854 + }, + { + "auxiliary_loss_clip": 0.01387523, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.23090696, + "balance_loss_mlp": 1.01212907, + "epoch": 0.9488651736058921, + "flos": 18377988963840.0, + "grad_norm": 2.1839346005472042, + "language_loss": 0.80937147, + "learning_rate": 2.733477870890999e-08, + "loss": 0.83355498, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18713379, + "step": 15782, + "time_per_iteration": 2.861647844314575 + }, + { + "auxiliary_loss_clip": 0.01180522, + "auxiliary_loss_mlp": 0.01019518, + "balance_loss_clip": 1.09265041, + "balance_loss_mlp": 1.00063539, + "epoch": 0.9489252968585601, + "flos": 70119861045120.0, + "grad_norm": 0.7373543638753183, + "language_loss": 0.59768558, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61968601, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.18847656, + "step": 15783, + "time_per_iteration": 3.426539897918701 + }, + { + "auxiliary_loss_clip": 0.01401402, + "auxiliary_loss_mlp": 0.01035304, + "balance_loss_clip": 1.24111176, + "balance_loss_mlp": 1.0156703, + "epoch": 0.948985420111228, + "flos": 27867029840640.0, + "grad_norm": 1.6217456847434788, + "language_loss": 0.74158055, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76594758, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19641113, + "step": 15784, + "time_per_iteration": 2.938406467437744 + }, + { + "auxiliary_loss_clip": 0.0140472, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.24263847, + "balance_loss_mlp": 1.01401806, + "epoch": 0.949045543363896, + "flos": 24326649262080.0, + "grad_norm": 1.8602291714428074, + "language_loss": 0.70511377, + "learning_rate": 2.714260468695806e-08, + "loss": 0.72949952, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.19836426, + "step": 15785, + "time_per_iteration": 2.8837757110595703 + }, + { + "auxiliary_loss_clip": 0.01410767, + "auxiliary_loss_mlp": 0.01035223, + "balance_loss_clip": 1.24749851, + "balance_loss_mlp": 1.01628029, + "epoch": 0.9491056666165639, + "flos": 24251895573120.0, + "grad_norm": 1.8085407489955077, + "language_loss": 0.76761508, + "learning_rate": 2.707869629830495e-08, + "loss": 0.79207504, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.18933105, + "step": 15786, + "time_per_iteration": 3.060150623321533 + }, + { + "auxiliary_loss_clip": 0.01393808, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.23595786, + "balance_loss_mlp": 1.01511574, + "epoch": 0.949165789869232, + "flos": 24540956472960.0, + "grad_norm": 1.790760280641059, + "language_loss": 0.80036819, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.82464314, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18591309, + "step": 15787, + "time_per_iteration": 2.9672884941101074 + }, + { + "auxiliary_loss_clip": 0.01401674, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.24535918, + "balance_loss_mlp": 1.01582682, + "epoch": 0.9492259131218999, + "flos": 22245146622720.0, + "grad_norm": 1.5514380728509336, + "language_loss": 0.76673585, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.7910918, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.1809082, + "step": 15788, + "time_per_iteration": 4.391999006271362 + }, + { + "auxiliary_loss_clip": 0.01406888, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.24559879, + "balance_loss_mlp": 1.01675653, + "epoch": 0.9492860363745679, + "flos": 22976621608320.0, + "grad_norm": 2.1244555136726424, + "language_loss": 0.72606969, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.75049585, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18994141, + "step": 15789, + "time_per_iteration": 2.8260793685913086 + }, + { + "auxiliary_loss_clip": 0.01399814, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.24037433, + "balance_loss_mlp": 1.01166165, + "epoch": 0.9493461596272358, + "flos": 18379708266240.0, + "grad_norm": 1.738112218492504, + "language_loss": 0.73820436, + "learning_rate": 2.682381090161989e-08, + "loss": 0.76253247, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.21313477, + "step": 15790, + "time_per_iteration": 2.865224599838257 + }, + { + "auxiliary_loss_clip": 0.0142051, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.25652993, + "balance_loss_mlp": 1.0141052, + "epoch": 0.9494062828799038, + "flos": 20021466263040.0, + "grad_norm": 1.714648973119075, + "language_loss": 0.78281081, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.8073442, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.18737793, + "step": 15791, + "time_per_iteration": 4.27386212348938 + }, + { + "auxiliary_loss_clip": 0.01432423, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.26607025, + "balance_loss_mlp": 1.01758897, + "epoch": 0.9494664061325718, + "flos": 27238297092480.0, + "grad_norm": 2.115245603811823, + "language_loss": 0.74898154, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.77367026, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.1887207, + "step": 15792, + "time_per_iteration": 2.9051811695098877 + }, + { + "auxiliary_loss_clip": 0.01390478, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.23183441, + "balance_loss_mlp": 1.01398146, + "epoch": 0.9495265293852397, + "flos": 18379120083840.0, + "grad_norm": 1.9113605786553118, + "language_loss": 0.78926134, + "learning_rate": 2.663343248754679e-08, + "loss": 0.81348884, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18286133, + "step": 15793, + "time_per_iteration": 2.8375725746154785 + }, + { + "auxiliary_loss_clip": 0.01384663, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.22645855, + "balance_loss_mlp": 1.01458788, + "epoch": 0.9495866526379078, + "flos": 23086105320960.0, + "grad_norm": 1.6862000774852333, + "language_loss": 0.7862463, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.81041932, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18041992, + "step": 15794, + "time_per_iteration": 2.8928539752960205 + }, + { + "auxiliary_loss_clip": 0.01406856, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.2444458, + "balance_loss_mlp": 1.01498556, + "epoch": 0.9496467758905757, + "flos": 17538794812800.0, + "grad_norm": 2.3107987342278435, + "language_loss": 0.61636692, + "learning_rate": 2.650688769211107e-08, + "loss": 0.64077955, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19445801, + "step": 15795, + "time_per_iteration": 2.818861722946167 + }, + { + "auxiliary_loss_clip": 0.01386085, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.23121226, + "balance_loss_mlp": 1.01757693, + "epoch": 0.9497068991432437, + "flos": 24144538366080.0, + "grad_norm": 1.7650589457316432, + "language_loss": 0.80157, + "learning_rate": 2.644372754577895e-08, + "loss": 0.82581282, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.20617676, + "step": 15796, + "time_per_iteration": 2.8484439849853516 + }, + { + "auxiliary_loss_clip": 0.01396259, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.23653817, + "balance_loss_mlp": 1.01428664, + "epoch": 0.9497670223959116, + "flos": 20313060871680.0, + "grad_norm": 3.491004559911205, + "language_loss": 0.76256061, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.78685391, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18774414, + "step": 15797, + "time_per_iteration": 2.830131769180298 + }, + { + "auxiliary_loss_clip": 0.01404984, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.24518085, + "balance_loss_mlp": 1.01410663, + "epoch": 0.9498271456485796, + "flos": 13706819625600.0, + "grad_norm": 2.4190522789368725, + "language_loss": 0.67297417, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.69735593, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19104004, + "step": 15798, + "time_per_iteration": 4.251521587371826 + }, + { + "auxiliary_loss_clip": 0.01418381, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.25496316, + "balance_loss_mlp": 1.01731217, + "epoch": 0.9498872689012475, + "flos": 20823849129600.0, + "grad_norm": 3.751364691808705, + "language_loss": 0.77673006, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.80127633, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18920898, + "step": 15799, + "time_per_iteration": 4.258679628372192 + }, + { + "auxiliary_loss_clip": 0.01390174, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.23573565, + "balance_loss_mlp": 1.01234293, + "epoch": 0.9499473921539155, + "flos": 21042545086080.0, + "grad_norm": 1.7562194181987858, + "language_loss": 0.72060138, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.7448101, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18347168, + "step": 15800, + "time_per_iteration": 2.8042502403259277 + }, + { + "auxiliary_loss_clip": 0.01402869, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.24371457, + "balance_loss_mlp": 1.01478529, + "epoch": 0.9500075154065835, + "flos": 21009172406400.0, + "grad_norm": 1.569945636456981, + "language_loss": 0.7235446, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74790585, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18493652, + "step": 15801, + "time_per_iteration": 2.859426736831665 + }, + { + "auxiliary_loss_clip": 0.01391765, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.23288178, + "balance_loss_mlp": 1.0143137, + "epoch": 0.9500676386592515, + "flos": 25133601853440.0, + "grad_norm": 1.807566186072682, + "language_loss": 0.81541741, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83966637, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18811035, + "step": 15802, + "time_per_iteration": 2.937368869781494 + }, + { + "auxiliary_loss_clip": 0.01417215, + "auxiliary_loss_mlp": 0.01037488, + "balance_loss_clip": 1.25452459, + "balance_loss_mlp": 1.01809287, + "epoch": 0.9501277619119194, + "flos": 27534235201920.0, + "grad_norm": 1.8013964980122885, + "language_loss": 0.6893121, + "learning_rate": 2.60037021038646e-08, + "loss": 0.71385908, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.1940918, + "step": 15803, + "time_per_iteration": 2.8811376094818115 + }, + { + "auxiliary_loss_clip": 0.01399101, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.24028385, + "balance_loss_mlp": 1.01550484, + "epoch": 0.9501878851645874, + "flos": 20823803884800.0, + "grad_norm": 1.6107051053520052, + "language_loss": 0.76467121, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.7890116, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19433594, + "step": 15804, + "time_per_iteration": 2.833137273788452 + }, + { + "auxiliary_loss_clip": 0.01414545, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.25272918, + "balance_loss_mlp": 1.01853621, + "epoch": 0.9502480084172553, + "flos": 18378577146240.0, + "grad_norm": 3.1194717956273603, + "language_loss": 0.73713839, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.76165676, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18737793, + "step": 15805, + "time_per_iteration": 2.8037145137786865 + }, + { + "auxiliary_loss_clip": 0.01408403, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.24830508, + "balance_loss_mlp": 1.01801491, + "epoch": 0.9503081316699233, + "flos": 23559946560000.0, + "grad_norm": 1.9202977215113957, + "language_loss": 0.80663365, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.83108377, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18615723, + "step": 15806, + "time_per_iteration": 2.9057319164276123 + }, + { + "auxiliary_loss_clip": 0.01409, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.24723291, + "balance_loss_mlp": 1.01346684, + "epoch": 0.9503682549225914, + "flos": 18049402091520.0, + "grad_norm": 2.1290884176423415, + "language_loss": 0.83554029, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.8599506, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18554688, + "step": 15807, + "time_per_iteration": 2.8105506896972656 + }, + { + "auxiliary_loss_clip": 0.01395249, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.2371484, + "balance_loss_mlp": 1.01116693, + "epoch": 0.9504283781752593, + "flos": 25897454133120.0, + "grad_norm": 1.7701684829867035, + "language_loss": 0.72542846, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.74968415, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19165039, + "step": 15808, + "time_per_iteration": 2.888897657394409 + }, + { + "auxiliary_loss_clip": 0.01393211, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.23576629, + "balance_loss_mlp": 1.01044679, + "epoch": 0.9504885014279273, + "flos": 22133310180480.0, + "grad_norm": 1.6088710638316248, + "language_loss": 0.70138514, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72561467, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1932373, + "step": 15809, + "time_per_iteration": 2.808933973312378 + }, + { + "auxiliary_loss_clip": 0.01390886, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.23234797, + "balance_loss_mlp": 1.01208901, + "epoch": 0.9505486246805952, + "flos": 21625824792960.0, + "grad_norm": 2.2153647544438644, + "language_loss": 0.75765753, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.78188443, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19714355, + "step": 15810, + "time_per_iteration": 2.868792772293091 + }, + { + "auxiliary_loss_clip": 0.01397203, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.23667502, + "balance_loss_mlp": 1.02000976, + "epoch": 0.9506087479332632, + "flos": 22538370044160.0, + "grad_norm": 1.3750164478914175, + "language_loss": 0.80469942, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.8290565, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18493652, + "step": 15811, + "time_per_iteration": 2.8515336513519287 + }, + { + "auxiliary_loss_clip": 0.01404647, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.2451005, + "balance_loss_mlp": 1.01383948, + "epoch": 0.9506688711859311, + "flos": 27538940661120.0, + "grad_norm": 1.9825853193609115, + "language_loss": 0.70809698, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.73247147, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18969727, + "step": 15812, + "time_per_iteration": 2.9683945178985596 + }, + { + "auxiliary_loss_clip": 0.01412201, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.24973083, + "balance_loss_mlp": 1.01694107, + "epoch": 0.9507289944385992, + "flos": 19875487979520.0, + "grad_norm": 1.705879315730569, + "language_loss": 0.66300708, + "learning_rate": 2.538145713158446e-08, + "loss": 0.68749034, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19165039, + "step": 15813, + "time_per_iteration": 2.835587978363037 + }, + { + "auxiliary_loss_clip": 0.01403776, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.2430234, + "balance_loss_mlp": 1.01651978, + "epoch": 0.9507891176912671, + "flos": 25204509734400.0, + "grad_norm": 1.4205683952538237, + "language_loss": 0.70815432, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.73254937, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.1920166, + "step": 15814, + "time_per_iteration": 2.8905835151672363 + }, + { + "auxiliary_loss_clip": 0.01391398, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.23441291, + "balance_loss_mlp": 1.011253, + "epoch": 0.9508492409439351, + "flos": 24910245682560.0, + "grad_norm": 2.3435473507056117, + "language_loss": 0.64183295, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.66604227, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18286133, + "step": 15815, + "time_per_iteration": 2.9033846855163574 + }, + { + "auxiliary_loss_clip": 0.01402004, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.24176466, + "balance_loss_mlp": 1.01368356, + "epoch": 0.950909364196603, + "flos": 29794862580480.0, + "grad_norm": 3.293243324781059, + "language_loss": 0.59508169, + "learning_rate": 2.519624364862061e-08, + "loss": 0.61942226, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18383789, + "step": 15816, + "time_per_iteration": 2.907811164855957 + }, + { + "auxiliary_loss_clip": 0.01408328, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.24882472, + "balance_loss_mlp": 1.01917291, + "epoch": 0.950969487449271, + "flos": 24728587234560.0, + "grad_norm": 1.4257737992236463, + "language_loss": 0.74179173, + "learning_rate": 2.513465558735994e-08, + "loss": 0.76626056, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19396973, + "step": 15817, + "time_per_iteration": 2.888936996459961 + }, + { + "auxiliary_loss_clip": 0.01398219, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.23649979, + "balance_loss_mlp": 1.01743615, + "epoch": 0.9510296107019389, + "flos": 13707136339200.0, + "grad_norm": 1.8279991964036522, + "language_loss": 0.60439467, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62876159, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.21044922, + "step": 15818, + "time_per_iteration": 2.8409926891326904 + }, + { + "auxiliary_loss_clip": 0.01400437, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.24108624, + "balance_loss_mlp": 1.0154078, + "epoch": 0.9510897339546069, + "flos": 17320913262720.0, + "grad_norm": 1.7200299650161295, + "language_loss": 0.70512557, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.72947091, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18676758, + "step": 15819, + "time_per_iteration": 2.798652172088623 + }, + { + "auxiliary_loss_clip": 0.01404004, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.24230027, + "balance_loss_mlp": 1.01376712, + "epoch": 0.951149857207275, + "flos": 14802244934400.0, + "grad_norm": 3.7549307142888084, + "language_loss": 0.7460345, + "learning_rate": 2.49503407354561e-08, + "loss": 0.77039748, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18530273, + "step": 15820, + "time_per_iteration": 2.8260083198547363 + }, + { + "auxiliary_loss_clip": 0.01418916, + "auxiliary_loss_mlp": 0.01041475, + "balance_loss_clip": 1.25651503, + "balance_loss_mlp": 1.02229381, + "epoch": 0.9512099804599429, + "flos": 19400651354880.0, + "grad_norm": 1.723660158467129, + "language_loss": 0.78877544, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.81337941, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19189453, + "step": 15821, + "time_per_iteration": 2.795001745223999 + }, + { + "auxiliary_loss_clip": 0.01392737, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.23459148, + "balance_loss_mlp": 1.01258147, + "epoch": 0.9512701037126109, + "flos": 36771750380160.0, + "grad_norm": 1.3756033746789327, + "language_loss": 0.71290839, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.7371577, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19616699, + "step": 15822, + "time_per_iteration": 2.958003282546997 + }, + { + "auxiliary_loss_clip": 0.01395571, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.23737264, + "balance_loss_mlp": 1.01132429, + "epoch": 0.9513302269652788, + "flos": 22648577673600.0, + "grad_norm": 1.7415098651582226, + "language_loss": 0.66867238, + "learning_rate": 2.47666999302647e-08, + "loss": 0.69292593, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18481445, + "step": 15823, + "time_per_iteration": 4.274961948394775 + }, + { + "auxiliary_loss_clip": 0.01393681, + "auxiliary_loss_mlp": 0.01035477, + "balance_loss_clip": 1.23577714, + "balance_loss_mlp": 1.01666594, + "epoch": 0.9513903502179468, + "flos": 22903496732160.0, + "grad_norm": 1.7267961411417065, + "language_loss": 0.77947271, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.80376428, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18811035, + "step": 15824, + "time_per_iteration": 2.825993537902832 + }, + { + "auxiliary_loss_clip": 0.0141107, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.2469039, + "balance_loss_mlp": 1.01213932, + "epoch": 0.9514504734706147, + "flos": 27940290451200.0, + "grad_norm": 2.335071347555368, + "language_loss": 0.74802017, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.77244604, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.19396973, + "step": 15825, + "time_per_iteration": 2.903411865234375 + }, + { + "auxiliary_loss_clip": 0.01181852, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.09236121, + "balance_loss_mlp": 1.02059102, + "epoch": 0.9515105967232828, + "flos": 67397364316800.0, + "grad_norm": 0.8184035734412601, + "language_loss": 0.53480101, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55705625, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.23046875, + "step": 15826, + "time_per_iteration": 4.726433277130127 + }, + { + "auxiliary_loss_clip": 0.01403398, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.24225616, + "balance_loss_mlp": 1.01229906, + "epoch": 0.9515707199759507, + "flos": 25857023264640.0, + "grad_norm": 1.7716214275134703, + "language_loss": 0.74123257, + "learning_rate": 2.452289414874076e-08, + "loss": 0.76557529, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18566895, + "step": 15827, + "time_per_iteration": 2.8961915969848633 + }, + { + "auxiliary_loss_clip": 0.01396782, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.23704314, + "balance_loss_mlp": 1.01896942, + "epoch": 0.9516308432286187, + "flos": 21837372071040.0, + "grad_norm": 2.217928880338013, + "language_loss": 0.75614274, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.7804985, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19812012, + "step": 15828, + "time_per_iteration": 2.8720791339874268 + }, + { + "auxiliary_loss_clip": 0.01391073, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.23453641, + "balance_loss_mlp": 1.01497197, + "epoch": 0.9516909664812866, + "flos": 27280492508160.0, + "grad_norm": 1.764120127527729, + "language_loss": 0.74227762, + "learning_rate": 2.440144071047978e-08, + "loss": 0.76651871, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18066406, + "step": 15829, + "time_per_iteration": 3.081106662750244 + }, + { + "auxiliary_loss_clip": 0.01404548, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.2446965, + "balance_loss_mlp": 1.01577091, + "epoch": 0.9517510897339546, + "flos": 21225289409280.0, + "grad_norm": 1.9739716390437276, + "language_loss": 0.62214696, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.6465261, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.17602539, + "step": 15830, + "time_per_iteration": 2.861830711364746 + }, + { + "auxiliary_loss_clip": 0.01402904, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.24137306, + "balance_loss_mlp": 1.01584113, + "epoch": 0.9518112129866225, + "flos": 18743206141440.0, + "grad_norm": 2.1865024992478674, + "language_loss": 0.73704255, + "learning_rate": 2.428028693179729e-08, + "loss": 0.76143312, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.203125, + "step": 15831, + "time_per_iteration": 2.8334529399871826 + }, + { + "auxiliary_loss_clip": 0.01389597, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.23222792, + "balance_loss_mlp": 1.00974107, + "epoch": 0.9518713362392905, + "flos": 16772363579520.0, + "grad_norm": 1.6592494589716154, + "language_loss": 0.66626954, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.69044232, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.17956543, + "step": 15832, + "time_per_iteration": 2.809551954269409 + }, + { + "auxiliary_loss_clip": 0.01389001, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.23421383, + "balance_loss_mlp": 1.01668477, + "epoch": 0.9519314594919586, + "flos": 15238777196160.0, + "grad_norm": 1.7656075427158238, + "language_loss": 0.78643501, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.81067657, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18469238, + "step": 15833, + "time_per_iteration": 4.205423593521118 + }, + { + "auxiliary_loss_clip": 0.01384171, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.22747827, + "balance_loss_mlp": 1.01396847, + "epoch": 0.9519915827446265, + "flos": 19361985033600.0, + "grad_norm": 1.9809653312821383, + "language_loss": 0.75855625, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.78272372, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.18615723, + "step": 15834, + "time_per_iteration": 4.206353664398193 + }, + { + "auxiliary_loss_clip": 0.01410976, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.24670386, + "balance_loss_mlp": 1.0146606, + "epoch": 0.9520517059972945, + "flos": 22274673494400.0, + "grad_norm": 3.6831644204758764, + "language_loss": 0.76891243, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.79336643, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.19750977, + "step": 15835, + "time_per_iteration": 2.828402519226074 + }, + { + "auxiliary_loss_clip": 0.01399833, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.23851514, + "balance_loss_mlp": 1.015728, + "epoch": 0.9521118292499624, + "flos": 14869442741760.0, + "grad_norm": 1.9665536170314728, + "language_loss": 0.67058933, + "learning_rate": 2.397871361623238e-08, + "loss": 0.69494343, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19836426, + "step": 15836, + "time_per_iteration": 2.8135793209075928 + }, + { + "auxiliary_loss_clip": 0.01394407, + "auxiliary_loss_mlp": 0.01031604, + "balance_loss_clip": 1.23798418, + "balance_loss_mlp": 1.01239967, + "epoch": 0.9521719525026304, + "flos": 23518022613120.0, + "grad_norm": 1.8206042837961223, + "language_loss": 0.70516396, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72942412, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.1920166, + "step": 15837, + "time_per_iteration": 2.869736671447754 + }, + { + "auxiliary_loss_clip": 0.01388649, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.22859156, + "balance_loss_mlp": 1.01279664, + "epoch": 0.9522320757552983, + "flos": 19723311158400.0, + "grad_norm": 1.8070168510197444, + "language_loss": 0.73867226, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.76288795, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.20129395, + "step": 15838, + "time_per_iteration": 2.8493638038635254 + }, + { + "auxiliary_loss_clip": 0.01395604, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.23667979, + "balance_loss_mlp": 1.01399362, + "epoch": 0.9522921990079664, + "flos": 25932274646400.0, + "grad_norm": 2.1051847073750327, + "language_loss": 0.7846902, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80897111, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18481445, + "step": 15839, + "time_per_iteration": 2.894648313522339 + }, + { + "auxiliary_loss_clip": 0.01403517, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.24257481, + "balance_loss_mlp": 1.02062106, + "epoch": 0.9523523222606343, + "flos": 19217409338880.0, + "grad_norm": 1.6627950181055369, + "language_loss": 0.80526948, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82968843, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1776123, + "step": 15840, + "time_per_iteration": 2.8819143772125244 + }, + { + "auxiliary_loss_clip": 0.01383773, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.22774398, + "balance_loss_mlp": 1.01244712, + "epoch": 0.9524124455133023, + "flos": 20930572909440.0, + "grad_norm": 2.3001645434840494, + "language_loss": 0.74039638, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.76452905, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.17041016, + "step": 15841, + "time_per_iteration": 2.8201746940612793 + }, + { + "auxiliary_loss_clip": 0.01390405, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.2357496, + "balance_loss_mlp": 1.01419842, + "epoch": 0.9524725687659702, + "flos": 18852825588480.0, + "grad_norm": 1.7616392570715682, + "language_loss": 0.79587424, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.82010615, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.18566895, + "step": 15842, + "time_per_iteration": 2.8824925422668457 + }, + { + "auxiliary_loss_clip": 0.01401764, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.24313951, + "balance_loss_mlp": 1.01780736, + "epoch": 0.9525326920186382, + "flos": 22684981754880.0, + "grad_norm": 1.6498454966399745, + "language_loss": 0.73002362, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.75441474, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1953125, + "step": 15843, + "time_per_iteration": 2.8462376594543457 + }, + { + "auxiliary_loss_clip": 0.01399173, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.23800015, + "balance_loss_mlp": 1.01426029, + "epoch": 0.9525928152713061, + "flos": 22095684489600.0, + "grad_norm": 1.6185058095164766, + "language_loss": 0.78854185, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.81287265, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1965332, + "step": 15844, + "time_per_iteration": 2.8755979537963867 + }, + { + "auxiliary_loss_clip": 0.01401243, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.23824084, + "balance_loss_mlp": 1.0154829, + "epoch": 0.9526529385239741, + "flos": 20714591640960.0, + "grad_norm": 3.7671344324570604, + "language_loss": 0.71276975, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.73714948, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.21240234, + "step": 15845, + "time_per_iteration": 2.8293771743774414 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.24296498, + "balance_loss_mlp": 1.01819611, + "epoch": 0.9527130617766422, + "flos": 23378559580800.0, + "grad_norm": 1.4128378466578928, + "language_loss": 0.75774336, + "learning_rate": 2.338118708818282e-08, + "loss": 0.7821703, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18945312, + "step": 15846, + "time_per_iteration": 2.8706579208374023 + }, + { + "auxiliary_loss_clip": 0.01392297, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.23292136, + "balance_loss_mlp": 1.0120368, + "epoch": 0.9527731850293101, + "flos": 18994324636800.0, + "grad_norm": 1.779746147458878, + "language_loss": 0.79106748, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.81529284, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18212891, + "step": 15847, + "time_per_iteration": 2.864745616912842 + }, + { + "auxiliary_loss_clip": 0.01386835, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.23019195, + "balance_loss_mlp": 1.01526737, + "epoch": 0.9528333082819781, + "flos": 19327481233920.0, + "grad_norm": 1.711001791839283, + "language_loss": 0.78402835, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80823624, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.18688965, + "step": 15848, + "time_per_iteration": 2.8164916038513184 + }, + { + "auxiliary_loss_clip": 0.01411394, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.24762917, + "balance_loss_mlp": 1.01832259, + "epoch": 0.952893431534646, + "flos": 23961884532480.0, + "grad_norm": 1.7550925466356884, + "language_loss": 0.72996259, + "learning_rate": 2.320339062183674e-08, + "loss": 0.7544474, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18762207, + "step": 15849, + "time_per_iteration": 2.8822686672210693 + }, + { + "auxiliary_loss_clip": 0.01418626, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.25387025, + "balance_loss_mlp": 1.01770771, + "epoch": 0.952953554787314, + "flos": 21039830398080.0, + "grad_norm": 1.5098453610117843, + "language_loss": 0.76005483, + "learning_rate": 2.314427505071226e-08, + "loss": 0.78460521, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18713379, + "step": 15850, + "time_per_iteration": 2.828068256378174 + }, + { + "auxiliary_loss_clip": 0.01397882, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.23832369, + "balance_loss_mlp": 1.01680708, + "epoch": 0.9530136780399819, + "flos": 22393115677440.0, + "grad_norm": 3.0411330371860292, + "language_loss": 0.7333231, + "learning_rate": 2.308523444215482e-08, + "loss": 0.75764954, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17944336, + "step": 15851, + "time_per_iteration": 2.909142017364502 + }, + { + "auxiliary_loss_clip": 0.01396554, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.23753262, + "balance_loss_mlp": 1.01329803, + "epoch": 0.95307380129265, + "flos": 22168673631360.0, + "grad_norm": 2.4796112619253754, + "language_loss": 0.80360329, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.82789016, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18847656, + "step": 15852, + "time_per_iteration": 2.831716537475586 + }, + { + "auxiliary_loss_clip": 0.01401864, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.24156606, + "balance_loss_mlp": 1.01889038, + "epoch": 0.9531339245453179, + "flos": 44039599361280.0, + "grad_norm": 1.6346945477542283, + "language_loss": 0.60878903, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.63318574, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18908691, + "step": 15853, + "time_per_iteration": 3.1342570781707764 + }, + { + "auxiliary_loss_clip": 0.01373562, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.21872437, + "balance_loss_mlp": 1.01581717, + "epoch": 0.9531940477979859, + "flos": 20276385321600.0, + "grad_norm": 4.960787403956611, + "language_loss": 0.72953123, + "learning_rate": 2.290856241425998e-08, + "loss": 0.75360775, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18273926, + "step": 15854, + "time_per_iteration": 2.842256784439087 + }, + { + "auxiliary_loss_clip": 0.01406482, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.24564135, + "balance_loss_mlp": 1.01326394, + "epoch": 0.9532541710506538, + "flos": 25346099272320.0, + "grad_norm": 1.9728021402115516, + "language_loss": 0.69007266, + "learning_rate": 2.284982167833127e-08, + "loss": 0.71446168, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19165039, + "step": 15855, + "time_per_iteration": 2.8700456619262695 + }, + { + "auxiliary_loss_clip": 0.01387976, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.22954166, + "balance_loss_mlp": 1.01303113, + "epoch": 0.9533142943033218, + "flos": 26480598105600.0, + "grad_norm": 1.6558309682016235, + "language_loss": 0.77379066, + "learning_rate": 2.279115591613556e-08, + "loss": 0.79798281, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18212891, + "step": 15856, + "time_per_iteration": 2.8652262687683105 + }, + { + "auxiliary_loss_clip": 0.01396076, + "auxiliary_loss_mlp": 0.01031216, + "balance_loss_clip": 1.23777831, + "balance_loss_mlp": 1.01295292, + "epoch": 0.9533744175559897, + "flos": 23666760829440.0, + "grad_norm": 1.6880225201583716, + "language_loss": 0.78347957, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80775249, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18273926, + "step": 15857, + "time_per_iteration": 2.848681926727295 + }, + { + "auxiliary_loss_clip": 0.01179669, + "auxiliary_loss_mlp": 0.01021778, + "balance_loss_clip": 1.09272265, + "balance_loss_mlp": 1.00337172, + "epoch": 0.9534345408086577, + "flos": 61080319704960.0, + "grad_norm": 0.707449838416559, + "language_loss": 0.62671852, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64873302, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.18359375, + "step": 15858, + "time_per_iteration": 4.750405550003052 + }, + { + "auxiliary_loss_clip": 0.0138986, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.23232436, + "balance_loss_mlp": 1.01599193, + "epoch": 0.9534946640613258, + "flos": 18960635243520.0, + "grad_norm": 1.6898571591385187, + "language_loss": 0.5762378, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.6004833, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18701172, + "step": 15859, + "time_per_iteration": 2.8759772777557373 + }, + { + "auxiliary_loss_clip": 0.01381308, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.22729206, + "balance_loss_mlp": 1.01325619, + "epoch": 0.9535547873139937, + "flos": 16662970356480.0, + "grad_norm": 2.11819030642061, + "language_loss": 0.82721043, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.8513338, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.17773438, + "step": 15860, + "time_per_iteration": 2.8158483505249023 + }, + { + "auxiliary_loss_clip": 0.01405244, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.24383092, + "balance_loss_mlp": 1.01372457, + "epoch": 0.9536149105666617, + "flos": 20677508887680.0, + "grad_norm": 1.8281519004467497, + "language_loss": 0.67025012, + "learning_rate": 2.249895178891159e-08, + "loss": 0.69462407, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18444824, + "step": 15861, + "time_per_iteration": 4.2691521644592285 + }, + { + "auxiliary_loss_clip": 0.0140867, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.24768555, + "balance_loss_mlp": 1.01379287, + "epoch": 0.9536750338193296, + "flos": 30712610983680.0, + "grad_norm": 2.555519967214081, + "language_loss": 0.66251743, + "learning_rate": 2.244073591573037e-08, + "loss": 0.68693161, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18945312, + "step": 15862, + "time_per_iteration": 2.922919273376465 + }, + { + "auxiliary_loss_clip": 0.0138757, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.23332739, + "balance_loss_mlp": 1.01275611, + "epoch": 0.9537351570719976, + "flos": 20413359889920.0, + "grad_norm": 1.5090424569901963, + "language_loss": 0.6865226, + "learning_rate": 2.238259503179485e-08, + "loss": 0.71071303, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18725586, + "step": 15863, + "time_per_iteration": 2.849214553833008 + }, + { + "auxiliary_loss_clip": 0.01400595, + "auxiliary_loss_mlp": 0.01033456, + "balance_loss_clip": 1.24158835, + "balance_loss_mlp": 1.01447797, + "epoch": 0.9537952803246655, + "flos": 29939845478400.0, + "grad_norm": 1.7785916815490503, + "language_loss": 0.79125869, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.81559914, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18969727, + "step": 15864, + "time_per_iteration": 2.9445626735687256 + }, + { + "auxiliary_loss_clip": 0.01389486, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.23339915, + "balance_loss_mlp": 1.0141716, + "epoch": 0.9538554035773336, + "flos": 20530987666560.0, + "grad_norm": 2.9700435384055743, + "language_loss": 0.60170639, + "learning_rate": 2.226653824047586e-08, + "loss": 0.6259231, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18005371, + "step": 15865, + "time_per_iteration": 2.8739097118377686 + }, + { + "auxiliary_loss_clip": 0.01394806, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.23521757, + "balance_loss_mlp": 1.01352119, + "epoch": 0.9539155268300015, + "flos": 18415886123520.0, + "grad_norm": 1.7476345161124003, + "language_loss": 0.71298218, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.73725665, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19104004, + "step": 15866, + "time_per_iteration": 2.8247647285461426 + }, + { + "auxiliary_loss_clip": 0.01406831, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.24698305, + "balance_loss_mlp": 1.01629102, + "epoch": 0.9539756500826695, + "flos": 26224366947840.0, + "grad_norm": 2.452044262528461, + "language_loss": 0.8577137, + "learning_rate": 2.215078143255855e-08, + "loss": 0.88214004, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1953125, + "step": 15867, + "time_per_iteration": 2.9008405208587646 + }, + { + "auxiliary_loss_clip": 0.01181084, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.0931251, + "balance_loss_mlp": 1.00586259, + "epoch": 0.9540357733353374, + "flos": 68322578112000.0, + "grad_norm": 0.7576184641944876, + "language_loss": 0.61811602, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.64024872, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.26367188, + "step": 15868, + "time_per_iteration": 4.815367937088013 + }, + { + "auxiliary_loss_clip": 0.01396548, + "auxiliary_loss_mlp": 0.01033723, + "balance_loss_clip": 1.23690403, + "balance_loss_mlp": 1.01491189, + "epoch": 0.9540958965880054, + "flos": 21298233306240.0, + "grad_norm": 1.9207224596212447, + "language_loss": 0.61131704, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.6356197, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18798828, + "step": 15869, + "time_per_iteration": 4.2893900871276855 + }, + { + "auxiliary_loss_clip": 0.01402326, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.2433064, + "balance_loss_mlp": 1.01857376, + "epoch": 0.9541560198406733, + "flos": 19759805729280.0, + "grad_norm": 2.1062487875905265, + "language_loss": 0.71989089, + "learning_rate": 2.197770872795579e-08, + "loss": 0.74427223, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.17248535, + "step": 15870, + "time_per_iteration": 2.8657803535461426 + }, + { + "auxiliary_loss_clip": 0.01398742, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.24047995, + "balance_loss_mlp": 1.01281857, + "epoch": 0.9542161430933414, + "flos": 24725872546560.0, + "grad_norm": 1.868446974620835, + "language_loss": 0.77609062, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.80039668, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19042969, + "step": 15871, + "time_per_iteration": 2.9891533851623535 + }, + { + "auxiliary_loss_clip": 0.01400104, + "auxiliary_loss_mlp": 0.01036376, + "balance_loss_clip": 1.24089837, + "balance_loss_mlp": 1.01748109, + "epoch": 0.9542762663460094, + "flos": 31078144874880.0, + "grad_norm": 1.8158997744210454, + "language_loss": 0.5890249, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.61338973, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18896484, + "step": 15872, + "time_per_iteration": 2.959434986114502 + }, + { + "auxiliary_loss_clip": 0.01405536, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.24323893, + "balance_loss_mlp": 1.01817989, + "epoch": 0.9543363895986773, + "flos": 20786359173120.0, + "grad_norm": 1.8143159291535895, + "language_loss": 0.75694871, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.78138006, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19433594, + "step": 15873, + "time_per_iteration": 2.8488261699676514 + }, + { + "auxiliary_loss_clip": 0.01399105, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.23792398, + "balance_loss_mlp": 1.01246393, + "epoch": 0.9543965128513453, + "flos": 24473577686400.0, + "grad_norm": 6.029658357416853, + "language_loss": 0.62882364, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.6531347, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1953125, + "step": 15874, + "time_per_iteration": 2.8589696884155273 + }, + { + "auxiliary_loss_clip": 0.01383961, + "auxiliary_loss_mlp": 0.0103528, + "balance_loss_clip": 1.22771561, + "balance_loss_mlp": 1.01596808, + "epoch": 0.9544566361040132, + "flos": 15268213578240.0, + "grad_norm": 1.934679503874201, + "language_loss": 0.90013587, + "learning_rate": 2.169075438538104e-08, + "loss": 0.92432833, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19311523, + "step": 15875, + "time_per_iteration": 2.847459316253662 + }, + { + "auxiliary_loss_clip": 0.01413104, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.24942029, + "balance_loss_mlp": 1.01332712, + "epoch": 0.9545167593566812, + "flos": 25929107510400.0, + "grad_norm": 1.6522967240216369, + "language_loss": 0.68559206, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.71005434, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19799805, + "step": 15876, + "time_per_iteration": 2.8786566257476807 + }, + { + "auxiliary_loss_clip": 0.01409826, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.24729729, + "balance_loss_mlp": 1.01363182, + "epoch": 0.9545768826093491, + "flos": 25638779756160.0, + "grad_norm": 1.8570534509737187, + "language_loss": 0.70344943, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.72787333, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18920898, + "step": 15877, + "time_per_iteration": 2.901604413986206 + }, + { + "auxiliary_loss_clip": 0.01410607, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.24965453, + "balance_loss_mlp": 1.01132131, + "epoch": 0.9546370058620172, + "flos": 22501558759680.0, + "grad_norm": 1.6254123941452787, + "language_loss": 0.71379542, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.73820662, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19177246, + "step": 15878, + "time_per_iteration": 2.870429277420044 + }, + { + "auxiliary_loss_clip": 0.01392309, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.23436975, + "balance_loss_mlp": 1.01392293, + "epoch": 0.9546971291146851, + "flos": 24620687089920.0, + "grad_norm": 1.2839911303183849, + "language_loss": 0.68647313, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.71072155, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18615723, + "step": 15879, + "time_per_iteration": 2.9011917114257812 + }, + { + "auxiliary_loss_clip": 0.01390249, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.23346448, + "balance_loss_mlp": 1.01126862, + "epoch": 0.9547572523673531, + "flos": 28669457952000.0, + "grad_norm": 1.88028076060204, + "language_loss": 0.85623682, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.88043857, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18640137, + "step": 15880, + "time_per_iteration": 2.913128614425659 + }, + { + "auxiliary_loss_clip": 0.01394798, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.23480177, + "balance_loss_mlp": 1.01464641, + "epoch": 0.954817375620021, + "flos": 33815735383680.0, + "grad_norm": 1.976869660491038, + "language_loss": 0.72489411, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74917865, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19006348, + "step": 15881, + "time_per_iteration": 2.9521148204803467 + }, + { + "auxiliary_loss_clip": 0.01397973, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.24083328, + "balance_loss_mlp": 1.01739049, + "epoch": 0.954877498872689, + "flos": 14436213350400.0, + "grad_norm": 2.2614401133578395, + "language_loss": 0.72647333, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.75081533, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18835449, + "step": 15882, + "time_per_iteration": 2.822021007537842 + }, + { + "auxiliary_loss_clip": 0.01409776, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.24819458, + "balance_loss_mlp": 1.01746583, + "epoch": 0.9549376221253569, + "flos": 59289126837120.0, + "grad_norm": 1.6343625639803911, + "language_loss": 0.66972554, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.69419014, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19213867, + "step": 15883, + "time_per_iteration": 3.209477424621582 + }, + { + "auxiliary_loss_clip": 0.01402918, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.24280405, + "balance_loss_mlp": 1.01724744, + "epoch": 0.954997745378025, + "flos": 17283197082240.0, + "grad_norm": 4.170039004690641, + "language_loss": 0.78890288, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.81330252, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19812012, + "step": 15884, + "time_per_iteration": 2.7932369709014893 + }, + { + "auxiliary_loss_clip": 0.01410589, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.24755156, + "balance_loss_mlp": 1.0125885, + "epoch": 0.955057868630693, + "flos": 13014282430080.0, + "grad_norm": 1.6362507025605773, + "language_loss": 0.78353941, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.8079679, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19641113, + "step": 15885, + "time_per_iteration": 2.826247453689575 + }, + { + "auxiliary_loss_clip": 0.0140108, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.2411449, + "balance_loss_mlp": 1.01197577, + "epoch": 0.9551179918833609, + "flos": 22647853756800.0, + "grad_norm": 2.1301254857593714, + "language_loss": 0.70891702, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.73323202, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18444824, + "step": 15886, + "time_per_iteration": 2.8400673866271973 + }, + { + "auxiliary_loss_clip": 0.01409128, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.24548864, + "balance_loss_mlp": 1.01854599, + "epoch": 0.9551781151360289, + "flos": 21552609427200.0, + "grad_norm": 1.6900708472766617, + "language_loss": 0.73697108, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.76144195, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19421387, + "step": 15887, + "time_per_iteration": 2.9240052700042725 + }, + { + "auxiliary_loss_clip": 0.01377792, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.22266269, + "balance_loss_mlp": 1.01160932, + "epoch": 0.9552382383886968, + "flos": 20711469749760.0, + "grad_norm": 6.862932880003928, + "language_loss": 0.57441646, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59849656, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18603516, + "step": 15888, + "time_per_iteration": 2.8672680854797363 + }, + { + "auxiliary_loss_clip": 0.01182454, + "auxiliary_loss_mlp": 0.01020011, + "balance_loss_clip": 1.09457421, + "balance_loss_mlp": 0.99893492, + "epoch": 0.9552983616413648, + "flos": 67800931102080.0, + "grad_norm": 0.7101981580259676, + "language_loss": 0.57902008, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.60104471, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.2109375, + "step": 15889, + "time_per_iteration": 3.4000484943389893 + }, + { + "auxiliary_loss_clip": 0.01406596, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.24331808, + "balance_loss_mlp": 1.01253545, + "epoch": 0.9553584848940327, + "flos": 21589963649280.0, + "grad_norm": 1.3936224899378897, + "language_loss": 0.67988598, + "learning_rate": 2.084114508877466e-08, + "loss": 0.70427364, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19628906, + "step": 15890, + "time_per_iteration": 2.8923583030700684 + }, + { + "auxiliary_loss_clip": 0.01393251, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.23614001, + "balance_loss_mlp": 1.01225758, + "epoch": 0.9554186081467008, + "flos": 24219156320640.0, + "grad_norm": 1.4654916007782082, + "language_loss": 0.74561226, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76984036, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17297363, + "step": 15891, + "time_per_iteration": 2.8617002964019775 + }, + { + "auxiliary_loss_clip": 0.01387799, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.23370886, + "balance_loss_mlp": 1.01152372, + "epoch": 0.9554787313993687, + "flos": 16259765529600.0, + "grad_norm": 1.9842574461789808, + "language_loss": 0.78733993, + "learning_rate": 2.072913954011435e-08, + "loss": 0.81151277, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.17956543, + "step": 15892, + "time_per_iteration": 2.861238479614258 + }, + { + "auxiliary_loss_clip": 0.01403395, + "auxiliary_loss_mlp": 0.01038633, + "balance_loss_clip": 1.2449832, + "balance_loss_mlp": 1.01872551, + "epoch": 0.9555388546520367, + "flos": 23414556458880.0, + "grad_norm": 1.4626697580634043, + "language_loss": 0.70621395, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.73063421, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19921875, + "step": 15893, + "time_per_iteration": 4.308905601501465 + }, + { + "auxiliary_loss_clip": 0.01391286, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.23428226, + "balance_loss_mlp": 1.01546812, + "epoch": 0.9555989779047046, + "flos": 14802380668800.0, + "grad_norm": 2.001226949858358, + "language_loss": 0.66305614, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.68733132, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.2076416, + "step": 15894, + "time_per_iteration": 2.8000314235687256 + }, + { + "auxiliary_loss_clip": 0.01400653, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.23822999, + "balance_loss_mlp": 1.01421833, + "epoch": 0.9556591011573726, + "flos": 22246911169920.0, + "grad_norm": 1.8250761635407013, + "language_loss": 0.82334942, + "learning_rate": 2.056169412853581e-08, + "loss": 0.8476882, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19018555, + "step": 15895, + "time_per_iteration": 2.8413095474243164 + }, + { + "auxiliary_loss_clip": 0.01394551, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.23384762, + "balance_loss_mlp": 1.01060796, + "epoch": 0.9557192244100405, + "flos": 27866848861440.0, + "grad_norm": 1.5514381710825422, + "language_loss": 0.73076057, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.75500786, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19567871, + "step": 15896, + "time_per_iteration": 4.343982458114624 + }, + { + "auxiliary_loss_clip": 0.01388182, + "auxiliary_loss_mlp": 0.01031104, + "balance_loss_clip": 1.23079884, + "balance_loss_mlp": 1.01280546, + "epoch": 0.9557793476627086, + "flos": 17611693464960.0, + "grad_norm": 2.4316381135997718, + "language_loss": 0.80515492, + "learning_rate": 2.045043915311706e-08, + "loss": 0.82934785, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18286133, + "step": 15897, + "time_per_iteration": 2.7978665828704834 + }, + { + "auxiliary_loss_clip": 0.01393696, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.2341367, + "balance_loss_mlp": 1.0138272, + "epoch": 0.9558394709153766, + "flos": 23885275806720.0, + "grad_norm": 2.307662601744314, + "language_loss": 0.73452127, + "learning_rate": 2.03949242614303e-08, + "loss": 0.75878954, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19311523, + "step": 15898, + "time_per_iteration": 2.9069690704345703 + }, + { + "auxiliary_loss_clip": 0.01183108, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.0945462, + "balance_loss_mlp": 1.0096823, + "epoch": 0.9558995941680445, + "flos": 53708371142400.0, + "grad_norm": 0.9118544183450444, + "language_loss": 0.5241133, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54630822, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.26757812, + "step": 15899, + "time_per_iteration": 3.2890727519989014 + }, + { + "auxiliary_loss_clip": 0.01426617, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.26141846, + "balance_loss_mlp": 1.01606202, + "epoch": 0.9559597174207125, + "flos": 13770488338560.0, + "grad_norm": 2.0689336657193773, + "language_loss": 0.69098878, + "learning_rate": 2.028411968062782e-08, + "loss": 0.71561378, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.19824219, + "step": 15900, + "time_per_iteration": 2.8987224102020264 + }, + { + "auxiliary_loss_clip": 0.01397829, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.23650563, + "balance_loss_mlp": 1.01135731, + "epoch": 0.9560198406733804, + "flos": 19945445719680.0, + "grad_norm": 1.9944706489616386, + "language_loss": 0.83425725, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85854959, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.20031738, + "step": 15901, + "time_per_iteration": 2.8445117473602295 + }, + { + "auxiliary_loss_clip": 0.01186245, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.09758437, + "balance_loss_mlp": 1.00866449, + "epoch": 0.9560799639260484, + "flos": 57315678059520.0, + "grad_norm": 0.7101445764899141, + "language_loss": 0.54384083, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56600738, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.21777344, + "step": 15902, + "time_per_iteration": 3.3556182384490967 + }, + { + "auxiliary_loss_clip": 0.01382513, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.22898197, + "balance_loss_mlp": 1.01306832, + "epoch": 0.9561400871787163, + "flos": 18926629136640.0, + "grad_norm": 1.6322659766613274, + "language_loss": 0.8524884, + "learning_rate": 2.01184758473425e-08, + "loss": 0.8766247, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.18041992, + "step": 15903, + "time_per_iteration": 4.26308274269104 + }, + { + "auxiliary_loss_clip": 0.01397699, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.23766565, + "balance_loss_mlp": 1.01074862, + "epoch": 0.9562002104313844, + "flos": 18047049361920.0, + "grad_norm": 2.2340023302275522, + "language_loss": 0.81101918, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.83527923, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.17553711, + "step": 15904, + "time_per_iteration": 4.240625858306885 + }, + { + "auxiliary_loss_clip": 0.01406775, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.24506521, + "balance_loss_mlp": 1.01176798, + "epoch": 0.9562603336840523, + "flos": 24728451500160.0, + "grad_norm": 2.101843115390627, + "language_loss": 0.60522354, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62959862, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18969727, + "step": 15905, + "time_per_iteration": 2.92730712890625 + }, + { + "auxiliary_loss_clip": 0.01396968, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.23869002, + "balance_loss_mlp": 1.01243651, + "epoch": 0.9563204569367203, + "flos": 21186713577600.0, + "grad_norm": 2.037146351835429, + "language_loss": 0.71417058, + "learning_rate": 1.995350770979254e-08, + "loss": 0.73844391, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.17919922, + "step": 15906, + "time_per_iteration": 2.8485374450683594 + }, + { + "auxiliary_loss_clip": 0.01414479, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.25092053, + "balance_loss_mlp": 1.01468122, + "epoch": 0.9563805801893882, + "flos": 20239302568320.0, + "grad_norm": 1.6636157776593996, + "language_loss": 0.71975827, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.74423838, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.18835449, + "step": 15907, + "time_per_iteration": 2.8828699588775635 + }, + { + "auxiliary_loss_clip": 0.0137969, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.22367084, + "balance_loss_mlp": 1.01484823, + "epoch": 0.9564407034420562, + "flos": 25421984081280.0, + "grad_norm": 2.0041098637770327, + "language_loss": 0.7092272, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.733356, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18347168, + "step": 15908, + "time_per_iteration": 2.8588943481445312 + }, + { + "auxiliary_loss_clip": 0.01396499, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.23777831, + "balance_loss_mlp": 1.01328254, + "epoch": 0.9565008266947241, + "flos": 18632908022400.0, + "grad_norm": 2.0101121520559073, + "language_loss": 0.83753681, + "learning_rate": 1.978921532427802e-08, + "loss": 0.86181754, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1829834, + "step": 15909, + "time_per_iteration": 2.837265729904175 + }, + { + "auxiliary_loss_clip": 0.01392089, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.23393071, + "balance_loss_mlp": 1.01481628, + "epoch": 0.9565609499473922, + "flos": 24872574746880.0, + "grad_norm": 2.671253757447132, + "language_loss": 0.69059706, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.71485233, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18640137, + "step": 15910, + "time_per_iteration": 2.895603895187378 + }, + { + "auxiliary_loss_clip": 0.01414818, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.2507689, + "balance_loss_mlp": 1.01604295, + "epoch": 0.9566210732000601, + "flos": 21808388136960.0, + "grad_norm": 1.885149761328476, + "language_loss": 0.75551373, + "learning_rate": 1.968006251276444e-08, + "loss": 0.78001428, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1920166, + "step": 15911, + "time_per_iteration": 2.8970866203308105 + }, + { + "auxiliary_loss_clip": 0.01397158, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.23740411, + "balance_loss_mlp": 1.01543832, + "epoch": 0.9566811964527281, + "flos": 18706847304960.0, + "grad_norm": 2.1654944345425133, + "language_loss": 0.70007735, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.72438973, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18664551, + "step": 15912, + "time_per_iteration": 2.875248670578003 + }, + { + "auxiliary_loss_clip": 0.0140795, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.24821234, + "balance_loss_mlp": 1.0167706, + "epoch": 0.9567413197053961, + "flos": 13007043262080.0, + "grad_norm": 2.3164393436225943, + "language_loss": 0.7330451, + "learning_rate": 1.95712100769696e-08, + "loss": 0.75747383, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18164062, + "step": 15913, + "time_per_iteration": 2.7933006286621094 + }, + { + "auxiliary_loss_clip": 0.01395637, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.23695731, + "balance_loss_mlp": 1.01239085, + "epoch": 0.956801442958064, + "flos": 19728785779200.0, + "grad_norm": 1.7103164058743772, + "language_loss": 0.7361275, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.76038992, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18212891, + "step": 15914, + "time_per_iteration": 2.914193630218506 + }, + { + "auxiliary_loss_clip": 0.01394114, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.23613596, + "balance_loss_mlp": 1.01117206, + "epoch": 0.956861566210732, + "flos": 18231829701120.0, + "grad_norm": 1.6059951152801648, + "language_loss": 0.67772067, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.70195466, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18115234, + "step": 15915, + "time_per_iteration": 2.844977617263794 + }, + { + "auxiliary_loss_clip": 0.01387934, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.23226702, + "balance_loss_mlp": 1.01267481, + "epoch": 0.9569216894634, + "flos": 22206254077440.0, + "grad_norm": 1.7405566212283887, + "language_loss": 0.64775282, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.67194664, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.18762207, + "step": 15916, + "time_per_iteration": 2.8971574306488037 + }, + { + "auxiliary_loss_clip": 0.01383544, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.23026597, + "balance_loss_mlp": 1.01344967, + "epoch": 0.956981812716068, + "flos": 21699266382720.0, + "grad_norm": 2.211904581265226, + "language_loss": 0.81215155, + "learning_rate": 1.935440639853536e-08, + "loss": 0.83630753, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.18579102, + "step": 15917, + "time_per_iteration": 2.8474903106689453 + }, + { + "auxiliary_loss_clip": 0.01391793, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.2345835, + "balance_loss_mlp": 1.01574445, + "epoch": 0.9570419359687359, + "flos": 13998911927040.0, + "grad_norm": 2.3907464067577164, + "language_loss": 0.73953831, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.76380157, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18798828, + "step": 15918, + "time_per_iteration": 2.8160483837127686 + }, + { + "auxiliary_loss_clip": 0.01182809, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.09442639, + "balance_loss_mlp": 1.01039696, + "epoch": 0.9571020592214039, + "flos": 65231986521600.0, + "grad_norm": 0.6622437235541858, + "language_loss": 0.53172708, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55388039, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22167969, + "step": 15919, + "time_per_iteration": 3.482100009918213 + }, + { + "auxiliary_loss_clip": 0.01404293, + "auxiliary_loss_mlp": 0.01034823, + "balance_loss_clip": 1.24132168, + "balance_loss_mlp": 1.01529682, + "epoch": 0.9571621824740718, + "flos": 17392952263680.0, + "grad_norm": 2.4026765964196604, + "language_loss": 0.7709893, + "learning_rate": 1.919259224843972e-08, + "loss": 0.79538047, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.1953125, + "step": 15920, + "time_per_iteration": 2.826139450073242 + }, + { + "auxiliary_loss_clip": 0.01411195, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.24931574, + "balance_loss_mlp": 1.01248217, + "epoch": 0.9572223057267398, + "flos": 14546330490240.0, + "grad_norm": 1.7082478410590913, + "language_loss": 0.80244678, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.82687032, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18701172, + "step": 15921, + "time_per_iteration": 2.878636598587036 + }, + { + "auxiliary_loss_clip": 0.01408382, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.24338651, + "balance_loss_mlp": 1.01284218, + "epoch": 0.9572824289794077, + "flos": 33960763526400.0, + "grad_norm": 1.8985959038551643, + "language_loss": 0.51774931, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.54214942, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18786621, + "step": 15922, + "time_per_iteration": 2.98093843460083 + }, + { + "auxiliary_loss_clip": 0.01407375, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.24690211, + "balance_loss_mlp": 1.01637423, + "epoch": 0.9573425522320758, + "flos": 18703453944960.0, + "grad_norm": 1.9618971157845593, + "language_loss": 0.84303558, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86746192, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18884277, + "step": 15923, + "time_per_iteration": 2.868478298187256 + }, + { + "auxiliary_loss_clip": 0.01393947, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.23502922, + "balance_loss_mlp": 1.01370955, + "epoch": 0.9574026754847437, + "flos": 28521579386880.0, + "grad_norm": 1.8861003329535757, + "language_loss": 0.75826204, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.78252292, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18432617, + "step": 15924, + "time_per_iteration": 2.90383243560791 + }, + { + "auxiliary_loss_clip": 0.01396227, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.23566914, + "balance_loss_mlp": 1.01742959, + "epoch": 0.9574627987374117, + "flos": 24362962853760.0, + "grad_norm": 1.866327912443195, + "language_loss": 0.86801052, + "learning_rate": 1.892440427371711e-08, + "loss": 0.89234269, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19567871, + "step": 15925, + "time_per_iteration": 2.863368272781372 + }, + { + "auxiliary_loss_clip": 0.01407749, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.24531174, + "balance_loss_mlp": 1.01095295, + "epoch": 0.9575229219900797, + "flos": 23520375342720.0, + "grad_norm": 1.672995618428822, + "language_loss": 0.76187968, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.78626364, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.19702148, + "step": 15926, + "time_per_iteration": 2.871328115463257 + }, + { + "auxiliary_loss_clip": 0.01401797, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.2417661, + "balance_loss_mlp": 1.01457775, + "epoch": 0.9575830452427476, + "flos": 22685162734080.0, + "grad_norm": 2.1373921311083284, + "language_loss": 0.78319335, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.80753738, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18041992, + "step": 15927, + "time_per_iteration": 2.867051362991333 + }, + { + "auxiliary_loss_clip": 0.01411456, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.24881935, + "balance_loss_mlp": 1.01452541, + "epoch": 0.9576431684954156, + "flos": 30498258528000.0, + "grad_norm": 1.5949741542920641, + "language_loss": 0.69831818, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.72278231, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 1.62597656, + "router_z_loss_mlp": 0.2043457, + "step": 15928, + "time_per_iteration": 4.338857173919678 + }, + { + "auxiliary_loss_clip": 0.01413673, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.25203776, + "balance_loss_mlp": 1.01390767, + "epoch": 0.9577032917480836, + "flos": 21696868408320.0, + "grad_norm": 2.7080583133661835, + "language_loss": 0.82586855, + "learning_rate": 1.871120608822485e-08, + "loss": 0.85033727, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19287109, + "step": 15929, + "time_per_iteration": 2.831333875656128 + }, + { + "auxiliary_loss_clip": 0.01416215, + "auxiliary_loss_mlp": 0.01040525, + "balance_loss_clip": 1.25052547, + "balance_loss_mlp": 1.02153492, + "epoch": 0.9577634150007516, + "flos": 29035082332800.0, + "grad_norm": 1.386325477089829, + "language_loss": 0.72703505, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.75160247, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.18994141, + "step": 15930, + "time_per_iteration": 2.8823630809783936 + }, + { + "auxiliary_loss_clip": 0.01394673, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.23662257, + "balance_loss_mlp": 1.01216757, + "epoch": 0.9578235382534195, + "flos": 19291846314240.0, + "grad_norm": 1.4323384932987289, + "language_loss": 0.63404047, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.65829313, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1842041, + "step": 15931, + "time_per_iteration": 2.8245046138763428 + }, + { + "auxiliary_loss_clip": 0.01398722, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.24339294, + "balance_loss_mlp": 1.01222146, + "epoch": 0.9578836615060875, + "flos": 13707272073600.0, + "grad_norm": 1.7747104208642654, + "language_loss": 0.70464432, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.72893941, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18591309, + "step": 15932, + "time_per_iteration": 4.391408920288086 + }, + { + "auxiliary_loss_clip": 0.01414632, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.25004125, + "balance_loss_mlp": 1.02082288, + "epoch": 0.9579437847587554, + "flos": 17063008047360.0, + "grad_norm": 1.7337414930143524, + "language_loss": 0.7610358, + "learning_rate": 1.849920999338961e-08, + "loss": 0.78558147, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.19104004, + "step": 15933, + "time_per_iteration": 2.8614706993103027 + }, + { + "auxiliary_loss_clip": 0.01184144, + "auxiliary_loss_mlp": 0.0103566, + "balance_loss_clip": 1.09506178, + "balance_loss_mlp": 1.01029193, + "epoch": 0.9580039080114234, + "flos": 60597248526720.0, + "grad_norm": 0.6953348197317655, + "language_loss": 0.57300317, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59520125, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.25390625, + "step": 15934, + "time_per_iteration": 3.4744327068328857 + }, + { + "auxiliary_loss_clip": 0.01182072, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.09280562, + "balance_loss_mlp": 1.01408792, + "epoch": 0.9580640312640913, + "flos": 66265580171520.0, + "grad_norm": 0.9104009963431124, + "language_loss": 0.66008008, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.6822753, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.23339844, + "step": 15935, + "time_per_iteration": 3.2301127910614014 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.01027187, + "balance_loss_clip": 1.09412766, + "balance_loss_mlp": 1.00582504, + "epoch": 0.9581241545167594, + "flos": 62246155201920.0, + "grad_norm": 0.7738931572379062, + "language_loss": 0.57114339, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59322989, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.21386719, + "step": 15936, + "time_per_iteration": 3.2709760665893555 + }, + { + "auxiliary_loss_clip": 0.01400376, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.23915136, + "balance_loss_mlp": 1.01545835, + "epoch": 0.9581842777694273, + "flos": 23778371047680.0, + "grad_norm": 1.9338743362812587, + "language_loss": 0.78869581, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.81305242, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19836426, + "step": 15937, + "time_per_iteration": 2.8671751022338867 + }, + { + "auxiliary_loss_clip": 0.01403604, + "auxiliary_loss_mlp": 0.01037687, + "balance_loss_clip": 1.24439406, + "balance_loss_mlp": 1.01866138, + "epoch": 0.9582444010220953, + "flos": 21222438986880.0, + "grad_norm": 1.55967268721307, + "language_loss": 0.68627918, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.71069211, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19042969, + "step": 15938, + "time_per_iteration": 4.316569805145264 + }, + { + "auxiliary_loss_clip": 0.01404027, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.24284661, + "balance_loss_mlp": 1.01490927, + "epoch": 0.9583045242747633, + "flos": 23815589535360.0, + "grad_norm": 2.442810033534897, + "language_loss": 0.67877328, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.70314968, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18701172, + "step": 15939, + "time_per_iteration": 4.309950590133667 + }, + { + "auxiliary_loss_clip": 0.0139138, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.23267221, + "balance_loss_mlp": 1.01179576, + "epoch": 0.9583646475274312, + "flos": 24141461719680.0, + "grad_norm": 1.5979306433430995, + "language_loss": 0.74492502, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.76914203, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18505859, + "step": 15940, + "time_per_iteration": 2.883789539337158 + }, + { + "auxiliary_loss_clip": 0.01420263, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.25852203, + "balance_loss_mlp": 1.01677489, + "epoch": 0.9584247707800992, + "flos": 20896431068160.0, + "grad_norm": 1.8366086333890226, + "language_loss": 0.7332601, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75781977, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18920898, + "step": 15941, + "time_per_iteration": 2.8464715480804443 + }, + { + "auxiliary_loss_clip": 0.01398089, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.23833036, + "balance_loss_mlp": 1.01319861, + "epoch": 0.9584848940327672, + "flos": 26078705377920.0, + "grad_norm": 1.592446167468445, + "language_loss": 0.72714889, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.75144994, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18823242, + "step": 15942, + "time_per_iteration": 3.0324931144714355 + }, + { + "auxiliary_loss_clip": 0.01408431, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.24639702, + "balance_loss_mlp": 1.01501036, + "epoch": 0.9585450172854352, + "flos": 34505557891200.0, + "grad_norm": 1.4885761095876966, + "language_loss": 0.72401917, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74844456, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.19104004, + "step": 15943, + "time_per_iteration": 2.936403751373291 + }, + { + "auxiliary_loss_clip": 0.01409588, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.24766779, + "balance_loss_mlp": 1.01432598, + "epoch": 0.9586051405381031, + "flos": 23120699610240.0, + "grad_norm": 1.6621538680556884, + "language_loss": 0.6951552, + "learning_rate": 1.792242006001965e-08, + "loss": 0.7195856, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19116211, + "step": 15944, + "time_per_iteration": 2.884065628051758 + }, + { + "auxiliary_loss_clip": 0.0140076, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.23932624, + "balance_loss_mlp": 1.01616299, + "epoch": 0.9586652637907711, + "flos": 19611972408960.0, + "grad_norm": 2.2198086291913905, + "language_loss": 0.66675007, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.69110084, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18164062, + "step": 15945, + "time_per_iteration": 2.85005521774292 + }, + { + "auxiliary_loss_clip": 0.0118102, + "auxiliary_loss_mlp": 0.01038627, + "balance_loss_clip": 1.09343112, + "balance_loss_mlp": 1.01077986, + "epoch": 0.958725387043439, + "flos": 72105118715520.0, + "grad_norm": 0.7454506615052396, + "language_loss": 0.61904407, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.64124048, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.27929688, + "step": 15946, + "time_per_iteration": 3.4205920696258545 + }, + { + "auxiliary_loss_clip": 0.01387702, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.23210859, + "balance_loss_mlp": 1.01336396, + "epoch": 0.958785510296107, + "flos": 28923381624960.0, + "grad_norm": 1.6881506503338013, + "language_loss": 0.76116586, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.78535974, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18347168, + "step": 15947, + "time_per_iteration": 2.897128105163574 + }, + { + "auxiliary_loss_clip": 0.01389611, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.23168349, + "balance_loss_mlp": 1.01296401, + "epoch": 0.958845633548775, + "flos": 18485979598080.0, + "grad_norm": 2.512468760970459, + "language_loss": 0.70596319, + "learning_rate": 1.771493294473747e-08, + "loss": 0.73018402, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1953125, + "step": 15948, + "time_per_iteration": 2.8244049549102783 + }, + { + "auxiliary_loss_clip": 0.01393967, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.23464656, + "balance_loss_mlp": 1.01615393, + "epoch": 0.958905756801443, + "flos": 24217572752640.0, + "grad_norm": 2.0803339232198454, + "language_loss": 0.79563522, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.8199228, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.18640137, + "step": 15949, + "time_per_iteration": 2.859264373779297 + }, + { + "auxiliary_loss_clip": 0.01401422, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.24024856, + "balance_loss_mlp": 1.01465833, + "epoch": 0.9589658800541109, + "flos": 25018372051200.0, + "grad_norm": 1.803976312416274, + "language_loss": 0.69472629, + "learning_rate": 1.761164038992602e-08, + "loss": 0.71908712, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.20007324, + "step": 15950, + "time_per_iteration": 2.851726531982422 + }, + { + "auxiliary_loss_clip": 0.01391801, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.23188996, + "balance_loss_mlp": 1.0130465, + "epoch": 0.9590260033067789, + "flos": 23525261781120.0, + "grad_norm": 1.9939726331259033, + "language_loss": 0.86792141, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.89215112, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18127441, + "step": 15951, + "time_per_iteration": 2.8604893684387207 + }, + { + "auxiliary_loss_clip": 0.01418709, + "auxiliary_loss_mlp": 0.0103826, + "balance_loss_clip": 1.25431406, + "balance_loss_mlp": 1.01931763, + "epoch": 0.9590861265594469, + "flos": 25531332059520.0, + "grad_norm": 2.5258364043318497, + "language_loss": 0.8100276, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.83459729, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.18933105, + "step": 15952, + "time_per_iteration": 2.8647520542144775 + }, + { + "auxiliary_loss_clip": 0.01398734, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.2397604, + "balance_loss_mlp": 1.01170802, + "epoch": 0.9591462498121148, + "flos": 21189383020800.0, + "grad_norm": 1.6014553577876367, + "language_loss": 0.70063311, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.72492838, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19104004, + "step": 15953, + "time_per_iteration": 2.8547143936157227 + }, + { + "auxiliary_loss_clip": 0.01392893, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.23391533, + "balance_loss_mlp": 1.01402688, + "epoch": 0.9592063730647828, + "flos": 21732322348800.0, + "grad_norm": 2.5279392940826577, + "language_loss": 0.59876215, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.62301701, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18554688, + "step": 15954, + "time_per_iteration": 2.87784743309021 + }, + { + "auxiliary_loss_clip": 0.01405579, + "auxiliary_loss_mlp": 0.01033573, + "balance_loss_clip": 1.24349678, + "balance_loss_mlp": 1.01441658, + "epoch": 0.9592664963174508, + "flos": 29901631605120.0, + "grad_norm": 2.2290566881046723, + "language_loss": 0.74644601, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.77083755, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19152832, + "step": 15955, + "time_per_iteration": 2.893137216567993 + }, + { + "auxiliary_loss_clip": 0.01396901, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.23566556, + "balance_loss_mlp": 1.01284289, + "epoch": 0.9593266195701188, + "flos": 18007432899840.0, + "grad_norm": 1.7866900059613064, + "language_loss": 0.63345206, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65775734, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20788574, + "step": 15956, + "time_per_iteration": 2.8754665851593018 + }, + { + "auxiliary_loss_clip": 0.01402352, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.24095881, + "balance_loss_mlp": 1.01192141, + "epoch": 0.9593867428227867, + "flos": 18846174602880.0, + "grad_norm": 3.849557730505381, + "language_loss": 0.6084286, + "learning_rate": 1.725248447997507e-08, + "loss": 0.63276196, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19055176, + "step": 15957, + "time_per_iteration": 2.969870090484619 + }, + { + "auxiliary_loss_clip": 0.01404801, + "auxiliary_loss_mlp": 0.01037067, + "balance_loss_clip": 1.24323189, + "balance_loss_mlp": 1.01791, + "epoch": 0.9594468660754547, + "flos": 29578247884800.0, + "grad_norm": 1.7694798450621323, + "language_loss": 0.74425328, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76867193, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19152832, + "step": 15958, + "time_per_iteration": 2.919109344482422 + }, + { + "auxiliary_loss_clip": 0.01393213, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.23486781, + "balance_loss_mlp": 1.01088274, + "epoch": 0.9595069893281226, + "flos": 20712736604160.0, + "grad_norm": 1.6711337902717764, + "language_loss": 0.75625789, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.78048837, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1895752, + "step": 15959, + "time_per_iteration": 2.8641583919525146 + }, + { + "auxiliary_loss_clip": 0.01409748, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.2464242, + "balance_loss_mlp": 1.01322842, + "epoch": 0.9595671125807906, + "flos": 22463299641600.0, + "grad_norm": 2.0699529468620925, + "language_loss": 0.66343296, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.68785477, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19213867, + "step": 15960, + "time_per_iteration": 2.8574483394622803 + }, + { + "auxiliary_loss_clip": 0.01388265, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.23245227, + "balance_loss_mlp": 1.01853371, + "epoch": 0.9596272358334585, + "flos": 23925842409600.0, + "grad_norm": 1.7701016185738538, + "language_loss": 0.78398645, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80824995, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.19555664, + "step": 15961, + "time_per_iteration": 2.858928680419922 + }, + { + "auxiliary_loss_clip": 0.01391831, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.23468435, + "balance_loss_mlp": 1.01053667, + "epoch": 0.9596873590861266, + "flos": 17680610574720.0, + "grad_norm": 1.8887806422533424, + "language_loss": 0.76267207, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78687668, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.1809082, + "step": 15962, + "time_per_iteration": 4.3035361766815186 + }, + { + "auxiliary_loss_clip": 0.0140293, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.24148118, + "balance_loss_mlp": 1.01207137, + "epoch": 0.9597474823387945, + "flos": 25818628412160.0, + "grad_norm": 2.183847415857662, + "language_loss": 0.72265053, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.74699438, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19396973, + "step": 15963, + "time_per_iteration": 2.8858559131622314 + }, + { + "auxiliary_loss_clip": 0.01374335, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.22170353, + "balance_loss_mlp": 1.01383257, + "epoch": 0.9598076055914625, + "flos": 23778913985280.0, + "grad_norm": 1.481896721075402, + "language_loss": 0.75072271, + "learning_rate": 1.689701268270527e-08, + "loss": 0.77479917, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.19470215, + "step": 15964, + "time_per_iteration": 2.8512539863586426 + }, + { + "auxiliary_loss_clip": 0.01178844, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.09113884, + "balance_loss_mlp": 1.00671768, + "epoch": 0.9598677288441305, + "flos": 56539971642240.0, + "grad_norm": 0.8710990217012085, + "language_loss": 0.57619071, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59826469, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.21875, + "step": 15965, + "time_per_iteration": 3.2855350971221924 + }, + { + "auxiliary_loss_clip": 0.01403334, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.2432059, + "balance_loss_mlp": 1.01525867, + "epoch": 0.9599278520967984, + "flos": 23006419948800.0, + "grad_norm": 1.8572832731289972, + "language_loss": 0.79721653, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.82159007, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1875, + "step": 15966, + "time_per_iteration": 4.299907684326172 + }, + { + "auxiliary_loss_clip": 0.01397244, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.23802936, + "balance_loss_mlp": 1.01169825, + "epoch": 0.9599879753494664, + "flos": 23050108442880.0, + "grad_norm": 1.5012240842705795, + "language_loss": 0.80380636, + "learning_rate": 1.674579558025102e-08, + "loss": 0.82808292, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18737793, + "step": 15967, + "time_per_iteration": 2.852555513381958 + }, + { + "auxiliary_loss_clip": 0.01404695, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.2426753, + "balance_loss_mlp": 1.01508808, + "epoch": 0.9600480986021344, + "flos": 16399183317120.0, + "grad_norm": 2.46502102878593, + "language_loss": 0.81535822, + "learning_rate": 1.669554028728348e-08, + "loss": 0.83975893, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20288086, + "step": 15968, + "time_per_iteration": 2.7934648990631104 + }, + { + "auxiliary_loss_clip": 0.01410795, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.24650359, + "balance_loss_mlp": 1.01786506, + "epoch": 0.9601082218548024, + "flos": 24286218393600.0, + "grad_norm": 2.2344310254419444, + "language_loss": 0.6811446, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.70563829, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.20703125, + "step": 15969, + "time_per_iteration": 2.8926937580108643 + }, + { + "auxiliary_loss_clip": 0.0140053, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.24131739, + "balance_loss_mlp": 1.01419377, + "epoch": 0.9601683451074703, + "flos": 19619528290560.0, + "grad_norm": 3.0912404825478172, + "language_loss": 0.80122375, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.82555854, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1875, + "step": 15970, + "time_per_iteration": 2.81402850151062 + }, + { + "auxiliary_loss_clip": 0.01386909, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.23114932, + "balance_loss_mlp": 1.01022863, + "epoch": 0.9602284683601383, + "flos": 26662301798400.0, + "grad_norm": 1.4866225334588137, + "language_loss": 0.78130984, + "learning_rate": 1.654522565861316e-08, + "loss": 0.80548555, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.20422363, + "step": 15971, + "time_per_iteration": 2.8932201862335205 + }, + { + "auxiliary_loss_clip": 0.01400068, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.23644066, + "balance_loss_mlp": 1.01592255, + "epoch": 0.9602885916128062, + "flos": 15561029796480.0, + "grad_norm": 1.955736514098828, + "language_loss": 0.68345261, + "learning_rate": 1.64952712054669e-08, + "loss": 0.70781463, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20227051, + "step": 15972, + "time_per_iteration": 2.82707142829895 + }, + { + "auxiliary_loss_clip": 0.01397022, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.23848403, + "balance_loss_mlp": 1.01387608, + "epoch": 0.9603487148654742, + "flos": 16509436191360.0, + "grad_norm": 2.1569287544145817, + "language_loss": 0.77221322, + "learning_rate": 1.644539196701844e-08, + "loss": 0.79650354, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18127441, + "step": 15973, + "time_per_iteration": 4.229241132736206 + }, + { + "auxiliary_loss_clip": 0.014009, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.24378216, + "balance_loss_mlp": 1.01514685, + "epoch": 0.9604088381181421, + "flos": 20853873694080.0, + "grad_norm": 1.6048251477629363, + "language_loss": 0.69508505, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71943927, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19372559, + "step": 15974, + "time_per_iteration": 2.8519973754882812 + }, + { + "auxiliary_loss_clip": 0.0140661, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.24479866, + "balance_loss_mlp": 1.01158869, + "epoch": 0.9604689613708102, + "flos": 19692698411520.0, + "grad_norm": 3.5610303733479025, + "language_loss": 0.68930328, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.71367848, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1932373, + "step": 15975, + "time_per_iteration": 4.398802042007446 + }, + { + "auxiliary_loss_clip": 0.0137718, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.22415066, + "balance_loss_mlp": 1.01025486, + "epoch": 0.9605290846234781, + "flos": 24108224774400.0, + "grad_norm": 2.5586393148955056, + "language_loss": 0.57039666, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.59445536, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18432617, + "step": 15976, + "time_per_iteration": 2.85956072807312 + }, + { + "auxiliary_loss_clip": 0.01382521, + "auxiliary_loss_mlp": 0.01028463, + "balance_loss_clip": 1.22730196, + "balance_loss_mlp": 1.0096637, + "epoch": 0.9605892078761461, + "flos": 27133156880640.0, + "grad_norm": 1.641260426351821, + "language_loss": 0.68752092, + "learning_rate": 1.624662719799219e-08, + "loss": 0.71163082, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.18798828, + "step": 15977, + "time_per_iteration": 2.986905336380005 + }, + { + "auxiliary_loss_clip": 0.01392457, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.23271763, + "balance_loss_mlp": 1.01504529, + "epoch": 0.9606493311288141, + "flos": 14144980700160.0, + "grad_norm": 1.8260555952152562, + "language_loss": 0.82995707, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.8542285, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1965332, + "step": 15978, + "time_per_iteration": 2.9257116317749023 + }, + { + "auxiliary_loss_clip": 0.01406212, + "auxiliary_loss_mlp": 0.01033737, + "balance_loss_clip": 1.24303102, + "balance_loss_mlp": 1.01325703, + "epoch": 0.960709454381482, + "flos": 15821332986240.0, + "grad_norm": 2.1624252069353407, + "language_loss": 0.84053147, + "learning_rate": 1.614769615070921e-08, + "loss": 0.86493099, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.20483398, + "step": 15979, + "time_per_iteration": 2.9144721031188965 + }, + { + "auxiliary_loss_clip": 0.01406714, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.24407423, + "balance_loss_mlp": 1.01596999, + "epoch": 0.96076957763415, + "flos": 22575724266240.0, + "grad_norm": 4.470369402352826, + "language_loss": 0.79661286, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.821024, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.1842041, + "step": 15980, + "time_per_iteration": 2.9412145614624023 + }, + { + "auxiliary_loss_clip": 0.01403375, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.2410351, + "balance_loss_mlp": 1.0165782, + "epoch": 0.960829700886818, + "flos": 24692047418880.0, + "grad_norm": 1.9901399290245663, + "language_loss": 0.68785155, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.71223855, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18762207, + "step": 15981, + "time_per_iteration": 2.8771586418151855 + }, + { + "auxiliary_loss_clip": 0.01390632, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.23234844, + "balance_loss_mlp": 1.01356411, + "epoch": 0.960889824139486, + "flos": 26553542002560.0, + "grad_norm": 1.526048710064427, + "language_loss": 0.69958884, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.72382152, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.19067383, + "step": 15982, + "time_per_iteration": 2.9642622470855713 + }, + { + "auxiliary_loss_clip": 0.01180969, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.09285784, + "balance_loss_mlp": 1.00811791, + "epoch": 0.9609499473921539, + "flos": 71143953287040.0, + "grad_norm": 0.6672956639404675, + "language_loss": 0.53278017, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55488181, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.2109375, + "step": 15983, + "time_per_iteration": 3.4245212078094482 + }, + { + "auxiliary_loss_clip": 0.01396644, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.23756957, + "balance_loss_mlp": 1.01689887, + "epoch": 0.9610100706448219, + "flos": 20560650272640.0, + "grad_norm": 2.410711465134529, + "language_loss": 0.69204432, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.71637022, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19042969, + "step": 15984, + "time_per_iteration": 2.8337159156799316 + }, + { + "auxiliary_loss_clip": 0.01377243, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.22361732, + "balance_loss_mlp": 1.01436234, + "epoch": 0.9610701938974898, + "flos": 14072308272000.0, + "grad_norm": 1.5827946436066016, + "language_loss": 0.68228281, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.70639247, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.19372559, + "step": 15985, + "time_per_iteration": 2.9058902263641357 + }, + { + "auxiliary_loss_clip": 0.01407051, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.24598026, + "balance_loss_mlp": 1.015908, + "epoch": 0.9611303171501578, + "flos": 20239619281920.0, + "grad_norm": 1.7758498990169005, + "language_loss": 0.79142952, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81584656, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1875, + "step": 15986, + "time_per_iteration": 2.8711938858032227 + }, + { + "auxiliary_loss_clip": 0.01399686, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.24084091, + "balance_loss_mlp": 1.01713967, + "epoch": 0.9611904404028258, + "flos": 20958651947520.0, + "grad_norm": 2.03413228771132, + "language_loss": 0.64612526, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.67049855, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.20507812, + "step": 15987, + "time_per_iteration": 2.8308238983154297 + }, + { + "auxiliary_loss_clip": 0.01384306, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.22988915, + "balance_loss_mlp": 1.01591897, + "epoch": 0.9612505636554938, + "flos": 24838704374400.0, + "grad_norm": 1.7213289184450784, + "language_loss": 0.67827511, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.70245451, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.17724609, + "step": 15988, + "time_per_iteration": 2.866257667541504 + }, + { + "auxiliary_loss_clip": 0.01400171, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.24018681, + "balance_loss_mlp": 1.01660156, + "epoch": 0.9613106869081617, + "flos": 17173758614400.0, + "grad_norm": 1.7861436296946636, + "language_loss": 0.74731576, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.77166504, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18151855, + "step": 15989, + "time_per_iteration": 2.809638500213623 + }, + { + "auxiliary_loss_clip": 0.01182175, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.09305155, + "balance_loss_mlp": 1.01281261, + "epoch": 0.9613708101608297, + "flos": 61592465306880.0, + "grad_norm": 0.8042047945905217, + "language_loss": 0.63164431, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65383834, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.24414062, + "step": 15990, + "time_per_iteration": 3.2551937103271484 + }, + { + "auxiliary_loss_clip": 0.01398663, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.23808396, + "balance_loss_mlp": 1.01303911, + "epoch": 0.9614309334134977, + "flos": 27429456948480.0, + "grad_norm": 2.227518834361908, + "language_loss": 0.78560829, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.80990243, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.17724609, + "step": 15991, + "time_per_iteration": 2.9074718952178955 + }, + { + "auxiliary_loss_clip": 0.01406594, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.24179506, + "balance_loss_mlp": 1.01455855, + "epoch": 0.9614910566661656, + "flos": 22829105001600.0, + "grad_norm": 2.3416633439933277, + "language_loss": 0.8637175, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.88813078, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20178223, + "step": 15992, + "time_per_iteration": 2.9261627197265625 + }, + { + "auxiliary_loss_clip": 0.01400616, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.240273, + "balance_loss_mlp": 1.01380658, + "epoch": 0.9615511799188337, + "flos": 20677554132480.0, + "grad_norm": 2.2591688381544666, + "language_loss": 0.73345906, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.75779235, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18908691, + "step": 15993, + "time_per_iteration": 2.841919422149658 + }, + { + "auxiliary_loss_clip": 0.01406368, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.24461031, + "balance_loss_mlp": 1.01586986, + "epoch": 0.9616113031715016, + "flos": 33163040874240.0, + "grad_norm": 1.5413374031714386, + "language_loss": 0.68559992, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.71000946, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18713379, + "step": 15994, + "time_per_iteration": 2.9672930240631104 + }, + { + "auxiliary_loss_clip": 0.01396994, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.23891163, + "balance_loss_mlp": 1.0149858, + "epoch": 0.9616714264241696, + "flos": 25020724780800.0, + "grad_norm": 5.098119938757766, + "language_loss": 0.84931862, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.8736307, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19226074, + "step": 15995, + "time_per_iteration": 2.8672475814819336 + }, + { + "auxiliary_loss_clip": 0.01419722, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.25600839, + "balance_loss_mlp": 1.01459241, + "epoch": 0.9617315496768375, + "flos": 13553918887680.0, + "grad_norm": 1.9063716280793492, + "language_loss": 0.76525986, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78979254, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.1895752, + "step": 15996, + "time_per_iteration": 2.910810708999634 + }, + { + "auxiliary_loss_clip": 0.01395587, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.23539066, + "balance_loss_mlp": 1.01290536, + "epoch": 0.9617916729295055, + "flos": 11261819111040.0, + "grad_norm": 24.072643676263425, + "language_loss": 0.78075838, + "learning_rate": 1.52708595287494e-08, + "loss": 0.80503511, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19177246, + "step": 15997, + "time_per_iteration": 4.235259294509888 + }, + { + "auxiliary_loss_clip": 0.01383344, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.22746539, + "balance_loss_mlp": 1.01241648, + "epoch": 0.9618517961821734, + "flos": 22829828918400.0, + "grad_norm": 1.6169604607343846, + "language_loss": 0.67969465, + "learning_rate": 1.522286126505001e-08, + "loss": 0.70383763, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18554688, + "step": 15998, + "time_per_iteration": 2.8584611415863037 + }, + { + "auxiliary_loss_clip": 0.01390346, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.23241305, + "balance_loss_mlp": 1.01058793, + "epoch": 0.9619119194348414, + "flos": 16625887603200.0, + "grad_norm": 1.6734145679292511, + "language_loss": 0.73078656, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75498378, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18798828, + "step": 15999, + "time_per_iteration": 2.959156036376953 + }, + { + "auxiliary_loss_clip": 0.01385244, + "auxiliary_loss_mlp": 0.01032024, + "balance_loss_clip": 1.22968173, + "balance_loss_mlp": 1.01416612, + "epoch": 0.9619720426875094, + "flos": 24545842911360.0, + "grad_norm": 1.6895435310920486, + "language_loss": 0.65791494, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.68208766, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1784668, + "step": 16000, + "time_per_iteration": 2.951401948928833 + }, + { + "auxiliary_loss_clip": 0.01402939, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.24252486, + "balance_loss_mlp": 1.01350641, + "epoch": 0.9620321659401774, + "flos": 20642190681600.0, + "grad_norm": 1.5221211518010416, + "language_loss": 0.75490326, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77925861, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19091797, + "step": 16001, + "time_per_iteration": 4.2572386264801025 + }, + { + "auxiliary_loss_clip": 0.0139844, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.23865616, + "balance_loss_mlp": 1.01550484, + "epoch": 0.9620922891928453, + "flos": 18524510184960.0, + "grad_norm": 1.7311813682608408, + "language_loss": 0.68933237, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.71366072, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18884277, + "step": 16002, + "time_per_iteration": 2.9166886806488037 + }, + { + "auxiliary_loss_clip": 0.01389924, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.23237109, + "balance_loss_mlp": 1.01531625, + "epoch": 0.9621524124455133, + "flos": 28779167888640.0, + "grad_norm": 1.2774157547709442, + "language_loss": 0.65430784, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.67855561, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.1953125, + "step": 16003, + "time_per_iteration": 2.9828546047210693 + }, + { + "auxiliary_loss_clip": 0.01405118, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.24460697, + "balance_loss_mlp": 1.01610422, + "epoch": 0.9622125356981813, + "flos": 19108197095040.0, + "grad_norm": 3.9925021330368673, + "language_loss": 0.76384306, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78824556, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19018555, + "step": 16004, + "time_per_iteration": 2.894174098968506 + }, + { + "auxiliary_loss_clip": 0.01392852, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.23506188, + "balance_loss_mlp": 1.0123477, + "epoch": 0.9622726589508492, + "flos": 20312427444480.0, + "grad_norm": 2.53832814683334, + "language_loss": 0.80449772, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.82873833, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18859863, + "step": 16005, + "time_per_iteration": 2.81652569770813 + }, + { + "auxiliary_loss_clip": 0.01391363, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.23442054, + "balance_loss_mlp": 1.01188827, + "epoch": 0.9623327822035173, + "flos": 54948851856000.0, + "grad_norm": 2.4166621132818884, + "language_loss": 0.68462819, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.70884842, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18786621, + "step": 16006, + "time_per_iteration": 3.1488258838653564 + }, + { + "auxiliary_loss_clip": 0.01385694, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.23150992, + "balance_loss_mlp": 1.01646543, + "epoch": 0.9623929054561852, + "flos": 21768183492480.0, + "grad_norm": 2.143214790351003, + "language_loss": 0.78400731, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80820847, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.17956543, + "step": 16007, + "time_per_iteration": 2.8847358226776123 + }, + { + "auxiliary_loss_clip": 0.0141274, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.25064039, + "balance_loss_mlp": 1.01393592, + "epoch": 0.9624530287088532, + "flos": 17940642295680.0, + "grad_norm": 1.9084261105175964, + "language_loss": 0.68418062, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70865321, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20581055, + "step": 16008, + "time_per_iteration": 4.1960060596466064 + }, + { + "auxiliary_loss_clip": 0.01398112, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.23739696, + "balance_loss_mlp": 1.01583171, + "epoch": 0.9625131519615211, + "flos": 23262967820160.0, + "grad_norm": 2.647211146549221, + "language_loss": 0.74487531, + "learning_rate": 1.469984811730529e-08, + "loss": 0.76920545, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19067383, + "step": 16009, + "time_per_iteration": 2.859495162963867 + }, + { + "auxiliary_loss_clip": 0.01396088, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.23799062, + "balance_loss_mlp": 1.01267838, + "epoch": 0.9625732752141891, + "flos": 18925950464640.0, + "grad_norm": 1.8007479504161117, + "language_loss": 0.76022148, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.78448808, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.17883301, + "step": 16010, + "time_per_iteration": 4.194318056106567 + }, + { + "auxiliary_loss_clip": 0.01422641, + "auxiliary_loss_mlp": 0.01039926, + "balance_loss_clip": 1.2565341, + "balance_loss_mlp": 1.01853991, + "epoch": 0.962633398466857, + "flos": 16261484832000.0, + "grad_norm": 1.956551726351416, + "language_loss": 0.70148027, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.72610593, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.21374512, + "step": 16011, + "time_per_iteration": 2.83772611618042 + }, + { + "auxiliary_loss_clip": 0.01392448, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.23439968, + "balance_loss_mlp": 1.01398087, + "epoch": 0.962693521719525, + "flos": 54215657568000.0, + "grad_norm": 1.74628570331771, + "language_loss": 0.69141024, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.71565658, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18200684, + "step": 16012, + "time_per_iteration": 3.1629652976989746 + }, + { + "auxiliary_loss_clip": 0.01416274, + "auxiliary_loss_mlp": 0.01036185, + "balance_loss_clip": 1.25133395, + "balance_loss_mlp": 1.01557338, + "epoch": 0.962753644972193, + "flos": 33117814056960.0, + "grad_norm": 2.096593727016843, + "language_loss": 0.73315084, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.75767541, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.20605469, + "step": 16013, + "time_per_iteration": 2.9411020278930664 + }, + { + "auxiliary_loss_clip": 0.01398287, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.23883057, + "balance_loss_mlp": 1.01046705, + "epoch": 0.962813768224861, + "flos": 42245936012160.0, + "grad_norm": 2.105317744420225, + "language_loss": 0.65139604, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.67567635, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19287109, + "step": 16014, + "time_per_iteration": 3.0462684631347656 + }, + { + "auxiliary_loss_clip": 0.01379889, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.22626615, + "balance_loss_mlp": 1.01566958, + "epoch": 0.9628738914775289, + "flos": 43960185457920.0, + "grad_norm": 1.491104231154019, + "language_loss": 0.72766364, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.75179678, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.17749023, + "step": 16015, + "time_per_iteration": 3.030846118927002 + }, + { + "auxiliary_loss_clip": 0.01391022, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.23265839, + "balance_loss_mlp": 1.00976014, + "epoch": 0.9629340147301969, + "flos": 15604582556160.0, + "grad_norm": 2.1645476463263744, + "language_loss": 0.78356373, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.80775708, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1854248, + "step": 16016, + "time_per_iteration": 2.8288121223449707 + }, + { + "auxiliary_loss_clip": 0.01183048, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.09356427, + "balance_loss_mlp": 1.00533283, + "epoch": 0.9629941379828649, + "flos": 62980209141120.0, + "grad_norm": 0.8192636355727563, + "language_loss": 0.63149297, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65361238, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.23535156, + "step": 16017, + "time_per_iteration": 3.245300769805908 + }, + { + "auxiliary_loss_clip": 0.01400219, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.24014378, + "balance_loss_mlp": 1.01290536, + "epoch": 0.9630542612355328, + "flos": 29911359237120.0, + "grad_norm": 1.8683551748974374, + "language_loss": 0.67502987, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.6993463, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1854248, + "step": 16018, + "time_per_iteration": 2.911592960357666 + }, + { + "auxiliary_loss_clip": 0.01404479, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.24415267, + "balance_loss_mlp": 1.01931739, + "epoch": 0.9631143844882009, + "flos": 17903514297600.0, + "grad_norm": 2.2267148029316743, + "language_loss": 0.80340934, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.82783335, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.1862793, + "step": 16019, + "time_per_iteration": 2.8536434173583984 + }, + { + "auxiliary_loss_clip": 0.01383649, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.22733104, + "balance_loss_mlp": 1.01226878, + "epoch": 0.9631745077408688, + "flos": 26149929972480.0, + "grad_norm": 1.3892395474262773, + "language_loss": 0.71898079, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74312758, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.1875, + "step": 16020, + "time_per_iteration": 2.8727340698242188 + }, + { + "auxiliary_loss_clip": 0.01397817, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.23729444, + "balance_loss_mlp": 1.01166463, + "epoch": 0.9632346309935368, + "flos": 24984637413120.0, + "grad_norm": 24.189682681961685, + "language_loss": 0.78207594, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.8063482, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.17749023, + "step": 16021, + "time_per_iteration": 2.8867135047912598 + }, + { + "auxiliary_loss_clip": 0.0142381, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.25880527, + "balance_loss_mlp": 1.01658392, + "epoch": 0.9632947542462047, + "flos": 23626194226560.0, + "grad_norm": 1.8261671278683849, + "language_loss": 0.66148818, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.68608785, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.19567871, + "step": 16022, + "time_per_iteration": 2.8831191062927246 + }, + { + "auxiliary_loss_clip": 0.01392211, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.23591518, + "balance_loss_mlp": 1.01710725, + "epoch": 0.9633548774988727, + "flos": 26406704067840.0, + "grad_norm": 2.1038859693408676, + "language_loss": 0.74477017, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.76903856, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.17529297, + "step": 16023, + "time_per_iteration": 2.867810010910034 + }, + { + "auxiliary_loss_clip": 0.01384859, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.22798777, + "balance_loss_mlp": 1.01033473, + "epoch": 0.9634150007515406, + "flos": 23777918599680.0, + "grad_norm": 1.4493365009877741, + "language_loss": 0.82034123, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.84448177, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18859863, + "step": 16024, + "time_per_iteration": 2.852630138397217 + }, + { + "auxiliary_loss_clip": 0.01416079, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.25140285, + "balance_loss_mlp": 1.01284564, + "epoch": 0.9634751240042086, + "flos": 24146348158080.0, + "grad_norm": 1.364178515386651, + "language_loss": 0.81780016, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.84228623, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19665527, + "step": 16025, + "time_per_iteration": 2.862863063812256 + }, + { + "auxiliary_loss_clip": 0.01410437, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.24749243, + "balance_loss_mlp": 1.01366174, + "epoch": 0.9635352472568766, + "flos": 24358800332160.0, + "grad_norm": 1.711063748084618, + "language_loss": 0.77226341, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.79669666, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19226074, + "step": 16026, + "time_per_iteration": 2.888176918029785 + }, + { + "auxiliary_loss_clip": 0.0139464, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.23463392, + "balance_loss_mlp": 1.0165273, + "epoch": 0.9635953705095446, + "flos": 23994533295360.0, + "grad_norm": 3.3421535207488295, + "language_loss": 0.63884413, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.66314679, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.1907959, + "step": 16027, + "time_per_iteration": 2.876574993133545 + }, + { + "auxiliary_loss_clip": 0.01406912, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.24527872, + "balance_loss_mlp": 1.01456654, + "epoch": 0.9636554937622125, + "flos": 19837907533440.0, + "grad_norm": 2.9488264736675465, + "language_loss": 0.87677842, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.90117639, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18322754, + "step": 16028, + "time_per_iteration": 2.9124183654785156 + }, + { + "auxiliary_loss_clip": 0.01183832, + "auxiliary_loss_mlp": 0.0103245, + "balance_loss_clip": 1.09450841, + "balance_loss_mlp": 1.01423502, + "epoch": 0.9637156170148805, + "flos": 67465014572160.0, + "grad_norm": 0.7052144500943459, + "language_loss": 0.53126049, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55342335, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.18261719, + "step": 16029, + "time_per_iteration": 3.3107709884643555 + }, + { + "auxiliary_loss_clip": 0.01400981, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.23931754, + "balance_loss_mlp": 1.01264381, + "epoch": 0.9637757402675484, + "flos": 20310029470080.0, + "grad_norm": 1.6249489933846686, + "language_loss": 0.74431372, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76863319, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18310547, + "step": 16030, + "time_per_iteration": 2.8293304443359375 + }, + { + "auxiliary_loss_clip": 0.01389369, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.23320341, + "balance_loss_mlp": 1.01349413, + "epoch": 0.9638358635202164, + "flos": 27245807729280.0, + "grad_norm": 2.093787792652627, + "language_loss": 0.66680384, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.69101155, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.17907715, + "step": 16031, + "time_per_iteration": 2.992018699645996 + }, + { + "auxiliary_loss_clip": 0.01176513, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.09013486, + "balance_loss_mlp": 1.00584245, + "epoch": 0.9638959867728845, + "flos": 70321228243200.0, + "grad_norm": 0.8371232373477131, + "language_loss": 0.60727292, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62932533, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.22851562, + "step": 16032, + "time_per_iteration": 3.3319199085235596 + }, + { + "auxiliary_loss_clip": 0.01374783, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.22204709, + "balance_loss_mlp": 1.01520848, + "epoch": 0.9639561100255524, + "flos": 25417595335680.0, + "grad_norm": 1.6691500056667177, + "language_loss": 0.67084432, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.6949265, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.18225098, + "step": 16033, + "time_per_iteration": 4.330109119415283 + }, + { + "auxiliary_loss_clip": 0.01397, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.23902178, + "balance_loss_mlp": 1.01143217, + "epoch": 0.9640162332782204, + "flos": 18122979415680.0, + "grad_norm": 1.8773509454065906, + "language_loss": 0.66662908, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.69090843, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.19494629, + "step": 16034, + "time_per_iteration": 2.865243434906006 + }, + { + "auxiliary_loss_clip": 0.01392839, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.23549557, + "balance_loss_mlp": 1.01493144, + "epoch": 0.9640763565308883, + "flos": 23450643826560.0, + "grad_norm": 2.027984328479228, + "language_loss": 0.75185227, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.77611661, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18664551, + "step": 16035, + "time_per_iteration": 2.866504669189453 + }, + { + "auxiliary_loss_clip": 0.01406614, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.24893212, + "balance_loss_mlp": 1.01516831, + "epoch": 0.9641364797835563, + "flos": 22429610248320.0, + "grad_norm": 2.1962156040267815, + "language_loss": 0.82991862, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.85432708, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19067383, + "step": 16036, + "time_per_iteration": 4.267319202423096 + }, + { + "auxiliary_loss_clip": 0.01397198, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.23895216, + "balance_loss_mlp": 1.01148963, + "epoch": 0.9641966030362242, + "flos": 30633106590720.0, + "grad_norm": 1.8232519018781073, + "language_loss": 0.70932561, + "learning_rate": 1.340965177371789e-08, + "loss": 0.73359668, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.1842041, + "step": 16037, + "time_per_iteration": 2.8957197666168213 + }, + { + "auxiliary_loss_clip": 0.01404201, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.24238455, + "balance_loss_mlp": 1.01222074, + "epoch": 0.9642567262888923, + "flos": 20961230901120.0, + "grad_norm": 1.5938753831062864, + "language_loss": 0.63400072, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65835673, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19177246, + "step": 16038, + "time_per_iteration": 2.889617443084717 + }, + { + "auxiliary_loss_clip": 0.01418694, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.2559526, + "balance_loss_mlp": 1.01749015, + "epoch": 0.9643168495415602, + "flos": 22650025507200.0, + "grad_norm": 1.7415228426359832, + "language_loss": 0.71563911, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.74018139, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18054199, + "step": 16039, + "time_per_iteration": 2.930497884750366 + }, + { + "auxiliary_loss_clip": 0.01395917, + "auxiliary_loss_mlp": 0.01028285, + "balance_loss_clip": 1.236606, + "balance_loss_mlp": 1.00923526, + "epoch": 0.9643769727942282, + "flos": 20275842384000.0, + "grad_norm": 2.1608363944778404, + "language_loss": 0.74465489, + "learning_rate": 1.327491870605657e-08, + "loss": 0.76889682, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19042969, + "step": 16040, + "time_per_iteration": 2.8745458126068115 + }, + { + "auxiliary_loss_clip": 0.01405566, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.24380219, + "balance_loss_mlp": 1.01682377, + "epoch": 0.9644370960468961, + "flos": 13889925907200.0, + "grad_norm": 2.321486562992873, + "language_loss": 0.74187231, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.7662788, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18273926, + "step": 16041, + "time_per_iteration": 2.8038291931152344 + }, + { + "auxiliary_loss_clip": 0.01382062, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.22944748, + "balance_loss_mlp": 1.01242328, + "epoch": 0.9644972192995641, + "flos": 17247064469760.0, + "grad_norm": 3.07677314917852, + "language_loss": 0.72835428, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.75248545, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18640137, + "step": 16042, + "time_per_iteration": 2.8355727195739746 + }, + { + "auxiliary_loss_clip": 0.01397532, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.23556674, + "balance_loss_mlp": 1.0117017, + "epoch": 0.964557342552232, + "flos": 23850455293440.0, + "grad_norm": 1.7072749632147377, + "language_loss": 0.81947196, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.84373361, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.16931152, + "step": 16043, + "time_per_iteration": 2.848682165145874 + }, + { + "auxiliary_loss_clip": 0.0140122, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.24226213, + "balance_loss_mlp": 1.01180303, + "epoch": 0.9646174658049, + "flos": 21663179015040.0, + "grad_norm": 1.5557709309004166, + "language_loss": 0.72031879, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.74462843, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.17956543, + "step": 16044, + "time_per_iteration": 4.191645383834839 + }, + { + "auxiliary_loss_clip": 0.01394936, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.23714876, + "balance_loss_mlp": 1.01092708, + "epoch": 0.9646775890575681, + "flos": 17138621387520.0, + "grad_norm": 1.8663212364405923, + "language_loss": 0.70350218, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.7277503, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.1895752, + "step": 16045, + "time_per_iteration": 4.241312026977539 + }, + { + "auxiliary_loss_clip": 0.01398139, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.23923552, + "balance_loss_mlp": 1.01417875, + "epoch": 0.964737712310236, + "flos": 13013468023680.0, + "grad_norm": 2.2766953623574255, + "language_loss": 0.76677215, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.79108346, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18823242, + "step": 16046, + "time_per_iteration": 2.812091827392578 + }, + { + "auxiliary_loss_clip": 0.01404237, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.24176741, + "balance_loss_mlp": 1.01293159, + "epoch": 0.964797835562904, + "flos": 24289747488000.0, + "grad_norm": 1.7060753024147461, + "language_loss": 0.63180441, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.65616864, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19250488, + "step": 16047, + "time_per_iteration": 2.8381309509277344 + }, + { + "auxiliary_loss_clip": 0.01397387, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.23840761, + "balance_loss_mlp": 1.01763296, + "epoch": 0.9648579588155719, + "flos": 20532616479360.0, + "grad_norm": 2.4484657861196175, + "language_loss": 0.70019633, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.72453713, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19067383, + "step": 16048, + "time_per_iteration": 2.8083038330078125 + }, + { + "auxiliary_loss_clip": 0.01403948, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.24228299, + "balance_loss_mlp": 1.01243019, + "epoch": 0.9649180820682399, + "flos": 32166014302080.0, + "grad_norm": 1.7247879888080815, + "language_loss": 0.64505494, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.6694144, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19567871, + "step": 16049, + "time_per_iteration": 2.892364263534546 + }, + { + "auxiliary_loss_clip": 0.01410476, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.25082731, + "balance_loss_mlp": 1.01394558, + "epoch": 0.9649782053209078, + "flos": 20532480744960.0, + "grad_norm": 1.6338836085244222, + "language_loss": 0.71921837, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.7436502, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18762207, + "step": 16050, + "time_per_iteration": 2.862340211868286 + }, + { + "auxiliary_loss_clip": 0.01414291, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.25006008, + "balance_loss_mlp": 1.01211905, + "epoch": 0.9650383285735759, + "flos": 43082958412800.0, + "grad_norm": 2.374355767803438, + "language_loss": 0.70185375, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72631001, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19226074, + "step": 16051, + "time_per_iteration": 3.0700838565826416 + }, + { + "auxiliary_loss_clip": 0.01180881, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.09315455, + "balance_loss_mlp": 1.0027163, + "epoch": 0.9650984518262438, + "flos": 61777471870080.0, + "grad_norm": 0.8511159233821218, + "language_loss": 0.5909369, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61304373, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.27148438, + "step": 16052, + "time_per_iteration": 3.4554381370544434 + }, + { + "auxiliary_loss_clip": 0.01396351, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.23880386, + "balance_loss_mlp": 1.01226854, + "epoch": 0.9651585750789118, + "flos": 29801468321280.0, + "grad_norm": 1.5560824135024054, + "language_loss": 0.75258124, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.7768572, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18981934, + "step": 16053, + "time_per_iteration": 2.963743209838867 + }, + { + "auxiliary_loss_clip": 0.01411749, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.2507689, + "balance_loss_mlp": 1.01371741, + "epoch": 0.9652186983315797, + "flos": 16881756802560.0, + "grad_norm": 3.049320886176013, + "language_loss": 0.68906248, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.7135098, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.19262695, + "step": 16054, + "time_per_iteration": 2.818467140197754 + }, + { + "auxiliary_loss_clip": 0.01395446, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.23772156, + "balance_loss_mlp": 1.01337147, + "epoch": 0.9652788215842477, + "flos": 31662284232960.0, + "grad_norm": 1.5507393783315537, + "language_loss": 0.63151813, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.655788, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1817627, + "step": 16055, + "time_per_iteration": 2.948437452316284 + }, + { + "auxiliary_loss_clip": 0.01397028, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.24019289, + "balance_loss_mlp": 1.01325893, + "epoch": 0.9653389448369156, + "flos": 24765081805440.0, + "grad_norm": 1.9315706217738529, + "language_loss": 0.77519083, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79947907, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.1854248, + "step": 16056, + "time_per_iteration": 2.868941068649292 + }, + { + "auxiliary_loss_clip": 0.01396874, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.23691416, + "balance_loss_mlp": 1.0155108, + "epoch": 0.9653990680895836, + "flos": 20305414500480.0, + "grad_norm": 2.1749772143858124, + "language_loss": 0.72682083, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.75113201, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.1875, + "step": 16057, + "time_per_iteration": 2.8377623558044434 + }, + { + "auxiliary_loss_clip": 0.01403132, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.24479079, + "balance_loss_mlp": 1.01564407, + "epoch": 0.9654591913422517, + "flos": 22539048716160.0, + "grad_norm": 2.1404854692958617, + "language_loss": 0.72196496, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74633396, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18103027, + "step": 16058, + "time_per_iteration": 2.896942377090454 + }, + { + "auxiliary_loss_clip": 0.01380624, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.22551751, + "balance_loss_mlp": 1.01349044, + "epoch": 0.9655193145949196, + "flos": 26774862157440.0, + "grad_norm": 1.4073953808185482, + "language_loss": 0.74392498, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76804936, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18322754, + "step": 16059, + "time_per_iteration": 2.8779640197753906 + }, + { + "auxiliary_loss_clip": 0.01412924, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.25029981, + "balance_loss_mlp": 1.01981342, + "epoch": 0.9655794378475876, + "flos": 41984727926400.0, + "grad_norm": 1.803246955502041, + "language_loss": 0.7408154, + "learning_rate": 1.239402791721722e-08, + "loss": 0.76532459, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.1817627, + "step": 16060, + "time_per_iteration": 3.008495330810547 + }, + { + "auxiliary_loss_clip": 0.01384403, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.23052382, + "balance_loss_mlp": 1.01460266, + "epoch": 0.9656395611002555, + "flos": 27720961067520.0, + "grad_norm": 1.881596441117576, + "language_loss": 0.76975977, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.79393423, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.18444824, + "step": 16061, + "time_per_iteration": 2.927568197250366 + }, + { + "auxiliary_loss_clip": 0.01185495, + "auxiliary_loss_mlp": 0.01020096, + "balance_loss_clip": 1.09585142, + "balance_loss_mlp": 0.99644536, + "epoch": 0.9656996843529235, + "flos": 68998646200320.0, + "grad_norm": 0.727033092487765, + "language_loss": 0.64126086, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66331679, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.23632812, + "step": 16062, + "time_per_iteration": 3.408754348754883 + }, + { + "auxiliary_loss_clip": 0.0137369, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.22000718, + "balance_loss_mlp": 1.01262236, + "epoch": 0.9657598076055914, + "flos": 20641059561600.0, + "grad_norm": 2.3631622365421867, + "language_loss": 0.94178599, + "learning_rate": 1.226449424760867e-08, + "loss": 0.96583396, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.18493652, + "step": 16063, + "time_per_iteration": 2.8354737758636475 + }, + { + "auxiliary_loss_clip": 0.01399106, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.23937798, + "balance_loss_mlp": 1.01323009, + "epoch": 0.9658199308582595, + "flos": 20458179504000.0, + "grad_norm": 1.690817006508775, + "language_loss": 0.8282131, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.85251749, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18127441, + "step": 16064, + "time_per_iteration": 2.7917416095733643 + }, + { + "auxiliary_loss_clip": 0.01401103, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.24466538, + "balance_loss_mlp": 1.01623976, + "epoch": 0.9658800541109274, + "flos": 24728903948160.0, + "grad_norm": 1.567842774870903, + "language_loss": 0.84299988, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86734819, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.17504883, + "step": 16065, + "time_per_iteration": 2.8806357383728027 + }, + { + "auxiliary_loss_clip": 0.01393394, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.23435438, + "balance_loss_mlp": 1.01355922, + "epoch": 0.9659401773635954, + "flos": 21618359400960.0, + "grad_norm": 1.5445005614496579, + "language_loss": 0.68508101, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.70934951, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19897461, + "step": 16066, + "time_per_iteration": 2.8352322578430176 + }, + { + "auxiliary_loss_clip": 0.01394899, + "auxiliary_loss_mlp": 0.01034153, + "balance_loss_clip": 1.23746753, + "balance_loss_mlp": 1.01525831, + "epoch": 0.9660003006162633, + "flos": 20310391428480.0, + "grad_norm": 1.7659189597314102, + "language_loss": 0.82363594, + "learning_rate": 1.209283794752558e-08, + "loss": 0.8479265, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18884277, + "step": 16067, + "time_per_iteration": 4.2677271366119385 + }, + { + "auxiliary_loss_clip": 0.013969, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.23878217, + "balance_loss_mlp": 1.01696074, + "epoch": 0.9660604238689313, + "flos": 24472582300800.0, + "grad_norm": 3.146586954365524, + "language_loss": 0.70140421, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.72572756, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18481445, + "step": 16068, + "time_per_iteration": 2.8764517307281494 + }, + { + "auxiliary_loss_clip": 0.01367164, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.2166903, + "balance_loss_mlp": 1.01377845, + "epoch": 0.9661205471215992, + "flos": 19873270984320.0, + "grad_norm": 1.6426073384449833, + "language_loss": 0.68628967, + "learning_rate": 1.20074620808146e-08, + "loss": 0.71027428, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 1.5078125, + "router_z_loss_mlp": 0.1751709, + "step": 16069, + "time_per_iteration": 2.83490252494812 + }, + { + "auxiliary_loss_clip": 0.01391617, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.23392177, + "balance_loss_mlp": 1.01327729, + "epoch": 0.9661806703742672, + "flos": 20567482237440.0, + "grad_norm": 1.7967375478551182, + "language_loss": 0.89603812, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.92027426, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18725586, + "step": 16070, + "time_per_iteration": 2.912762403488159 + }, + { + "auxiliary_loss_clip": 0.01406899, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.24670386, + "balance_loss_mlp": 1.01892972, + "epoch": 0.9662407936269353, + "flos": 21440094312960.0, + "grad_norm": 2.866960382325993, + "language_loss": 0.78016722, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.8046174, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19213867, + "step": 16071, + "time_per_iteration": 4.310982704162598 + }, + { + "auxiliary_loss_clip": 0.01396944, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.23986697, + "balance_loss_mlp": 1.01804304, + "epoch": 0.9663009168796032, + "flos": 14911095219840.0, + "grad_norm": 7.891129304577265, + "language_loss": 0.66860777, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.69296092, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.20324707, + "step": 16072, + "time_per_iteration": 2.8181912899017334 + }, + { + "auxiliary_loss_clip": 0.01412186, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.25047922, + "balance_loss_mlp": 1.01745343, + "epoch": 0.9663610401322712, + "flos": 24320134010880.0, + "grad_norm": 3.5441121813183813, + "language_loss": 0.78116304, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.80564928, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19006348, + "step": 16073, + "time_per_iteration": 2.9015071392059326 + }, + { + "auxiliary_loss_clip": 0.01414311, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.25173163, + "balance_loss_mlp": 1.01953316, + "epoch": 0.9664211633849391, + "flos": 17646378243840.0, + "grad_norm": 2.8967082570350016, + "language_loss": 0.76206237, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78659713, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.19641113, + "step": 16074, + "time_per_iteration": 2.8201541900634766 + }, + { + "auxiliary_loss_clip": 0.01398503, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.240448, + "balance_loss_mlp": 1.01465559, + "epoch": 0.9664812866376071, + "flos": 29801558810880.0, + "grad_norm": 5.190174682403722, + "language_loss": 0.76045108, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.78477675, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.1940918, + "step": 16075, + "time_per_iteration": 2.8968615531921387 + }, + { + "auxiliary_loss_clip": 0.01401851, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.24146771, + "balance_loss_mlp": 1.01240182, + "epoch": 0.966541409890275, + "flos": 14290280311680.0, + "grad_norm": 2.6137444300275634, + "language_loss": 0.79649419, + "learning_rate": 1.171102125547696e-08, + "loss": 0.82081664, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.17993164, + "step": 16076, + "time_per_iteration": 2.842754602432251 + }, + { + "auxiliary_loss_clip": 0.01392258, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.23238659, + "balance_loss_mlp": 1.01721978, + "epoch": 0.9666015331429431, + "flos": 19868746504320.0, + "grad_norm": 1.6638666795591646, + "language_loss": 0.72288322, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74717414, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.19604492, + "step": 16077, + "time_per_iteration": 2.864861249923706 + }, + { + "auxiliary_loss_clip": 0.01391806, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.23314345, + "balance_loss_mlp": 1.01396728, + "epoch": 0.966661656395611, + "flos": 27137409891840.0, + "grad_norm": 2.397937271229835, + "language_loss": 0.60043252, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.6246897, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19934082, + "step": 16078, + "time_per_iteration": 4.27131462097168 + }, + { + "auxiliary_loss_clip": 0.01410467, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.24875784, + "balance_loss_mlp": 1.01683211, + "epoch": 0.966721779648279, + "flos": 21518512830720.0, + "grad_norm": 1.8714424519790533, + "language_loss": 0.72992164, + "learning_rate": 1.158510609718899e-08, + "loss": 0.75438273, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.18798828, + "step": 16079, + "time_per_iteration": 2.8669281005859375 + }, + { + "auxiliary_loss_clip": 0.01384069, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.22903657, + "balance_loss_mlp": 1.01238036, + "epoch": 0.9667819029009469, + "flos": 23888578677120.0, + "grad_norm": 1.6065279740342007, + "language_loss": 0.72426254, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74840939, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18249512, + "step": 16080, + "time_per_iteration": 4.300507545471191 + }, + { + "auxiliary_loss_clip": 0.01389961, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.23128414, + "balance_loss_mlp": 1.01069069, + "epoch": 0.9668420261536149, + "flos": 21517245976320.0, + "grad_norm": 1.9575334703326828, + "language_loss": 0.74683875, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.7710278, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18261719, + "step": 16081, + "time_per_iteration": 2.8782808780670166 + }, + { + "auxiliary_loss_clip": 0.01394003, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.2350806, + "balance_loss_mlp": 1.01180768, + "epoch": 0.9669021494062828, + "flos": 26698389166080.0, + "grad_norm": 1.7711745995963897, + "language_loss": 0.67920214, + "learning_rate": 1.145986954691236e-08, + "loss": 0.70345175, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.19165039, + "step": 16082, + "time_per_iteration": 2.9073731899261475 + }, + { + "auxiliary_loss_clip": 0.0139184, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.23406208, + "balance_loss_mlp": 1.00938284, + "epoch": 0.9669622726589508, + "flos": 29836967506560.0, + "grad_norm": 1.5678568485635798, + "language_loss": 0.77284157, + "learning_rate": 1.141827483932789e-08, + "loss": 0.79705149, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.19763184, + "step": 16083, + "time_per_iteration": 2.8845417499542236 + }, + { + "auxiliary_loss_clip": 0.01402087, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.24211657, + "balance_loss_mlp": 1.01668096, + "epoch": 0.9670223959116189, + "flos": 22930897098240.0, + "grad_norm": 1.9185206390232092, + "language_loss": 0.80190635, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.82628369, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18969727, + "step": 16084, + "time_per_iteration": 2.92252254486084 + }, + { + "auxiliary_loss_clip": 0.01403231, + "auxiliary_loss_mlp": 0.01030454, + "balance_loss_clip": 1.24102437, + "balance_loss_mlp": 1.01195228, + "epoch": 0.9670825191642868, + "flos": 18633677184000.0, + "grad_norm": 3.047341761112542, + "language_loss": 0.69247562, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.71681243, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18493652, + "step": 16085, + "time_per_iteration": 2.837934732437134 + }, + { + "auxiliary_loss_clip": 0.01419796, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.25535727, + "balance_loss_mlp": 1.01602435, + "epoch": 0.9671426424169548, + "flos": 24508443444480.0, + "grad_norm": 1.794054034540556, + "language_loss": 0.69164264, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71619362, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.19274902, + "step": 16086, + "time_per_iteration": 2.9011118412017822 + }, + { + "auxiliary_loss_clip": 0.01407803, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.2481966, + "balance_loss_mlp": 1.01324487, + "epoch": 0.9672027656696227, + "flos": 20379625251840.0, + "grad_norm": 2.1726257373206925, + "language_loss": 0.7907986, + "learning_rate": 1.125265009690235e-08, + "loss": 0.81520903, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19970703, + "step": 16087, + "time_per_iteration": 2.8651180267333984 + }, + { + "auxiliary_loss_clip": 0.01389397, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.23081517, + "balance_loss_mlp": 1.01495337, + "epoch": 0.9672628889222907, + "flos": 18889501138560.0, + "grad_norm": 1.9089972818311673, + "language_loss": 0.72262824, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.74684697, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.17529297, + "step": 16088, + "time_per_iteration": 2.8264026641845703 + }, + { + "auxiliary_loss_clip": 0.01392544, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.23721957, + "balance_loss_mlp": 1.01357532, + "epoch": 0.9673230121749586, + "flos": 28707219377280.0, + "grad_norm": 1.4205636674899553, + "language_loss": 0.7136848, + "learning_rate": 1.117029020040916e-08, + "loss": 0.73793197, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.1862793, + "step": 16089, + "time_per_iteration": 2.9089529514312744 + }, + { + "auxiliary_loss_clip": 0.01409302, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.24848247, + "balance_loss_mlp": 1.01310563, + "epoch": 0.9673831354276267, + "flos": 20493904913280.0, + "grad_norm": 2.714491278621544, + "language_loss": 0.75332355, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.77773184, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18432617, + "step": 16090, + "time_per_iteration": 2.8405590057373047 + }, + { + "auxiliary_loss_clip": 0.01417124, + "auxiliary_loss_mlp": 0.01029712, + "balance_loss_clip": 1.25173104, + "balance_loss_mlp": 1.01081753, + "epoch": 0.9674432586802946, + "flos": 26808913509120.0, + "grad_norm": 1.7070007585083067, + "language_loss": 0.69380897, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71827734, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.18908691, + "step": 16091, + "time_per_iteration": 2.8920960426330566 + }, + { + "auxiliary_loss_clip": 0.01390514, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.23389161, + "balance_loss_mlp": 1.01673734, + "epoch": 0.9675033819329626, + "flos": 22321438634880.0, + "grad_norm": 2.4619670106862106, + "language_loss": 0.77802789, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.80230343, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.20300293, + "step": 16092, + "time_per_iteration": 2.927084445953369 + }, + { + "auxiliary_loss_clip": 0.01391954, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.23437786, + "balance_loss_mlp": 1.0143342, + "epoch": 0.9675635051856305, + "flos": 12682935624960.0, + "grad_norm": 2.7871743926161265, + "language_loss": 0.77777374, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.8020162, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.1796875, + "step": 16093, + "time_per_iteration": 2.8511133193969727 + }, + { + "auxiliary_loss_clip": 0.01394537, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.23539352, + "balance_loss_mlp": 1.0098567, + "epoch": 0.9676236284382985, + "flos": 24619284501120.0, + "grad_norm": 2.7401319165621247, + "language_loss": 0.69869143, + "learning_rate": 1.096571027726112e-08, + "loss": 0.72292888, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19348145, + "step": 16094, + "time_per_iteration": 2.871044397354126 + }, + { + "auxiliary_loss_clip": 0.01404664, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.24201417, + "balance_loss_mlp": 1.01481533, + "epoch": 0.9676837516909664, + "flos": 23377428460800.0, + "grad_norm": 1.741388968864354, + "language_loss": 0.76560444, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78998649, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.18725586, + "step": 16095, + "time_per_iteration": 2.8633015155792236 + }, + { + "auxiliary_loss_clip": 0.01406385, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.24272156, + "balance_loss_mlp": 1.01418293, + "epoch": 0.9677438749436345, + "flos": 20496981559680.0, + "grad_norm": 1.8779131269138423, + "language_loss": 0.7188561, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.74325049, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.1887207, + "step": 16096, + "time_per_iteration": 2.887155055999756 + }, + { + "auxiliary_loss_clip": 0.0141363, + "auxiliary_loss_mlp": 0.01035711, + "balance_loss_clip": 1.25219941, + "balance_loss_mlp": 1.01642299, + "epoch": 0.9678039981963025, + "flos": 47572107344640.0, + "grad_norm": 1.7124421337653342, + "language_loss": 0.72476614, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.74925959, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19287109, + "step": 16097, + "time_per_iteration": 3.060173749923706 + }, + { + "auxiliary_loss_clip": 0.01395796, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.23867369, + "balance_loss_mlp": 1.01403642, + "epoch": 0.9678641214489704, + "flos": 25049889694080.0, + "grad_norm": 1.7947219427114232, + "language_loss": 0.78669643, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.81098521, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19055176, + "step": 16098, + "time_per_iteration": 2.875985860824585 + }, + { + "auxiliary_loss_clip": 0.01381474, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.22628915, + "balance_loss_mlp": 1.01575017, + "epoch": 0.9679242447016384, + "flos": 19249288940160.0, + "grad_norm": 1.8230476040076373, + "language_loss": 0.90509903, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92924899, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.17785645, + "step": 16099, + "time_per_iteration": 2.8328845500946045 + }, + { + "auxiliary_loss_clip": 0.0141464, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.25151038, + "balance_loss_mlp": 1.01434779, + "epoch": 0.9679843679543063, + "flos": 33267502414080.0, + "grad_norm": 1.8034124267981217, + "language_loss": 0.66840422, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.69289708, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.20300293, + "step": 16100, + "time_per_iteration": 2.952063798904419 + }, + { + "auxiliary_loss_clip": 0.01390149, + "auxiliary_loss_mlp": 0.01028952, + "balance_loss_clip": 1.2314775, + "balance_loss_mlp": 1.01059389, + "epoch": 0.9680444912069743, + "flos": 22793832040320.0, + "grad_norm": 1.808082566413832, + "language_loss": 0.73791677, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.76210773, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18371582, + "step": 16101, + "time_per_iteration": 2.852747917175293 + }, + { + "auxiliary_loss_clip": 0.01393444, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.23463738, + "balance_loss_mlp": 1.01257122, + "epoch": 0.9681046144596422, + "flos": 24034602205440.0, + "grad_norm": 3.02758435764318, + "language_loss": 0.74028927, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.76453829, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.1887207, + "step": 16102, + "time_per_iteration": 4.311879873275757 + }, + { + "auxiliary_loss_clip": 0.01407473, + "auxiliary_loss_mlp": 0.01036792, + "balance_loss_clip": 1.24674416, + "balance_loss_mlp": 1.01664543, + "epoch": 0.9681647377123103, + "flos": 23451548722560.0, + "grad_norm": 1.8157065613753776, + "language_loss": 0.78513151, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.80957419, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.20129395, + "step": 16103, + "time_per_iteration": 2.8557939529418945 + }, + { + "auxiliary_loss_clip": 0.01406784, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.24689305, + "balance_loss_mlp": 1.01443434, + "epoch": 0.9682248609649782, + "flos": 22685479447680.0, + "grad_norm": 1.5325443979707467, + "language_loss": 0.80476439, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82916617, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18969727, + "step": 16104, + "time_per_iteration": 2.8791444301605225 + }, + { + "auxiliary_loss_clip": 0.0138878, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.23126054, + "balance_loss_mlp": 1.01429939, + "epoch": 0.9682849842176462, + "flos": 24439797803520.0, + "grad_norm": 1.53026226268362, + "language_loss": 0.78326106, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.80747664, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18481445, + "step": 16105, + "time_per_iteration": 2.9193274974823 + }, + { + "auxiliary_loss_clip": 0.01183552, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.09482217, + "balance_loss_mlp": 1.01383686, + "epoch": 0.9683451074703141, + "flos": 60023424983040.0, + "grad_norm": 0.8211507215538079, + "language_loss": 0.56731719, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58958477, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.29296875, + "step": 16106, + "time_per_iteration": 4.7999937534332275 + }, + { + "auxiliary_loss_clip": 0.01182267, + "auxiliary_loss_mlp": 0.01039631, + "balance_loss_clip": 1.09502363, + "balance_loss_mlp": 1.0196991, + "epoch": 0.9684052307229821, + "flos": 52722067587840.0, + "grad_norm": 0.8824386913191324, + "language_loss": 0.6166116, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63883054, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.19921875, + "step": 16107, + "time_per_iteration": 3.1996593475341797 + }, + { + "auxiliary_loss_clip": 0.01399733, + "auxiliary_loss_mlp": 0.01037214, + "balance_loss_clip": 1.2394917, + "balance_loss_mlp": 1.01877189, + "epoch": 0.96846535397565, + "flos": 22794103509120.0, + "grad_norm": 3.012271564964, + "language_loss": 0.74582839, + "learning_rate": 1.040291854638875e-08, + "loss": 0.77019787, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18457031, + "step": 16108, + "time_per_iteration": 2.8605902194976807 + }, + { + "auxiliary_loss_clip": 0.01404369, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.24251032, + "balance_loss_mlp": 1.01263022, + "epoch": 0.968525477228318, + "flos": 23332518357120.0, + "grad_norm": 3.8223461836127783, + "language_loss": 0.58252043, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.60688484, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19433594, + "step": 16109, + "time_per_iteration": 2.857447862625122 + }, + { + "auxiliary_loss_clip": 0.01179933, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.0928483, + "balance_loss_mlp": 1.00611782, + "epoch": 0.9685856004809861, + "flos": 67915075029120.0, + "grad_norm": 0.6689177113830288, + "language_loss": 0.54293251, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56502104, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.22753906, + "step": 16110, + "time_per_iteration": 3.2365925312042236 + }, + { + "auxiliary_loss_clip": 0.01412373, + "auxiliary_loss_mlp": 0.01034004, + "balance_loss_clip": 1.24893773, + "balance_loss_mlp": 1.01398838, + "epoch": 0.968645723733654, + "flos": 33960899260800.0, + "grad_norm": 1.4633542249169573, + "language_loss": 0.63075864, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.65522242, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20007324, + "step": 16111, + "time_per_iteration": 2.9124701023101807 + }, + { + "auxiliary_loss_clip": 0.01396243, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.2381264, + "balance_loss_mlp": 1.01334381, + "epoch": 0.968705846986322, + "flos": 18560642797440.0, + "grad_norm": 1.8129688968730455, + "language_loss": 0.75007284, + "learning_rate": 1.024483677309118e-08, + "loss": 0.77435696, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18811035, + "step": 16112, + "time_per_iteration": 2.9002394676208496 + }, + { + "auxiliary_loss_clip": 0.01384106, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.22871792, + "balance_loss_mlp": 1.01564729, + "epoch": 0.9687659702389899, + "flos": 17429899282560.0, + "grad_norm": 1.8416067503416018, + "language_loss": 0.67547351, + "learning_rate": 1.020550495531558e-08, + "loss": 0.69965339, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18249512, + "step": 16113, + "time_per_iteration": 4.313498020172119 + }, + { + "auxiliary_loss_clip": 0.01177266, + "auxiliary_loss_mlp": 0.01019765, + "balance_loss_clip": 1.09033465, + "balance_loss_mlp": 0.99859315, + "epoch": 0.9688260934916579, + "flos": 62078296417920.0, + "grad_norm": 0.6960541649119036, + "language_loss": 0.56644547, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.5884158, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.21191406, + "step": 16114, + "time_per_iteration": 3.281243085861206 + }, + { + "auxiliary_loss_clip": 0.01398876, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.23995805, + "balance_loss_mlp": 1.01582074, + "epoch": 0.9688862167443258, + "flos": 15082483098240.0, + "grad_norm": 1.9554561667894377, + "language_loss": 0.82984507, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.85418248, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.19055176, + "step": 16115, + "time_per_iteration": 4.160271644592285 + }, + { + "auxiliary_loss_clip": 0.01379058, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.22554719, + "balance_loss_mlp": 1.01127243, + "epoch": 0.9689463399969939, + "flos": 19947798449280.0, + "grad_norm": 1.574532342687924, + "language_loss": 0.72517997, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74926639, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.18322754, + "step": 16116, + "time_per_iteration": 2.8470888137817383 + }, + { + "auxiliary_loss_clip": 0.01403556, + "auxiliary_loss_mlp": 0.0102817, + "balance_loss_clip": 1.24248838, + "balance_loss_mlp": 1.00981164, + "epoch": 0.9690064632496618, + "flos": 19582445537280.0, + "grad_norm": 1.9864058054616247, + "language_loss": 0.76725149, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.79156876, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18371582, + "step": 16117, + "time_per_iteration": 2.801406145095825 + }, + { + "auxiliary_loss_clip": 0.01397075, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.23622835, + "balance_loss_mlp": 1.01506805, + "epoch": 0.9690665865023298, + "flos": 21882463153920.0, + "grad_norm": 2.3239879576729674, + "language_loss": 0.77942777, + "learning_rate": 1.000997769426548e-08, + "loss": 0.80373245, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18322754, + "step": 16118, + "time_per_iteration": 2.841005802154541 + }, + { + "auxiliary_loss_clip": 0.01414636, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.25415361, + "balance_loss_mlp": 1.01453269, + "epoch": 0.9691267097549977, + "flos": 21003878764800.0, + "grad_norm": 1.674769416682097, + "language_loss": 0.78506756, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80954552, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1862793, + "step": 16119, + "time_per_iteration": 2.849072217941284 + }, + { + "auxiliary_loss_clip": 0.01385104, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.23083854, + "balance_loss_mlp": 1.01444268, + "epoch": 0.9691868330076657, + "flos": 24289611753600.0, + "grad_norm": 1.3802769080867991, + "language_loss": 0.76099694, + "learning_rate": 9.932295003832747e-09, + "loss": 0.78517354, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.18115234, + "step": 16120, + "time_per_iteration": 2.900120973587036 + }, + { + "auxiliary_loss_clip": 0.01410677, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.25086176, + "balance_loss_mlp": 1.0127387, + "epoch": 0.9692469562603336, + "flos": 17684592117120.0, + "grad_norm": 1.7856261502677935, + "language_loss": 0.70341176, + "learning_rate": 9.89356685323095e-09, + "loss": 0.72782892, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.1829834, + "step": 16121, + "time_per_iteration": 2.8468096256256104 + }, + { + "auxiliary_loss_clip": 0.01389538, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.23152781, + "balance_loss_mlp": 1.01064229, + "epoch": 0.9693070795130017, + "flos": 26845860528000.0, + "grad_norm": 1.8274096114105445, + "language_loss": 0.69463056, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71882081, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18835449, + "step": 16122, + "time_per_iteration": 2.882077217102051 + }, + { + "auxiliary_loss_clip": 0.01403914, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.24356437, + "balance_loss_mlp": 1.01193142, + "epoch": 0.9693672027656697, + "flos": 18086349110400.0, + "grad_norm": 1.743857138804114, + "language_loss": 0.76238042, + "learning_rate": 9.81633694859907e-09, + "loss": 0.7867161, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.17724609, + "step": 16123, + "time_per_iteration": 2.8360776901245117 + }, + { + "auxiliary_loss_clip": 0.01397069, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.23545039, + "balance_loss_mlp": 1.01201177, + "epoch": 0.9694273260183376, + "flos": 21773205665280.0, + "grad_norm": 1.5874243530975334, + "language_loss": 0.75206399, + "learning_rate": 9.777835197497753e-09, + "loss": 0.77634716, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19250488, + "step": 16124, + "time_per_iteration": 2.8346760272979736 + }, + { + "auxiliary_loss_clip": 0.01403048, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.24231815, + "balance_loss_mlp": 1.01588643, + "epoch": 0.9694874492710056, + "flos": 24436902136320.0, + "grad_norm": 2.5709807418418706, + "language_loss": 0.75441802, + "learning_rate": 9.739408915820258e-09, + "loss": 0.77878845, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18115234, + "step": 16125, + "time_per_iteration": 2.8585658073425293 + }, + { + "auxiliary_loss_clip": 0.01183467, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.09442663, + "balance_loss_mlp": 1.00527525, + "epoch": 0.9695475725236735, + "flos": 67679367027840.0, + "grad_norm": 0.919609345773292, + "language_loss": 0.61539143, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63751537, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.23632812, + "step": 16126, + "time_per_iteration": 3.289154291152954 + }, + { + "auxiliary_loss_clip": 0.01381031, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.22632694, + "balance_loss_mlp": 1.01450992, + "epoch": 0.9696076957763415, + "flos": 19137090539520.0, + "grad_norm": 1.6480419698716133, + "language_loss": 0.75184226, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77599937, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.20153809, + "step": 16127, + "time_per_iteration": 2.831860303878784 + }, + { + "auxiliary_loss_clip": 0.01406961, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.24321866, + "balance_loss_mlp": 1.01385784, + "epoch": 0.9696678190290094, + "flos": 15495234577920.0, + "grad_norm": 1.908256089925537, + "language_loss": 0.69864392, + "learning_rate": 9.62458290188839e-09, + "loss": 0.72304416, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19226074, + "step": 16128, + "time_per_iteration": 2.8462777137756348 + }, + { + "auxiliary_loss_clip": 0.01394924, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.23734891, + "balance_loss_mlp": 1.01477027, + "epoch": 0.9697279422816775, + "flos": 36221933842560.0, + "grad_norm": 1.5783371409266866, + "language_loss": 0.65741873, + "learning_rate": 9.586458512449213e-09, + "loss": 0.68170905, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19335938, + "step": 16129, + "time_per_iteration": 2.953537940979004 + }, + { + "auxiliary_loss_clip": 0.01421164, + "auxiliary_loss_mlp": 0.0103283, + "balance_loss_clip": 1.25727057, + "balance_loss_mlp": 1.0136497, + "epoch": 0.9697880655343454, + "flos": 25495018467840.0, + "grad_norm": 1.9727734268645263, + "language_loss": 0.63628972, + "learning_rate": 9.548409599691166e-09, + "loss": 0.66082966, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 1.63964844, + "router_z_loss_mlp": 0.19165039, + "step": 16130, + "time_per_iteration": 2.9029581546783447 + }, + { + "auxiliary_loss_clip": 0.01417108, + "auxiliary_loss_mlp": 0.01033921, + "balance_loss_clip": 1.25224972, + "balance_loss_mlp": 1.0148232, + "epoch": 0.9698481887870134, + "flos": 15339981110400.0, + "grad_norm": 2.6441444161847203, + "language_loss": 0.71769357, + "learning_rate": 9.510436165056867e-09, + "loss": 0.74220395, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.1907959, + "step": 16131, + "time_per_iteration": 2.8254919052124023 + }, + { + "auxiliary_loss_clip": 0.01417085, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.25430155, + "balance_loss_mlp": 1.01898623, + "epoch": 0.9699083120396813, + "flos": 21992580293760.0, + "grad_norm": 1.6359206513231148, + "language_loss": 0.77282333, + "learning_rate": 9.472538209986058e-09, + "loss": 0.79738176, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19763184, + "step": 16132, + "time_per_iteration": 2.9412343502044678 + }, + { + "auxiliary_loss_clip": 0.01400854, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.24147415, + "balance_loss_mlp": 1.0193553, + "epoch": 0.9699684352923493, + "flos": 15671146936320.0, + "grad_norm": 2.8994340530886795, + "language_loss": 0.79560006, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81998956, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18737793, + "step": 16133, + "time_per_iteration": 2.815883159637451 + }, + { + "auxiliary_loss_clip": 0.01393352, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.23781514, + "balance_loss_mlp": 1.01572108, + "epoch": 0.9700285585450172, + "flos": 21918460032000.0, + "grad_norm": 1.5773113009625812, + "language_loss": 0.65481627, + "learning_rate": 9.396968744281863e-09, + "loss": 0.67909193, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18493652, + "step": 16134, + "time_per_iteration": 2.8525664806365967 + }, + { + "auxiliary_loss_clip": 0.01402447, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.24238205, + "balance_loss_mlp": 1.01346993, + "epoch": 0.9700886817976853, + "flos": 23925435206400.0, + "grad_norm": 2.207490908421783, + "language_loss": 0.81528616, + "learning_rate": 9.359297236513519e-09, + "loss": 0.8396312, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18591309, + "step": 16135, + "time_per_iteration": 2.9292211532592773 + }, + { + "auxiliary_loss_clip": 0.01407368, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.24484921, + "balance_loss_mlp": 1.01432252, + "epoch": 0.9701488050503532, + "flos": 25458931100160.0, + "grad_norm": 2.046627198563364, + "language_loss": 0.7422033, + "learning_rate": 9.321701214040079e-09, + "loss": 0.76662052, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.20031738, + "step": 16136, + "time_per_iteration": 2.871307134628296 + }, + { + "auxiliary_loss_clip": 0.01385396, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.22839832, + "balance_loss_mlp": 1.01426053, + "epoch": 0.9702089283030212, + "flos": 20599542817920.0, + "grad_norm": 1.7169481804383706, + "language_loss": 0.77227473, + "learning_rate": 9.28418067828729e-09, + "loss": 0.79645419, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18273926, + "step": 16137, + "time_per_iteration": 2.8432352542877197 + }, + { + "auxiliary_loss_clip": 0.01179202, + "auxiliary_loss_mlp": 0.01021802, + "balance_loss_clip": 1.0921309, + "balance_loss_mlp": 1.00377798, + "epoch": 0.9702690515556892, + "flos": 70683233040000.0, + "grad_norm": 0.8191662807832778, + "language_loss": 0.54930115, + "learning_rate": 9.246735630678015e-09, + "loss": 0.57131118, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.18066406, + "step": 16138, + "time_per_iteration": 4.854604721069336 + }, + { + "auxiliary_loss_clip": 0.01407801, + "auxiliary_loss_mlp": 0.01028055, + "balance_loss_clip": 1.2469399, + "balance_loss_mlp": 1.01032889, + "epoch": 0.9703291748083571, + "flos": 35894885293440.0, + "grad_norm": 1.6580180868063579, + "language_loss": 0.71960104, + "learning_rate": 9.209366072632007e-09, + "loss": 0.74395961, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.17736816, + "step": 16139, + "time_per_iteration": 2.9524242877960205 + }, + { + "auxiliary_loss_clip": 0.0140734, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.24648309, + "balance_loss_mlp": 1.01316893, + "epoch": 0.9703892980610251, + "flos": 24327192199680.0, + "grad_norm": 1.4788404097581702, + "language_loss": 0.72954118, + "learning_rate": 9.172072005566134e-09, + "loss": 0.75394487, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1986084, + "step": 16140, + "time_per_iteration": 2.887711524963379 + }, + { + "auxiliary_loss_clip": 0.01396881, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.23664236, + "balance_loss_mlp": 1.01763225, + "epoch": 0.970449421313693, + "flos": 18012636051840.0, + "grad_norm": 2.202862269899796, + "language_loss": 0.69261819, + "learning_rate": 9.13485343089504e-09, + "loss": 0.71696573, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.20239258, + "step": 16141, + "time_per_iteration": 4.231178045272827 + }, + { + "auxiliary_loss_clip": 0.01388179, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.23304176, + "balance_loss_mlp": 1.01387811, + "epoch": 0.9705095445663611, + "flos": 25348949694720.0, + "grad_norm": 1.9367402813633896, + "language_loss": 0.69335878, + "learning_rate": 9.097710350029597e-09, + "loss": 0.71756899, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1895752, + "step": 16142, + "time_per_iteration": 2.8643553256988525 + }, + { + "auxiliary_loss_clip": 0.01398158, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.23768139, + "balance_loss_mlp": 1.0124265, + "epoch": 0.970569667819029, + "flos": 26844865142400.0, + "grad_norm": 1.8066971458512866, + "language_loss": 0.56038988, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58468348, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18774414, + "step": 16143, + "time_per_iteration": 2.9029433727264404 + }, + { + "auxiliary_loss_clip": 0.01406651, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.24456954, + "balance_loss_mlp": 1.0159452, + "epoch": 0.970629791071697, + "flos": 25859692707840.0, + "grad_norm": 10.353866906179231, + "language_loss": 0.68830967, + "learning_rate": 9.023650675347382e-09, + "loss": 0.71272194, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.18640137, + "step": 16144, + "time_per_iteration": 2.8985683917999268 + }, + { + "auxiliary_loss_clip": 0.01386336, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.22983944, + "balance_loss_mlp": 1.01310444, + "epoch": 0.9706899143243649, + "flos": 36554321278080.0, + "grad_norm": 3.0737535430619163, + "language_loss": 0.7219826, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74617529, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.19836426, + "step": 16145, + "time_per_iteration": 2.9813120365142822 + }, + { + "auxiliary_loss_clip": 0.01399379, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.23782706, + "balance_loss_mlp": 1.01472855, + "epoch": 0.9707500375770329, + "flos": 12273758484480.0, + "grad_norm": 3.0910936234215103, + "language_loss": 0.80543458, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82976562, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18994141, + "step": 16146, + "time_per_iteration": 2.8732059001922607 + }, + { + "auxiliary_loss_clip": 0.01181166, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.0917027, + "balance_loss_mlp": 1.00778353, + "epoch": 0.9708101608297008, + "flos": 60886417898880.0, + "grad_norm": 0.7701317981547353, + "language_loss": 0.54704314, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56921494, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.28320312, + "step": 16147, + "time_per_iteration": 3.4482955932617188 + }, + { + "auxiliary_loss_clip": 0.01407818, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.2457298, + "balance_loss_mlp": 1.0172683, + "epoch": 0.9708702840823689, + "flos": 27135916813440.0, + "grad_norm": 3.274700762556912, + "language_loss": 0.62400049, + "learning_rate": 8.876437313434682e-09, + "loss": 0.64843941, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18811035, + "step": 16148, + "time_per_iteration": 4.28760552406311 + }, + { + "auxiliary_loss_clip": 0.01398654, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.24106729, + "balance_loss_mlp": 1.01379323, + "epoch": 0.9709304073350368, + "flos": 20787580782720.0, + "grad_norm": 1.7725379314285796, + "language_loss": 0.74760616, + "learning_rate": 8.839822728487155e-09, + "loss": 0.77191675, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18615723, + "step": 16149, + "time_per_iteration": 2.8908510208129883 + }, + { + "auxiliary_loss_clip": 0.01390795, + "auxiliary_loss_mlp": 0.01039129, + "balance_loss_clip": 1.23239422, + "balance_loss_mlp": 1.02024579, + "epoch": 0.9709905305877048, + "flos": 41948640558720.0, + "grad_norm": 2.184690880535293, + "language_loss": 0.76594621, + "learning_rate": 8.803283648533222e-09, + "loss": 0.79024541, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18884277, + "step": 16150, + "time_per_iteration": 4.370849370956421 + }, + { + "auxiliary_loss_clip": 0.01420061, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.25405979, + "balance_loss_mlp": 1.01636231, + "epoch": 0.9710506538403728, + "flos": 17174527776000.0, + "grad_norm": 2.0281235289193464, + "language_loss": 0.74495029, + "learning_rate": 8.766820074958214e-09, + "loss": 0.76952946, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.21508789, + "step": 16151, + "time_per_iteration": 2.901977300643921 + }, + { + "auxiliary_loss_clip": 0.01387558, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.23293078, + "balance_loss_mlp": 1.01171803, + "epoch": 0.9711107770930407, + "flos": 21182008118400.0, + "grad_norm": 2.3745417442155614, + "language_loss": 0.75615978, + "learning_rate": 8.730432009145027e-09, + "loss": 0.78034854, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.19604492, + "step": 16152, + "time_per_iteration": 2.894869804382324 + }, + { + "auxiliary_loss_clip": 0.01407166, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.24797797, + "balance_loss_mlp": 1.01501369, + "epoch": 0.9711709003457087, + "flos": 22247318373120.0, + "grad_norm": 1.6008248734439288, + "language_loss": 0.67648774, + "learning_rate": 8.694119452473448e-09, + "loss": 0.70090127, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19177246, + "step": 16153, + "time_per_iteration": 2.8772382736206055 + }, + { + "auxiliary_loss_clip": 0.01399873, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.2403096, + "balance_loss_mlp": 1.01277685, + "epoch": 0.9712310235983767, + "flos": 26225090864640.0, + "grad_norm": 1.748182695136443, + "language_loss": 0.71511364, + "learning_rate": 8.65788240632037e-09, + "loss": 0.73943651, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.19628906, + "step": 16154, + "time_per_iteration": 2.942561149597168 + }, + { + "auxiliary_loss_clip": 0.014095, + "auxiliary_loss_mlp": 0.01037983, + "balance_loss_clip": 1.24635613, + "balance_loss_mlp": 1.01894534, + "epoch": 0.9712911468510447, + "flos": 20678006580480.0, + "grad_norm": 1.8528823554855698, + "language_loss": 0.81855738, + "learning_rate": 8.621720872059812e-09, + "loss": 0.84303218, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.19042969, + "step": 16155, + "time_per_iteration": 2.863548755645752 + }, + { + "auxiliary_loss_clip": 0.01406141, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.24223876, + "balance_loss_mlp": 1.01290476, + "epoch": 0.9713512701037126, + "flos": 13560750852480.0, + "grad_norm": 11.38890371699834, + "language_loss": 0.67992437, + "learning_rate": 8.58563485106334e-09, + "loss": 0.70432246, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.20788574, + "step": 16156, + "time_per_iteration": 2.7993052005767822 + }, + { + "auxiliary_loss_clip": 0.0140316, + "auxiliary_loss_mlp": 0.01036659, + "balance_loss_clip": 1.23986244, + "balance_loss_mlp": 1.01789546, + "epoch": 0.9714113933563806, + "flos": 25859828442240.0, + "grad_norm": 2.9592552297067627, + "language_loss": 0.91744024, + "learning_rate": 8.54962434469919e-09, + "loss": 0.94183838, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.18774414, + "step": 16157, + "time_per_iteration": 2.8777596950531006 + }, + { + "auxiliary_loss_clip": 0.01412301, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.25053835, + "balance_loss_mlp": 1.01489949, + "epoch": 0.9714715166090485, + "flos": 12748866577920.0, + "grad_norm": 2.016914392239951, + "language_loss": 0.73006833, + "learning_rate": 8.513689354332721e-09, + "loss": 0.75451958, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.17932129, + "step": 16158, + "time_per_iteration": 2.8247272968292236 + }, + { + "auxiliary_loss_clip": 0.01403615, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.24500358, + "balance_loss_mlp": 1.01913476, + "epoch": 0.9715316398617165, + "flos": 18414664513920.0, + "grad_norm": 1.9648794728798589, + "language_loss": 0.61782014, + "learning_rate": 8.477829881326836e-09, + "loss": 0.64223182, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.1842041, + "step": 16159, + "time_per_iteration": 2.848052501678467 + }, + { + "auxiliary_loss_clip": 0.01392013, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.23718333, + "balance_loss_mlp": 1.01342916, + "epoch": 0.9715917631143844, + "flos": 28925734354560.0, + "grad_norm": 1.6737759484025454, + "language_loss": 0.79725099, + "learning_rate": 8.44204592704112e-09, + "loss": 0.82148641, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 1.54882812, + "router_z_loss_mlp": 0.18115234, + "step": 16160, + "time_per_iteration": 2.935985803604126 + }, + { + "auxiliary_loss_clip": 0.01177131, + "auxiliary_loss_mlp": 0.0102432, + "balance_loss_clip": 1.09064627, + "balance_loss_mlp": 1.00295734, + "epoch": 0.9716518863670525, + "flos": 65968872900480.0, + "grad_norm": 0.7710650595495714, + "language_loss": 0.54338998, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56540447, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.21386719, + "step": 16161, + "time_per_iteration": 3.372110605239868 + }, + { + "auxiliary_loss_clip": 0.01387921, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.23148799, + "balance_loss_mlp": 1.01351786, + "epoch": 0.9717120096197204, + "flos": 17721810604800.0, + "grad_norm": 1.8500687523991368, + "language_loss": 0.72437346, + "learning_rate": 8.3707045800554e-09, + "loss": 0.7485795, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.19177246, + "step": 16162, + "time_per_iteration": 2.8383305072784424 + }, + { + "auxiliary_loss_clip": 0.01391795, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.23374653, + "balance_loss_mlp": 1.01222146, + "epoch": 0.9717721328723884, + "flos": 24474437337600.0, + "grad_norm": 4.972124470773038, + "language_loss": 0.79969335, + "learning_rate": 8.335147190060787e-09, + "loss": 0.82392001, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18664551, + "step": 16163, + "time_per_iteration": 2.878923177719116 + }, + { + "auxiliary_loss_clip": 0.01393493, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.23625863, + "balance_loss_mlp": 1.01122582, + "epoch": 0.9718322561250564, + "flos": 20786042459520.0, + "grad_norm": 1.5062141666143194, + "language_loss": 0.73581594, + "learning_rate": 8.299665324196903e-09, + "loss": 0.76004201, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.17907715, + "step": 16164, + "time_per_iteration": 2.9071035385131836 + }, + { + "auxiliary_loss_clip": 0.01405978, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.24471903, + "balance_loss_mlp": 1.01749718, + "epoch": 0.9718923793777243, + "flos": 19035207953280.0, + "grad_norm": 2.2859496888682056, + "language_loss": 0.85102439, + "learning_rate": 8.264258983809114e-09, + "loss": 0.87544703, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18798828, + "step": 16165, + "time_per_iteration": 2.873432159423828 + }, + { + "auxiliary_loss_clip": 0.01400748, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.2422173, + "balance_loss_mlp": 1.01492453, + "epoch": 0.9719525026303923, + "flos": 21881739237120.0, + "grad_norm": 2.444406364067279, + "language_loss": 0.80002278, + "learning_rate": 8.228928170240345e-09, + "loss": 0.82434773, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.16821289, + "step": 16166, + "time_per_iteration": 2.9322948455810547 + }, + { + "auxiliary_loss_clip": 0.0139746, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.23839879, + "balance_loss_mlp": 1.01165032, + "epoch": 0.9720126258830603, + "flos": 14437570694400.0, + "grad_norm": 1.7064912762865116, + "language_loss": 0.71911716, + "learning_rate": 8.193672884830195e-09, + "loss": 0.74339104, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18273926, + "step": 16167, + "time_per_iteration": 2.919938802719116 + }, + { + "auxiliary_loss_clip": 0.01396609, + "auxiliary_loss_mlp": 0.01038008, + "balance_loss_clip": 1.23905206, + "balance_loss_mlp": 1.01718211, + "epoch": 0.9720727491357283, + "flos": 26262626065920.0, + "grad_norm": 1.438060051156298, + "language_loss": 0.76344848, + "learning_rate": 8.158493128915812e-09, + "loss": 0.78779465, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.20825195, + "step": 16168, + "time_per_iteration": 2.92417049407959 + }, + { + "auxiliary_loss_clip": 0.01407037, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.2452122, + "balance_loss_mlp": 1.01679134, + "epoch": 0.9721328723883962, + "flos": 22684257838080.0, + "grad_norm": 2.1096614632035418, + "language_loss": 0.74741149, + "learning_rate": 8.123388903830797e-09, + "loss": 0.77184415, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19421387, + "step": 16169, + "time_per_iteration": 2.8938534259796143 + }, + { + "auxiliary_loss_clip": 0.01416715, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.2526319, + "balance_loss_mlp": 1.01161814, + "epoch": 0.9721929956410642, + "flos": 28085771041920.0, + "grad_norm": 2.0189384694925256, + "language_loss": 0.58227628, + "learning_rate": 8.088360210906309e-09, + "loss": 0.60675323, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.19360352, + "step": 16170, + "time_per_iteration": 3.005073308944702 + }, + { + "auxiliary_loss_clip": 0.01391393, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.23195815, + "balance_loss_mlp": 1.0136131, + "epoch": 0.9722531188937321, + "flos": 21006095760000.0, + "grad_norm": 1.7957431193786442, + "language_loss": 0.7191757, + "learning_rate": 8.053407051471062e-09, + "loss": 0.74342096, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19519043, + "step": 16171, + "time_per_iteration": 2.9933855533599854 + }, + { + "auxiliary_loss_clip": 0.01399601, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.23979509, + "balance_loss_mlp": 1.01878405, + "epoch": 0.9723132421464001, + "flos": 16079147712000.0, + "grad_norm": 1.7035127794905218, + "language_loss": 0.69373339, + "learning_rate": 8.018529426850218e-09, + "loss": 0.71810222, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18493652, + "step": 16172, + "time_per_iteration": 2.8956363201141357 + }, + { + "auxiliary_loss_clip": 0.01392457, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.2361145, + "balance_loss_mlp": 1.0119493, + "epoch": 0.972373365399068, + "flos": 27757274659200.0, + "grad_norm": 1.7455273221982708, + "language_loss": 0.87024099, + "learning_rate": 7.983727338366274e-09, + "loss": 0.89447689, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19177246, + "step": 16173, + "time_per_iteration": 4.394887208938599 + }, + { + "auxiliary_loss_clip": 0.01423447, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.25771356, + "balance_loss_mlp": 1.018659, + "epoch": 0.9724334886517361, + "flos": 23013297158400.0, + "grad_norm": 1.8232491244186737, + "language_loss": 0.65366638, + "learning_rate": 7.949000787339289e-09, + "loss": 0.67827839, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19104004, + "step": 16174, + "time_per_iteration": 2.871086835861206 + }, + { + "auxiliary_loss_clip": 0.0139429, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.23761308, + "balance_loss_mlp": 1.0166254, + "epoch": 0.972493611904404, + "flos": 25457935714560.0, + "grad_norm": 1.4266629316949673, + "language_loss": 0.78825581, + "learning_rate": 7.914349775085538e-09, + "loss": 0.81255078, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18591309, + "step": 16175, + "time_per_iteration": 2.9415509700775146 + }, + { + "auxiliary_loss_clip": 0.0139138, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.23313189, + "balance_loss_mlp": 1.01549196, + "epoch": 0.972553735157072, + "flos": 16991919187200.0, + "grad_norm": 2.337708969734513, + "language_loss": 0.58365387, + "learning_rate": 7.879774302919307e-09, + "loss": 0.60791153, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18896484, + "step": 16176, + "time_per_iteration": 2.8378376960754395 + }, + { + "auxiliary_loss_clip": 0.01410109, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.25174236, + "balance_loss_mlp": 1.0109601, + "epoch": 0.97261385840974, + "flos": 26115923865600.0, + "grad_norm": 2.398149540032479, + "language_loss": 0.72868901, + "learning_rate": 7.845274372151545e-09, + "loss": 0.75308174, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18188477, + "step": 16177, + "time_per_iteration": 4.339169025421143 + }, + { + "auxiliary_loss_clip": 0.01399962, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.23931813, + "balance_loss_mlp": 1.01301301, + "epoch": 0.9726739816624079, + "flos": 25458976344960.0, + "grad_norm": 1.688287123052307, + "language_loss": 0.6960299, + "learning_rate": 7.810849984090984e-09, + "loss": 0.72034287, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18322754, + "step": 16178, + "time_per_iteration": 2.9026010036468506 + }, + { + "auxiliary_loss_clip": 0.0140856, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.24605691, + "balance_loss_mlp": 1.01218247, + "epoch": 0.972734104915076, + "flos": 29024721273600.0, + "grad_norm": 2.912708929601516, + "language_loss": 0.6825428, + "learning_rate": 7.776501140042358e-09, + "loss": 0.70693535, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.18530273, + "step": 16179, + "time_per_iteration": 2.8754777908325195 + }, + { + "auxiliary_loss_clip": 0.01386497, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.23008192, + "balance_loss_mlp": 1.01590347, + "epoch": 0.9727942281677439, + "flos": 23447295711360.0, + "grad_norm": 1.9384759626842905, + "language_loss": 0.78078777, + "learning_rate": 7.742227841308624e-09, + "loss": 0.80499578, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18408203, + "step": 16180, + "time_per_iteration": 2.8491365909576416 + }, + { + "auxiliary_loss_clip": 0.01409768, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.2443186, + "balance_loss_mlp": 1.01292467, + "epoch": 0.9728543514204119, + "flos": 31737626104320.0, + "grad_norm": 1.6002077579190386, + "language_loss": 0.77564657, + "learning_rate": 7.708030089189188e-09, + "loss": 0.80006993, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 1.65429688, + "router_z_loss_mlp": 0.1965332, + "step": 16181, + "time_per_iteration": 2.9964916706085205 + }, + { + "auxiliary_loss_clip": 0.01397109, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.23837626, + "balance_loss_mlp": 1.01755261, + "epoch": 0.9729144746730798, + "flos": 16297934158080.0, + "grad_norm": 1.4442251740758172, + "language_loss": 0.64438868, + "learning_rate": 7.67390788498079e-09, + "loss": 0.6687234, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18798828, + "step": 16182, + "time_per_iteration": 2.870171070098877 + }, + { + "auxiliary_loss_clip": 0.0140339, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.24295688, + "balance_loss_mlp": 1.01251698, + "epoch": 0.9729745979257478, + "flos": 25051066058880.0, + "grad_norm": 4.186218822970891, + "language_loss": 0.6372323, + "learning_rate": 7.639861229977507e-09, + "loss": 0.66157794, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18664551, + "step": 16183, + "time_per_iteration": 4.293447732925415 + }, + { + "auxiliary_loss_clip": 0.01390044, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.23275256, + "balance_loss_mlp": 1.01531875, + "epoch": 0.9730347211784157, + "flos": 22649165856000.0, + "grad_norm": 1.605978408181502, + "language_loss": 0.78499818, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80924892, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19714355, + "step": 16184, + "time_per_iteration": 2.884580135345459 + }, + { + "auxiliary_loss_clip": 0.01393506, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.23500478, + "balance_loss_mlp": 1.0125649, + "epoch": 0.9730948444310837, + "flos": 11005723687680.0, + "grad_norm": 2.192501169555215, + "language_loss": 0.80106372, + "learning_rate": 7.571994572747709e-09, + "loss": 0.82530302, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.17858887, + "step": 16185, + "time_per_iteration": 4.429924964904785 + }, + { + "auxiliary_loss_clip": 0.01413406, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.25150871, + "balance_loss_mlp": 1.01757264, + "epoch": 0.9731549676837516, + "flos": 16807998499200.0, + "grad_norm": 1.6620523291253477, + "language_loss": 0.78685713, + "learning_rate": 7.538174573094469e-09, + "loss": 0.81135845, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19140625, + "step": 16186, + "time_per_iteration": 2.887716293334961 + }, + { + "auxiliary_loss_clip": 0.01399059, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.24032736, + "balance_loss_mlp": 1.01533055, + "epoch": 0.9732150909364197, + "flos": 21151531105920.0, + "grad_norm": 1.7076633108454966, + "language_loss": 0.66242653, + "learning_rate": 7.504430127793337e-09, + "loss": 0.68675828, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18774414, + "step": 16187, + "time_per_iteration": 2.878744602203369 + }, + { + "auxiliary_loss_clip": 0.01396341, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.23905444, + "balance_loss_mlp": 1.01177239, + "epoch": 0.9732752141890876, + "flos": 33739850574720.0, + "grad_norm": 1.6480616104857082, + "language_loss": 0.80716914, + "learning_rate": 7.47076123812418e-09, + "loss": 0.83143795, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18737793, + "step": 16188, + "time_per_iteration": 3.018159866333008 + }, + { + "auxiliary_loss_clip": 0.01387847, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.23287892, + "balance_loss_mlp": 1.01278472, + "epoch": 0.9733353374417556, + "flos": 23414873172480.0, + "grad_norm": 1.846467092536917, + "language_loss": 0.79017508, + "learning_rate": 7.437167905363084e-09, + "loss": 0.81436121, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.17980957, + "step": 16189, + "time_per_iteration": 2.9806342124938965 + }, + { + "auxiliary_loss_clip": 0.0138764, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.22873425, + "balance_loss_mlp": 1.01155591, + "epoch": 0.9733954606944236, + "flos": 39180256323840.0, + "grad_norm": 1.9181958403720214, + "language_loss": 0.51775908, + "learning_rate": 7.403650130784367e-09, + "loss": 0.54193425, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18322754, + "step": 16190, + "time_per_iteration": 2.9808578491210938 + }, + { + "auxiliary_loss_clip": 0.01396786, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.23768115, + "balance_loss_mlp": 1.01460946, + "epoch": 0.9734555839470915, + "flos": 21992037356160.0, + "grad_norm": 1.6935672236447623, + "language_loss": 0.81525922, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83956474, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19165039, + "step": 16191, + "time_per_iteration": 2.888597249984741 + }, + { + "auxiliary_loss_clip": 0.01405075, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.24598646, + "balance_loss_mlp": 1.01305008, + "epoch": 0.9735157071997596, + "flos": 16582696801920.0, + "grad_norm": 1.6776807692401956, + "language_loss": 0.83014822, + "learning_rate": 7.336841261255111e-09, + "loss": 0.8545121, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18249512, + "step": 16192, + "time_per_iteration": 2.7913994789123535 + }, + { + "auxiliary_loss_clip": 0.01423379, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.26255071, + "balance_loss_mlp": 1.01419616, + "epoch": 0.9735758304524275, + "flos": 20231113259520.0, + "grad_norm": 8.688198350150344, + "language_loss": 0.75586408, + "learning_rate": 7.303550168837658e-09, + "loss": 0.78042293, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1829834, + "step": 16193, + "time_per_iteration": 2.8233678340911865 + }, + { + "auxiliary_loss_clip": 0.01386178, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.23112178, + "balance_loss_mlp": 1.01203442, + "epoch": 0.9736359537050955, + "flos": 23662236349440.0, + "grad_norm": 2.471811740770899, + "language_loss": 0.85558796, + "learning_rate": 7.270334639669417e-09, + "loss": 0.8797425, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.17248535, + "step": 16194, + "time_per_iteration": 2.8938090801239014 + }, + { + "auxiliary_loss_clip": 0.01378681, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.22568142, + "balance_loss_mlp": 1.01363635, + "epoch": 0.9736960769577634, + "flos": 15568721412480.0, + "grad_norm": 1.706143341946828, + "language_loss": 0.76520085, + "learning_rate": 7.237194675009828e-09, + "loss": 0.78931117, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.18701172, + "step": 16195, + "time_per_iteration": 2.8149304389953613 + }, + { + "auxiliary_loss_clip": 0.01181568, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.09322906, + "balance_loss_mlp": 1.00548124, + "epoch": 0.9737562002104314, + "flos": 65379711369600.0, + "grad_norm": 0.7074616399574122, + "language_loss": 0.52466005, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54677182, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.24121094, + "step": 16196, + "time_per_iteration": 3.28844952583313 + }, + { + "auxiliary_loss_clip": 0.01406918, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.24657178, + "balance_loss_mlp": 1.01041186, + "epoch": 0.9738163234630993, + "flos": 27207639100800.0, + "grad_norm": 1.8495417814219874, + "language_loss": 0.76825392, + "learning_rate": 7.171141444240136e-09, + "loss": 0.79260755, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18041992, + "step": 16197, + "time_per_iteration": 2.933150291442871 + }, + { + "auxiliary_loss_clip": 0.01415841, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.25017273, + "balance_loss_mlp": 1.01235175, + "epoch": 0.9738764467157673, + "flos": 21079265880960.0, + "grad_norm": 2.2631821713131135, + "language_loss": 0.68064821, + "learning_rate": 7.13822818063492e-09, + "loss": 0.70512092, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.19067383, + "step": 16198, + "time_per_iteration": 2.8958942890167236 + }, + { + "auxiliary_loss_clip": 0.01396805, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.23626709, + "balance_loss_mlp": 1.01417303, + "epoch": 0.9739365699684353, + "flos": 21371177203200.0, + "grad_norm": 1.7695226381433766, + "language_loss": 0.78979659, + "learning_rate": 7.10539048654768e-09, + "loss": 0.81409454, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18811035, + "step": 16199, + "time_per_iteration": 2.8626868724823 + }, + { + "auxiliary_loss_clip": 0.01401144, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.24122453, + "balance_loss_mlp": 1.01564324, + "epoch": 0.9739966932211033, + "flos": 21910496947200.0, + "grad_norm": 1.6931131618083486, + "language_loss": 0.79610562, + "learning_rate": 7.072628363223865e-09, + "loss": 0.82046103, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18737793, + "step": 16200, + "time_per_iteration": 2.8704047203063965 + }, + { + "auxiliary_loss_clip": 0.01425846, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.25831926, + "balance_loss_mlp": 1.01434135, + "epoch": 0.9740568164737712, + "flos": 24838206681600.0, + "grad_norm": 2.4207356307047596, + "language_loss": 0.69446558, + "learning_rate": 7.039941811905592e-09, + "loss": 0.71906108, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.19360352, + "step": 16201, + "time_per_iteration": 2.9265036582946777 + }, + { + "auxiliary_loss_clip": 0.01403276, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.24258566, + "balance_loss_mlp": 1.01595306, + "epoch": 0.9741169397264392, + "flos": 23634202556160.0, + "grad_norm": 1.472758522532452, + "language_loss": 0.73378354, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.75816476, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18908691, + "step": 16202, + "time_per_iteration": 2.8630495071411133 + }, + { + "auxiliary_loss_clip": 0.01403997, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.24271309, + "balance_loss_mlp": 1.01327658, + "epoch": 0.9741770629791072, + "flos": 18849794186880.0, + "grad_norm": 1.9187673095500724, + "language_loss": 0.73372418, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75808704, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.19018555, + "step": 16203, + "time_per_iteration": 2.9457216262817383 + }, + { + "auxiliary_loss_clip": 0.01396368, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.23582304, + "balance_loss_mlp": 1.01400304, + "epoch": 0.9742371862317751, + "flos": 22356440127360.0, + "grad_norm": 1.6933981274254448, + "language_loss": 0.7791431, + "learning_rate": 6.942335602365235e-09, + "loss": 0.80343443, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18762207, + "step": 16204, + "time_per_iteration": 2.852281093597412 + }, + { + "auxiliary_loss_clip": 0.01399442, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.23916149, + "balance_loss_mlp": 1.01388788, + "epoch": 0.9742973094844432, + "flos": 21772888951680.0, + "grad_norm": 1.9565868481430708, + "language_loss": 0.80493987, + "learning_rate": 6.909951351435905e-09, + "loss": 0.82927155, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19812012, + "step": 16205, + "time_per_iteration": 2.868943452835083 + }, + { + "auxiliary_loss_clip": 0.01381985, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.2244494, + "balance_loss_mlp": 1.01480746, + "epoch": 0.9743574327371111, + "flos": 26259549419520.0, + "grad_norm": 1.6140809808490482, + "language_loss": 0.75416082, + "learning_rate": 6.87764267868074e-09, + "loss": 0.77831113, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18225098, + "step": 16206, + "time_per_iteration": 2.890082836151123 + }, + { + "auxiliary_loss_clip": 0.01396937, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.23538339, + "balance_loss_mlp": 1.01293325, + "epoch": 0.9744175559897791, + "flos": 12356972951040.0, + "grad_norm": 2.616995658724922, + "language_loss": 0.85339075, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.87767923, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.18981934, + "step": 16207, + "time_per_iteration": 2.8434536457061768 + }, + { + "auxiliary_loss_clip": 0.01387325, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.23129988, + "balance_loss_mlp": 1.01638842, + "epoch": 0.974477679242447, + "flos": 28408295111040.0, + "grad_norm": 1.8354607390285151, + "language_loss": 0.71170712, + "learning_rate": 6.813252072591425e-09, + "loss": 0.7359271, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.18286133, + "step": 16208, + "time_per_iteration": 2.980886459350586 + }, + { + "auxiliary_loss_clip": 0.01387756, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.23510623, + "balance_loss_mlp": 1.01200497, + "epoch": 0.974537802495115, + "flos": 17794347298560.0, + "grad_norm": 1.8465179202310829, + "language_loss": 0.77952158, + "learning_rate": 6.781170141698878e-09, + "loss": 0.80369914, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 1.52539062, + "router_z_loss_mlp": 0.18017578, + "step": 16209, + "time_per_iteration": 4.369826316833496 + }, + { + "auxiliary_loss_clip": 0.01401165, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.2389369, + "balance_loss_mlp": 1.01363707, + "epoch": 0.9745979257477829, + "flos": 23852943757440.0, + "grad_norm": 1.7628109002935457, + "language_loss": 0.80297321, + "learning_rate": 6.749163793864144e-09, + "loss": 0.82731783, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19665527, + "step": 16210, + "time_per_iteration": 2.9210939407348633 + }, + { + "auxiliary_loss_clip": 0.01395293, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.23562253, + "balance_loss_mlp": 1.01677001, + "epoch": 0.9746580490004509, + "flos": 27027518976000.0, + "grad_norm": 5.833557764098546, + "language_loss": 0.7836495, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80795455, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18444824, + "step": 16211, + "time_per_iteration": 4.362037420272827 + }, + { + "auxiliary_loss_clip": 0.01419458, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.25431347, + "balance_loss_mlp": 1.01786065, + "epoch": 0.9747181722531189, + "flos": 19801503452160.0, + "grad_norm": 2.656654433560433, + "language_loss": 0.78689891, + "learning_rate": 6.685377852219787e-09, + "loss": 0.81146944, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 1.64941406, + "router_z_loss_mlp": 0.19726562, + "step": 16212, + "time_per_iteration": 2.8346025943756104 + }, + { + "auxiliary_loss_clip": 0.01385796, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.23035717, + "balance_loss_mlp": 1.01333702, + "epoch": 0.9747782955057869, + "flos": 31443000094080.0, + "grad_norm": 5.171784234346739, + "language_loss": 0.81228447, + "learning_rate": 6.653598260829118e-09, + "loss": 0.83646214, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18640137, + "step": 16213, + "time_per_iteration": 2.9361579418182373 + }, + { + "auxiliary_loss_clip": 0.01387757, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.22946405, + "balance_loss_mlp": 1.00948846, + "epoch": 0.9748384187584548, + "flos": 15969844978560.0, + "grad_norm": 2.125745941676953, + "language_loss": 0.67558849, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.69974911, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18823242, + "step": 16214, + "time_per_iteration": 2.84736967086792 + }, + { + "auxiliary_loss_clip": 0.01407096, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.24332762, + "balance_loss_mlp": 1.0158143, + "epoch": 0.9748985420111228, + "flos": 20568522867840.0, + "grad_norm": 1.6699869323972791, + "language_loss": 0.74999416, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.77441764, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19433594, + "step": 16215, + "time_per_iteration": 2.8607492446899414 + }, + { + "auxiliary_loss_clip": 0.01393204, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.23356605, + "balance_loss_mlp": 1.01224637, + "epoch": 0.9749586652637908, + "flos": 36735527278080.0, + "grad_norm": 1.8037485293047941, + "language_loss": 0.6773867, + "learning_rate": 6.558713018834483e-09, + "loss": 0.70163065, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18945312, + "step": 16216, + "time_per_iteration": 2.9663803577423096 + }, + { + "auxiliary_loss_clip": 0.01411899, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.24894452, + "balance_loss_mlp": 1.01586819, + "epoch": 0.9750187885164587, + "flos": 11006492849280.0, + "grad_norm": 2.4189545047443866, + "language_loss": 0.72628176, + "learning_rate": 6.527235786226937e-09, + "loss": 0.75075281, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19335938, + "step": 16217, + "time_per_iteration": 2.847182512283325 + }, + { + "auxiliary_loss_clip": 0.01387582, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.22993314, + "balance_loss_mlp": 1.01212835, + "epoch": 0.9750789117691268, + "flos": 25750616198400.0, + "grad_norm": 1.5486590404623812, + "language_loss": 0.78813589, + "learning_rate": 6.495834146306167e-09, + "loss": 0.81231606, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18310547, + "step": 16218, + "time_per_iteration": 2.899005174636841 + }, + { + "auxiliary_loss_clip": 0.01381324, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.22496283, + "balance_loss_mlp": 1.01542664, + "epoch": 0.9751390350217947, + "flos": 13341738182400.0, + "grad_norm": 1.965379575628166, + "language_loss": 0.78334588, + "learning_rate": 6.464508100263222e-09, + "loss": 0.80750328, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18981934, + "step": 16219, + "time_per_iteration": 4.202998876571655 + }, + { + "auxiliary_loss_clip": 0.0140014, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.23970747, + "balance_loss_mlp": 1.01580787, + "epoch": 0.9751991582744627, + "flos": 22830960038400.0, + "grad_norm": 1.7172042899904458, + "language_loss": 0.82175243, + "learning_rate": 6.433257649285817e-09, + "loss": 0.84609509, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18334961, + "step": 16220, + "time_per_iteration": 4.2648303508758545 + }, + { + "auxiliary_loss_clip": 0.01386374, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.22951591, + "balance_loss_mlp": 1.01503992, + "epoch": 0.9752592815271306, + "flos": 19655660903040.0, + "grad_norm": 2.3936098096802625, + "language_loss": 0.7584582, + "learning_rate": 6.402082794559227e-09, + "loss": 0.78265691, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18457031, + "step": 16221, + "time_per_iteration": 2.858649253845215 + }, + { + "auxiliary_loss_clip": 0.01388027, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.23152876, + "balance_loss_mlp": 1.012079, + "epoch": 0.9753194047797986, + "flos": 26702415953280.0, + "grad_norm": 1.5485150335561055, + "language_loss": 0.66974056, + "learning_rate": 6.370983537265395e-09, + "loss": 0.69394565, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.20422363, + "step": 16222, + "time_per_iteration": 2.9350554943084717 + }, + { + "auxiliary_loss_clip": 0.01395326, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.23777139, + "balance_loss_mlp": 1.01309037, + "epoch": 0.9753795280324665, + "flos": 23232174094080.0, + "grad_norm": 2.4871438006927558, + "language_loss": 0.89199436, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.91626048, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18200684, + "step": 16223, + "time_per_iteration": 2.919252872467041 + }, + { + "auxiliary_loss_clip": 0.01392785, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.23706627, + "balance_loss_mlp": 1.01851213, + "epoch": 0.9754396512851345, + "flos": 19473278538240.0, + "grad_norm": 1.7031893695150015, + "language_loss": 0.75100362, + "learning_rate": 6.309011819690457e-09, + "loss": 0.7753033, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.18664551, + "step": 16224, + "time_per_iteration": 2.8274567127227783 + }, + { + "auxiliary_loss_clip": 0.01181938, + "auxiliary_loss_mlp": 0.01022529, + "balance_loss_clip": 1.0939827, + "balance_loss_mlp": 1.00030851, + "epoch": 0.9754997745378025, + "flos": 68489667734400.0, + "grad_norm": 0.809790010652841, + "language_loss": 0.59106421, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61310887, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.22265625, + "step": 16225, + "time_per_iteration": 3.369819402694702 + }, + { + "auxiliary_loss_clip": 0.01393989, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.23639035, + "balance_loss_mlp": 1.0163641, + "epoch": 0.9755598977904705, + "flos": 26406161130240.0, + "grad_norm": 1.660901904940292, + "language_loss": 0.69417632, + "learning_rate": 6.247342505960818e-09, + "loss": 0.71847409, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.1940918, + "step": 16226, + "time_per_iteration": 2.9045512676239014 + }, + { + "auxiliary_loss_clip": 0.01393198, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.23428881, + "balance_loss_mlp": 1.01486015, + "epoch": 0.9756200210431384, + "flos": 16626294806400.0, + "grad_norm": 1.6553515189316224, + "language_loss": 0.83610439, + "learning_rate": 6.216621253462894e-09, + "loss": 0.86036968, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18481445, + "step": 16227, + "time_per_iteration": 2.8254475593566895 + }, + { + "auxiliary_loss_clip": 0.01395432, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.23790789, + "balance_loss_mlp": 1.01254702, + "epoch": 0.9756801442958064, + "flos": 23633523884160.0, + "grad_norm": 2.1377362337290817, + "language_loss": 0.78233647, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80660903, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.19262695, + "step": 16228, + "time_per_iteration": 2.894794225692749 + }, + { + "auxiliary_loss_clip": 0.01183604, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.09400022, + "balance_loss_mlp": 1.01884079, + "epoch": 0.9757402675484744, + "flos": 61651609539840.0, + "grad_norm": 0.8433921415933072, + "language_loss": 0.55795377, + "learning_rate": 6.155405563025962e-09, + "loss": 0.5802682, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.2890625, + "step": 16229, + "time_per_iteration": 3.2694520950317383 + }, + { + "auxiliary_loss_clip": 0.0140099, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.24201107, + "balance_loss_mlp": 1.01656318, + "epoch": 0.9758003908011423, + "flos": 24069060760320.0, + "grad_norm": 1.9137128941278372, + "language_loss": 0.75584525, + "learning_rate": 6.124911127407984e-09, + "loss": 0.78021032, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.1895752, + "step": 16230, + "time_per_iteration": 2.872739315032959 + }, + { + "auxiliary_loss_clip": 0.0138069, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.22612679, + "balance_loss_mlp": 1.01385438, + "epoch": 0.9758605140538104, + "flos": 17501893038720.0, + "grad_norm": 1.9213139971288582, + "language_loss": 0.72959417, + "learning_rate": 6.094492299733245e-09, + "loss": 0.75371623, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.17663574, + "step": 16231, + "time_per_iteration": 2.8657314777374268 + }, + { + "auxiliary_loss_clip": 0.01417613, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.25340331, + "balance_loss_mlp": 1.01210928, + "epoch": 0.9759206373064783, + "flos": 24837708988800.0, + "grad_norm": 1.7597077618921848, + "language_loss": 0.77006012, + "learning_rate": 6.064149081155267e-09, + "loss": 0.79454488, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.1875, + "step": 16232, + "time_per_iteration": 2.920801877975464 + }, + { + "auxiliary_loss_clip": 0.01179753, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.09231985, + "balance_loss_mlp": 1.00750434, + "epoch": 0.9759807605591463, + "flos": 68189838572160.0, + "grad_norm": 0.7413605561446295, + "language_loss": 0.53795445, + "learning_rate": 6.033881472824465e-09, + "loss": 0.5599997, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.17285156, + "step": 16233, + "time_per_iteration": 3.132338285446167 + }, + { + "auxiliary_loss_clip": 0.01390204, + "auxiliary_loss_mlp": 0.01030215, + "balance_loss_clip": 1.23274112, + "balance_loss_mlp": 1.01171327, + "epoch": 0.9760408838118142, + "flos": 18998487158400.0, + "grad_norm": 1.727365289369547, + "language_loss": 0.72425121, + "learning_rate": 6.003689475888807e-09, + "loss": 0.74845541, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18493652, + "step": 16234, + "time_per_iteration": 2.863232135772705 + }, + { + "auxiliary_loss_clip": 0.01407126, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.24245358, + "balance_loss_mlp": 1.0134728, + "epoch": 0.9761010070644822, + "flos": 17134096907520.0, + "grad_norm": 2.4193343479413674, + "language_loss": 0.80037463, + "learning_rate": 5.973573091493156e-09, + "loss": 0.82476836, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.18774414, + "step": 16235, + "time_per_iteration": 2.818143844604492 + }, + { + "auxiliary_loss_clip": 0.01393661, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.23678589, + "balance_loss_mlp": 1.01379573, + "epoch": 0.9761611303171501, + "flos": 22062266565120.0, + "grad_norm": 1.840192077337459, + "language_loss": 0.7751019, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79936862, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.19226074, + "step": 16236, + "time_per_iteration": 2.8439910411834717 + }, + { + "auxiliary_loss_clip": 0.01399687, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.24014604, + "balance_loss_mlp": 1.01518774, + "epoch": 0.9762212535698181, + "flos": 21766283210880.0, + "grad_norm": 1.8844866378489251, + "language_loss": 0.76393127, + "learning_rate": 5.913567164886446e-09, + "loss": 0.78825957, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17944336, + "step": 16237, + "time_per_iteration": 2.8670175075531006 + }, + { + "auxiliary_loss_clip": 0.01396019, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.23503792, + "balance_loss_mlp": 1.01617074, + "epoch": 0.9762813768224861, + "flos": 25932681849600.0, + "grad_norm": 1.825793769446491, + "language_loss": 0.73374444, + "learning_rate": 5.8836776249509e-09, + "loss": 0.75806034, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19384766, + "step": 16238, + "time_per_iteration": 2.8664515018463135 + }, + { + "auxiliary_loss_clip": 0.01397738, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.23776472, + "balance_loss_mlp": 1.01537311, + "epoch": 0.9763415000751541, + "flos": 24060102289920.0, + "grad_norm": 2.093123273615315, + "language_loss": 0.85013598, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.87445295, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18591309, + "step": 16239, + "time_per_iteration": 2.8302385807037354 + }, + { + "auxiliary_loss_clip": 0.01402226, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.24071336, + "balance_loss_mlp": 1.01392126, + "epoch": 0.976401623327822, + "flos": 17027327882880.0, + "grad_norm": 2.5934576512047225, + "language_loss": 0.60408556, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62843168, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18457031, + "step": 16240, + "time_per_iteration": 2.8622426986694336 + }, + { + "auxiliary_loss_clip": 0.01396649, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.24004614, + "balance_loss_mlp": 1.01287413, + "epoch": 0.97646174658049, + "flos": 16115461303680.0, + "grad_norm": 1.7835460985714808, + "language_loss": 0.83519226, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.85947275, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18505859, + "step": 16241, + "time_per_iteration": 2.9623098373413086 + }, + { + "auxiliary_loss_clip": 0.01391964, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.23174107, + "balance_loss_mlp": 1.02059793, + "epoch": 0.9765218698331579, + "flos": 21262915100160.0, + "grad_norm": 8.495715279383578, + "language_loss": 0.84197247, + "learning_rate": 5.764875647408463e-09, + "loss": 0.86628556, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18762207, + "step": 16242, + "time_per_iteration": 2.9112656116485596 + }, + { + "auxiliary_loss_clip": 0.01400583, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.24086618, + "balance_loss_mlp": 1.01208282, + "epoch": 0.9765819930858259, + "flos": 18597227857920.0, + "grad_norm": 1.5390580955374455, + "language_loss": 0.76907206, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.79338229, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18359375, + "step": 16243, + "time_per_iteration": 2.865661859512329 + }, + { + "auxiliary_loss_clip": 0.01389975, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.23138714, + "balance_loss_mlp": 1.01751935, + "epoch": 0.976642116338494, + "flos": 20276566300800.0, + "grad_norm": 1.6996965514701532, + "language_loss": 0.70855284, + "learning_rate": 5.705928383713754e-09, + "loss": 0.73282254, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.19482422, + "step": 16244, + "time_per_iteration": 4.36298680305481 + }, + { + "auxiliary_loss_clip": 0.0140427, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.24185324, + "balance_loss_mlp": 1.01501894, + "epoch": 0.9767022395911619, + "flos": 25559818300800.0, + "grad_norm": 1.8130657932466356, + "language_loss": 0.84038353, + "learning_rate": 5.676568187055197e-09, + "loss": 0.86476755, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19116211, + "step": 16245, + "time_per_iteration": 2.90073561668396 + }, + { + "auxiliary_loss_clip": 0.01394927, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.23677707, + "balance_loss_mlp": 1.01370478, + "epoch": 0.9767623628438299, + "flos": 21772753217280.0, + "grad_norm": 1.304184589534147, + "language_loss": 0.79026985, + "learning_rate": 5.647283615340726e-09, + "loss": 0.81454009, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18395996, + "step": 16246, + "time_per_iteration": 4.307973384857178 + }, + { + "auxiliary_loss_clip": 0.01371437, + "auxiliary_loss_mlp": 0.01030135, + "balance_loss_clip": 1.22127235, + "balance_loss_mlp": 1.01280165, + "epoch": 0.9768224860964978, + "flos": 15859456369920.0, + "grad_norm": 1.3224820187807351, + "language_loss": 0.74769616, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.77171189, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.17346191, + "step": 16247, + "time_per_iteration": 2.8672025203704834 + }, + { + "auxiliary_loss_clip": 0.01400293, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.24097741, + "balance_loss_mlp": 1.01257455, + "epoch": 0.9768826093491658, + "flos": 25160911729920.0, + "grad_norm": 1.5486620970180436, + "language_loss": 0.80441558, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82872832, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.18408203, + "step": 16248, + "time_per_iteration": 2.9788942337036133 + }, + { + "auxiliary_loss_clip": 0.01410111, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.2480197, + "balance_loss_mlp": 1.02443624, + "epoch": 0.9769427326018337, + "flos": 22977526504320.0, + "grad_norm": 3.9066961865791074, + "language_loss": 0.79958737, + "learning_rate": 5.559883660954278e-09, + "loss": 0.82413918, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.2064209, + "step": 16249, + "time_per_iteration": 2.9217536449432373 + }, + { + "auxiliary_loss_clip": 0.01391741, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.23632061, + "balance_loss_mlp": 1.01560783, + "epoch": 0.9770028558545018, + "flos": 15271697427840.0, + "grad_norm": 2.4152044783732194, + "language_loss": 0.67622209, + "learning_rate": 5.530901600093507e-09, + "loss": 0.70048332, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18786621, + "step": 16250, + "time_per_iteration": 2.823637008666992 + }, + { + "auxiliary_loss_clip": 0.01182215, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.09327328, + "balance_loss_mlp": 1.00171304, + "epoch": 0.9770629791071697, + "flos": 71481272405760.0, + "grad_norm": 0.7854921579859312, + "language_loss": 0.59936517, + "learning_rate": 5.501995169700846e-09, + "loss": 0.62145245, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.24707031, + "step": 16251, + "time_per_iteration": 3.383244752883911 + }, + { + "auxiliary_loss_clip": 0.01401337, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.2419765, + "balance_loss_mlp": 1.01641989, + "epoch": 0.9771231023598377, + "flos": 22421420939520.0, + "grad_norm": 1.728371907105927, + "language_loss": 0.79088134, + "learning_rate": 5.473164370872307e-09, + "loss": 0.81525666, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19775391, + "step": 16252, + "time_per_iteration": 2.846653699874878 + }, + { + "auxiliary_loss_clip": 0.01388662, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.23144317, + "balance_loss_mlp": 1.01363349, + "epoch": 0.9771832256125056, + "flos": 19035253198080.0, + "grad_norm": 3.8430409177887026, + "language_loss": 0.65023601, + "learning_rate": 5.444409204701461e-09, + "loss": 0.67445147, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19250488, + "step": 16253, + "time_per_iteration": 2.8675317764282227 + }, + { + "auxiliary_loss_clip": 0.014093, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.24931467, + "balance_loss_mlp": 1.01464713, + "epoch": 0.9772433488651736, + "flos": 17831203827840.0, + "grad_norm": 3.3532893645595387, + "language_loss": 0.77566469, + "learning_rate": 5.415729672278324e-09, + "loss": 0.8001073, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.203125, + "step": 16254, + "time_per_iteration": 4.289041996002197 + }, + { + "auxiliary_loss_clip": 0.01401249, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.23959076, + "balance_loss_mlp": 1.01624405, + "epoch": 0.9773034721178415, + "flos": 37642914622080.0, + "grad_norm": 3.074352354795752, + "language_loss": 0.64868402, + "learning_rate": 5.387125774690471e-09, + "loss": 0.67304766, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.1887207, + "step": 16255, + "time_per_iteration": 4.5104851722717285 + }, + { + "auxiliary_loss_clip": 0.01405521, + "auxiliary_loss_mlp": 0.01033752, + "balance_loss_clip": 1.24209261, + "balance_loss_mlp": 1.01281917, + "epoch": 0.9773635953705095, + "flos": 20311974996480.0, + "grad_norm": 1.5835759663280684, + "language_loss": 0.75949782, + "learning_rate": 5.358597513023033e-09, + "loss": 0.78389055, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.20922852, + "step": 16256, + "time_per_iteration": 2.835880756378174 + }, + { + "auxiliary_loss_clip": 0.01393416, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.23732936, + "balance_loss_mlp": 1.01957285, + "epoch": 0.9774237186231776, + "flos": 22319312129280.0, + "grad_norm": 2.34766087713655, + "language_loss": 0.78350526, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80783415, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.19909668, + "step": 16257, + "time_per_iteration": 2.8585243225097656 + }, + { + "auxiliary_loss_clip": 0.01391353, + "auxiliary_loss_mlp": 0.01033108, + "balance_loss_clip": 1.23297632, + "balance_loss_mlp": 1.01433265, + "epoch": 0.9774838418758455, + "flos": 24215039043840.0, + "grad_norm": 1.6719429121673341, + "language_loss": 0.75457954, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77882421, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18774414, + "step": 16258, + "time_per_iteration": 2.8921289443969727 + }, + { + "auxiliary_loss_clip": 0.01182335, + "auxiliary_loss_mlp": 0.01039249, + "balance_loss_clip": 1.09375525, + "balance_loss_mlp": 1.01617038, + "epoch": 0.9775439651285135, + "flos": 66390582850560.0, + "grad_norm": 0.6827868734239715, + "language_loss": 0.59880227, + "learning_rate": 5.273466554344353e-09, + "loss": 0.62101811, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.23046875, + "step": 16259, + "time_per_iteration": 3.444063186645508 + }, + { + "auxiliary_loss_clip": 0.01412937, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.25008583, + "balance_loss_mlp": 1.01696563, + "epoch": 0.9776040883811814, + "flos": 22611811633920.0, + "grad_norm": 1.72763536980597, + "language_loss": 0.74273479, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.76722753, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19372559, + "step": 16260, + "time_per_iteration": 2.901092290878296 + }, + { + "auxiliary_loss_clip": 0.01393638, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.23596776, + "balance_loss_mlp": 1.01399422, + "epoch": 0.9776642116338494, + "flos": 18451340064000.0, + "grad_norm": 4.705948366504652, + "language_loss": 0.79720318, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.82147932, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.1998291, + "step": 16261, + "time_per_iteration": 2.8695197105407715 + }, + { + "auxiliary_loss_clip": 0.01415709, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.25335109, + "balance_loss_mlp": 1.01306987, + "epoch": 0.9777243348865173, + "flos": 22648803897600.0, + "grad_norm": 2.603208981775957, + "language_loss": 0.75115401, + "learning_rate": 5.189016357718845e-09, + "loss": 0.77562511, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18334961, + "step": 16262, + "time_per_iteration": 2.882128953933716 + }, + { + "auxiliary_loss_clip": 0.01407047, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.24638486, + "balance_loss_mlp": 1.01307917, + "epoch": 0.9777844581391854, + "flos": 31333833095040.0, + "grad_norm": 2.039669284673078, + "language_loss": 0.70707583, + "learning_rate": 5.16101757762133e-09, + "loss": 0.73147446, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.19750977, + "step": 16263, + "time_per_iteration": 2.926440477371216 + }, + { + "auxiliary_loss_clip": 0.01408276, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.24738753, + "balance_loss_mlp": 1.01499629, + "epoch": 0.9778445813918533, + "flos": 23049384526080.0, + "grad_norm": 1.5599774349677595, + "language_loss": 0.66591328, + "learning_rate": 5.133094442018038e-09, + "loss": 0.69032913, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.18310547, + "step": 16264, + "time_per_iteration": 2.869438648223877 + }, + { + "auxiliary_loss_clip": 0.01411716, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.24635291, + "balance_loss_mlp": 1.01215708, + "epoch": 0.9779047046445213, + "flos": 17575244138880.0, + "grad_norm": 7.776731454926342, + "language_loss": 0.74027693, + "learning_rate": 5.105246951967679e-09, + "loss": 0.7647028, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 1.65527344, + "router_z_loss_mlp": 0.18725586, + "step": 16265, + "time_per_iteration": 2.8278932571411133 + }, + { + "auxiliary_loss_clip": 0.01388681, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.23267794, + "balance_loss_mlp": 1.01687205, + "epoch": 0.9779648278971892, + "flos": 20750995722240.0, + "grad_norm": 1.7885565132865664, + "language_loss": 0.69299114, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71723515, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18859863, + "step": 16266, + "time_per_iteration": 2.895054817199707 + }, + { + "auxiliary_loss_clip": 0.01382117, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.22856843, + "balance_loss_mlp": 1.0114429, + "epoch": 0.9780249511498572, + "flos": 21035305918080.0, + "grad_norm": 1.5747832703190197, + "language_loss": 0.87301457, + "learning_rate": 5.049778912747049e-09, + "loss": 0.89712834, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 1.53613281, + "router_z_loss_mlp": 0.17822266, + "step": 16267, + "time_per_iteration": 2.8854143619537354 + }, + { + "auxiliary_loss_clip": 0.01405007, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.24290586, + "balance_loss_mlp": 1.01410842, + "epoch": 0.9780850744025251, + "flos": 30786550266240.0, + "grad_norm": 2.2147559568329256, + "language_loss": 0.710051, + "learning_rate": 5.022158365679985e-09, + "loss": 0.73444217, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20019531, + "step": 16268, + "time_per_iteration": 2.9728877544403076 + }, + { + "auxiliary_loss_clip": 0.01402904, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.2433989, + "balance_loss_mlp": 1.01049244, + "epoch": 0.9781451976551931, + "flos": 20312653668480.0, + "grad_norm": 2.419186640616248, + "language_loss": 0.74268174, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76700389, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18823242, + "step": 16269, + "time_per_iteration": 2.872086524963379 + }, + { + "auxiliary_loss_clip": 0.01400546, + "auxiliary_loss_mlp": 0.01037991, + "balance_loss_clip": 1.23992479, + "balance_loss_mlp": 1.01777339, + "epoch": 0.9782053209078612, + "flos": 24327192199680.0, + "grad_norm": 1.7754473883981208, + "language_loss": 0.71359897, + "learning_rate": 4.967144221869501e-09, + "loss": 0.73798436, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.20214844, + "step": 16270, + "time_per_iteration": 2.877346992492676 + }, + { + "auxiliary_loss_clip": 0.01400866, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.24066472, + "balance_loss_mlp": 1.01314068, + "epoch": 0.9782654441605291, + "flos": 32502292790400.0, + "grad_norm": 1.7363722386286664, + "language_loss": 0.6492542, + "learning_rate": 4.939750627212191e-09, + "loss": 0.6735816, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18725586, + "step": 16271, + "time_per_iteration": 2.9763550758361816 + }, + { + "auxiliary_loss_clip": 0.01388013, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.23356962, + "balance_loss_mlp": 1.01676047, + "epoch": 0.9783255674131971, + "flos": 26990255243520.0, + "grad_norm": 1.4493076697749112, + "language_loss": 0.70685893, + "learning_rate": 4.912432685439505e-09, + "loss": 0.73109168, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.18505859, + "step": 16272, + "time_per_iteration": 2.8863015174865723 + }, + { + "auxiliary_loss_clip": 0.01400979, + "auxiliary_loss_mlp": 0.01034062, + "balance_loss_clip": 1.23959529, + "balance_loss_mlp": 1.0157516, + "epoch": 0.978385690665865, + "flos": 23122599891840.0, + "grad_norm": 1.9122153088387575, + "language_loss": 0.67346054, + "learning_rate": 4.88519039758728e-09, + "loss": 0.69781095, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18322754, + "step": 16273, + "time_per_iteration": 2.8705954551696777 + }, + { + "auxiliary_loss_clip": 0.01399683, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.23985147, + "balance_loss_mlp": 1.01221299, + "epoch": 0.978445813918533, + "flos": 25420310023680.0, + "grad_norm": 1.6421877777147773, + "language_loss": 0.74522758, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76953673, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19006348, + "step": 16274, + "time_per_iteration": 3.0399856567382812 + }, + { + "auxiliary_loss_clip": 0.01396188, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.23755348, + "balance_loss_mlp": 1.01737905, + "epoch": 0.9785059371712009, + "flos": 23560715721600.0, + "grad_norm": 1.5648886168141092, + "language_loss": 0.78297043, + "learning_rate": 4.830932787773579e-09, + "loss": 0.80728883, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.18286133, + "step": 16275, + "time_per_iteration": 2.9320003986358643 + }, + { + "auxiliary_loss_clip": 0.01400434, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.24000216, + "balance_loss_mlp": 1.01199675, + "epoch": 0.978566060423869, + "flos": 34364782759680.0, + "grad_norm": 1.5556579106374215, + "language_loss": 0.71560961, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73992199, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18798828, + "step": 16276, + "time_per_iteration": 2.9695611000061035 + }, + { + "auxiliary_loss_clip": 0.0137354, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.21849656, + "balance_loss_mlp": 1.01378024, + "epoch": 0.9786261836765369, + "flos": 11626131392640.0, + "grad_norm": 2.001994431329105, + "language_loss": 0.86880094, + "learning_rate": 4.776977806000726e-09, + "loss": 0.89286178, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18762207, + "step": 16277, + "time_per_iteration": 2.800727367401123 + }, + { + "auxiliary_loss_clip": 0.01392511, + "auxiliary_loss_mlp": 0.01033544, + "balance_loss_clip": 1.23637843, + "balance_loss_mlp": 1.01443458, + "epoch": 0.9786863069292049, + "flos": 17429944527360.0, + "grad_norm": 1.6643018985640836, + "language_loss": 0.71468437, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73894489, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.19091797, + "step": 16278, + "time_per_iteration": 2.850175142288208 + }, + { + "auxiliary_loss_clip": 0.01395832, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.23797011, + "balance_loss_mlp": 1.01299644, + "epoch": 0.9787464301818728, + "flos": 20853330756480.0, + "grad_norm": 2.1174342178591363, + "language_loss": 0.84468228, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86896133, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19091797, + "step": 16279, + "time_per_iteration": 4.308953046798706 + }, + { + "auxiliary_loss_clip": 0.01393878, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.23454273, + "balance_loss_mlp": 1.01178932, + "epoch": 0.9788065534345408, + "flos": 18231965435520.0, + "grad_norm": 1.8716598543840146, + "language_loss": 0.79555768, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81979978, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18530273, + "step": 16280, + "time_per_iteration": 2.856933355331421 + }, + { + "auxiliary_loss_clip": 0.01378991, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.22493112, + "balance_loss_mlp": 1.01735044, + "epoch": 0.9788666766872087, + "flos": 21587746654080.0, + "grad_norm": 1.7199167863160605, + "language_loss": 0.79863888, + "learning_rate": 4.669975759268085e-09, + "loss": 0.82278812, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 1.54101562, + "router_z_loss_mlp": 0.18591309, + "step": 16281, + "time_per_iteration": 4.477910757064819 + }, + { + "auxiliary_loss_clip": 0.01400145, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.23970044, + "balance_loss_mlp": 1.01387262, + "epoch": 0.9789267999398767, + "flos": 24911422047360.0, + "grad_norm": 1.6273026971620754, + "language_loss": 0.81003582, + "learning_rate": 4.643414402842216e-09, + "loss": 0.83435917, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18334961, + "step": 16282, + "time_per_iteration": 3.020148754119873 + }, + { + "auxiliary_loss_clip": 0.01385973, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.22814369, + "balance_loss_mlp": 1.017735, + "epoch": 0.9789869231925448, + "flos": 19582536026880.0, + "grad_norm": 3.3881743940661178, + "language_loss": 0.83797395, + "learning_rate": 4.616928710538204e-09, + "loss": 0.86219627, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18530273, + "step": 16283, + "time_per_iteration": 2.8674137592315674 + }, + { + "auxiliary_loss_clip": 0.01398831, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.23967314, + "balance_loss_mlp": 1.01676643, + "epoch": 0.9790470464452127, + "flos": 16804424160000.0, + "grad_norm": 1.8438729125356759, + "language_loss": 0.72703159, + "learning_rate": 4.590518683360134e-09, + "loss": 0.75137359, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18591309, + "step": 16284, + "time_per_iteration": 2.8400633335113525 + }, + { + "auxiliary_loss_clip": 0.01389368, + "auxiliary_loss_mlp": 0.01034335, + "balance_loss_clip": 1.23391747, + "balance_loss_mlp": 1.01551163, + "epoch": 0.9791071696978807, + "flos": 18378531901440.0, + "grad_norm": 1.620209670331508, + "language_loss": 0.65028942, + "learning_rate": 4.56418432230965e-09, + "loss": 0.67452645, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18847656, + "step": 16285, + "time_per_iteration": 2.8428094387054443 + }, + { + "auxiliary_loss_clip": 0.01391727, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.23452771, + "balance_loss_mlp": 1.01287806, + "epoch": 0.9791672929505486, + "flos": 24180942447360.0, + "grad_norm": 1.4773658128214413, + "language_loss": 0.71537113, + "learning_rate": 4.537925628385286e-09, + "loss": 0.73960221, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18493652, + "step": 16286, + "time_per_iteration": 2.8957693576812744 + }, + { + "auxiliary_loss_clip": 0.01380172, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.22505486, + "balance_loss_mlp": 1.01553917, + "epoch": 0.9792274162032166, + "flos": 24365134604160.0, + "grad_norm": 1.4819171197318963, + "language_loss": 0.59570032, + "learning_rate": 4.511742602582691e-09, + "loss": 0.61983871, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 1.54980469, + "router_z_loss_mlp": 0.18151855, + "step": 16287, + "time_per_iteration": 2.9106900691986084 + }, + { + "auxiliary_loss_clip": 0.01391968, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.23482251, + "balance_loss_mlp": 1.01314747, + "epoch": 0.9792875394558845, + "flos": 26406930291840.0, + "grad_norm": 1.7182501958454888, + "language_loss": 0.82415587, + "learning_rate": 4.485635245894626e-09, + "loss": 0.84840596, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.19885254, + "step": 16288, + "time_per_iteration": 2.8720617294311523 + }, + { + "auxiliary_loss_clip": 0.01397198, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.23821235, + "balance_loss_mlp": 1.01527715, + "epoch": 0.9793476627085526, + "flos": 28159846058880.0, + "grad_norm": 1.392899788630742, + "language_loss": 0.72237962, + "learning_rate": 4.459603559311631e-09, + "loss": 0.74669641, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.1920166, + "step": 16289, + "time_per_iteration": 2.9140384197235107 + }, + { + "auxiliary_loss_clip": 0.01399165, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.24110365, + "balance_loss_mlp": 1.01533318, + "epoch": 0.9794077859612205, + "flos": 16772680293120.0, + "grad_norm": 2.186455798757513, + "language_loss": 0.76440865, + "learning_rate": 4.43364754382003e-09, + "loss": 0.78873646, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18286133, + "step": 16290, + "time_per_iteration": 5.616847276687622 + }, + { + "auxiliary_loss_clip": 0.01402242, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.23894954, + "balance_loss_mlp": 1.01312518, + "epoch": 0.9794679092138885, + "flos": 19290081767040.0, + "grad_norm": 1.5662991233947143, + "language_loss": 0.67486227, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69921279, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.19702148, + "step": 16291, + "time_per_iteration": 2.8482697010040283 + }, + { + "auxiliary_loss_clip": 0.0140756, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.24499488, + "balance_loss_mlp": 1.01435947, + "epoch": 0.9795280324665564, + "flos": 32168276542080.0, + "grad_norm": 1.6048190983064579, + "language_loss": 0.62354088, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.6479522, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.19213867, + "step": 16292, + "time_per_iteration": 2.9441723823547363 + }, + { + "auxiliary_loss_clip": 0.01391748, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.23321724, + "balance_loss_mlp": 1.01174593, + "epoch": 0.9795881557192244, + "flos": 19069892732160.0, + "grad_norm": 2.027644088307165, + "language_loss": 0.74357748, + "learning_rate": 4.356233533724829e-09, + "loss": 0.7678014, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18908691, + "step": 16293, + "time_per_iteration": 2.8586339950561523 + }, + { + "auxiliary_loss_clip": 0.01415149, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.25321293, + "balance_loss_mlp": 1.01644373, + "epoch": 0.9796482789718923, + "flos": 28341685486080.0, + "grad_norm": 1.6874131800020478, + "language_loss": 0.85017979, + "learning_rate": 4.330580212414503e-09, + "loss": 0.87468231, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18676758, + "step": 16294, + "time_per_iteration": 2.8975167274475098 + }, + { + "auxiliary_loss_clip": 0.01377776, + "auxiliary_loss_mlp": 0.01035472, + "balance_loss_clip": 1.22491884, + "balance_loss_mlp": 1.01625562, + "epoch": 0.9797084022245603, + "flos": 17977046376960.0, + "grad_norm": 2.284729616993463, + "language_loss": 0.72993577, + "learning_rate": 4.305002567088767e-09, + "loss": 0.75406832, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.19213867, + "step": 16295, + "time_per_iteration": 2.8519980907440186 + }, + { + "auxiliary_loss_clip": 0.01408836, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.24683952, + "balance_loss_mlp": 1.0190686, + "epoch": 0.9797685254772284, + "flos": 20276430566400.0, + "grad_norm": 1.6793868298773194, + "language_loss": 0.80939591, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.83387017, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.19519043, + "step": 16296, + "time_per_iteration": 2.8413708209991455 + }, + { + "auxiliary_loss_clip": 0.0139827, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.24080193, + "balance_loss_mlp": 1.01404762, + "epoch": 0.9798286487298963, + "flos": 26918578200960.0, + "grad_norm": 1.7431190160309271, + "language_loss": 0.75663209, + "learning_rate": 4.254074308266853e-09, + "loss": 0.78094065, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.18530273, + "step": 16297, + "time_per_iteration": 2.9021573066711426 + }, + { + "auxiliary_loss_clip": 0.01401074, + "auxiliary_loss_mlp": 0.01034545, + "balance_loss_clip": 1.2385236, + "balance_loss_mlp": 1.01585293, + "epoch": 0.9798887719825643, + "flos": 27172185160320.0, + "grad_norm": 2.736842327065104, + "language_loss": 0.78904635, + "learning_rate": 4.228723696702019e-09, + "loss": 0.81340253, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.18676758, + "step": 16298, + "time_per_iteration": 2.937523126602173 + }, + { + "auxiliary_loss_clip": 0.01389679, + "auxiliary_loss_mlp": 0.01028399, + "balance_loss_clip": 1.23440218, + "balance_loss_mlp": 1.00977886, + "epoch": 0.9799488952352322, + "flos": 20678323294080.0, + "grad_norm": 1.6884137906118983, + "language_loss": 0.7355001, + "learning_rate": 4.203448764984019e-09, + "loss": 0.75968087, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.1862793, + "step": 16299, + "time_per_iteration": 2.832148313522339 + }, + { + "auxiliary_loss_clip": 0.01399617, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.23832512, + "balance_loss_mlp": 1.01288378, + "epoch": 0.9800090184879002, + "flos": 21991268194560.0, + "grad_norm": 2.0957449246246074, + "language_loss": 0.90152174, + "learning_rate": 4.178249514071419e-09, + "loss": 0.92583275, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.18591309, + "step": 16300, + "time_per_iteration": 2.8857336044311523 + }, + { + "auxiliary_loss_clip": 0.01405069, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.24176288, + "balance_loss_mlp": 1.01253021, + "epoch": 0.9800691417405681, + "flos": 21298414285440.0, + "grad_norm": 2.0932251788503846, + "language_loss": 0.7942636, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.81863058, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.19091797, + "step": 16301, + "time_per_iteration": 2.879556894302368 + }, + { + "auxiliary_loss_clip": 0.01401244, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.2425729, + "balance_loss_mlp": 1.01770067, + "epoch": 0.9801292649932362, + "flos": 18448444396800.0, + "grad_norm": 1.87574907858267, + "language_loss": 0.76540327, + "learning_rate": 4.128078058480921e-09, + "loss": 0.78978711, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19421387, + "step": 16302, + "time_per_iteration": 2.857126235961914 + }, + { + "auxiliary_loss_clip": 0.01402291, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.24362671, + "balance_loss_mlp": 1.01518726, + "epoch": 0.9801893882459041, + "flos": 25057309841280.0, + "grad_norm": 3.349300968912559, + "language_loss": 0.80016255, + "learning_rate": 4.103105855705724e-09, + "loss": 0.82452738, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.18994141, + "step": 16303, + "time_per_iteration": 2.900641918182373 + }, + { + "auxiliary_loss_clip": 0.0140967, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.24669874, + "balance_loss_mlp": 1.01537466, + "epoch": 0.9802495114985721, + "flos": 18519714236160.0, + "grad_norm": 2.3117070308959518, + "language_loss": 0.83624637, + "learning_rate": 4.078209337540883e-09, + "loss": 0.86068213, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.1854248, + "step": 16304, + "time_per_iteration": 2.8329830169677734 + }, + { + "auxiliary_loss_clip": 0.01387626, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.23341894, + "balance_loss_mlp": 1.01392829, + "epoch": 0.98030963475124, + "flos": 21479620285440.0, + "grad_norm": 1.7510404733696563, + "language_loss": 0.71509999, + "learning_rate": 4.053388504930089e-09, + "loss": 0.7392959, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.18017578, + "step": 16305, + "time_per_iteration": 2.9039056301116943 + }, + { + "auxiliary_loss_clip": 0.01411074, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.24917269, + "balance_loss_mlp": 1.01484561, + "epoch": 0.980369758003908, + "flos": 20421911157120.0, + "grad_norm": 2.645134043452221, + "language_loss": 0.7271477, + "learning_rate": 4.028643358815032e-09, + "loss": 0.75160277, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19580078, + "step": 16306, + "time_per_iteration": 2.840883493423462 + }, + { + "auxiliary_loss_clip": 0.01378388, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.2228663, + "balance_loss_mlp": 1.0131135, + "epoch": 0.9804298812565759, + "flos": 23408312676480.0, + "grad_norm": 1.9032003177788999, + "language_loss": 0.74548829, + "learning_rate": 4.00397390013385e-09, + "loss": 0.76958561, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18225098, + "step": 16307, + "time_per_iteration": 2.9179160594940186 + }, + { + "auxiliary_loss_clip": 0.013737, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.22109962, + "balance_loss_mlp": 1.01336288, + "epoch": 0.980490004509244, + "flos": 23302539037440.0, + "grad_norm": 1.7652662549700575, + "language_loss": 0.74874175, + "learning_rate": 3.979380129822018e-09, + "loss": 0.7727837, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.17138672, + "step": 16308, + "time_per_iteration": 2.8911659717559814 + }, + { + "auxiliary_loss_clip": 0.01180885, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.09327066, + "balance_loss_mlp": 1.00890124, + "epoch": 0.980550127761912, + "flos": 56077351113600.0, + "grad_norm": 0.7601874219662592, + "language_loss": 0.57846069, + "learning_rate": 3.954862048811902e-09, + "loss": 0.60066187, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.30273438, + "step": 16309, + "time_per_iteration": 3.24973464012146 + }, + { + "auxiliary_loss_clip": 0.01394804, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.23478055, + "balance_loss_mlp": 1.01338887, + "epoch": 0.9806102510145799, + "flos": 25343022625920.0, + "grad_norm": 1.7445918279657047, + "language_loss": 0.67349899, + "learning_rate": 3.930419658033646e-09, + "loss": 0.69776863, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18762207, + "step": 16310, + "time_per_iteration": 2.8780481815338135 + }, + { + "auxiliary_loss_clip": 0.0117898, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.09201503, + "balance_loss_mlp": 1.01071513, + "epoch": 0.9806703742672479, + "flos": 67309960066560.0, + "grad_norm": 0.8210806659183517, + "language_loss": 0.54602456, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56811702, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.1953125, + "step": 16311, + "time_per_iteration": 3.335137128829956 + }, + { + "auxiliary_loss_clip": 0.01401789, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.24327064, + "balance_loss_mlp": 1.01394713, + "epoch": 0.9807304975199158, + "flos": 25240008919680.0, + "grad_norm": 2.7528148001981356, + "language_loss": 0.80360031, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82794696, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18933105, + "step": 16312, + "time_per_iteration": 2.92051362991333 + }, + { + "auxiliary_loss_clip": 0.01390162, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.23369527, + "balance_loss_mlp": 1.00930643, + "epoch": 0.9807906207725838, + "flos": 17465126999040.0, + "grad_norm": 2.560431084668054, + "language_loss": 0.63889706, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.66306788, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.17602539, + "step": 16313, + "time_per_iteration": 2.8344950675964355 + }, + { + "auxiliary_loss_clip": 0.01390689, + "auxiliary_loss_mlp": 0.01035957, + "balance_loss_clip": 1.23382485, + "balance_loss_mlp": 1.01652575, + "epoch": 0.9808507440252517, + "flos": 21042183127680.0, + "grad_norm": 1.9037445823970505, + "language_loss": 0.73120725, + "learning_rate": 3.833407015731316e-09, + "loss": 0.75547373, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.19433594, + "step": 16314, + "time_per_iteration": 4.307093143463135 + }, + { + "auxiliary_loss_clip": 0.01178, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.09043956, + "balance_loss_mlp": 1.01045096, + "epoch": 0.9809108672779198, + "flos": 64073688923520.0, + "grad_norm": 0.6964784399449659, + "language_loss": 0.51786762, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53993905, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.18652344, + "step": 16315, + "time_per_iteration": 3.298262357711792 + }, + { + "auxiliary_loss_clip": 0.01398186, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.23735845, + "balance_loss_mlp": 1.01357865, + "epoch": 0.9809709905305877, + "flos": 22790031477120.0, + "grad_norm": 1.7125208300800534, + "language_loss": 0.70373702, + "learning_rate": 3.785354859932033e-09, + "loss": 0.72803891, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.1842041, + "step": 16316, + "time_per_iteration": 4.401776313781738 + }, + { + "auxiliary_loss_clip": 0.01393144, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.23213053, + "balance_loss_mlp": 1.01174021, + "epoch": 0.9810311137832557, + "flos": 37027393355520.0, + "grad_norm": 2.1005508088311005, + "language_loss": 0.55808771, + "learning_rate": 3.76144232656661e-09, + "loss": 0.58232176, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.1854248, + "step": 16317, + "time_per_iteration": 3.016195774078369 + }, + { + "auxiliary_loss_clip": 0.01387057, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.23080909, + "balance_loss_mlp": 1.01348996, + "epoch": 0.9810912370359236, + "flos": 18925543261440.0, + "grad_norm": 2.5096305686801106, + "language_loss": 0.73968148, + "learning_rate": 3.737605490767404e-09, + "loss": 0.76387393, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18713379, + "step": 16318, + "time_per_iteration": 2.857858180999756 + }, + { + "auxiliary_loss_clip": 0.0138169, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.22618926, + "balance_loss_mlp": 1.01701093, + "epoch": 0.9811513602885916, + "flos": 18450616147200.0, + "grad_norm": 2.1048247137796134, + "language_loss": 0.82531714, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84948635, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18225098, + "step": 16319, + "time_per_iteration": 2.8692994117736816 + }, + { + "auxiliary_loss_clip": 0.01177028, + "auxiliary_loss_mlp": 0.01019687, + "balance_loss_clip": 1.09129691, + "balance_loss_mlp": 1.00261641, + "epoch": 0.9812114835412595, + "flos": 68089449064320.0, + "grad_norm": 0.7176720519565054, + "language_loss": 0.53626126, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55822843, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.17089844, + "step": 16320, + "time_per_iteration": 3.265028953552246 + }, + { + "auxiliary_loss_clip": 0.01401918, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.24187636, + "balance_loss_mlp": 1.01452088, + "epoch": 0.9812716067939276, + "flos": 25383589228800.0, + "grad_norm": 1.9872695036795303, + "language_loss": 0.73973334, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.76408237, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.18469238, + "step": 16321, + "time_per_iteration": 2.9811341762542725 + }, + { + "auxiliary_loss_clip": 0.01389594, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.23382437, + "balance_loss_mlp": 1.01390481, + "epoch": 0.9813317300465956, + "flos": 22867092650880.0, + "grad_norm": 1.6440586777003658, + "language_loss": 0.79545784, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.81967252, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.1796875, + "step": 16322, + "time_per_iteration": 2.8677475452423096 + }, + { + "auxiliary_loss_clip": 0.01390874, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.23178005, + "balance_loss_mlp": 1.01609039, + "epoch": 0.9813918532992635, + "flos": 23597527006080.0, + "grad_norm": 1.8544480247367832, + "language_loss": 0.81633532, + "learning_rate": 3.619556806799595e-09, + "loss": 0.84059596, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.19091797, + "step": 16323, + "time_per_iteration": 2.8989927768707275 + }, + { + "auxiliary_loss_clip": 0.01405556, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.24270999, + "balance_loss_mlp": 1.01482272, + "epoch": 0.9814519765519315, + "flos": 19614913320960.0, + "grad_norm": 6.298577167984119, + "language_loss": 0.85500985, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87940764, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.19396973, + "step": 16324, + "time_per_iteration": 4.24781346321106 + }, + { + "auxiliary_loss_clip": 0.01396656, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.23775065, + "balance_loss_mlp": 1.01297975, + "epoch": 0.9815120998045994, + "flos": 33958863244800.0, + "grad_norm": 1.353887722845431, + "language_loss": 0.75175053, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.77603108, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18408203, + "step": 16325, + "time_per_iteration": 4.434406757354736 + }, + { + "auxiliary_loss_clip": 0.01383666, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.2302556, + "balance_loss_mlp": 1.01609325, + "epoch": 0.9815722230572674, + "flos": 20859755518080.0, + "grad_norm": 1.6261691932796574, + "language_loss": 0.76900005, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.79318333, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.18579102, + "step": 16326, + "time_per_iteration": 2.855483293533325 + }, + { + "auxiliary_loss_clip": 0.01407485, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.24645972, + "balance_loss_mlp": 1.01450849, + "epoch": 0.9816323463099353, + "flos": 22905125544960.0, + "grad_norm": 1.7289096153173702, + "language_loss": 0.6859405, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.71035308, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.19250488, + "step": 16327, + "time_per_iteration": 2.861799955368042 + }, + { + "auxiliary_loss_clip": 0.01413633, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.24842799, + "balance_loss_mlp": 1.01474786, + "epoch": 0.9816924695626034, + "flos": 31551714645120.0, + "grad_norm": 1.4978965196358367, + "language_loss": 0.73972952, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.76421046, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 1.65234375, + "router_z_loss_mlp": 0.19726562, + "step": 16328, + "time_per_iteration": 2.927619218826294 + }, + { + "auxiliary_loss_clip": 0.01435894, + "auxiliary_loss_mlp": 0.01037964, + "balance_loss_clip": 1.26747417, + "balance_loss_mlp": 1.01930737, + "epoch": 0.9817525928152713, + "flos": 21516838773120.0, + "grad_norm": 1.778962727529593, + "language_loss": 0.82399046, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.84872901, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.18664551, + "step": 16329, + "time_per_iteration": 2.8557119369506836 + }, + { + "auxiliary_loss_clip": 0.01412318, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.24899113, + "balance_loss_mlp": 1.01420999, + "epoch": 0.9818127160679393, + "flos": 25559863545600.0, + "grad_norm": 1.8466984731399236, + "language_loss": 0.77100396, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.79545832, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 1.63183594, + "router_z_loss_mlp": 0.18920898, + "step": 16330, + "time_per_iteration": 2.9195094108581543 + }, + { + "auxiliary_loss_clip": 0.01430675, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.25955737, + "balance_loss_mlp": 1.01302314, + "epoch": 0.9818728393206072, + "flos": 28815390990720.0, + "grad_norm": 2.2335169401631267, + "language_loss": 0.66965783, + "learning_rate": 3.434615511252126e-09, + "loss": 0.69429767, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.20288086, + "step": 16331, + "time_per_iteration": 2.885392427444458 + }, + { + "auxiliary_loss_clip": 0.01395363, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.23685288, + "balance_loss_mlp": 1.01321447, + "epoch": 0.9819329625732752, + "flos": 23232762276480.0, + "grad_norm": 1.6956179259388642, + "language_loss": 0.74645454, + "learning_rate": 3.411838534981948e-09, + "loss": 0.77072704, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18652344, + "step": 16332, + "time_per_iteration": 2.8638341426849365 + }, + { + "auxiliary_loss_clip": 0.01390591, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.23213017, + "balance_loss_mlp": 1.01103091, + "epoch": 0.9819930858259431, + "flos": 17539473484800.0, + "grad_norm": 2.74879588440532, + "language_loss": 0.77271914, + "learning_rate": 3.389137269534936e-09, + "loss": 0.79690945, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.17407227, + "step": 16333, + "time_per_iteration": 2.807950496673584 + }, + { + "auxiliary_loss_clip": 0.0139534, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.23599482, + "balance_loss_mlp": 1.01264524, + "epoch": 0.9820532090786112, + "flos": 12537138320640.0, + "grad_norm": 2.0958041519038164, + "language_loss": 0.74219966, + "learning_rate": 3.366511715771958e-09, + "loss": 0.76646334, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.18383789, + "step": 16334, + "time_per_iteration": 2.817504644393921 + }, + { + "auxiliary_loss_clip": 0.0140219, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.24142075, + "balance_loss_mlp": 1.01798677, + "epoch": 0.9821133323312792, + "flos": 18848572577280.0, + "grad_norm": 2.779586204860118, + "language_loss": 0.79093903, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.81532896, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18798828, + "step": 16335, + "time_per_iteration": 2.849583387374878 + }, + { + "auxiliary_loss_clip": 0.01405568, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.24272227, + "balance_loss_mlp": 1.01850319, + "epoch": 0.9821734555839471, + "flos": 34837357144320.0, + "grad_norm": 36.7807906505841, + "language_loss": 0.65588474, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.68032199, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19665527, + "step": 16336, + "time_per_iteration": 2.9764039516448975 + }, + { + "auxiliary_loss_clip": 0.01404347, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.23972666, + "balance_loss_mlp": 1.01747704, + "epoch": 0.9822335788366151, + "flos": 17136856840320.0, + "grad_norm": 1.798625558581924, + "language_loss": 0.74381649, + "learning_rate": 3.299089333152372e-09, + "loss": 0.76823378, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19885254, + "step": 16337, + "time_per_iteration": 2.8278439044952393 + }, + { + "auxiliary_loss_clip": 0.01394576, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.23308909, + "balance_loss_mlp": 1.01769078, + "epoch": 0.982293702089283, + "flos": 20822898988800.0, + "grad_norm": 1.594928490681411, + "language_loss": 0.73806381, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.76238853, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.20214844, + "step": 16338, + "time_per_iteration": 2.867870569229126 + }, + { + "auxiliary_loss_clip": 0.01399886, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.23954272, + "balance_loss_mlp": 1.02140856, + "epoch": 0.982353825341951, + "flos": 24691504481280.0, + "grad_norm": 1.7318630898245948, + "language_loss": 0.82135737, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.84576118, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.1907959, + "step": 16339, + "time_per_iteration": 2.9090890884399414 + }, + { + "auxiliary_loss_clip": 0.01386984, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.23142767, + "balance_loss_mlp": 1.0121665, + "epoch": 0.982413948594619, + "flos": 20860343700480.0, + "grad_norm": 1.6831310070160217, + "language_loss": 0.62820965, + "learning_rate": 3.232348386403405e-09, + "loss": 0.6523906, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.1895752, + "step": 16340, + "time_per_iteration": 2.8368945121765137 + }, + { + "auxiliary_loss_clip": 0.01407641, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.24622583, + "balance_loss_mlp": 1.01109552, + "epoch": 0.982474071847287, + "flos": 15385750865280.0, + "grad_norm": 2.980301633031861, + "language_loss": 0.86782217, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.89220321, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19372559, + "step": 16341, + "time_per_iteration": 2.9089434146881104 + }, + { + "auxiliary_loss_clip": 0.01380327, + "auxiliary_loss_mlp": 0.01029545, + "balance_loss_clip": 1.22555661, + "balance_loss_mlp": 1.01084125, + "epoch": 0.9825341950999549, + "flos": 23786741335680.0, + "grad_norm": 2.9144734842749567, + "language_loss": 0.67235738, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69645607, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.18701172, + "step": 16342, + "time_per_iteration": 2.9485669136047363 + }, + { + "auxiliary_loss_clip": 0.01402154, + "auxiliary_loss_mlp": 0.01029666, + "balance_loss_clip": 1.2407583, + "balance_loss_mlp": 1.01022315, + "epoch": 0.9825943183526229, + "flos": 22756251594240.0, + "grad_norm": 1.7507930586633487, + "language_loss": 0.77544034, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79975855, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19445801, + "step": 16343, + "time_per_iteration": 2.8317768573760986 + }, + { + "auxiliary_loss_clip": 0.01397975, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.24077928, + "balance_loss_mlp": 1.01368237, + "epoch": 0.9826544416052908, + "flos": 27721639739520.0, + "grad_norm": 1.5365773974449652, + "language_loss": 0.75830626, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.78260481, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18188477, + "step": 16344, + "time_per_iteration": 2.887556791305542 + }, + { + "auxiliary_loss_clip": 0.01412385, + "auxiliary_loss_mlp": 0.01035542, + "balance_loss_clip": 1.25207484, + "balance_loss_mlp": 1.01726723, + "epoch": 0.9827145648579588, + "flos": 26952584307840.0, + "grad_norm": 2.122693524862932, + "language_loss": 0.67261279, + "learning_rate": 3.122627838848313e-09, + "loss": 0.69709206, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18286133, + "step": 16345, + "time_per_iteration": 2.8643970489501953 + }, + { + "auxiliary_loss_clip": 0.01385296, + "auxiliary_loss_mlp": 0.01030047, + "balance_loss_clip": 1.22998023, + "balance_loss_mlp": 1.01168847, + "epoch": 0.9827746881106267, + "flos": 21875223985920.0, + "grad_norm": 2.2036909212656606, + "language_loss": 0.80049175, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.82464516, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18347168, + "step": 16346, + "time_per_iteration": 2.870227336883545 + }, + { + "auxiliary_loss_clip": 0.01423699, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.25728202, + "balance_loss_mlp": 1.01381326, + "epoch": 0.9828348113632948, + "flos": 20860705658880.0, + "grad_norm": 2.021880516125609, + "language_loss": 0.76154137, + "learning_rate": 3.079269666552031e-09, + "loss": 0.78611028, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19384766, + "step": 16347, + "time_per_iteration": 2.8142223358154297 + }, + { + "auxiliary_loss_clip": 0.01379999, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.2249949, + "balance_loss_mlp": 1.01809525, + "epoch": 0.9828949346159628, + "flos": 34582664309760.0, + "grad_norm": 1.708137141615956, + "language_loss": 0.68188095, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.70604777, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.18591309, + "step": 16348, + "time_per_iteration": 2.9624292850494385 + }, + { + "auxiliary_loss_clip": 0.01393618, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.23579335, + "balance_loss_mlp": 1.01422811, + "epoch": 0.9829550578686307, + "flos": 24466926700800.0, + "grad_norm": 3.5809937518734167, + "language_loss": 0.70032036, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.72459674, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19812012, + "step": 16349, + "time_per_iteration": 4.2957799434661865 + }, + { + "auxiliary_loss_clip": 0.01369055, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.21643162, + "balance_loss_mlp": 1.01173651, + "epoch": 0.9830151811212987, + "flos": 16918930045440.0, + "grad_norm": 1.8589675914643207, + "language_loss": 0.75835854, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78235161, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.18518066, + "step": 16350, + "time_per_iteration": 2.8139514923095703 + }, + { + "auxiliary_loss_clip": 0.0140304, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.24248648, + "balance_loss_mlp": 1.01557326, + "epoch": 0.9830753043739666, + "flos": 21298278551040.0, + "grad_norm": 2.516635311641066, + "language_loss": 0.8511833, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.87556893, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19934082, + "step": 16351, + "time_per_iteration": 4.262692213058472 + }, + { + "auxiliary_loss_clip": 0.01393776, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.23487878, + "balance_loss_mlp": 1.01144421, + "epoch": 0.9831354276266346, + "flos": 31736268760320.0, + "grad_norm": 1.5599381867491418, + "language_loss": 0.68913257, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71337265, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18786621, + "step": 16352, + "time_per_iteration": 2.9057857990264893 + }, + { + "auxiliary_loss_clip": 0.01394509, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.23653221, + "balance_loss_mlp": 1.01609898, + "epoch": 0.9831955508793025, + "flos": 21629715845760.0, + "grad_norm": 1.35473531271265, + "language_loss": 0.67110133, + "learning_rate": 2.951012538143782e-09, + "loss": 0.69539773, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19030762, + "step": 16353, + "time_per_iteration": 2.845715045928955 + }, + { + "auxiliary_loss_clip": 0.01380957, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.22571826, + "balance_loss_mlp": 1.01553202, + "epoch": 0.9832556741319706, + "flos": 22979019582720.0, + "grad_norm": 1.500469136580036, + "language_loss": 0.75142503, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.7755748, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.18505859, + "step": 16354, + "time_per_iteration": 2.904158353805542 + }, + { + "auxiliary_loss_clip": 0.01388183, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.23123801, + "balance_loss_mlp": 1.01439595, + "epoch": 0.9833157973846385, + "flos": 21333642001920.0, + "grad_norm": 2.4553363274230398, + "language_loss": 0.78386176, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.80806804, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18066406, + "step": 16355, + "time_per_iteration": 2.8519859313964844 + }, + { + "auxiliary_loss_clip": 0.01395761, + "auxiliary_loss_mlp": 0.01034345, + "balance_loss_clip": 1.23781109, + "balance_loss_mlp": 1.01573634, + "epoch": 0.9833759206373065, + "flos": 21078451474560.0, + "grad_norm": 4.805718133736267, + "language_loss": 0.73910356, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.76340461, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18615723, + "step": 16356, + "time_per_iteration": 2.8277242183685303 + }, + { + "auxiliary_loss_clip": 0.01396647, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.23926568, + "balance_loss_mlp": 1.01123023, + "epoch": 0.9834360438899744, + "flos": 18706485346560.0, + "grad_norm": 2.0552353896871205, + "language_loss": 0.76903188, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.79330111, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19042969, + "step": 16357, + "time_per_iteration": 2.824057102203369 + }, + { + "auxiliary_loss_clip": 0.01394216, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.23582339, + "balance_loss_mlp": 1.01082444, + "epoch": 0.9834961671426424, + "flos": 21114991290240.0, + "grad_norm": 2.125172688605168, + "language_loss": 0.8134706, + "learning_rate": 2.846214118442436e-09, + "loss": 0.83771765, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19665527, + "step": 16358, + "time_per_iteration": 2.8243868350982666 + }, + { + "auxiliary_loss_clip": 0.01403278, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.24454403, + "balance_loss_mlp": 1.01217651, + "epoch": 0.9835562903953103, + "flos": 26698841614080.0, + "grad_norm": 2.1338256830293956, + "language_loss": 0.68725592, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.71159214, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.1817627, + "step": 16359, + "time_per_iteration": 4.44164776802063 + }, + { + "auxiliary_loss_clip": 0.01389552, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.23226833, + "balance_loss_mlp": 1.01293278, + "epoch": 0.9836164136479784, + "flos": 22100118480000.0, + "grad_norm": 2.284236987211495, + "language_loss": 0.7000975, + "learning_rate": 2.804824870920264e-09, + "loss": 0.72430223, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.17980957, + "step": 16360, + "time_per_iteration": 4.3412346839904785 + }, + { + "auxiliary_loss_clip": 0.01403123, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.24288607, + "balance_loss_mlp": 1.01479983, + "epoch": 0.9836765369006463, + "flos": 23888940635520.0, + "grad_norm": 1.6961491125983499, + "language_loss": 0.84604245, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.87042356, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.2019043, + "step": 16361, + "time_per_iteration": 3.0159943103790283 + }, + { + "auxiliary_loss_clip": 0.01392089, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.23346114, + "balance_loss_mlp": 1.0115943, + "epoch": 0.9837366601533143, + "flos": 25855122983040.0, + "grad_norm": 1.6044526467163782, + "language_loss": 0.76481867, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78903604, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18054199, + "step": 16362, + "time_per_iteration": 2.928985595703125 + }, + { + "auxiliary_loss_clip": 0.01399211, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.23991418, + "balance_loss_mlp": 1.01541543, + "epoch": 0.9837967834059823, + "flos": 21367150416000.0, + "grad_norm": 1.7239013868153286, + "language_loss": 0.71955121, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.74388403, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18676758, + "step": 16363, + "time_per_iteration": 2.8677234649658203 + }, + { + "auxiliary_loss_clip": 0.01381781, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.22852707, + "balance_loss_mlp": 1.01343012, + "epoch": 0.9838569066586502, + "flos": 18525143612160.0, + "grad_norm": 1.6162151351248686, + "language_loss": 0.63508821, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65921849, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 1.53320312, + "router_z_loss_mlp": 0.17822266, + "step": 16364, + "time_per_iteration": 2.808624029159546 + }, + { + "auxiliary_loss_clip": 0.01398042, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.23892069, + "balance_loss_mlp": 1.01218867, + "epoch": 0.9839170299113182, + "flos": 22461761318400.0, + "grad_norm": 1.6708738058559187, + "language_loss": 0.75448823, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77876347, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17297363, + "step": 16365, + "time_per_iteration": 2.877235174179077 + }, + { + "auxiliary_loss_clip": 0.01389255, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.23222971, + "balance_loss_mlp": 1.01326513, + "epoch": 0.9839771531639862, + "flos": 27904022104320.0, + "grad_norm": 1.604459488239463, + "language_loss": 0.76975167, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.79395252, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17590332, + "step": 16366, + "time_per_iteration": 2.9044857025146484 + }, + { + "auxiliary_loss_clip": 0.01385916, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.22960329, + "balance_loss_mlp": 1.01464963, + "epoch": 0.9840372764166542, + "flos": 28224691136640.0, + "grad_norm": 1.7194003972455227, + "language_loss": 0.77541733, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79960632, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18334961, + "step": 16367, + "time_per_iteration": 2.872946262359619 + }, + { + "auxiliary_loss_clip": 0.01398219, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.24057591, + "balance_loss_mlp": 1.01676273, + "epoch": 0.9840973996693221, + "flos": 23414375479680.0, + "grad_norm": 2.696641913280829, + "language_loss": 0.61740375, + "learning_rate": 2.642297296540974e-09, + "loss": 0.64174175, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18835449, + "step": 16368, + "time_per_iteration": 2.904874563217163 + }, + { + "auxiliary_loss_clip": 0.01382313, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.22839928, + "balance_loss_mlp": 1.01613498, + "epoch": 0.9841575229219901, + "flos": 21405364289280.0, + "grad_norm": 2.4962731170714374, + "language_loss": 0.66281772, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.68697852, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.17651367, + "step": 16369, + "time_per_iteration": 2.8288822174072266 + }, + { + "auxiliary_loss_clip": 0.01393065, + "auxiliary_loss_mlp": 0.01032102, + "balance_loss_clip": 1.23325992, + "balance_loss_mlp": 1.01321971, + "epoch": 0.984217646174658, + "flos": 24475251744000.0, + "grad_norm": 1.6031460700587432, + "language_loss": 0.68616229, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.71041393, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18884277, + "step": 16370, + "time_per_iteration": 2.8937289714813232 + }, + { + "auxiliary_loss_clip": 0.01404385, + "auxiliary_loss_mlp": 0.01034008, + "balance_loss_clip": 1.24368763, + "balance_loss_mlp": 1.01427865, + "epoch": 0.984277769427326, + "flos": 16443595728000.0, + "grad_norm": 2.087007185534773, + "language_loss": 0.74735892, + "learning_rate": 2.582599145159792e-09, + "loss": 0.77174282, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.1973877, + "step": 16371, + "time_per_iteration": 2.8109121322631836 + }, + { + "auxiliary_loss_clip": 0.01183102, + "auxiliary_loss_mlp": 0.01053289, + "balance_loss_clip": 1.09419525, + "balance_loss_mlp": 1.02467859, + "epoch": 0.9843378926799939, + "flos": 64563003884160.0, + "grad_norm": 0.781229367967472, + "language_loss": 0.65256512, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67492902, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.28515625, + "step": 16372, + "time_per_iteration": 3.348527669906616 + }, + { + "auxiliary_loss_clip": 0.01386593, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.22959805, + "balance_loss_mlp": 1.010849, + "epoch": 0.984398015932662, + "flos": 17391368695680.0, + "grad_norm": 1.7264321663506492, + "language_loss": 0.71189058, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.73604369, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.17871094, + "step": 16373, + "time_per_iteration": 2.8333990573883057 + }, + { + "auxiliary_loss_clip": 0.0138492, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.22872269, + "balance_loss_mlp": 1.01180482, + "epoch": 0.9844581391853299, + "flos": 23889800286720.0, + "grad_norm": 1.6993230314119452, + "language_loss": 0.82217801, + "learning_rate": 2.523582674173186e-09, + "loss": 0.84633231, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18713379, + "step": 16374, + "time_per_iteration": 2.94132137298584 + }, + { + "auxiliary_loss_clip": 0.01390964, + "auxiliary_loss_mlp": 0.01033234, + "balance_loss_clip": 1.23189664, + "balance_loss_mlp": 1.01433945, + "epoch": 0.9845182624379979, + "flos": 19874945041920.0, + "grad_norm": 1.7374782127016828, + "language_loss": 0.69914925, + "learning_rate": 2.504062005197927e-09, + "loss": 0.72339123, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.18896484, + "step": 16375, + "time_per_iteration": 2.922139883041382 + }, + { + "auxiliary_loss_clip": 0.0141248, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.25130475, + "balance_loss_mlp": 1.01791394, + "epoch": 0.9845783856906659, + "flos": 28265800677120.0, + "grad_norm": 1.8013628094840444, + "language_loss": 0.81601644, + "learning_rate": 2.484617081468521e-09, + "loss": 0.84051436, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19372559, + "step": 16376, + "time_per_iteration": 2.9252190589904785 + }, + { + "auxiliary_loss_clip": 0.0139016, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.23375154, + "balance_loss_mlp": 1.01452506, + "epoch": 0.9846385089433338, + "flos": 28339739959680.0, + "grad_norm": 1.4930984954441606, + "language_loss": 0.63119018, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.65542233, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18530273, + "step": 16377, + "time_per_iteration": 2.9014320373535156 + }, + { + "auxiliary_loss_clip": 0.01403505, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.24304891, + "balance_loss_mlp": 1.0147543, + "epoch": 0.9846986321960018, + "flos": 24327554158080.0, + "grad_norm": 1.7073433586366624, + "language_loss": 0.74081969, + "learning_rate": 2.445954472695133e-09, + "loss": 0.76518875, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18640137, + "step": 16378, + "time_per_iteration": 2.8990511894226074 + }, + { + "auxiliary_loss_clip": 0.01397814, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.23867548, + "balance_loss_mlp": 1.01396322, + "epoch": 0.9847587554486698, + "flos": 27283523909760.0, + "grad_norm": 1.6439877443565571, + "language_loss": 0.71049571, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73479688, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18322754, + "step": 16379, + "time_per_iteration": 2.9230332374572754 + }, + { + "auxiliary_loss_clip": 0.0140022, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.23968482, + "balance_loss_mlp": 1.01339364, + "epoch": 0.9848188787013378, + "flos": 16550817200640.0, + "grad_norm": 1.819288466970461, + "language_loss": 0.6921615, + "learning_rate": 2.407594853716999e-09, + "loss": 0.71648586, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18811035, + "step": 16380, + "time_per_iteration": 2.8730764389038086 + }, + { + "auxiliary_loss_clip": 0.01408244, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.24382401, + "balance_loss_mlp": 1.01386571, + "epoch": 0.9848790019540057, + "flos": 20203305690240.0, + "grad_norm": 1.9020148737699714, + "language_loss": 0.79697436, + "learning_rate": 2.38852866722139e-09, + "loss": 0.82139087, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.1953125, + "step": 16381, + "time_per_iteration": 2.8677265644073486 + }, + { + "auxiliary_loss_clip": 0.01402657, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.2415123, + "balance_loss_mlp": 1.01274383, + "epoch": 0.9849391252066737, + "flos": 28272180193920.0, + "grad_norm": 1.4761322258319163, + "language_loss": 0.83083123, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.85517442, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.18920898, + "step": 16382, + "time_per_iteration": 2.9514949321746826 + }, + { + "auxiliary_loss_clip": 0.01411039, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.24681437, + "balance_loss_mlp": 1.01194251, + "epoch": 0.9849992484593416, + "flos": 22464973699200.0, + "grad_norm": 1.8069972822303069, + "language_loss": 0.74937087, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.77379322, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.19250488, + "step": 16383, + "time_per_iteration": 2.8254432678222656 + }, + { + "auxiliary_loss_clip": 0.0140342, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.24305797, + "balance_loss_mlp": 1.01377344, + "epoch": 0.9850593717120096, + "flos": 34510851532800.0, + "grad_norm": 1.7567342576549105, + "language_loss": 0.67192638, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.69629109, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19274902, + "step": 16384, + "time_per_iteration": 4.360185623168945 + }, + { + "auxiliary_loss_clip": 0.01406865, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.24487042, + "balance_loss_mlp": 1.01675141, + "epoch": 0.9851194949646775, + "flos": 38851624206720.0, + "grad_norm": 1.747995428822308, + "language_loss": 0.71622294, + "learning_rate": 2.313021424697359e-09, + "loss": 0.74065107, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.1920166, + "step": 16385, + "time_per_iteration": 2.983834981918335 + }, + { + "auxiliary_loss_clip": 0.01409948, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.24913669, + "balance_loss_mlp": 1.01388419, + "epoch": 0.9851796182173456, + "flos": 17721403401600.0, + "grad_norm": 1.8843443149609969, + "language_loss": 0.82167351, + "learning_rate": 2.294333993509978e-09, + "loss": 0.84610087, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.18896484, + "step": 16386, + "time_per_iteration": 4.316385269165039 + }, + { + "auxiliary_loss_clip": 0.0140934, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.2483269, + "balance_loss_mlp": 1.01717424, + "epoch": 0.9852397414700135, + "flos": 27465861029760.0, + "grad_norm": 2.0028294475262167, + "language_loss": 0.68844473, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.71290493, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.19519043, + "step": 16387, + "time_per_iteration": 2.8710286617279053 + }, + { + "auxiliary_loss_clip": 0.01379441, + "auxiliary_loss_mlp": 0.0103253, + "balance_loss_clip": 1.22604513, + "balance_loss_mlp": 1.0144937, + "epoch": 0.9852998647226815, + "flos": 18305949962880.0, + "grad_norm": 2.530714127502147, + "language_loss": 0.75028926, + "learning_rate": 2.257186391438237e-09, + "loss": 0.77440894, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.18041992, + "step": 16388, + "time_per_iteration": 2.824612855911255 + }, + { + "auxiliary_loss_clip": 0.0139284, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.23440742, + "balance_loss_mlp": 1.01335168, + "epoch": 0.9853599879753495, + "flos": 19651091178240.0, + "grad_norm": 2.836748547456495, + "language_loss": 0.82969445, + "learning_rate": 2.238726221962528e-09, + "loss": 0.85393667, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.18041992, + "step": 16389, + "time_per_iteration": 2.824563503265381 + }, + { + "auxiliary_loss_clip": 0.01390799, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.23236752, + "balance_loss_mlp": 1.00969088, + "epoch": 0.9854201112280174, + "flos": 23852491309440.0, + "grad_norm": 4.504824147853173, + "language_loss": 0.67813128, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.7023285, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.19226074, + "step": 16390, + "time_per_iteration": 2.8358211517333984 + }, + { + "auxiliary_loss_clip": 0.01417146, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.25562644, + "balance_loss_mlp": 1.01929426, + "epoch": 0.9854802344806854, + "flos": 30092610481920.0, + "grad_norm": 1.767111895931276, + "language_loss": 0.77645439, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.80101037, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 1.61621094, + "router_z_loss_mlp": 0.19165039, + "step": 16391, + "time_per_iteration": 2.9367012977600098 + }, + { + "auxiliary_loss_clip": 0.0137723, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.22404671, + "balance_loss_mlp": 1.01677513, + "epoch": 0.9855403577333534, + "flos": 21917238422400.0, + "grad_norm": 3.719820029087447, + "language_loss": 0.69277596, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.71690357, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18762207, + "step": 16392, + "time_per_iteration": 2.8416285514831543 + }, + { + "auxiliary_loss_clip": 0.01419583, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.25276935, + "balance_loss_mlp": 1.01278389, + "epoch": 0.9856004809860214, + "flos": 15422154946560.0, + "grad_norm": 7.0796075759043156, + "language_loss": 0.57301009, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.59753799, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.2043457, + "step": 16393, + "time_per_iteration": 2.8223114013671875 + }, + { + "auxiliary_loss_clip": 0.01399772, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.2355479, + "balance_loss_mlp": 1.01385808, + "epoch": 0.9856606042386893, + "flos": 13658697141120.0, + "grad_norm": 3.411685874541053, + "language_loss": 0.79819244, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.82251513, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18615723, + "step": 16394, + "time_per_iteration": 2.779979944229126 + }, + { + "auxiliary_loss_clip": 0.01402366, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.24078119, + "balance_loss_mlp": 1.0136342, + "epoch": 0.9857207274913573, + "flos": 23490124554240.0, + "grad_norm": 1.4687254398454892, + "language_loss": 0.7671209, + "learning_rate": 2.129556090869178e-09, + "loss": 0.79146796, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.18701172, + "step": 16395, + "time_per_iteration": 4.313923120498657 + }, + { + "auxiliary_loss_clip": 0.01400235, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.24263501, + "balance_loss_mlp": 1.01274538, + "epoch": 0.9857808507440252, + "flos": 21074379442560.0, + "grad_norm": 1.8700402348234162, + "language_loss": 0.7590099, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.78332961, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18994141, + "step": 16396, + "time_per_iteration": 4.341885328292847 + }, + { + "auxiliary_loss_clip": 0.0139445, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.23705721, + "balance_loss_mlp": 1.01344526, + "epoch": 0.9858409739966932, + "flos": 25312274144640.0, + "grad_norm": 1.4407267834413362, + "language_loss": 0.71642005, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.74068719, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18835449, + "step": 16397, + "time_per_iteration": 2.9244003295898438 + }, + { + "auxiliary_loss_clip": 0.01386336, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.23293519, + "balance_loss_mlp": 1.01464248, + "epoch": 0.9859010972493611, + "flos": 20568568112640.0, + "grad_norm": 1.6054343834615512, + "language_loss": 0.7226454, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.7468347, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 1.53417969, + "router_z_loss_mlp": 0.17944336, + "step": 16398, + "time_per_iteration": 2.880894660949707 + }, + { + "auxiliary_loss_clip": 0.01401069, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.24321866, + "balance_loss_mlp": 1.01409674, + "epoch": 0.9859612205020292, + "flos": 24765986701440.0, + "grad_norm": 1.5097557565261428, + "language_loss": 0.74801075, + "learning_rate": 2.058291183208771e-09, + "loss": 0.77234828, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18591309, + "step": 16399, + "time_per_iteration": 2.921234369277954 + }, + { + "auxiliary_loss_clip": 0.0139886, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.23905742, + "balance_loss_mlp": 1.01489472, + "epoch": 0.9860213437546971, + "flos": 21115760451840.0, + "grad_norm": 1.8494235383744735, + "language_loss": 0.58158511, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60592377, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.20117188, + "step": 16400, + "time_per_iteration": 2.8620755672454834 + }, + { + "auxiliary_loss_clip": 0.01422207, + "auxiliary_loss_mlp": 0.01034365, + "balance_loss_clip": 1.25506163, + "balance_loss_mlp": 1.01473093, + "epoch": 0.9860814670073651, + "flos": 19145596561920.0, + "grad_norm": 2.1286804973289697, + "language_loss": 0.81499112, + "learning_rate": 2.023113299582491e-09, + "loss": 0.83955681, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 1.67285156, + "router_z_loss_mlp": 0.19641113, + "step": 16401, + "time_per_iteration": 2.914907693862915 + }, + { + "auxiliary_loss_clip": 0.01386358, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.22993231, + "balance_loss_mlp": 1.01233459, + "epoch": 0.9861415902600331, + "flos": 17245616636160.0, + "grad_norm": 1.7411081070866785, + "language_loss": 0.79070008, + "learning_rate": 2.005638002662069e-09, + "loss": 0.81487978, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.19262695, + "step": 16402, + "time_per_iteration": 2.902890205383301 + }, + { + "auxiliary_loss_clip": 0.01395688, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.23590302, + "balance_loss_mlp": 1.01474071, + "epoch": 0.986201713512701, + "flos": 27794176433280.0, + "grad_norm": 1.775030176844202, + "language_loss": 0.7114929, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.73578906, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1920166, + "step": 16403, + "time_per_iteration": 2.9123497009277344 + }, + { + "auxiliary_loss_clip": 0.01398904, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.24031365, + "balance_loss_mlp": 1.01048601, + "epoch": 0.986261836765369, + "flos": 28742039890560.0, + "grad_norm": 1.7582831087140398, + "language_loss": 0.75262415, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.77689898, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.18103027, + "step": 16404, + "time_per_iteration": 2.896562337875366 + }, + { + "auxiliary_loss_clip": 0.01404024, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.24263072, + "balance_loss_mlp": 1.01740813, + "epoch": 0.986321960018037, + "flos": 34326749865600.0, + "grad_norm": 1.8570842492651563, + "language_loss": 0.70821142, + "learning_rate": 1.953666699415768e-09, + "loss": 0.73261136, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.1854248, + "step": 16405, + "time_per_iteration": 2.9488749504089355 + }, + { + "auxiliary_loss_clip": 0.01393794, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.2380873, + "balance_loss_mlp": 1.01303256, + "epoch": 0.986382083270705, + "flos": 25200075744000.0, + "grad_norm": 1.7805660764713538, + "language_loss": 0.70745975, + "learning_rate": 1.93649446302846e-09, + "loss": 0.73170865, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18066406, + "step": 16406, + "time_per_iteration": 2.8927483558654785 + }, + { + "auxiliary_loss_clip": 0.013874, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.23113787, + "balance_loss_mlp": 1.0138948, + "epoch": 0.9864422065233729, + "flos": 11030635589760.0, + "grad_norm": 11.122142521354993, + "language_loss": 0.74835587, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.7725538, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18505859, + "step": 16407, + "time_per_iteration": 2.7870326042175293 + }, + { + "auxiliary_loss_clip": 0.01387695, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.23030782, + "balance_loss_mlp": 1.01171517, + "epoch": 0.9865023297760409, + "flos": 16554436784640.0, + "grad_norm": 5.132109332118595, + "language_loss": 0.77706742, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.80124879, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18725586, + "step": 16408, + "time_per_iteration": 2.8352224826812744 + }, + { + "auxiliary_loss_clip": 0.01416348, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.25211167, + "balance_loss_mlp": 1.01324081, + "epoch": 0.9865624530287088, + "flos": 18889501138560.0, + "grad_norm": 1.7703462783372068, + "language_loss": 0.69054627, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.71503001, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18811035, + "step": 16409, + "time_per_iteration": 2.8402485847473145 + }, + { + "auxiliary_loss_clip": 0.01180813, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.09420919, + "balance_loss_mlp": 1.00941837, + "epoch": 0.9866225762813768, + "flos": 68915838936960.0, + "grad_norm": 0.8048945630231783, + "language_loss": 0.61097121, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63306332, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.18945312, + "step": 16410, + "time_per_iteration": 3.405085325241089 + }, + { + "auxiliary_loss_clip": 0.01397049, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.23698974, + "balance_loss_mlp": 1.01289773, + "epoch": 0.9866826995340447, + "flos": 29035037088000.0, + "grad_norm": 2.0900844003183536, + "language_loss": 0.668715, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.693012, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.19750977, + "step": 16411, + "time_per_iteration": 3.032343626022339 + }, + { + "auxiliary_loss_clip": 0.01181314, + "auxiliary_loss_mlp": 0.01038858, + "balance_loss_clip": 1.09372211, + "balance_loss_mlp": 1.01549268, + "epoch": 0.9867428227867128, + "flos": 65411246995200.0, + "grad_norm": 0.7232335389489192, + "language_loss": 0.56308901, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58529079, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.23339844, + "step": 16412, + "time_per_iteration": 3.370803117752075 + }, + { + "auxiliary_loss_clip": 0.01418032, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.2531724, + "balance_loss_mlp": 1.01243305, + "epoch": 0.9868029460393807, + "flos": 26517771348480.0, + "grad_norm": 1.9019440255109836, + "language_loss": 0.73458463, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75908792, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.1986084, + "step": 16413, + "time_per_iteration": 2.9063198566436768 + }, + { + "auxiliary_loss_clip": 0.0139342, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.23302853, + "balance_loss_mlp": 1.01159334, + "epoch": 0.9868630692920487, + "flos": 22977843217920.0, + "grad_norm": 1.7042336294599283, + "language_loss": 0.72071773, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.74495083, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.1829834, + "step": 16414, + "time_per_iteration": 2.892103433609009 + }, + { + "auxiliary_loss_clip": 0.01384885, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.23120296, + "balance_loss_mlp": 1.01346922, + "epoch": 0.9869231925447167, + "flos": 19838224247040.0, + "grad_norm": 1.8028580724795036, + "language_loss": 0.71473396, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.73890626, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.1887207, + "step": 16415, + "time_per_iteration": 2.826204776763916 + }, + { + "auxiliary_loss_clip": 0.01374679, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.22148752, + "balance_loss_mlp": 1.01095319, + "epoch": 0.9869833157973846, + "flos": 20205296461440.0, + "grad_norm": 1.6140461473247256, + "language_loss": 0.76107764, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.78511006, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.17614746, + "step": 16416, + "time_per_iteration": 2.931623935699463 + }, + { + "auxiliary_loss_clip": 0.01392706, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.23634624, + "balance_loss_mlp": 1.00947666, + "epoch": 0.9870434390500527, + "flos": 16106095630080.0, + "grad_norm": 1.8242613570742483, + "language_loss": 0.71594393, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.74015701, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.19116211, + "step": 16417, + "time_per_iteration": 2.8050966262817383 + }, + { + "auxiliary_loss_clip": 0.01419023, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.25330114, + "balance_loss_mlp": 1.01158464, + "epoch": 0.9871035623027206, + "flos": 21770355242880.0, + "grad_norm": 2.162598400949832, + "language_loss": 0.71517164, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.73967385, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.19604492, + "step": 16418, + "time_per_iteration": 2.880129098892212 + }, + { + "auxiliary_loss_clip": 0.01181082, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.09310055, + "balance_loss_mlp": 1.01482701, + "epoch": 0.9871636855553886, + "flos": 70252718336640.0, + "grad_norm": 0.6621746989465043, + "language_loss": 0.53711224, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55930781, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.23632812, + "step": 16419, + "time_per_iteration": 5.01503586769104 + }, + { + "auxiliary_loss_clip": 0.01399927, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.23767471, + "balance_loss_mlp": 1.01758528, + "epoch": 0.9872238088080565, + "flos": 25056631169280.0, + "grad_norm": 1.7688479174860716, + "language_loss": 0.78800833, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.81238562, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20227051, + "step": 16420, + "time_per_iteration": 2.8838114738464355 + }, + { + "auxiliary_loss_clip": 0.01391352, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.23584116, + "balance_loss_mlp": 1.0140202, + "epoch": 0.9872839320607245, + "flos": 19475812247040.0, + "grad_norm": 1.8500805285590918, + "language_loss": 0.71105075, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73529595, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.19152832, + "step": 16421, + "time_per_iteration": 4.341374635696411 + }, + { + "auxiliary_loss_clip": 0.01411209, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.24770212, + "balance_loss_mlp": 1.01401067, + "epoch": 0.9873440553133924, + "flos": 26954167875840.0, + "grad_norm": 1.876757470214039, + "language_loss": 0.82905984, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.85350633, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19433594, + "step": 16422, + "time_per_iteration": 2.945460557937622 + }, + { + "auxiliary_loss_clip": 0.0139971, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.24330842, + "balance_loss_mlp": 1.01519239, + "epoch": 0.9874041785660604, + "flos": 19071476300160.0, + "grad_norm": 1.767852906874787, + "language_loss": 0.87122798, + "learning_rate": 1.656159280223779e-09, + "loss": 0.89556456, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.1875, + "step": 16423, + "time_per_iteration": 2.849529504776001 + }, + { + "auxiliary_loss_clip": 0.01401092, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.24025822, + "balance_loss_mlp": 1.01348543, + "epoch": 0.9874643018187284, + "flos": 21115715207040.0, + "grad_norm": 3.1053685746657527, + "language_loss": 0.71658301, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.740933, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.20397949, + "step": 16424, + "time_per_iteration": 2.8494362831115723 + }, + { + "auxiliary_loss_clip": 0.01402442, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.24115956, + "balance_loss_mlp": 1.01337731, + "epoch": 0.9875244250713964, + "flos": 24436811646720.0, + "grad_norm": 2.343569889449802, + "language_loss": 0.81452322, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.83887249, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.19116211, + "step": 16425, + "time_per_iteration": 2.894390821456909 + }, + { + "auxiliary_loss_clip": 0.01392291, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.23306966, + "balance_loss_mlp": 1.01086855, + "epoch": 0.9875845483240643, + "flos": 25128127232640.0, + "grad_norm": 2.0917833937464385, + "language_loss": 0.80520761, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82943773, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19848633, + "step": 16426, + "time_per_iteration": 2.855056047439575 + }, + { + "auxiliary_loss_clip": 0.01389911, + "auxiliary_loss_mlp": 0.01035739, + "balance_loss_clip": 1.23250341, + "balance_loss_mlp": 1.01617718, + "epoch": 0.9876446715767323, + "flos": 16590569397120.0, + "grad_norm": 1.8828727462905686, + "language_loss": 0.85659063, + "learning_rate": 1.593380599750338e-09, + "loss": 0.88084716, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19555664, + "step": 16427, + "time_per_iteration": 2.8632853031158447 + }, + { + "auxiliary_loss_clip": 0.01407055, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.24988639, + "balance_loss_mlp": 1.01455998, + "epoch": 0.9877047948294003, + "flos": 21626051016960.0, + "grad_norm": 1.6868841474262293, + "language_loss": 0.71219558, + "learning_rate": 1.577875377599458e-09, + "loss": 0.73659629, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.18444824, + "step": 16428, + "time_per_iteration": 2.8641908168792725 + }, + { + "auxiliary_loss_clip": 0.01390004, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.23307657, + "balance_loss_mlp": 1.01136732, + "epoch": 0.9877649180820682, + "flos": 21188342390400.0, + "grad_norm": 2.869681706012376, + "language_loss": 0.81181949, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.83601713, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.18395996, + "step": 16429, + "time_per_iteration": 2.876523971557617 + }, + { + "auxiliary_loss_clip": 0.01403493, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.24455023, + "balance_loss_mlp": 1.01095068, + "epoch": 0.9878250413347363, + "flos": 39763762254720.0, + "grad_norm": 1.5745816376620567, + "language_loss": 0.63061202, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.65493333, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.17700195, + "step": 16430, + "time_per_iteration": 4.526405334472656 + }, + { + "auxiliary_loss_clip": 0.01392664, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.2333566, + "balance_loss_mlp": 1.01385903, + "epoch": 0.9878851645874042, + "flos": 29437970446080.0, + "grad_norm": 1.2603608363570549, + "language_loss": 0.73316455, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75742114, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19128418, + "step": 16431, + "time_per_iteration": 4.360586166381836 + }, + { + "auxiliary_loss_clip": 0.01398731, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.23906994, + "balance_loss_mlp": 1.01305485, + "epoch": 0.9879452878400722, + "flos": 15812419760640.0, + "grad_norm": 2.19100921825087, + "language_loss": 0.81681073, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.84111464, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18615723, + "step": 16432, + "time_per_iteration": 2.826491117477417 + }, + { + "auxiliary_loss_clip": 0.01385667, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.22922778, + "balance_loss_mlp": 1.01313591, + "epoch": 0.9880054110927401, + "flos": 22243336830720.0, + "grad_norm": 1.4329380653035315, + "language_loss": 0.80867302, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.8328377, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.17651367, + "step": 16433, + "time_per_iteration": 2.893446445465088 + }, + { + "auxiliary_loss_clip": 0.01391285, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.23467445, + "balance_loss_mlp": 1.01260972, + "epoch": 0.9880655343454081, + "flos": 28774688653440.0, + "grad_norm": 2.263975873388379, + "language_loss": 0.65694475, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.68117332, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18969727, + "step": 16434, + "time_per_iteration": 2.8816497325897217 + }, + { + "auxiliary_loss_clip": 0.01404736, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.24238777, + "balance_loss_mlp": 1.01253581, + "epoch": 0.988125657598076, + "flos": 32866107379200.0, + "grad_norm": 1.7228588235009938, + "language_loss": 0.69990021, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.72425652, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18347168, + "step": 16435, + "time_per_iteration": 2.9447977542877197 + }, + { + "auxiliary_loss_clip": 0.01393156, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.23494029, + "balance_loss_mlp": 1.01324892, + "epoch": 0.988185780850744, + "flos": 19400063172480.0, + "grad_norm": 1.7985508229044551, + "language_loss": 0.76087976, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.78513348, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18969727, + "step": 16436, + "time_per_iteration": 2.864062786102295 + }, + { + "auxiliary_loss_clip": 0.01402619, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.24296403, + "balance_loss_mlp": 1.01139593, + "epoch": 0.988245904103412, + "flos": 22538596268160.0, + "grad_norm": 2.108242118773648, + "language_loss": 0.75229537, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.77663231, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19677734, + "step": 16437, + "time_per_iteration": 2.8416621685028076 + }, + { + "auxiliary_loss_clip": 0.01374109, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.22071767, + "balance_loss_mlp": 1.01597691, + "epoch": 0.98830602735608, + "flos": 28671267744000.0, + "grad_norm": 1.6034908463391344, + "language_loss": 0.60430837, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62839109, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.18188477, + "step": 16438, + "time_per_iteration": 2.9804015159606934 + }, + { + "auxiliary_loss_clip": 0.01392746, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.23495388, + "balance_loss_mlp": 1.01264405, + "epoch": 0.9883661506087479, + "flos": 21005824291200.0, + "grad_norm": 3.280548525338781, + "language_loss": 0.7300126, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.75425768, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19116211, + "step": 16439, + "time_per_iteration": 2.8490633964538574 + }, + { + "auxiliary_loss_clip": 0.01383008, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.2276144, + "balance_loss_mlp": 1.01595998, + "epoch": 0.9884262738614159, + "flos": 32718455038080.0, + "grad_norm": 1.622422454468921, + "language_loss": 0.60882771, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.63300502, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 1.5546875, + "router_z_loss_mlp": 0.18774414, + "step": 16440, + "time_per_iteration": 2.960610866546631 + }, + { + "auxiliary_loss_clip": 0.01402042, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.24079323, + "balance_loss_mlp": 1.01615226, + "epoch": 0.9884863971140839, + "flos": 17573162878080.0, + "grad_norm": 2.5844537566043315, + "language_loss": 0.7728591, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.7972303, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18908691, + "step": 16441, + "time_per_iteration": 2.898624897003174 + }, + { + "auxiliary_loss_clip": 0.01404912, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.24498153, + "balance_loss_mlp": 1.0170629, + "epoch": 0.9885465203667518, + "flos": 40567411975680.0, + "grad_norm": 2.1140195426228128, + "language_loss": 0.68566346, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.71007252, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.18933105, + "step": 16442, + "time_per_iteration": 3.069356918334961 + }, + { + "auxiliary_loss_clip": 0.01382653, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.22685969, + "balance_loss_mlp": 1.01140141, + "epoch": 0.9886066436194199, + "flos": 13815805645440.0, + "grad_norm": 2.6738773239888247, + "language_loss": 0.75557512, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.7797004, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.18457031, + "step": 16443, + "time_per_iteration": 2.842747449874878 + }, + { + "auxiliary_loss_clip": 0.01401267, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.24202538, + "balance_loss_mlp": 1.01444137, + "epoch": 0.9886667668720878, + "flos": 23333966190720.0, + "grad_norm": 1.6541528053125119, + "language_loss": 0.74732929, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.77167165, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.18530273, + "step": 16444, + "time_per_iteration": 2.865488290786743 + }, + { + "auxiliary_loss_clip": 0.01397773, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.24002981, + "balance_loss_mlp": 1.01327372, + "epoch": 0.9887268901247558, + "flos": 22715006319360.0, + "grad_norm": 1.8009080877633203, + "language_loss": 0.69415748, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71845818, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19018555, + "step": 16445, + "time_per_iteration": 2.938490390777588 + }, + { + "auxiliary_loss_clip": 0.01387474, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.2296648, + "balance_loss_mlp": 1.01328564, + "epoch": 0.9887870133774237, + "flos": 13048786229760.0, + "grad_norm": 3.745076057305797, + "language_loss": 0.61128354, + "learning_rate": 1.311740377491155e-09, + "loss": 0.63547963, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18859863, + "step": 16446, + "time_per_iteration": 2.858058214187622 + }, + { + "auxiliary_loss_clip": 0.01382388, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.22605908, + "balance_loss_mlp": 1.01516891, + "epoch": 0.9888471366300917, + "flos": 15167281132800.0, + "grad_norm": 1.9250539361817742, + "language_loss": 0.71721661, + "learning_rate": 1.297675079582783e-09, + "loss": 0.74137282, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 1.56445312, + "router_z_loss_mlp": 0.18066406, + "step": 16447, + "time_per_iteration": 2.8022687435150146 + }, + { + "auxiliary_loss_clip": 0.01387822, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.23129034, + "balance_loss_mlp": 1.01378465, + "epoch": 0.9889072598827596, + "flos": 25129620311040.0, + "grad_norm": 1.9978546576799325, + "language_loss": 0.84051061, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.86469948, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.17297363, + "step": 16448, + "time_per_iteration": 2.8684353828430176 + }, + { + "auxiliary_loss_clip": 0.01383265, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.22688961, + "balance_loss_mlp": 1.01147032, + "epoch": 0.9889673831354276, + "flos": 16737724045440.0, + "grad_norm": 1.6317993720618438, + "language_loss": 0.71057594, + "learning_rate": 1.26977185727406e-09, + "loss": 0.73469746, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.17407227, + "step": 16449, + "time_per_iteration": 2.8257665634155273 + }, + { + "auxiliary_loss_clip": 0.01414778, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.25260293, + "balance_loss_mlp": 1.01092148, + "epoch": 0.9890275063880956, + "flos": 35597408860800.0, + "grad_norm": 2.0699314500929042, + "language_loss": 0.74887818, + "learning_rate": 1.25593393393153e-09, + "loss": 0.77332217, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.18713379, + "step": 16450, + "time_per_iteration": 3.007493734359741 + }, + { + "auxiliary_loss_clip": 0.01400375, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.23737669, + "balance_loss_mlp": 1.0135355, + "epoch": 0.9890876296407636, + "flos": 18961494894720.0, + "grad_norm": 2.404630838383014, + "language_loss": 0.79718059, + "learning_rate": 1.242171803164549e-09, + "loss": 0.82149994, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 1.62890625, + "router_z_loss_mlp": 0.18017578, + "step": 16451, + "time_per_iteration": 2.872495651245117 + }, + { + "auxiliary_loss_clip": 0.01409139, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.2453289, + "balance_loss_mlp": 1.01551056, + "epoch": 0.9891477528934315, + "flos": 23779909370880.0, + "grad_norm": 1.9179648386048196, + "language_loss": 0.7119683, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.73640549, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.19067383, + "step": 16452, + "time_per_iteration": 2.8751890659332275 + }, + { + "auxiliary_loss_clip": 0.0138509, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.23113811, + "balance_loss_mlp": 1.00986123, + "epoch": 0.9892078761460995, + "flos": 20781563224320.0, + "grad_norm": 1.698320310580232, + "language_loss": 0.74397731, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.76810461, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.17773438, + "step": 16453, + "time_per_iteration": 2.85638165473938 + }, + { + "auxiliary_loss_clip": 0.01402047, + "auxiliary_loss_mlp": 0.01037332, + "balance_loss_clip": 1.23991764, + "balance_loss_mlp": 1.01898539, + "epoch": 0.9892679993987675, + "flos": 23378288112000.0, + "grad_norm": 2.760773229420982, + "language_loss": 0.71657538, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.74096918, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18334961, + "step": 16454, + "time_per_iteration": 4.341542959213257 + }, + { + "auxiliary_loss_clip": 0.01379647, + "auxiliary_loss_mlp": 0.01030187, + "balance_loss_clip": 1.22476852, + "balance_loss_mlp": 1.01119685, + "epoch": 0.9893281226514354, + "flos": 22714237157760.0, + "grad_norm": 2.003869021726763, + "language_loss": 0.75248092, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77657926, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.18981934, + "step": 16455, + "time_per_iteration": 2.886852502822876 + }, + { + "auxiliary_loss_clip": 0.01378936, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.2237978, + "balance_loss_mlp": 1.0135982, + "epoch": 0.9893882459041035, + "flos": 21806125896960.0, + "grad_norm": 1.7557437973468615, + "language_loss": 0.65884215, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.68296385, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 1.55078125, + "router_z_loss_mlp": 0.19641113, + "step": 16456, + "time_per_iteration": 4.329211950302124 + }, + { + "auxiliary_loss_clip": 0.01416976, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.25480092, + "balance_loss_mlp": 1.01717687, + "epoch": 0.9894483691567714, + "flos": 18122843681280.0, + "grad_norm": 2.036478350333511, + "language_loss": 0.74480867, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76933467, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.18444824, + "step": 16457, + "time_per_iteration": 2.8383498191833496 + }, + { + "auxiliary_loss_clip": 0.01401614, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.24011433, + "balance_loss_mlp": 1.01540744, + "epoch": 0.9895084924094394, + "flos": 31223942179200.0, + "grad_norm": 2.093606851004012, + "language_loss": 0.69823956, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.72259808, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18847656, + "step": 16458, + "time_per_iteration": 2.9327542781829834 + }, + { + "auxiliary_loss_clip": 0.01379539, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.2230705, + "balance_loss_mlp": 1.01210451, + "epoch": 0.9895686156621073, + "flos": 19686318894720.0, + "grad_norm": 1.7901048071240322, + "language_loss": 0.8027972, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.82689708, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 1.56640625, + "router_z_loss_mlp": 0.18334961, + "step": 16459, + "time_per_iteration": 2.8448216915130615 + }, + { + "auxiliary_loss_clip": 0.0140258, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.2410562, + "balance_loss_mlp": 1.01250577, + "epoch": 0.9896287389147753, + "flos": 23590876020480.0, + "grad_norm": 2.1433783289280077, + "language_loss": 0.71858865, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.74292719, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.18774414, + "step": 16460, + "time_per_iteration": 2.865595579147339 + }, + { + "auxiliary_loss_clip": 0.01397539, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.23684239, + "balance_loss_mlp": 1.01660657, + "epoch": 0.9896888621674432, + "flos": 29617004695680.0, + "grad_norm": 1.586384198624689, + "language_loss": 0.87840587, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.90273732, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.18994141, + "step": 16461, + "time_per_iteration": 2.9141829013824463 + }, + { + "auxiliary_loss_clip": 0.01404078, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.24478531, + "balance_loss_mlp": 1.01573527, + "epoch": 0.9897489854201112, + "flos": 23705291416320.0, + "grad_norm": 1.5701458460334479, + "language_loss": 0.63970709, + "learning_rate": 1.09579082189315e-09, + "loss": 0.66409743, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19226074, + "step": 16462, + "time_per_iteration": 2.8678526878356934 + }, + { + "auxiliary_loss_clip": 0.01399647, + "auxiliary_loss_mlp": 0.01031472, + "balance_loss_clip": 1.24181867, + "balance_loss_mlp": 1.01192212, + "epoch": 0.9898091086727792, + "flos": 13233068876160.0, + "grad_norm": 1.7279517559795192, + "language_loss": 0.73509479, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75940597, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19543457, + "step": 16463, + "time_per_iteration": 2.8295462131500244 + }, + { + "auxiliary_loss_clip": 0.01407236, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.24681115, + "balance_loss_mlp": 1.01246023, + "epoch": 0.9898692319254472, + "flos": 22940850954240.0, + "grad_norm": 1.653460971191012, + "language_loss": 0.70744181, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.73183358, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.19470215, + "step": 16464, + "time_per_iteration": 2.850431203842163 + }, + { + "auxiliary_loss_clip": 0.0140841, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.24684548, + "balance_loss_mlp": 1.01479042, + "epoch": 0.9899293551781151, + "flos": 12465913726080.0, + "grad_norm": 1.9898458384307425, + "language_loss": 0.73888767, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.7633121, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19226074, + "step": 16465, + "time_per_iteration": 4.209473133087158 + }, + { + "auxiliary_loss_clip": 0.01384509, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.22909737, + "balance_loss_mlp": 1.01494896, + "epoch": 0.9899894784307831, + "flos": 26882671812480.0, + "grad_norm": 1.607359367118123, + "language_loss": 0.87545979, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.89962518, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 1.55371094, + "router_z_loss_mlp": 0.17102051, + "step": 16466, + "time_per_iteration": 4.298904180526733 + }, + { + "auxiliary_loss_clip": 0.01398971, + "auxiliary_loss_mlp": 0.01029934, + "balance_loss_clip": 1.23993695, + "balance_loss_mlp": 1.01089668, + "epoch": 0.990049601683451, + "flos": 21551704531200.0, + "grad_norm": 1.6326239016675728, + "language_loss": 0.7244764, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.74876547, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19042969, + "step": 16467, + "time_per_iteration": 2.8738081455230713 + }, + { + "auxiliary_loss_clip": 0.0138917, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.23118985, + "balance_loss_mlp": 1.01689136, + "epoch": 0.990109724936119, + "flos": 28784732999040.0, + "grad_norm": 1.3341094880140978, + "language_loss": 0.65413356, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67837489, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.18054199, + "step": 16468, + "time_per_iteration": 2.9437413215637207 + }, + { + "auxiliary_loss_clip": 0.01403145, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.23946595, + "balance_loss_mlp": 1.01256657, + "epoch": 0.9901698481887871, + "flos": 29285160197760.0, + "grad_norm": 1.8267849045015225, + "language_loss": 0.62419486, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64854133, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.18908691, + "step": 16469, + "time_per_iteration": 2.929666519165039 + }, + { + "auxiliary_loss_clip": 0.01396111, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.23722196, + "balance_loss_mlp": 1.01364589, + "epoch": 0.990229971441455, + "flos": 15966994556160.0, + "grad_norm": 3.323835233958882, + "language_loss": 0.730088, + "learning_rate": 9.950925847685976e-10, + "loss": 0.7543726, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18701172, + "step": 16470, + "time_per_iteration": 2.829166889190674 + }, + { + "auxiliary_loss_clip": 0.01178066, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.08966398, + "balance_loss_mlp": 1.01690471, + "epoch": 0.990290094694123, + "flos": 69812928449280.0, + "grad_norm": 0.6650812193149377, + "language_loss": 0.55529261, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57744348, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.20117188, + "step": 16471, + "time_per_iteration": 3.547701835632324 + }, + { + "auxiliary_loss_clip": 0.01388742, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.23070765, + "balance_loss_mlp": 1.01453066, + "epoch": 0.9903502179467909, + "flos": 16260896649600.0, + "grad_norm": 2.139720554473198, + "language_loss": 0.85772669, + "learning_rate": 9.706760407131032e-10, + "loss": 0.88195705, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19750977, + "step": 16472, + "time_per_iteration": 2.8637189865112305 + }, + { + "auxiliary_loss_clip": 0.01408928, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.24744034, + "balance_loss_mlp": 1.0113349, + "epoch": 0.9904103411994589, + "flos": 21698135262720.0, + "grad_norm": 2.0223331569882794, + "language_loss": 0.86558306, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88997114, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.1854248, + "step": 16473, + "time_per_iteration": 2.8790690898895264 + }, + { + "auxiliary_loss_clip": 0.01404886, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.24535537, + "balance_loss_mlp": 1.0151391, + "epoch": 0.9904704644521268, + "flos": 25750525708800.0, + "grad_norm": 1.6207762338520475, + "language_loss": 0.85330868, + "learning_rate": 9.465627102240859e-10, + "loss": 0.87768823, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.17956543, + "step": 16474, + "time_per_iteration": 2.8954622745513916 + }, + { + "auxiliary_loss_clip": 0.01386321, + "auxiliary_loss_mlp": 0.01036473, + "balance_loss_clip": 1.22899103, + "balance_loss_mlp": 1.0178169, + "epoch": 0.9905305877047949, + "flos": 21918414787200.0, + "grad_norm": 1.645497899949222, + "language_loss": 0.77228153, + "learning_rate": 9.346197512116738e-10, + "loss": 0.79650944, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.18640137, + "step": 16475, + "time_per_iteration": 2.8597652912139893 + }, + { + "auxiliary_loss_clip": 0.0139722, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.23778069, + "balance_loss_mlp": 1.0123204, + "epoch": 0.9905907109574628, + "flos": 21401020788480.0, + "grad_norm": 1.6839590429351796, + "language_loss": 0.75867951, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78296554, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19042969, + "step": 16476, + "time_per_iteration": 2.898207664489746 + }, + { + "auxiliary_loss_clip": 0.01423458, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.25669169, + "balance_loss_mlp": 1.01414585, + "epoch": 0.9906508342101308, + "flos": 20531259135360.0, + "grad_norm": 2.8645962914513667, + "language_loss": 0.68827063, + "learning_rate": 9.109612479154538e-10, + "loss": 0.71284848, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 1.66796875, + "router_z_loss_mlp": 0.20178223, + "step": 16477, + "time_per_iteration": 2.8452680110931396 + }, + { + "auxiliary_loss_clip": 0.01411565, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.24863696, + "balance_loss_mlp": 1.01286626, + "epoch": 0.9907109574627987, + "flos": 21371177203200.0, + "grad_norm": 1.903865867995358, + "language_loss": 0.73229289, + "learning_rate": 8.992457045289282e-10, + "loss": 0.75673401, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.19677734, + "step": 16478, + "time_per_iteration": 2.857356071472168 + }, + { + "auxiliary_loss_clip": 0.01409177, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.24802005, + "balance_loss_mlp": 1.01833844, + "epoch": 0.9907710807154667, + "flos": 17345146492800.0, + "grad_norm": 2.577267954066346, + "language_loss": 0.8196044, + "learning_rate": 8.876059672433545e-10, + "loss": 0.8440789, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.19921875, + "step": 16479, + "time_per_iteration": 2.820232391357422 + }, + { + "auxiliary_loss_clip": 0.01407593, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.2464112, + "balance_loss_mlp": 1.01306164, + "epoch": 0.9908312039681346, + "flos": 28633732542720.0, + "grad_norm": 1.5398043999805664, + "language_loss": 0.67177993, + "learning_rate": 8.760420364999355e-10, + "loss": 0.69616997, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 1.61230469, + "router_z_loss_mlp": 0.18334961, + "step": 16480, + "time_per_iteration": 2.8810153007507324 + }, + { + "auxiliary_loss_clip": 0.01385061, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.22792697, + "balance_loss_mlp": 1.01250887, + "epoch": 0.9908913272208026, + "flos": 35783727523200.0, + "grad_norm": 1.7812081684523362, + "language_loss": 0.72949809, + "learning_rate": 8.645539127374313e-10, + "loss": 0.75366473, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.19091797, + "step": 16481, + "time_per_iteration": 3.0511796474456787 + }, + { + "auxiliary_loss_clip": 0.01394311, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.23707116, + "balance_loss_mlp": 1.01146901, + "epoch": 0.9909514504734707, + "flos": 19911892060800.0, + "grad_norm": 1.8811230247096633, + "language_loss": 0.78440297, + "learning_rate": 8.531415963912713e-10, + "loss": 0.80864352, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.1829834, + "step": 16482, + "time_per_iteration": 2.841183662414551 + }, + { + "auxiliary_loss_clip": 0.01399591, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.23884821, + "balance_loss_mlp": 1.01170802, + "epoch": 0.9910115737261386, + "flos": 20012734016640.0, + "grad_norm": 1.7590114459948827, + "language_loss": 0.76143408, + "learning_rate": 8.418050878944427e-10, + "loss": 0.78573644, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18933105, + "step": 16483, + "time_per_iteration": 2.840317487716675 + }, + { + "auxiliary_loss_clip": 0.01176153, + "auxiliary_loss_mlp": 0.01023479, + "balance_loss_clip": 1.08978796, + "balance_loss_mlp": 1.00631285, + "epoch": 0.9910716969788066, + "flos": 70720270548480.0, + "grad_norm": 0.679474959086763, + "language_loss": 0.53681815, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55881453, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.171875, + "step": 16484, + "time_per_iteration": 3.488264322280884 + }, + { + "auxiliary_loss_clip": 0.01385447, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.22988439, + "balance_loss_mlp": 1.01236165, + "epoch": 0.9911318202314745, + "flos": 21443985365760.0, + "grad_norm": 2.3040782270122313, + "language_loss": 0.82507575, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84923989, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.18615723, + "step": 16485, + "time_per_iteration": 2.834007740020752 + }, + { + "auxiliary_loss_clip": 0.01388744, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.23362505, + "balance_loss_mlp": 1.0161432, + "epoch": 0.9911919434841425, + "flos": 19835871517440.0, + "grad_norm": 1.6110321341730882, + "language_loss": 0.82295537, + "learning_rate": 8.082504137836288e-10, + "loss": 0.84719861, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.19433594, + "step": 16486, + "time_per_iteration": 2.845529556274414 + }, + { + "auxiliary_loss_clip": 0.01411214, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.2489965, + "balance_loss_mlp": 1.01521552, + "epoch": 0.9912520667368104, + "flos": 41734378592640.0, + "grad_norm": 1.3190888344923042, + "language_loss": 0.66571963, + "learning_rate": 7.972171409538209e-10, + "loss": 0.69017041, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18652344, + "step": 16487, + "time_per_iteration": 3.059370279312134 + }, + { + "auxiliary_loss_clip": 0.01383994, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.22863436, + "balance_loss_mlp": 1.01289916, + "epoch": 0.9913121899894785, + "flos": 23780361818880.0, + "grad_norm": 1.6049698259702623, + "language_loss": 0.77607274, + "learning_rate": 7.862596780936481e-10, + "loss": 0.80022514, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.18359375, + "step": 16488, + "time_per_iteration": 2.963336706161499 + }, + { + "auxiliary_loss_clip": 0.0142136, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.25554657, + "balance_loss_mlp": 1.01310027, + "epoch": 0.9913723132421464, + "flos": 23780361818880.0, + "grad_norm": 2.13476345238607, + "language_loss": 0.69987619, + "learning_rate": 7.753780256190001e-10, + "loss": 0.72441131, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.19042969, + "step": 16489, + "time_per_iteration": 4.311330795288086 + }, + { + "auxiliary_loss_clip": 0.01176003, + "auxiliary_loss_mlp": 0.01017812, + "balance_loss_clip": 1.08860373, + "balance_loss_mlp": 0.99902481, + "epoch": 0.9914324364948144, + "flos": 71298166124160.0, + "grad_norm": 0.6069470711894912, + "language_loss": 0.5259797, + "learning_rate": 7.645721839424357e-10, + "loss": 0.5479179, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1875, + "step": 16490, + "time_per_iteration": 3.4537580013275146 + }, + { + "auxiliary_loss_clip": 0.01419717, + "auxiliary_loss_mlp": 0.01034024, + "balance_loss_clip": 1.25509071, + "balance_loss_mlp": 1.0141871, + "epoch": 0.9914925597474823, + "flos": 23705789109120.0, + "grad_norm": 1.8880501379519958, + "language_loss": 0.76428998, + "learning_rate": 7.538421534734052e-10, + "loss": 0.78882742, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.19848633, + "step": 16491, + "time_per_iteration": 4.393556118011475 + }, + { + "auxiliary_loss_clip": 0.01417968, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.25507259, + "balance_loss_mlp": 1.01298785, + "epoch": 0.9915526830001503, + "flos": 13439141533440.0, + "grad_norm": 2.328640926333991, + "language_loss": 0.70915425, + "learning_rate": 7.431879346191383e-10, + "loss": 0.73365921, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.19543457, + "step": 16492, + "time_per_iteration": 2.7919111251831055 + }, + { + "auxiliary_loss_clip": 0.01391072, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.23310721, + "balance_loss_mlp": 1.01304746, + "epoch": 0.9916128062528182, + "flos": 20750769498240.0, + "grad_norm": 2.2430412454193216, + "language_loss": 0.69184566, + "learning_rate": 7.326095277837563e-10, + "loss": 0.71608174, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.19494629, + "step": 16493, + "time_per_iteration": 2.8960349559783936 + }, + { + "auxiliary_loss_clip": 0.01418285, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.25453639, + "balance_loss_mlp": 1.01520181, + "epoch": 0.9916729295054862, + "flos": 22495993649280.0, + "grad_norm": 3.1657004917122684, + "language_loss": 0.71964538, + "learning_rate": 7.221069333678276e-10, + "loss": 0.7441774, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.19714355, + "step": 16494, + "time_per_iteration": 2.8948168754577637 + }, + { + "auxiliary_loss_clip": 0.01401996, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.24029148, + "balance_loss_mlp": 1.01268959, + "epoch": 0.9917330527581543, + "flos": 14800389897600.0, + "grad_norm": 2.062521391693983, + "language_loss": 0.68706441, + "learning_rate": 7.116801517701443e-10, + "loss": 0.7114042, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.19287109, + "step": 16495, + "time_per_iteration": 2.8999719619750977 + }, + { + "auxiliary_loss_clip": 0.01177333, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.09052444, + "balance_loss_mlp": 1.01537502, + "epoch": 0.9917931760108222, + "flos": 59219458548480.0, + "grad_norm": 0.7214530020154722, + "language_loss": 0.53546178, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55761969, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.23046875, + "step": 16496, + "time_per_iteration": 3.497404098510742 + }, + { + "auxiliary_loss_clip": 0.01394402, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.23400974, + "balance_loss_mlp": 1.0130949, + "epoch": 0.9918532992634902, + "flos": 26773142855040.0, + "grad_norm": 1.5957293644959956, + "language_loss": 0.72271848, + "learning_rate": 6.91054028607585e-10, + "loss": 0.74698132, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.18786621, + "step": 16497, + "time_per_iteration": 2.90838885307312 + }, + { + "auxiliary_loss_clip": 0.01419059, + "auxiliary_loss_mlp": 0.01034619, + "balance_loss_clip": 1.25439215, + "balance_loss_mlp": 1.01505661, + "epoch": 0.9919134225161581, + "flos": 14983903382400.0, + "grad_norm": 2.0803313713119294, + "language_loss": 0.82872969, + "learning_rate": 6.808546878249721e-10, + "loss": 0.85326648, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19555664, + "step": 16498, + "time_per_iteration": 2.8506102561950684 + }, + { + "auxiliary_loss_clip": 0.0140043, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.24064124, + "balance_loss_mlp": 1.01637959, + "epoch": 0.9919735457688261, + "flos": 27829585128960.0, + "grad_norm": 1.749854798797655, + "language_loss": 0.68954647, + "learning_rate": 6.707311614246869e-10, + "loss": 0.71390992, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.1953125, + "step": 16499, + "time_per_iteration": 4.499104976654053 + }, + { + "auxiliary_loss_clip": 0.01420257, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.25667357, + "balance_loss_mlp": 1.01546526, + "epoch": 0.992033669021494, + "flos": 22572421395840.0, + "grad_norm": 3.000668238451244, + "language_loss": 0.82858789, + "learning_rate": 6.606834497904223e-10, + "loss": 0.85312474, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.17956543, + "step": 16500, + "time_per_iteration": 4.324865341186523 + }, + { + "auxiliary_loss_clip": 0.01394538, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.23536587, + "balance_loss_mlp": 1.01587522, + "epoch": 0.9920937922741621, + "flos": 25385580000000.0, + "grad_norm": 1.758729680230694, + "language_loss": 0.82542479, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84971952, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.1907959, + "step": 16501, + "time_per_iteration": 2.8914644718170166 + }, + { + "auxiliary_loss_clip": 0.01397375, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.23737943, + "balance_loss_mlp": 1.01379764, + "epoch": 0.99215391552683, + "flos": 22064619294720.0, + "grad_norm": 2.2470686268854663, + "language_loss": 0.78135002, + "learning_rate": 6.408154723420711e-10, + "loss": 0.80564809, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18615723, + "step": 16502, + "time_per_iteration": 2.8514018058776855 + }, + { + "auxiliary_loss_clip": 0.01434712, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.27188778, + "balance_loss_mlp": 1.01636696, + "epoch": 0.992214038779498, + "flos": 15422335925760.0, + "grad_norm": 2.1818542820195654, + "language_loss": 0.72347337, + "learning_rate": 6.309952072811597e-10, + "loss": 0.74817932, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.19519043, + "step": 16503, + "time_per_iteration": 2.829491376876831 + }, + { + "auxiliary_loss_clip": 0.01179298, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.08943892, + "balance_loss_mlp": 1.00656593, + "epoch": 0.9922741620321659, + "flos": 62046597795840.0, + "grad_norm": 0.6338681167257824, + "language_loss": 0.55120569, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57330465, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.24023438, + "step": 16504, + "time_per_iteration": 3.4515397548675537 + }, + { + "auxiliary_loss_clip": 0.013981, + "auxiliary_loss_mlp": 0.01028662, + "balance_loss_clip": 1.23816514, + "balance_loss_mlp": 1.01072073, + "epoch": 0.9923342852848339, + "flos": 17174392041600.0, + "grad_norm": 1.7033882690323705, + "language_loss": 0.70095128, + "learning_rate": 6.115821263481536e-10, + "loss": 0.72521889, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.17944336, + "step": 16505, + "time_per_iteration": 2.8334648609161377 + }, + { + "auxiliary_loss_clip": 0.01402717, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.23855162, + "balance_loss_mlp": 1.01088405, + "epoch": 0.9923944085375018, + "flos": 23193055324800.0, + "grad_norm": 1.9164489564145357, + "language_loss": 0.6610375, + "learning_rate": 6.019893112119146e-10, + "loss": 0.68537879, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 1.640625, + "router_z_loss_mlp": 0.2052002, + "step": 16506, + "time_per_iteration": 2.8972232341766357 + }, + { + "auxiliary_loss_clip": 0.01387218, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.22835624, + "balance_loss_mlp": 1.01281071, + "epoch": 0.9924545317901698, + "flos": 20823668150400.0, + "grad_norm": 19.864694976176192, + "language_loss": 0.63956892, + "learning_rate": 5.924723134487219e-10, + "loss": 0.66376287, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19372559, + "step": 16507, + "time_per_iteration": 2.8689470291137695 + }, + { + "auxiliary_loss_clip": 0.01400924, + "auxiliary_loss_mlp": 0.0103423, + "balance_loss_clip": 1.24098682, + "balance_loss_mlp": 1.01428628, + "epoch": 0.9925146550428379, + "flos": 20092871836800.0, + "grad_norm": 1.9666558464110555, + "language_loss": 0.7361086, + "learning_rate": 5.830311334193983e-10, + "loss": 0.76046014, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 1.59863281, + "router_z_loss_mlp": 0.19946289, + "step": 16508, + "time_per_iteration": 2.842477321624756 + }, + { + "auxiliary_loss_clip": 0.01394687, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.23469424, + "balance_loss_mlp": 1.01268697, + "epoch": 0.9925747782955058, + "flos": 24984727902720.0, + "grad_norm": 1.4782602042961164, + "language_loss": 0.71069813, + "learning_rate": 5.736657714818793e-10, + "loss": 0.73496377, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.1920166, + "step": 16509, + "time_per_iteration": 2.8958420753479004 + }, + { + "auxiliary_loss_clip": 0.01399668, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.23820674, + "balance_loss_mlp": 1.01372671, + "epoch": 0.9926349015481738, + "flos": 60492271311360.0, + "grad_norm": 1.6978387685509295, + "language_loss": 0.6861819, + "learning_rate": 5.643762279912146e-10, + "loss": 0.71051002, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.1940918, + "step": 16510, + "time_per_iteration": 3.2113988399505615 + }, + { + "auxiliary_loss_clip": 0.01410033, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.24769092, + "balance_loss_mlp": 1.01496863, + "epoch": 0.9926950248008417, + "flos": 20751719639040.0, + "grad_norm": 2.146345013020012, + "language_loss": 0.82617652, + "learning_rate": 5.551625032997886e-10, + "loss": 0.85062242, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19592285, + "step": 16511, + "time_per_iteration": 2.863591432571411 + }, + { + "auxiliary_loss_clip": 0.01387466, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.23033953, + "balance_loss_mlp": 1.01405728, + "epoch": 0.9927551480535097, + "flos": 24363686770560.0, + "grad_norm": 2.8046324104572484, + "language_loss": 0.92675954, + "learning_rate": 5.460245977570998e-10, + "loss": 0.95096087, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.18615723, + "step": 16512, + "time_per_iteration": 2.90809965133667 + }, + { + "auxiliary_loss_clip": 0.01176633, + "auxiliary_loss_mlp": 0.01020354, + "balance_loss_clip": 1.0903877, + "balance_loss_mlp": 0.99899215, + "epoch": 0.9928152713061776, + "flos": 71308436693760.0, + "grad_norm": 0.6957966975924443, + "language_loss": 0.55233419, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57430398, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.21386719, + "step": 16513, + "time_per_iteration": 3.543395757675171 + }, + { + "auxiliary_loss_clip": 0.0139687, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.23847234, + "balance_loss_mlp": 1.01230931, + "epoch": 0.9928753945588457, + "flos": 57828665329920.0, + "grad_norm": 1.571783858715082, + "language_loss": 0.65665495, + "learning_rate": 5.279762455006054e-10, + "loss": 0.68093646, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.18969727, + "step": 16514, + "time_per_iteration": 3.189713716506958 + }, + { + "auxiliary_loss_clip": 0.01403666, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.24184728, + "balance_loss_mlp": 1.01355958, + "epoch": 0.9929355178115136, + "flos": 19577332874880.0, + "grad_norm": 1.7857685573101947, + "language_loss": 0.73790419, + "learning_rate": 5.190657994713632e-10, + "loss": 0.7622807, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 1.62011719, + "router_z_loss_mlp": 0.20410156, + "step": 16515, + "time_per_iteration": 2.8217029571533203 + }, + { + "auxiliary_loss_clip": 0.0140158, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.24170542, + "balance_loss_mlp": 1.01571703, + "epoch": 0.9929956410641816, + "flos": 22974540347520.0, + "grad_norm": 1.9306838252089873, + "language_loss": 0.78132933, + "learning_rate": 5.102311739593191e-10, + "loss": 0.80570126, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.19909668, + "step": 16516, + "time_per_iteration": 2.961782217025757 + }, + { + "auxiliary_loss_clip": 0.01392465, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.23428917, + "balance_loss_mlp": 1.01432419, + "epoch": 0.9930557643168495, + "flos": 22576991120640.0, + "grad_norm": 1.8202685866577524, + "language_loss": 0.78752828, + "learning_rate": 5.014723692997602e-10, + "loss": 0.81178159, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.1854248, + "step": 16517, + "time_per_iteration": 2.8611419200897217 + }, + { + "auxiliary_loss_clip": 0.01417403, + "auxiliary_loss_mlp": 0.01037054, + "balance_loss_clip": 1.25249112, + "balance_loss_mlp": 1.01664567, + "epoch": 0.9931158875695175, + "flos": 17209710247680.0, + "grad_norm": 2.4135607643736523, + "language_loss": 0.68739825, + "learning_rate": 4.927893858248655e-10, + "loss": 0.71194279, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.20397949, + "step": 16518, + "time_per_iteration": 2.81862473487854 + }, + { + "auxiliary_loss_clip": 0.0117825, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.09114695, + "balance_loss_mlp": 1.02028263, + "epoch": 0.9931760108221854, + "flos": 63739039248000.0, + "grad_norm": 0.747765155854921, + "language_loss": 0.53420138, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55636024, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.17382812, + "step": 16519, + "time_per_iteration": 3.273587465286255 + }, + { + "auxiliary_loss_clip": 0.01393179, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.23513544, + "balance_loss_mlp": 1.01295424, + "epoch": 0.9932361340748534, + "flos": 15313259416320.0, + "grad_norm": 1.7024020232275632, + "language_loss": 0.60703689, + "learning_rate": 4.756508837426842e-10, + "loss": 0.63128173, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.18347168, + "step": 16520, + "time_per_iteration": 2.8711400032043457 + }, + { + "auxiliary_loss_clip": 0.0140051, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.24095845, + "balance_loss_mlp": 1.01616895, + "epoch": 0.9932962573275215, + "flos": 36078670247040.0, + "grad_norm": 1.644182003131036, + "language_loss": 0.62296605, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64732659, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.19360352, + "step": 16521, + "time_per_iteration": 3.0150258541107178 + }, + { + "auxiliary_loss_clip": 0.0140811, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.24604917, + "balance_loss_mlp": 1.01084995, + "epoch": 0.9933563805801894, + "flos": 21480479936640.0, + "grad_norm": 5.014743160059916, + "language_loss": 0.75039792, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.77478892, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.20141602, + "step": 16522, + "time_per_iteration": 2.8488998413085938 + }, + { + "auxiliary_loss_clip": 0.01392194, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.23597908, + "balance_loss_mlp": 1.01559806, + "epoch": 0.9934165038328574, + "flos": 23996750290560.0, + "grad_norm": 1.6409053270800151, + "language_loss": 0.74109495, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.76536345, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.19067383, + "step": 16523, + "time_per_iteration": 2.8624582290649414 + }, + { + "auxiliary_loss_clip": 0.01395485, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.23693252, + "balance_loss_mlp": 1.01198769, + "epoch": 0.9934766270855253, + "flos": 21917736115200.0, + "grad_norm": 3.3505942369369377, + "language_loss": 0.71684539, + "learning_rate": 4.422837480875241e-10, + "loss": 0.74110651, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.18640137, + "step": 16524, + "time_per_iteration": 4.320645093917847 + }, + { + "auxiliary_loss_clip": 0.01406824, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.24593532, + "balance_loss_mlp": 1.01396573, + "epoch": 0.9935367503381933, + "flos": 17138078449920.0, + "grad_norm": 2.10892538392591, + "language_loss": 0.80006212, + "learning_rate": 4.341315219624775e-10, + "loss": 0.82445866, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18847656, + "step": 16525, + "time_per_iteration": 2.8166589736938477 + }, + { + "auxiliary_loss_clip": 0.01396361, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.23867989, + "balance_loss_mlp": 1.01082623, + "epoch": 0.9935968735908612, + "flos": 22356440127360.0, + "grad_norm": 1.661834005953681, + "language_loss": 0.75919795, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.78345799, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18823242, + "step": 16526, + "time_per_iteration": 4.30532693862915 + }, + { + "auxiliary_loss_clip": 0.0138303, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.22880554, + "balance_loss_mlp": 1.01599383, + "epoch": 0.9936569968435293, + "flos": 29472474245760.0, + "grad_norm": 1.4781374473587419, + "language_loss": 0.73051143, + "learning_rate": 4.180545412333369e-10, + "loss": 0.75467688, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.1751709, + "step": 16527, + "time_per_iteration": 2.917006015777588 + }, + { + "auxiliary_loss_clip": 0.01396405, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.23486137, + "balance_loss_mlp": 1.01362395, + "epoch": 0.9937171200961972, + "flos": 16552491258240.0, + "grad_norm": 2.017788879356323, + "language_loss": 0.76140904, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78569865, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.18933105, + "step": 16528, + "time_per_iteration": 2.8136661052703857 + }, + { + "auxiliary_loss_clip": 0.01407881, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.24581707, + "balance_loss_mlp": 1.01295114, + "epoch": 0.9937772433488652, + "flos": 24401221971840.0, + "grad_norm": 2.3106751846310747, + "language_loss": 0.68531692, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70971847, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.1932373, + "step": 16529, + "time_per_iteration": 2.971472978591919 + }, + { + "auxiliary_loss_clip": 0.01407296, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.24378955, + "balance_loss_mlp": 1.01469457, + "epoch": 0.9938373666015331, + "flos": 15678521838720.0, + "grad_norm": 2.443467747913877, + "language_loss": 0.65763223, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.68204534, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.19335938, + "step": 16530, + "time_per_iteration": 2.8498618602752686 + }, + { + "auxiliary_loss_clip": 0.01413041, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.25200319, + "balance_loss_mlp": 1.01282108, + "epoch": 0.9938974898542011, + "flos": 19504569957120.0, + "grad_norm": 11.542830933465353, + "language_loss": 0.71749127, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.74193102, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.18115234, + "step": 16531, + "time_per_iteration": 2.7993905544281006 + }, + { + "auxiliary_loss_clip": 0.01394957, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.23459375, + "balance_loss_mlp": 1.01420522, + "epoch": 0.993957613106869, + "flos": 26918261487360.0, + "grad_norm": 1.5497651055396993, + "language_loss": 0.74781752, + "learning_rate": 3.791890207045512e-10, + "loss": 0.77209246, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18334961, + "step": 16532, + "time_per_iteration": 2.9319751262664795 + }, + { + "auxiliary_loss_clip": 0.01378712, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.22603512, + "balance_loss_mlp": 1.01372528, + "epoch": 0.994017736359537, + "flos": 14947861259520.0, + "grad_norm": 1.731568180411468, + "language_loss": 0.71301651, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.73712599, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.18518066, + "step": 16533, + "time_per_iteration": 2.8201966285705566 + }, + { + "auxiliary_loss_clip": 0.01407913, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.24653554, + "balance_loss_mlp": 1.01537323, + "epoch": 0.9940778596122051, + "flos": 15386384292480.0, + "grad_norm": 3.583756756501591, + "language_loss": 0.85090053, + "learning_rate": 3.641735912007782e-10, + "loss": 0.87533057, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 1.61523438, + "router_z_loss_mlp": 0.19714355, + "step": 16534, + "time_per_iteration": 4.2584614753723145 + }, + { + "auxiliary_loss_clip": 0.01376684, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.22428846, + "balance_loss_mlp": 1.01318944, + "epoch": 0.994137982864873, + "flos": 25238923044480.0, + "grad_norm": 2.3466282904023372, + "language_loss": 0.66988671, + "learning_rate": 3.567796158934211e-10, + "loss": 0.69397449, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.18896484, + "step": 16535, + "time_per_iteration": 4.344003200531006 + }, + { + "auxiliary_loss_clip": 0.0140103, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.24277771, + "balance_loss_mlp": 1.01352406, + "epoch": 0.994198106117541, + "flos": 18451294819200.0, + "grad_norm": 1.524482674255288, + "language_loss": 0.65694427, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.68127698, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 1.58300781, + "router_z_loss_mlp": 0.18725586, + "step": 16536, + "time_per_iteration": 2.8118183612823486 + }, + { + "auxiliary_loss_clip": 0.01396418, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.2385689, + "balance_loss_mlp": 1.01301503, + "epoch": 0.9942582293702089, + "flos": 16662744132480.0, + "grad_norm": 1.9814036551872931, + "language_loss": 0.79632115, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.82060653, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.19104004, + "step": 16537, + "time_per_iteration": 2.8669991493225098 + }, + { + "auxiliary_loss_clip": 0.01420741, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.25438643, + "balance_loss_mlp": 1.01275134, + "epoch": 0.9943183526228769, + "flos": 21954592644480.0, + "grad_norm": 1.8344150213236963, + "language_loss": 0.69376463, + "learning_rate": 3.35052651107004e-10, + "loss": 0.71829534, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.19580078, + "step": 16538, + "time_per_iteration": 2.852799654006958 + }, + { + "auxiliary_loss_clip": 0.01377108, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.22151411, + "balance_loss_mlp": 1.01220751, + "epoch": 0.9943784758755448, + "flos": 23853260471040.0, + "grad_norm": 2.2013402973233265, + "language_loss": 0.76024354, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.7843284, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.19165039, + "step": 16539, + "time_per_iteration": 2.8497817516326904 + }, + { + "auxiliary_loss_clip": 0.01407007, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.24459803, + "balance_loss_mlp": 1.01328039, + "epoch": 0.9944385991282129, + "flos": 21479801264640.0, + "grad_norm": 2.3366954031232274, + "language_loss": 0.71337545, + "learning_rate": 3.209471449341361e-10, + "loss": 0.7377674, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.18896484, + "step": 16540, + "time_per_iteration": 2.8357419967651367 + }, + { + "auxiliary_loss_clip": 0.01381318, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.22563505, + "balance_loss_mlp": 1.00953639, + "epoch": 0.9944987223808808, + "flos": 22936597943040.0, + "grad_norm": 2.1395800030861327, + "language_loss": 0.76214004, + "learning_rate": 3.140081337600353e-10, + "loss": 0.78622246, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.17370605, + "step": 16541, + "time_per_iteration": 2.8595893383026123 + }, + { + "auxiliary_loss_clip": 0.01393737, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.2342937, + "balance_loss_mlp": 1.0157516, + "epoch": 0.9945588456335488, + "flos": 22393296656640.0, + "grad_norm": 1.6260254176434317, + "language_loss": 0.77466238, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.79895294, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19567871, + "step": 16542, + "time_per_iteration": 2.8494691848754883 + }, + { + "auxiliary_loss_clip": 0.01403848, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.24286032, + "balance_loss_mlp": 1.01076388, + "epoch": 0.9946189688862167, + "flos": 21407445550080.0, + "grad_norm": 2.2253198757971435, + "language_loss": 0.75790095, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.78224277, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 1.609375, + "router_z_loss_mlp": 0.19580078, + "step": 16543, + "time_per_iteration": 2.8462002277374268 + }, + { + "auxiliary_loss_clip": 0.01426161, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.26197004, + "balance_loss_mlp": 1.01293063, + "epoch": 0.9946790921388847, + "flos": 12422722924800.0, + "grad_norm": 2.1362217633574323, + "language_loss": 0.82573128, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.85030955, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 1.64160156, + "router_z_loss_mlp": 0.18737793, + "step": 16544, + "time_per_iteration": 2.806331157684326 + }, + { + "auxiliary_loss_clip": 0.01404687, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.24503183, + "balance_loss_mlp": 1.01259434, + "epoch": 0.9947392153915526, + "flos": 19066544616960.0, + "grad_norm": 13.504331585187186, + "language_loss": 0.79688418, + "learning_rate": 2.870103745831187e-10, + "loss": 0.82124364, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.18676758, + "step": 16545, + "time_per_iteration": 2.8460733890533447 + }, + { + "auxiliary_loss_clip": 0.01399431, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.23758841, + "balance_loss_mlp": 1.01490295, + "epoch": 0.9947993386442207, + "flos": 27320425683840.0, + "grad_norm": 1.6076735466552663, + "language_loss": 0.72715509, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.7514888, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.19042969, + "step": 16546, + "time_per_iteration": 2.9006974697113037 + }, + { + "auxiliary_loss_clip": 0.01387393, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.23037589, + "balance_loss_mlp": 1.01309824, + "epoch": 0.9948594618968887, + "flos": 20814392966400.0, + "grad_norm": 4.968101643896007, + "language_loss": 0.77799022, + "learning_rate": 2.739664698798716e-10, + "loss": 0.80217075, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17578125, + "step": 16547, + "time_per_iteration": 2.877582550048828 + }, + { + "auxiliary_loss_clip": 0.01405144, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.2442528, + "balance_loss_mlp": 1.01500118, + "epoch": 0.9949195851495566, + "flos": 23302900995840.0, + "grad_norm": 2.2761940095884055, + "language_loss": 0.70516878, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72954488, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.17456055, + "step": 16548, + "time_per_iteration": 2.8586854934692383 + }, + { + "auxiliary_loss_clip": 0.01399428, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.24195158, + "balance_loss_mlp": 1.00993884, + "epoch": 0.9949797084022246, + "flos": 18524736408960.0, + "grad_norm": 2.4604662499845253, + "language_loss": 0.76166636, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.78594226, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.18225098, + "step": 16549, + "time_per_iteration": 2.8484036922454834 + }, + { + "auxiliary_loss_clip": 0.01406805, + "auxiliary_loss_mlp": 0.01033591, + "balance_loss_clip": 1.24459398, + "balance_loss_mlp": 1.01348031, + "epoch": 0.9950398316548925, + "flos": 30419523296640.0, + "grad_norm": 1.6203421638132818, + "language_loss": 0.75008857, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.7744925, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.20092773, + "step": 16550, + "time_per_iteration": 3.0100960731506348 + }, + { + "auxiliary_loss_clip": 0.01396902, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.23761773, + "balance_loss_mlp": 1.01647866, + "epoch": 0.9950999549075605, + "flos": 19910218003200.0, + "grad_norm": 31.58311427129161, + "language_loss": 0.78467917, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80899036, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.17724609, + "step": 16551, + "time_per_iteration": 2.824270248413086 + }, + { + "auxiliary_loss_clip": 0.01368453, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.21710515, + "balance_loss_mlp": 1.01446462, + "epoch": 0.9951600781602284, + "flos": 17612643605760.0, + "grad_norm": 1.3660177939649691, + "language_loss": 0.67153275, + "learning_rate": 2.426837340270271e-10, + "loss": 0.69553405, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.17224121, + "step": 16552, + "time_per_iteration": 2.871899366378784 + }, + { + "auxiliary_loss_clip": 0.01390036, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.23126912, + "balance_loss_mlp": 1.01227832, + "epoch": 0.9952202014128965, + "flos": 28962771863040.0, + "grad_norm": 1.3483168737158358, + "language_loss": 0.81571096, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83992481, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 0.19055176, + "step": 16553, + "time_per_iteration": 2.9152684211730957 + }, + { + "auxiliary_loss_clip": 0.01178448, + "auxiliary_loss_mlp": 0.01020555, + "balance_loss_clip": 1.09008861, + "balance_loss_mlp": 0.99871588, + "epoch": 0.9952803246655644, + "flos": 70845499451520.0, + "grad_norm": 0.7164974794391568, + "language_loss": 0.57347679, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59546685, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.21875, + "step": 16554, + "time_per_iteration": 3.439408779144287 + }, + { + "auxiliary_loss_clip": 0.01400658, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.23995447, + "balance_loss_mlp": 1.01540852, + "epoch": 0.9953404479182324, + "flos": 21809157298560.0, + "grad_norm": 1.629216354804764, + "language_loss": 0.77565455, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79999113, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.17578125, + "step": 16555, + "time_per_iteration": 2.872360944747925 + }, + { + "auxiliary_loss_clip": 0.01396867, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.23773015, + "balance_loss_mlp": 1.01180971, + "epoch": 0.9954005711709003, + "flos": 21945543684480.0, + "grad_norm": 1.8263083780306948, + "language_loss": 0.86491007, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88918984, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.19299316, + "step": 16556, + "time_per_iteration": 2.8362884521484375 + }, + { + "auxiliary_loss_clip": 0.01386884, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.2299782, + "balance_loss_mlp": 1.0130049, + "epoch": 0.9954606944235683, + "flos": 19364021049600.0, + "grad_norm": 1.6052679986389666, + "language_loss": 0.73544562, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75962937, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.18493652, + "step": 16557, + "time_per_iteration": 2.829871654510498 + }, + { + "auxiliary_loss_clip": 0.01390554, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.23435366, + "balance_loss_mlp": 1.01472962, + "epoch": 0.9955208176762362, + "flos": 30530816801280.0, + "grad_norm": 1.8559815414510208, + "language_loss": 0.76889741, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.79313219, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.18188477, + "step": 16558, + "time_per_iteration": 2.9124436378479004 + }, + { + "auxiliary_loss_clip": 0.01394311, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.23384953, + "balance_loss_mlp": 1.01540112, + "epoch": 0.9955809409289043, + "flos": 30020435746560.0, + "grad_norm": 1.9371249323555864, + "language_loss": 0.64146626, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.66575515, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19189453, + "step": 16559, + "time_per_iteration": 4.340106010437012 + }, + { + "auxiliary_loss_clip": 0.01392463, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.23363566, + "balance_loss_mlp": 1.01386976, + "epoch": 0.9956410641815723, + "flos": 21553197609600.0, + "grad_norm": 2.2257896302363105, + "language_loss": 0.75287116, + "learning_rate": 1.965745799148433e-10, + "loss": 0.77711761, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1829834, + "step": 16560, + "time_per_iteration": 2.964780330657959 + }, + { + "auxiliary_loss_clip": 0.01395852, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.237746, + "balance_loss_mlp": 1.01230633, + "epoch": 0.9957011874342402, + "flos": 21699492606720.0, + "grad_norm": 1.8621997286815193, + "language_loss": 0.80222762, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.82648885, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.17956543, + "step": 16561, + "time_per_iteration": 4.256886005401611 + }, + { + "auxiliary_loss_clip": 0.01392285, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.23596239, + "balance_loss_mlp": 1.01158071, + "epoch": 0.9957613106869082, + "flos": 17709096816000.0, + "grad_norm": 2.1983330892316446, + "language_loss": 0.66828388, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.69251168, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.18933105, + "step": 16562, + "time_per_iteration": 2.7956676483154297 + }, + { + "auxiliary_loss_clip": 0.01413583, + "auxiliary_loss_mlp": 0.01037013, + "balance_loss_clip": 1.24991703, + "balance_loss_mlp": 1.01641369, + "epoch": 0.9958214339395761, + "flos": 30568442492160.0, + "grad_norm": 1.6603827982878892, + "language_loss": 0.64787471, + "learning_rate": 1.805348815528962e-10, + "loss": 0.67238069, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.20593262, + "step": 16563, + "time_per_iteration": 2.9021146297454834 + }, + { + "auxiliary_loss_clip": 0.01388072, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.23055601, + "balance_loss_mlp": 1.01560318, + "epoch": 0.9958815571922441, + "flos": 24179494613760.0, + "grad_norm": 8.367077508876621, + "language_loss": 0.65243554, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.67666137, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18908691, + "step": 16564, + "time_per_iteration": 2.8833401203155518 + }, + { + "auxiliary_loss_clip": 0.01394622, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.23707473, + "balance_loss_mlp": 1.0115118, + "epoch": 0.995941680444912, + "flos": 15495460801920.0, + "grad_norm": 1.9619007843621288, + "language_loss": 0.74881655, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.77306956, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.19165039, + "step": 16565, + "time_per_iteration": 2.8087565898895264 + }, + { + "auxiliary_loss_clip": 0.01390181, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.23087716, + "balance_loss_mlp": 1.01032579, + "epoch": 0.9960018036975801, + "flos": 18629559907200.0, + "grad_norm": 1.8826164932645824, + "language_loss": 0.79443848, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81862664, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.18310547, + "step": 16566, + "time_per_iteration": 2.8383727073669434 + }, + { + "auxiliary_loss_clip": 0.01390328, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.23396802, + "balance_loss_mlp": 1.01344299, + "epoch": 0.996061926950248, + "flos": 20093731488000.0, + "grad_norm": 1.790822228282713, + "language_loss": 0.71623898, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.74046159, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 1.56347656, + "router_z_loss_mlp": 0.18481445, + "step": 16567, + "time_per_iteration": 2.837270975112915 + }, + { + "auxiliary_loss_clip": 0.01407763, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.24551928, + "balance_loss_mlp": 1.01466179, + "epoch": 0.996122050202916, + "flos": 24357216764160.0, + "grad_norm": 2.2269843901567383, + "language_loss": 0.79630184, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.82072598, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19995117, + "step": 16568, + "time_per_iteration": 2.875387668609619 + }, + { + "auxiliary_loss_clip": 0.01376494, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.22247648, + "balance_loss_mlp": 1.00873554, + "epoch": 0.9961821734555839, + "flos": 24207980855040.0, + "grad_norm": 1.677873999686422, + "language_loss": 0.82046902, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84449911, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.17785645, + "step": 16569, + "time_per_iteration": 4.355791807174683 + }, + { + "auxiliary_loss_clip": 0.01381217, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.22702801, + "balance_loss_mlp": 1.01405251, + "epoch": 0.9962422967082519, + "flos": 22642876828800.0, + "grad_norm": 2.8244713516959354, + "language_loss": 0.71636248, + "learning_rate": 1.457630950747468e-10, + "loss": 0.74049711, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 1.54003906, + "router_z_loss_mlp": 0.18188477, + "step": 16570, + "time_per_iteration": 4.341177225112915 + }, + { + "auxiliary_loss_clip": 0.01389352, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.23162413, + "balance_loss_mlp": 1.01110005, + "epoch": 0.9963024199609198, + "flos": 26407247005440.0, + "grad_norm": 1.5296586335916629, + "language_loss": 0.75319993, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77739084, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18640137, + "step": 16571, + "time_per_iteration": 3.0083255767822266 + }, + { + "auxiliary_loss_clip": 0.01395412, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.23737574, + "balance_loss_mlp": 1.01509392, + "epoch": 0.9963625432135879, + "flos": 16590252683520.0, + "grad_norm": 1.968891648190455, + "language_loss": 0.81085801, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.83516359, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.20043945, + "step": 16572, + "time_per_iteration": 2.8354341983795166 + }, + { + "auxiliary_loss_clip": 0.01398146, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.23982239, + "balance_loss_mlp": 1.01061392, + "epoch": 0.9964226664662559, + "flos": 26479647964800.0, + "grad_norm": 1.9142900637593965, + "language_loss": 0.7127713, + "learning_rate": 1.3199841727074e-10, + "loss": 0.73704582, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.18688965, + "step": 16573, + "time_per_iteration": 2.885568857192993 + }, + { + "auxiliary_loss_clip": 0.01421204, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.25591803, + "balance_loss_mlp": 1.01593733, + "epoch": 0.9964827897189238, + "flos": 27458305148160.0, + "grad_norm": 2.309299391862734, + "language_loss": 0.64140701, + "learning_rate": 1.275618614968721e-10, + "loss": 0.66597867, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.20031738, + "step": 16574, + "time_per_iteration": 2.8985610008239746 + }, + { + "auxiliary_loss_clip": 0.01425131, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.26037836, + "balance_loss_mlp": 1.01521873, + "epoch": 0.9965429129715918, + "flos": 11727109082880.0, + "grad_norm": 2.1496306052284884, + "language_loss": 0.77342445, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.79802024, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 1.64648438, + "router_z_loss_mlp": 0.19238281, + "step": 16575, + "time_per_iteration": 2.809224843978882 + }, + { + "auxiliary_loss_clip": 0.01390726, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.23156977, + "balance_loss_mlp": 1.01279211, + "epoch": 0.9966030362242597, + "flos": 19765597063680.0, + "grad_norm": 2.9846587427359714, + "language_loss": 0.71143079, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.73565412, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18823242, + "step": 16576, + "time_per_iteration": 2.863509178161621 + }, + { + "auxiliary_loss_clip": 0.01384904, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.23074245, + "balance_loss_mlp": 1.01357901, + "epoch": 0.9966631594769277, + "flos": 23925797164800.0, + "grad_norm": 1.5790313537424852, + "language_loss": 0.7326262, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.75680315, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.1920166, + "step": 16577, + "time_per_iteration": 2.8518576622009277 + }, + { + "auxiliary_loss_clip": 0.01397947, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.23760796, + "balance_loss_mlp": 1.01464009, + "epoch": 0.9967232827295956, + "flos": 15567952250880.0, + "grad_norm": 3.0455610229032692, + "language_loss": 0.79107273, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.81538683, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18811035, + "step": 16578, + "time_per_iteration": 2.823291301727295 + }, + { + "auxiliary_loss_clip": 0.01395038, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.2361629, + "balance_loss_mlp": 1.01322067, + "epoch": 0.9967834059822637, + "flos": 20822627520000.0, + "grad_norm": 1.7639218622557369, + "language_loss": 0.76658219, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.79084218, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.17749023, + "step": 16579, + "time_per_iteration": 2.835808753967285 + }, + { + "auxiliary_loss_clip": 0.01412097, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.25088692, + "balance_loss_mlp": 1.01380956, + "epoch": 0.9968435292349316, + "flos": 36732541121280.0, + "grad_norm": 2.679600034790059, + "language_loss": 0.70266658, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.72712588, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 1.61328125, + "router_z_loss_mlp": 0.20019531, + "step": 16580, + "time_per_iteration": 3.009549379348755 + }, + { + "auxiliary_loss_clip": 0.0140181, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.24139857, + "balance_loss_mlp": 1.01274133, + "epoch": 0.9969036524875996, + "flos": 26772237959040.0, + "grad_norm": 1.6966015252597766, + "language_loss": 0.79821157, + "learning_rate": 9.862937031113184e-11, + "loss": 0.82253438, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.17724609, + "step": 16581, + "time_per_iteration": 2.9062957763671875 + }, + { + "auxiliary_loss_clip": 0.01390278, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.23375285, + "balance_loss_mlp": 1.01299334, + "epoch": 0.9969637757402675, + "flos": 24838070947200.0, + "grad_norm": 1.797496817652888, + "language_loss": 0.81369692, + "learning_rate": 9.479950191249031e-11, + "loss": 0.83790356, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.17382812, + "step": 16582, + "time_per_iteration": 2.8926329612731934 + }, + { + "auxiliary_loss_clip": 0.0138706, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.23246086, + "balance_loss_mlp": 1.01387453, + "epoch": 0.9970238989929355, + "flos": 23048932078080.0, + "grad_norm": 1.7024672963279959, + "language_loss": 0.61400491, + "learning_rate": 9.104547011951069e-11, + "loss": 0.63819844, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.18432617, + "step": 16583, + "time_per_iteration": 2.8819990158081055 + }, + { + "auxiliary_loss_clip": 0.01399315, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.23956358, + "balance_loss_mlp": 1.01151371, + "epoch": 0.9970840222456034, + "flos": 25309061763840.0, + "grad_norm": 1.7140377300823582, + "language_loss": 0.78340805, + "learning_rate": 8.736727507452357e-11, + "loss": 0.80769747, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 1.59765625, + "router_z_loss_mlp": 0.18115234, + "step": 16584, + "time_per_iteration": 2.8664238452911377 + }, + { + "auxiliary_loss_clip": 0.01393075, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.2362448, + "balance_loss_mlp": 1.01634145, + "epoch": 0.9971441454982715, + "flos": 21625372344960.0, + "grad_norm": 1.618210996073375, + "language_loss": 0.6976645, + "learning_rate": 8.376491691697297e-11, + "loss": 0.72193593, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.17724609, + "step": 16585, + "time_per_iteration": 2.8620927333831787 + }, + { + "auxiliary_loss_clip": 0.01391839, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.23420978, + "balance_loss_mlp": 1.01483548, + "epoch": 0.9972042687509394, + "flos": 14983767648000.0, + "grad_norm": 2.894940839137588, + "language_loss": 0.82285881, + "learning_rate": 8.023839578363834e-11, + "loss": 0.84711188, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 1.57714844, + "router_z_loss_mlp": 0.18640137, + "step": 16586, + "time_per_iteration": 2.8479483127593994 + }, + { + "auxiliary_loss_clip": 0.01403443, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.24305916, + "balance_loss_mlp": 1.01246226, + "epoch": 0.9972643920036074, + "flos": 25816275682560.0, + "grad_norm": 1.6613899031229435, + "language_loss": 0.79136521, + "learning_rate": 7.678771180796851e-11, + "loss": 0.81570768, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 1.60546875, + "router_z_loss_mlp": 0.18359375, + "step": 16587, + "time_per_iteration": 2.8912594318389893 + }, + { + "auxiliary_loss_clip": 0.01411361, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.25063825, + "balance_loss_mlp": 1.0190804, + "epoch": 0.9973245152562754, + "flos": 23335368779520.0, + "grad_norm": 2.2102800654999966, + "language_loss": 0.73601741, + "learning_rate": 7.341286512074773e-11, + "loss": 0.76050556, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 1.60839844, + "router_z_loss_mlp": 0.18371582, + "step": 16588, + "time_per_iteration": 2.880502223968506 + }, + { + "auxiliary_loss_clip": 0.01415396, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.25046158, + "balance_loss_mlp": 1.01593387, + "epoch": 0.9973846385089433, + "flos": 12173640445440.0, + "grad_norm": 2.718181350972074, + "language_loss": 0.83705145, + "learning_rate": 7.011385585031781e-11, + "loss": 0.86154789, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 1.64746094, + "router_z_loss_mlp": 0.18310547, + "step": 16589, + "time_per_iteration": 2.816628932952881 + }, + { + "auxiliary_loss_clip": 0.0141155, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.24609351, + "balance_loss_mlp": 1.01592755, + "epoch": 0.9974447617616113, + "flos": 20054115025920.0, + "grad_norm": 1.9981506705331358, + "language_loss": 0.71551174, + "learning_rate": 6.689068412168986e-11, + "loss": 0.73998255, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 1.65332031, + "router_z_loss_mlp": 0.19592285, + "step": 16590, + "time_per_iteration": 2.908003330230713 + }, + { + "auxiliary_loss_clip": 0.01408014, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.2456429, + "balance_loss_mlp": 1.01354373, + "epoch": 0.9975048850142793, + "flos": 32027229941760.0, + "grad_norm": 1.8417622543140382, + "language_loss": 0.63712478, + "learning_rate": 6.374335005676634e-11, + "loss": 0.66153407, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.19372559, + "step": 16591, + "time_per_iteration": 2.944382429122925 + }, + { + "auxiliary_loss_clip": 0.01391563, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.23115504, + "balance_loss_mlp": 1.01255369, + "epoch": 0.9975650082669473, + "flos": 36945943436160.0, + "grad_norm": 1.901774205437116, + "language_loss": 0.74192953, + "learning_rate": 6.067185377522933e-11, + "loss": 0.76616287, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.19213867, + "step": 16592, + "time_per_iteration": 2.9663896560668945 + }, + { + "auxiliary_loss_clip": 0.01403864, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.24385607, + "balance_loss_mlp": 1.01674843, + "epoch": 0.9976251315196152, + "flos": 16480814215680.0, + "grad_norm": 1.4712718225120045, + "language_loss": 0.85653269, + "learning_rate": 5.767619539343016e-11, + "loss": 0.88092691, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18811035, + "step": 16593, + "time_per_iteration": 2.838838815689087 + }, + { + "auxiliary_loss_clip": 0.0139547, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.23801255, + "balance_loss_mlp": 1.01357341, + "epoch": 0.9976852547722832, + "flos": 19656475309440.0, + "grad_norm": 2.012454126069002, + "language_loss": 0.70616508, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.7304343, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.17883301, + "step": 16594, + "time_per_iteration": 4.3265392780303955 + }, + { + "auxiliary_loss_clip": 0.01409871, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.24752963, + "balance_loss_mlp": 1.01117289, + "epoch": 0.9977453780249511, + "flos": 20457998524800.0, + "grad_norm": 2.2252551533356257, + "language_loss": 0.73193485, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.75633556, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 1.62402344, + "router_z_loss_mlp": 0.19018555, + "step": 16595, + "time_per_iteration": 2.827493667602539 + }, + { + "auxiliary_loss_clip": 0.01178743, + "auxiliary_loss_mlp": 0.01024461, + "balance_loss_clip": 1.09158635, + "balance_loss_mlp": 1.00185931, + "epoch": 0.9978055012776191, + "flos": 65481458221440.0, + "grad_norm": 0.792229855630669, + "language_loss": 0.60304224, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62507427, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.22558594, + "step": 16596, + "time_per_iteration": 4.636451005935669 + }, + { + "auxiliary_loss_clip": 0.01403396, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.24274564, + "balance_loss_mlp": 1.01459837, + "epoch": 0.997865624530287, + "flos": 20641285785600.0, + "grad_norm": 1.85641297136318, + "language_loss": 0.78347909, + "learning_rate": 4.645194309227385e-11, + "loss": 0.80784285, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.18383789, + "step": 16597, + "time_per_iteration": 2.836442470550537 + }, + { + "auxiliary_loss_clip": 0.01399435, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.23872483, + "balance_loss_mlp": 1.01294494, + "epoch": 0.9979257477829551, + "flos": 29398896921600.0, + "grad_norm": 1.8146283617060635, + "language_loss": 0.8280766, + "learning_rate": 4.383547585562475e-11, + "loss": 0.85239506, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.19470215, + "step": 16598, + "time_per_iteration": 2.8840250968933105 + }, + { + "auxiliary_loss_clip": 0.0142017, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.25348186, + "balance_loss_mlp": 1.016927, + "epoch": 0.997985871035623, + "flos": 22644550886400.0, + "grad_norm": 1.809576199169239, + "language_loss": 0.64948046, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67404926, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 1.66699219, + "router_z_loss_mlp": 0.19787598, + "step": 16599, + "time_per_iteration": 2.858654260635376 + }, + { + "auxiliary_loss_clip": 0.01180519, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.09288085, + "balance_loss_mlp": 1.00202513, + "epoch": 0.998045994288291, + "flos": 61832498826240.0, + "grad_norm": 0.8619244498028018, + "language_loss": 0.62409866, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64617968, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.25585938, + "step": 16600, + "time_per_iteration": 3.198683023452759 + }, + { + "auxiliary_loss_clip": 0.01395378, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.23751843, + "balance_loss_mlp": 1.01281726, + "epoch": 0.998106117540959, + "flos": 19255080274560.0, + "grad_norm": 1.5956996145999227, + "language_loss": 0.79177636, + "learning_rate": 3.644110575717896e-11, + "loss": 0.81604254, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.18432617, + "step": 16601, + "time_per_iteration": 2.8326947689056396 + }, + { + "auxiliary_loss_clip": 0.01404637, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.24252868, + "balance_loss_mlp": 1.01420021, + "epoch": 0.9981662407936269, + "flos": 21116212899840.0, + "grad_norm": 1.8422130828921273, + "language_loss": 0.82738602, + "learning_rate": 3.412799323987414e-11, + "loss": 0.85175931, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.18493652, + "step": 16602, + "time_per_iteration": 2.878427267074585 + }, + { + "auxiliary_loss_clip": 0.01406451, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.24720526, + "balance_loss_mlp": 1.01292861, + "epoch": 0.998226364046295, + "flos": 24327644647680.0, + "grad_norm": 1.9545090990409062, + "language_loss": 0.63167262, + "learning_rate": 3.189071962883538e-11, + "loss": 0.65605712, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.1907959, + "step": 16603, + "time_per_iteration": 2.860781192779541 + }, + { + "auxiliary_loss_clip": 0.01400942, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.24061799, + "balance_loss_mlp": 1.01485372, + "epoch": 0.9982864872989629, + "flos": 23845478365440.0, + "grad_norm": 1.8614833045719852, + "language_loss": 0.71905941, + "learning_rate": 2.972928500866168e-11, + "loss": 0.74341702, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.19958496, + "step": 16604, + "time_per_iteration": 2.877634048461914 + }, + { + "auxiliary_loss_clip": 0.01397329, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.23838484, + "balance_loss_mlp": 1.01007736, + "epoch": 0.9983466105516309, + "flos": 18342354044160.0, + "grad_norm": 1.7367280129996039, + "language_loss": 0.65306723, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.67732495, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18371582, + "step": 16605, + "time_per_iteration": 4.352726459503174 + }, + { + "auxiliary_loss_clip": 0.01392571, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.23534393, + "balance_loss_mlp": 1.01438451, + "epoch": 0.9984067338042988, + "flos": 17245254677760.0, + "grad_norm": 1.635113885871471, + "language_loss": 0.71749115, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.74175358, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.19299316, + "step": 16606, + "time_per_iteration": 4.232144594192505 + }, + { + "auxiliary_loss_clip": 0.01394121, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.23600972, + "balance_loss_mlp": 1.01351452, + "epoch": 0.9984668570569668, + "flos": 20677644622080.0, + "grad_norm": 2.0419980330408642, + "language_loss": 0.82525891, + "learning_rate": 2.370001590090709e-11, + "loss": 0.84954298, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 1.58007812, + "router_z_loss_mlp": 0.20739746, + "step": 16607, + "time_per_iteration": 2.8524351119995117 + }, + { + "auxiliary_loss_clip": 0.01403551, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.24010694, + "balance_loss_mlp": 1.01549208, + "epoch": 0.9985269803096347, + "flos": 30274495153920.0, + "grad_norm": 1.557093765990386, + "language_loss": 0.67675805, + "learning_rate": 2.184193803622669e-11, + "loss": 0.70115358, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.20507812, + "step": 16608, + "time_per_iteration": 2.961256742477417 + }, + { + "auxiliary_loss_clip": 0.01400924, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.24183714, + "balance_loss_mlp": 1.01030397, + "epoch": 0.9985871035623027, + "flos": 10568015061120.0, + "grad_norm": 1.9038565287429294, + "language_loss": 0.81254333, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.83683801, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.18237305, + "step": 16609, + "time_per_iteration": 2.813807964324951 + }, + { + "auxiliary_loss_clip": 0.0139348, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.23352456, + "balance_loss_mlp": 1.01662993, + "epoch": 0.9986472268149706, + "flos": 16882254495360.0, + "grad_norm": 1.612761275562252, + "language_loss": 0.63325381, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.6575405, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.18554688, + "step": 16610, + "time_per_iteration": 2.8661773204803467 + }, + { + "auxiliary_loss_clip": 0.01398896, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.23960698, + "balance_loss_mlp": 1.01545191, + "epoch": 0.9987073500676387, + "flos": 22064709784320.0, + "grad_norm": 16.767731120781075, + "language_loss": 0.68186361, + "learning_rate": 1.672274094288717e-11, + "loss": 0.70620394, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.19677734, + "step": 16611, + "time_per_iteration": 2.816735029220581 + }, + { + "auxiliary_loss_clip": 0.01398712, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.23761225, + "balance_loss_mlp": 1.01451087, + "epoch": 0.9987674733203066, + "flos": 30495272371200.0, + "grad_norm": 1.4330105718083253, + "language_loss": 0.70623183, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.730564, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.1998291, + "step": 16612, + "time_per_iteration": 2.9174954891204834 + }, + { + "auxiliary_loss_clip": 0.01389505, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.23344541, + "balance_loss_mlp": 1.01336682, + "epoch": 0.9988275965729746, + "flos": 27756595987200.0, + "grad_norm": 1.5849791146017618, + "language_loss": 0.74494529, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76915789, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.18383789, + "step": 16613, + "time_per_iteration": 2.8861424922943115 + }, + { + "auxiliary_loss_clip": 0.01389624, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.22996783, + "balance_loss_mlp": 1.01315928, + "epoch": 0.9988877198256426, + "flos": 17531600889600.0, + "grad_norm": 2.1873032954509215, + "language_loss": 0.74338591, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.76760674, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 1.59667969, + "router_z_loss_mlp": 0.19299316, + "step": 16614, + "time_per_iteration": 2.801055669784546 + }, + { + "auxiliary_loss_clip": 0.01398696, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.23942804, + "balance_loss_mlp": 1.01443589, + "epoch": 0.9989478430783105, + "flos": 21006412473600.0, + "grad_norm": 2.065331199839602, + "language_loss": 0.73211551, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75644135, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 1.59375, + "router_z_loss_mlp": 0.19445801, + "step": 16615, + "time_per_iteration": 2.957162857055664 + }, + { + "auxiliary_loss_clip": 0.01408518, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.24497175, + "balance_loss_mlp": 1.01065826, + "epoch": 0.9990079663309785, + "flos": 13378458977280.0, + "grad_norm": 3.056121369958587, + "language_loss": 0.79146641, + "learning_rate": 9.70753783247069e-12, + "loss": 0.81584185, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.18359375, + "step": 16616, + "time_per_iteration": 2.8193466663360596 + }, + { + "auxiliary_loss_clip": 0.0140043, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.24193227, + "balance_loss_mlp": 1.01364553, + "epoch": 0.9990680895836465, + "flos": 17318877246720.0, + "grad_norm": 1.8849169299726047, + "language_loss": 0.83677983, + "learning_rate": 8.532016508855378e-12, + "loss": 0.86111474, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.1940918, + "step": 16617, + "time_per_iteration": 2.8249504566192627 + }, + { + "auxiliary_loss_clip": 0.01393575, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.23639393, + "balance_loss_mlp": 1.01130497, + "epoch": 0.9991282128363145, + "flos": 24218930096640.0, + "grad_norm": 1.9948768914063952, + "language_loss": 0.79220641, + "learning_rate": 7.43233506206309e-12, + "loss": 0.81643271, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.1776123, + "step": 16618, + "time_per_iteration": 2.879115104675293 + }, + { + "auxiliary_loss_clip": 0.01392465, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.23491514, + "balance_loss_mlp": 1.01414728, + "epoch": 0.9991883360889824, + "flos": 21184315603200.0, + "grad_norm": 1.627230997868662, + "language_loss": 0.75904751, + "learning_rate": 6.408493534060255e-12, + "loss": 0.78329885, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.18518066, + "step": 16619, + "time_per_iteration": 2.8527066707611084 + }, + { + "auxiliary_loss_clip": 0.01386741, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.2310704, + "balance_loss_mlp": 1.01328635, + "epoch": 0.9992484593416504, + "flos": 19910579961600.0, + "grad_norm": 1.9238489520353774, + "language_loss": 0.87078863, + "learning_rate": 5.460491963260594e-12, + "loss": 0.89497507, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.18603516, + "step": 16620, + "time_per_iteration": 2.851719617843628 + }, + { + "auxiliary_loss_clip": 0.01389313, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.23224902, + "balance_loss_mlp": 1.010638, + "epoch": 0.9993085825943183, + "flos": 24867145370880.0, + "grad_norm": 1.7690788998381917, + "language_loss": 0.7305612, + "learning_rate": 4.58833038607942e-12, + "loss": 0.75473368, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.17297363, + "step": 16621, + "time_per_iteration": 2.89024019241333 + }, + { + "auxiliary_loss_clip": 0.01179163, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.09071612, + "balance_loss_mlp": 1.00349665, + "epoch": 0.9993687058469863, + "flos": 71319657404160.0, + "grad_norm": 0.7382580761413181, + "language_loss": 0.56546247, + "learning_rate": 3.79200883515729e-12, + "loss": 0.5875504, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.26171875, + "step": 16622, + "time_per_iteration": 3.567823886871338 + }, + { + "auxiliary_loss_clip": 0.01402878, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.24267018, + "balance_loss_mlp": 1.01430511, + "epoch": 0.9994288290996542, + "flos": 12206198718720.0, + "grad_norm": 1.9077223565094001, + "language_loss": 0.71944743, + "learning_rate": 3.071527340914315e-12, + "loss": 0.74380636, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.18725586, + "step": 16623, + "time_per_iteration": 2.854400634765625 + }, + { + "auxiliary_loss_clip": 0.01390473, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.23234987, + "balance_loss_mlp": 1.00983417, + "epoch": 0.9994889523523223, + "flos": 17897677718400.0, + "grad_norm": 1.8034415668210102, + "language_loss": 0.75078022, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77497524, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.19189453, + "step": 16624, + "time_per_iteration": 2.897861957550049 + }, + { + "auxiliary_loss_clip": 0.01393988, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.23479736, + "balance_loss_mlp": 1.00940514, + "epoch": 0.9995490756049902, + "flos": 26590262797440.0, + "grad_norm": 1.5451847888440806, + "language_loss": 0.74692374, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.77115524, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.19750977, + "step": 16625, + "time_per_iteration": 2.9265923500061035 + }, + { + "auxiliary_loss_clip": 0.0138045, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.22587824, + "balance_loss_mlp": 1.01153851, + "epoch": 0.9996091988576582, + "flos": 22210145130240.0, + "grad_norm": 2.0296810382562644, + "language_loss": 0.78008509, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.80419505, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.19006348, + "step": 16626, + "time_per_iteration": 2.8440189361572266 + }, + { + "auxiliary_loss_clip": 0.01395946, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.23829889, + "balance_loss_mlp": 1.01222777, + "epoch": 0.9996693221103262, + "flos": 27382601318400.0, + "grad_norm": 2.281150078360747, + "language_loss": 0.82095659, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84521902, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 1.578125, + "router_z_loss_mlp": 0.18066406, + "step": 16627, + "time_per_iteration": 2.9165232181549072 + }, + { + "auxiliary_loss_clip": 0.01403332, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.24057233, + "balance_loss_mlp": 1.01290917, + "epoch": 0.9997294453629941, + "flos": 26881766916480.0, + "grad_norm": 2.217513637178297, + "language_loss": 0.72398841, + "learning_rate": 6.067215747584952e-13, + "loss": 0.7483536, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.20263672, + "step": 16628, + "time_per_iteration": 2.9542076587677 + }, + { + "auxiliary_loss_clip": 0.01392562, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.23247933, + "balance_loss_mlp": 1.01277328, + "epoch": 0.9997895686156621, + "flos": 23487409866240.0, + "grad_norm": 1.796385392838348, + "language_loss": 0.7607643, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.78500557, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 1.6015625, + "router_z_loss_mlp": 0.18786621, + "step": 16629, + "time_per_iteration": 4.279237747192383 + }, + { + "auxiliary_loss_clip": 0.01418061, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.25414956, + "balance_loss_mlp": 1.01437283, + "epoch": 0.9998496918683301, + "flos": 20233285009920.0, + "grad_norm": 1.6544386008431677, + "language_loss": 0.6107229, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.63523757, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.19030762, + "step": 16630, + "time_per_iteration": 2.8854622840881348 + }, + { + "auxiliary_loss_clip": 0.01399146, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.23930359, + "balance_loss_mlp": 1.01429462, + "epoch": 0.9999098151209981, + "flos": 21662907546240.0, + "grad_norm": 2.1908451151571398, + "language_loss": 0.61663115, + "learning_rate": 3.792010017100722e-14, + "loss": 0.64095813, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.19262695, + "step": 16631, + "time_per_iteration": 4.266470432281494 + }, + { + "auxiliary_loss_clip": 0.01384094, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.22959709, + "balance_loss_mlp": 1.01159823, + "epoch": 0.999969938373666, + "flos": 11551513438080.0, + "grad_norm": 2.3658797066110053, + "language_loss": 0.73203969, + "learning_rate": 0.0, + "loss": 0.75617027, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.17358398, + "step": 16632, + "time_per_iteration": 2.78379225730896 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3998867231602115e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}